Sha256: f7ed4a3b573fecd208588689dbd7ffd49096749c118869e7393ef8693fb0c45f

Contents?: true

Size: 1.26 KB

Versions: 4

Compression:

Stored size: 1.26 KB

Contents

#BasicTokenizer breaks given strings to a set of tokens. As tokens are regarded the words and the sequences of
# the numbers the string contains.
module Company
  module Mapping
    class BasicTokenizer

      def initialize(ignorePunctuation = true, ignoreCase = true)
        @doIgnorePunctuation, @doIgnoreCase = ignorePunctuation, ignoreCase
      end

      def to_s
        "{BasicTokenizer: (IgnoresPunctuation: #@doIgnorePunctuation, IgnoresCase: #@doIgnoreCase)}"
      end

      def tokenize(text)
        text = tranform(text)
        tokens = Array.new
        index = 0
        while (index < text.length)
          char = text[index]
          case char
          when /\s/
            index = index + 1
          when /\w/ #/(?<word>\w+)/
            buf = ""
            while ((index < text.length) && (text[index].match(/\w/)))
              buf << text[index]
              index += 1
            end
            tokens.push buf
            index += 1
          else
            tokens.push(char) unless @doIgnorePunctuation
            index += 1
          end
        end
        tokens
      end

      private
      def tranform(text)
        @doIgnoreCase ? text.to_s.downcase : text.to_s
      end
    end
  end
end

Version data entries

4 entries across 4 versions & 1 rubygems

Version Path
company-mapping-0.2.3 lib/company/mapping/document_utils/basic_tokenizer.rb
company-mapping-0.2.2 lib/company/mapping/document_utils/basic_tokenizer.rb
company-mapping-0.2.1 lib/company/mapping/document_utils/basic_tokenizer.rb
company-mapping-0.2.0 lib/company/mapping/document_utils/basic_tokenizer.rb