Sha256: 04dfcb3d6ce83d690ba8d3c0c282aa0d8c3d19943ae67dc9d31576f50043e463

Contents?: true

Size: 1.43 KB

Versions: 1

Compression:

Stored size: 1.43 KB

Contents

#BasicTokenizer breaks given strings to a set of tokens. As tokens are regarded the words and the sequences of
# the numbers the string contains.
module Company
  module Mapping
    class BasicTokenizer

      def initialize(ignorePunctuation = true, ignoreCase = true)
        @doIgnorePunctuation, @doIgnoreCase = ignorePunctuation, ignoreCase
      end

      def to_s
        "{BasicTokenizer: (IgnoresPunctuation: #@doIgnorePunctuation, IgnoresCase: #@doIgnoreCase)}"
      end

      def tokenize(text)
        _text = tranform(text)
        _tokens = Array.new
        _index = 0;
        while (_index<_text.length)
          _char = String(_text[_index])
          if (_char.match(/\s/))
            _index = _index+1
          elsif (_char.match(/\w/))
            _buf = StringIO.new("")
            while ((_index < _text.length) && (_text[_index].match(/\w/)))
              _buf << _text[_index]
              _index += 1
            end
            _tokens.push(_buf.string)
          else
            if (!@doIgnorePunctuation)
              _buf = StringIO.new("")
              _buf << _char
              _tokens.push(_buf.string)
            end
            _index += 1
          end
        end
        return _tokens
      end

      private
      def tranform(text)
        if (@doIgnoreCase)
          return text.to_s.downcase
        end
        return text.to_s
      end
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
company-mapping-0.1.0 lib/company/mapping/document_utils/basic_tokenizer.rb