#BasicTokenizer breaks given strings to a set of tokens. As tokens are regarded the words and the sequences of
# the numbers the string contains.
module Company
  module Mapping
    class BasicTokenizer

      def initialize(ignorePunctuation = true, ignoreCase = true)
        @doIgnorePunctuation, @doIgnoreCase = ignorePunctuation, ignoreCase
      end

      def to_s
        "{BasicTokenizer: (IgnoresPunctuation: #@doIgnorePunctuation, IgnoresCase: #@doIgnoreCase)}"
      end

      def tokenize(text)
        _text = tranform(text)
        _tokens = Array.new
        _index = 0;
        while (_index<_text.length)
          _char = String(_text[_index])
          if (_char.match(/\s/))
            _index = _index+1
          elsif (_char.match(/\w/))
            _buf = StringIO.new("")
            while ((_index < _text.length) && (_text[_index].match(/\w/)))
              _buf << _text[_index]
              _index += 1
            end
            _tokens.push(_buf.string)
          else
            if (!@doIgnorePunctuation)
              _buf = StringIO.new("")
              _buf << _char
              _tokens.push(_buf.string)
            end
            _index += 1
          end
        end
        return _tokens
      end

      private
      def tranform(text)
        if (@doIgnoreCase)
          return text.to_s.downcase
        end
        return text.to_s
      end
    end
  end
end