lib/company/mapping/document_utils/basic_tokenizer.rb in company-mapping-0.1.0 vs lib/company/mapping/document_utils/basic_tokenizer.rb in company-mapping-0.2.0

- old
+ new

@@ -11,41 +11,36 @@ def to_s "{BasicTokenizer: (IgnoresPunctuation: #@doIgnorePunctuation, IgnoresCase: #@doIgnoreCase)}" end def tokenize(text) - _text = tranform(text) - _tokens = Array.new - _index = 0; - while (_index<_text.length) - _char = String(_text[_index]) - if (_char.match(/\s/)) - _index = _index+1 - elsif (_char.match(/\w/)) - _buf = StringIO.new("") - while ((_index < _text.length) && (_text[_index].match(/\w/))) - _buf << _text[_index] - _index += 1 + text = tranform(text) + tokens = Array.new + index = 0 + while (index < text.length) + char = text[index] + case char + when /\s/ + index = index + 1 + when /\w/ #/(?<word>\w+)/ + buf = "" + while ((index < text.length) && (text[index].match(/\w/))) + buf << text[index] + index += 1 end - _tokens.push(_buf.string) + tokens.push buf + index += 1 else - if (!@doIgnorePunctuation) - _buf = StringIO.new("") - _buf << _char - _tokens.push(_buf.string) - end - _index += 1 + tokens.push(char) unless @doIgnorePunctuation + index += 1 end end - return _tokens + tokens end private def tranform(text) - if (@doIgnoreCase) - return text.to_s.downcase - end - return text.to_s + @doIgnoreCase ? text.to_s.downcase : text.to_s end end end end \ No newline at end of file