Sha256: a44bc901196a5f37ef48da64046e16ef35ef79595cc049c5a44826f87694e23e
Contents?: true
Size: 1.9 KB
Versions: 2
Compression:
Stored size: 1.9 KB
Contents
# encoding: UTF-8 class TextNlp class Expressions attr_accessor :values def initialize(expressions = []) @root, @values = {}, [] expressions.each { |expr| self << expr } end def <<(expression) node = @root expression = expression.normalize @values << expression tokens = expression.tokenize tokens_count = tokens.size tokens.each_with_index do |token,i| unless node.key?(token) node[token] = {} node[token][:parent] = node end node = node[token] if (i == (tokens_count-1)) # leaf node[:leaf] = 1 end end end def any?(text) (find(text).size > 0) end def expressionize(text) expressions = find(text).sort { |e1,e2| e2.tokenize.size <=> e1.tokenize.size } text = text.tokenize.join(',') expressions.each { |expr| text.gsub!(expr.tokenize.join(','), expr) } text.split(',') end def find(text) find_expressions(0,text.normalize.tokenize.map { |t| t }) end private def find_expressions(start_index, tokens, expressions = []) node, leaf, expr = @root, false, [] tokens[start_index..-1].each_with_index do |token,i| if (node.key?(token)) node = node[token] expr << token else while (expr.size > 0 && node) if node.key?(:leaf) expressions << expr.join(' ') break end expr.pop node = node[:parent] end break end end start_index = expr.size > 0 ? (start_index + expr.size) : (start_index + 1) if (start_index <= (tokens.size - 1)) find_expressions(start_index, tokens, expressions) else expressions << expr.join(' ') if (expr.size > 0 && node.key?(:leaf) ) end expressions end end end
Version data entries
2 entries across 2 versions & 1 rubygems
Version | Path |
---|---|
text_nlp-0.0.2 | lib/text_nlp/expressions.rb |
text_nlp-0.0.1 | lib/text_nlp/expressions.rb |