Sha256: e8ad78b8866df16aad09d4068aba46186cc458777e40d301299abefcecce797b
Contents?: true
Size: 1.88 KB
Versions: 1
Compression:
Stored size: 1.88 KB
Contents
# encoding: UTF-8 class TextNlp class Expressions attr_accessor :values def initialize(expressions = []) @root, @values = {}, [] expressions.each { |expr| self << expr } end def <<(expression) node = @root expression.normalize! @values << expression tokens = expression.tokenize tokens_count = tokens.size tokens.each_with_index do |token,i| unless node.key?(token) node[token] = {} node[token][:parent] = node end node = node[token] if (i == (tokens_count-1)) # leaf node[:leaf] = 1 end end end def any?(text) (find(text).size > 0) end def expressionize(text) expressions = find(text).sort { |e1,e2| e2.tokenize.size <=> e1.tokenize.size } text = text.tokenize.join(',') expressions.each { |expr| text.gsub!(expr.tokenize.join(','), expr) } text.split(',') end def find(text) find_expressions(0,text.normalize.tokenize) end private def find_expressions(start_index, tokens, expressions = []) node, leaf, expr = @root, false, [] tokens[start_index..-1].each_with_index do |token,i| if (node.key?(token)) node = node[token] expr << token else while (expr.size > 0 && node) if node.key?(:leaf) expressions << expr.join(' ') break end expr.pop node = node[:parent] end break end end start_index = expr.size > 0 ? (start_index + expr.size) : (start_index + 1) if (start_index <= (tokens.size - 1)) find_expressions(start_index, tokens, expressions) else expressions << expr.join(' ') if (expr.size > 0 && node.key?(:leaf) ) end expressions end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
text_nlp-0.0.3 | lib/text_nlp/expressions.rb |