Sha256: a44bc901196a5f37ef48da64046e16ef35ef79595cc049c5a44826f87694e23e

Contents?: true

Size: 1.9 KB

Versions: 2

Compression:

Stored size: 1.9 KB

Contents

# encoding: UTF-8

class TextNlp
  class Expressions
    
    attr_accessor :values
    
    def initialize(expressions = [])
      @root, @values = {}, []
      expressions.each { |expr| self << expr }
    end
    
    def <<(expression)
      node = @root
      expression = expression.normalize
      @values << expression
      tokens = expression.tokenize
      tokens_count = tokens.size
      tokens.each_with_index do |token,i|
        unless node.key?(token)
          node[token] = {}
          node[token][:parent] = node
        end
        node = node[token]
        if (i == (tokens_count-1)) # leaf
          node[:leaf] = 1
        end
      end
    end
    
    def any?(text)
      (find(text).size > 0)
    end
    
    def expressionize(text)
      expressions = find(text).sort { |e1,e2| e2.tokenize.size <=> e1.tokenize.size }
      text = text.tokenize.join(',')
      expressions.each { |expr| text.gsub!(expr.tokenize.join(','), expr) }
      text.split(',')
    end
    
    def find(text)
      find_expressions(0,text.normalize.tokenize.map { |t| t })
    end
  
    private
    def find_expressions(start_index, tokens, expressions = [])
      node, leaf, expr = @root, false, []
      tokens[start_index..-1].each_with_index do |token,i|
        if (node.key?(token))
          node = node[token]
          expr << token
        else
          while (expr.size > 0 && node)
            if node.key?(:leaf)
              expressions << expr.join(' ')
              break
            end
            expr.pop
            node = node[:parent]
          end
          break
        end
      end
      start_index = expr.size > 0 ? (start_index + expr.size) : (start_index + 1)
      if (start_index <= (tokens.size - 1))
        find_expressions(start_index, tokens, expressions)
      else
        expressions << expr.join(' ') if (expr.size > 0 && node.key?(:leaf) )
      end
      expressions
    end
    
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
text_nlp-0.0.2 lib/text_nlp/expressions.rb
text_nlp-0.0.1 lib/text_nlp/expressions.rb