Sha256: b23b704ec2f59af37f52943096b30d13646c8b22b48b8870cbeaca0d1e1220f1

Contents?: true

Size: 1.1 KB

Versions: 1

Compression:

Stored size: 1.1 KB

Contents

class StringSet
  class Error < ::RuntimeError; end
  
  TOKENIZER = /\W+/
  
  attr_reader :strings, :max_token_size
  
  def stemming? 
    !!@stemming
  end
  
  def initialize(strings = [], options = {})
    @stemming = options[:stem]
    @strings = tokenize strings
    @max_token_size = @strings.map{|str| str.split(TOKENIZER).length }.max.to_i
    @strings.map! {|str| stem(str.split(TOKENIZER)).join(" ") } if stemming?
  end
  
  def substrings_in(strings)
    tokenize(strings, true) & @strings
  end

  def tokenize(strings, ngramize = false)
    tokens = case strings
    when Array: 
      strings
    when String: 
      stem(strings.split(TOKENIZER))
    else
      tokenize(strings.to_s, ngramize)
    end
    ngramize ? ngramize(tokens) : tokens
  end
  
  def ngramize(tokens, size = @max_token_size)
    buffer = []
    2.upto(size) do |n|
      0.upto(tokens.length - n) do |i|
        buffer << Array.new(n){|j| j }.map{|k| tokens[i+k] }.join(" ")
      end
    end
    tokens + buffer
  end

  def stem(tokens)
    return tokens unless stemming?
    require "stemmer"
    tokens.map{|t| t.stem }
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
fizx-stringset-0.2.2 lib/stringset.rb