require 'set' # Strabo assists full text search indexing by generating term-frequency maps # for an object's attributes. The term-frequency map may be flattened into # an index for the entire object. # # Strabo was written with MongoDB in mind. The idea is that a document will # store its own embedded keyword index that MongoDB can use for full text # search. # # @example: Using strabo # class Book < Hash # include Strabo # end # # book = Book.new # book['title'] = 'Learn to Program' # book['author'] = 'Chris Pine' # b.keywords # => {"title"=>{"learn"=>1, "to"=>1, "program"=>1}, "author"=>{"chris"=>1, "pine"=>1}} # # @author: Jon Morton # module Strabo # Stemming configuration. By default, Strabo performs no stemming. # # @example: Configuring stemming # require 'rubygems' # require 'lingua/stemmer' # Strabo::Stemmer.stemmer = lambda { |term| Lingua.stemmer(term) } # # @see http://github.com/aurelian/ruby-stemmer Ruby-Stemmer on github # module Stemmer # Set the stemmer used during tokenization. # # @param [lambda] stemmer called with individual tokens # # @see Strabo#stem def self.stemmer=(stemmer) @stemmer = stemmer end # Invokes stemmer on token. If no stemmer has been configured, it will # return the original token. # # @param [Array] one or more tokens # # @return [Array] stemmed tokens # # @see Strabo#stemmer def self.stem(*tokens) tokens.map do |token| @stemmer.nil? ? token : @stemmer.call(token) end.flatten end end # Defines how a single string is divided into multiple strings. module Tokenizer # Break a string into a list of strings. # # @param [String] text to convert into a list # @param [Regex] delimiter used to scan the string # # @return [Array] list of stemmed terms # # @private def self.tokenize(value, delimiter = /\S+/) value.downcase.gsub(/[^a-z0-9\s]/i,'').scan(delimiter) end end module Indexer # Get attribute-term-frequency map. If flattened, a term-frequency map # without the context of the attribute. # # @param [TrueClass, FalseClass] flatten # # @return [Hash] { attribute => { term => frequency } } or # { term => frequency } map. def keywords(flatten = false) @term_map = {} self.each { |key, value| @term_map[key] = frequency(Stemmer::stem(Tokenizer::tokenize(value))) } case flatten when false : @term_map when true : flatten_keyword_map(@term_map) else flatten_keyword_map(@term_map).keys end end private # Tally the number of occurrences of a value in a list. # # @param [Array] list of terms to count # # @return [Hash] term-frequency map # # @private def frequency(values) values.inject(Hash.new) do |h, term| h[term] = (h[term].nil?) ? (1) : (h[term] + 1) h end end # @see Strabo::Indexer#keywords # # @private def flatten_keyword_map(map) h = {} map.each do |att, terms| terms.each do |term, frequency| h[term] = (h[term] || 0) + frequency end end h end end end