require 'set'

# Strabo assists full text search indexing by generating term-frequency maps
# for an object's attributes.  The term-frequency map may be flattened into
# an index for the entire object.
#
# Strabo was written with MongoDB in mind.  The idea is that a document will
# store its own embedded keyword index that MongoDB can use for full text
# search.
#
# @example: Using strabo
#   class Book < Hash
#     include Strabo
#   end
#   
#   book = Book.new
#   book['title'] = 'Learn to Program'
#   book['author'] = 'Chris Pine'
#   b.keywords # => {"title"=>{"learn"=>1, "to"=>1, "program"=>1}, "author"=>{"chris"=>1, "pine"=>1}}
#
# @author: Jon Morton
#
module Strabo
  
  # Stemming configuration.  By default, Strabo performs no stemming.
  #
  # @example: Configuring stemming
  #  require 'rubygems'
  #  require 'lingua/stemmer'
  #  Strabo::Stemmer.stemmer = lambda { |term| Lingua.stemmer(term) }
  #
  # @see http://github.com/aurelian/ruby-stemmer Ruby-Stemmer on github
  #
  module Stemmer
    
    # Set the stemmer used during tokenization.
    #
    # @param [lambda] stemmer called with individual tokens
    #
    # @see Strabo#stem
    def self.stemmer=(stemmer)
      @stemmer = stemmer
    end
  
    # Invokes stemmer on token.  If no stemmer has been configured, it will
    # return the original token.
    #
    # @param [Array] one or more tokens 
    #
    # @return [Array] stemmed tokens
    #
    # @see Strabo#stemmer
    def self.stem(*tokens)
      tokens.map do |token|
        @stemmer.nil? ? token : @stemmer.call(token)
      end.flatten
    end
  end
  
  # Defines how a single string is divided into multiple strings.
  module Tokenizer
    
    # Break a string into a list of strings.
    #
    # @param [String] text to convert into a list
    # @param [Regex] delimiter used to scan the string
    #
    # @return [Array] list of stemmed terms
    #
    # @private
    def self.tokenize(value, delimiter = /\S+/)
      value.downcase.gsub(/[^a-z0-9\s]/i,'').scan(delimiter)
    end
    
  end
  
  module Indexer
    
    # Get attribute-term-frequency map. If flattened, a term-frequency map
    # without the context of the attribute.
    #
    # @param [TrueClass, FalseClass] flatten
    #
    # @return [Hash] { attribute => { term => frequency } } or
    #                { term => frequency } map.
    def keywords(flatten = false)
      @term_map = {}
      self.each { |key, value| @term_map[key] = frequency(Stemmer::stem(Tokenizer::tokenize(value))) }
      
      case flatten
      when false : @term_map
      when true  : flatten_keyword_map(@term_map)
      else         flatten_keyword_map(@term_map).keys
      end
    end
    
  private
  
    # Tally the number of occurrences of a value in a list.
    #
    # @param [Array] list of terms to count
    #
    # @return [Hash] term-frequency map
    #
    # @private
    def frequency(values)
      values.inject(Hash.new) do |h, term|
        h[term] = (h[term].nil?) ? (1) : (h[term] + 1)
        h
      end
    end
    
    # @see Strabo::Indexer#keywords
    #
    # @private
    def flatten_keyword_map(map)
      h = {}
      map.each do |att, terms|
        terms.each do |term, frequency|
          h[term] = (h[term] || 0) + frequency
        end
      end
      h
    end
    
  end
end