mongoid_fulltext.rb in mongoid_fulltext-0.5.2

- old
+ new

@@ -1,7 +1,8 @@
 require 'mongoid_indexes'
 require 'unicode_utils'
+require 'cgi'
 
 module Mongoid::FullTextSearch
   extend ActiveSupport::Concern
 
   included do
@@ -178,55 +179,60 @@
       else
         results.map { |result| instantiate_mapreduce_result(result) }.compact
       end
     end
 
-    # returns an [ngram, score] [ngram, position] pair
     def all_ngrams(str, config, bound_number_returned = true)
-      return {} if str.nil? or str.length < config[:ngram_width]
+      return {} if str.nil?
 
-      filtered_str = String.new(str)
       if config[:remove_accents]
-        if str.encoding.name == "ASCII-8BIT"
-          filtered_str = CGI.unescape(filtered_str)
-        end
-        filtered_str = UnicodeUtils.nfkd(filtered_str).gsub(/[^\x00-\x7F]/,'')
+        str = UnicodeUtils.nfkd(CGI.unescape(str)).gsub(/[^\x00-\x7F]/,'')
       end
 
-      filtered_str = filtered_str.mb_chars.downcase.to_s.split('').map{ |ch| config[:alphabet][ch] }.compact.join('')
+      # Remove any characters that aren't in the alphabet
+      filtered_str = str.mb_chars.to_s.downcase.split('').find_all{ |ch| config[:alphabet][ch] }.join('')
       
+      # Figure out how many ngrams to extract from the string. If we can't afford to extract all ngrams,
+      # step over the string in evenly spaced strides to extract ngrams. For example, to extract 3 3-letter
+      # ngrams from 'abcdefghijk', we'd want to extract 'abc', 'efg', and 'ijk'.
       if bound_number_returned
         step_size = [((filtered_str.length - config[:ngram_width]).to_f / config[:max_ngrams_to_search]).ceil, 1].max
       else
         step_size = 1
       end
       
-      # Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the input string
-      ngram_ary = (0..filtered_str.length - config[:ngram_width]).step(step_size).map do |i|
+      # Create an array of records of the form {:ngram => x, :score => y} for all ngrams that occur in the 
+      # input string using the step size that we just computed. Let score(x,y) be the score of string x
+      # compared with string y - assigning scores to ngrams with the square root-based scoring function
+      # below and multiplying scores of matching ngrams together yields a score function that has the
+      # property that score(x,y) > score(x,z) for any string z containing y and score(x,y) > score(x,z)
+      # for any string z contained in y.
+      ngram_array = (0..filtered_str.length - config[:ngram_width]).step(step_size).map do |i|
         if i == 0 or (config[:apply_prefix_scoring_to_all_words] and \
                       config[:word_separators].has_key?(filtered_str[i-1].chr))
           score = Math.sqrt(1 + 1.0/filtered_str.length)
         else
           score = Math.sqrt(2.0/filtered_str.length)
         end
         {:ngram => filtered_str[i..i+config[:ngram_width]-1], :score => score}
       end
 
       # If an ngram appears multiple times in the query string, keep the max score
-      ngram_ary = ngram_ary.group_by{ |h| h[:ngram] }.map{ |key, values| {:ngram => key, :score => values.map{ |v| v[:score] }.max} }
+      ngram_array = ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| {:ngram => key, :score => values.map{ |v| v[:score] }.max} }
       
+      # Add records to the array of ngrams for each full word in the string that isn't a stop word
       if (config[:index_full_words])
         full_words_seen = {}
         filtered_str.split(Regexp.compile(config[:word_separators].keys.join)).each do |word|
-          if word.length >= config[:ngram_width] and full_words_seen[word].nil? and config[:stop_words][word].nil?
-            ngram_ary << {:ngram => word, :score => 1}
+          if word.length > 1 and full_words_seen[word].nil? and config[:stop_words][word].nil?
+            ngram_array << {:ngram => word, :score => 1}
             full_words_seen[word] = true
           end
         end
       end
 
       # If an ngram appears as a full word and an ngram, keep the sum of the two scores
-      Hash[ngram_ary.group_by{ |h| h[:ngram] }.map{ |key, values| [key, values.map{ |v| v[:score] }.sum] }]
+      Hash[ngram_array.group_by{ |h| h[:ngram] }.map{ |key, values| [key, values.map{ |v| v[:score] }.sum] }]
     end
     
     def remove_from_ngram_index
       self.mongoid_fulltext_config.each_pair do |index_name, fulltext_config|
         coll = collection.db.collection(index_name)