lib/mongoid_fulltext.rb in mongoid_fulltext-0.3.6 vs lib/mongoid_fulltext.rb in mongoid_fulltext-0.4.0

- old
+ new

@@ -6,11 +6,11 @@ end class UnspecifiedIndexError < StandardError; end module ClassMethods - + def fulltext_search_in(*args) self.mongoid_fulltext_config = {} if self.mongoid_fulltext_config.nil? options = args.last.is_a?(Hash) ? args.pop : {} if options.has_key?(:index_name) index_name = options[:index_name] @@ -22,92 +22,143 @@ :alphabet => 'abcdefghijklmnopqrstuvwxyz0123456789 ', :word_separators => ' ', :ngram_width => 3, :max_ngrams_to_search => 6, :apply_prefix_scoring_to_all_words => true, - :index_full_words => true + :index_full_words => true, + :max_candidate_set_size => 1000 } config.update(options) args = [:to_s] if args.empty? config[:ngram_fields] = args config[:alphabet] = Hash[config[:alphabet].split('').map{ |ch| [ch,ch] }] config[:word_separators] = Hash[config[:word_separators].split('').map{ |ch| [ch,ch] }] self.mongoid_fulltext_config[index_name] = config - coll = collection.db.collection(index_name) - coll.ensure_index([['ngram', Mongo::ASCENDING]]) - coll.ensure_index([['document_id', Mongo::ASCENDING]]) + ensure_indexes(index_name, config) before_save :update_ngram_index before_destroy :remove_from_ngram_index end + def ensure_indexes(index_name, config) + db = collection.db + coll = db.collection(index_name) + + filter_indexes = (config[:filters] || []).map do |key,value| + ["filter_values.#{key}", Mongo::ASCENDING] + end + index_definition = [['ngram', Mongo::ASCENDING], ['score', Mongo::DESCENDING]].concat(filter_indexes) + + # Since the definition of the index could have changed, we'll clean up by + # removing any indexes that aren't on the exact + correct_keys = index_definition.map{ |field_def| field_def[0] } + all_filter_keys = filter_indexes.map{ |field_def| field_def[0] } + coll.index_information.each do |name, definition| + keys = definition['key'].keys + next if !keys.member?('ngram') + all_filter_keys |= keys.find_all{ |key| key.starts_with?('filter_values.') } + coll.drop_index(name) if keys & correct_keys != correct_keys + end + + if all_filter_keys.length > filter_indexes.length + filter_indexes = all_filter_keys.map { |key| [key, Mongo::ASCENDING] } + index_definition = [['ngram', Mongo::ASCENDING], ['score', Mongo::DESCENDING]].concat(filter_indexes) + end + + coll.ensure_index(index_definition, name: 'fts_index') + coll.ensure_index([['document_id', Mongo::ASCENDING]]) # to make removes fast + end + def fulltext_search(query_string, options={}) max_results = options.has_key?(:max_results) ? options.delete(:max_results) : 10 return_scores = options.has_key?(:return_scores) ? options.delete(:return_scores) : false if self.mongoid_fulltext_config.count > 1 and !options.has_key?(:index) error_message = '%s is indexed by multiple full-text indexes. You must specify one by passing an :index_name parameter' raise UnspecifiedIndexError, error_message % self.name, caller end index_name = options.has_key?(:index) ? options.delete(:index) : self.mongoid_fulltext_config.keys.first - - # options hash should only contain filters after this point + + # Options hash should only contain filters after this point + ngrams = all_ngrams(query_string, self.mongoid_fulltext_config[index_name]) return [] if ngrams.empty? - - query = {'ngram' => {'$in' => ngrams.keys}} - query.update(Hash[options.map { |key,value| [ 'filter_values.%s' % key, { '$all' => [ value ].flatten } ] }]) - map = <<-EOS - function() { - emit(this['document_id'], {'class': this['class'], 'score': this['score']*ngrams[this['ngram']] }) - } - EOS - reduce = <<-EOS - function(key, values) { - score = 0.0 - for (i in values) { - score += values[i]['score'] - } - return({'class': values[0]['class'], 'score': score}) - } - EOS - mr_options = {:scope => {:ngrams => ngrams }, :query => query, :raw => true} - rc_options = { :return_scores => return_scores } + + # For each ngram, construct the query we'll use to pull index documents and + # get a count of the number of index documents containing that n-gram + ordering = [['score', Mongo::DESCENDING]] + limit = self.mongoid_fulltext_config[index_name][:max_candidate_set_size] coll = collection.db.collection(index_name) - if collection.db.connection.server_version >= '1.7.4' - mr_options[:out] = {:inline => 1} - results = coll.map_reduce(map, reduce, mr_options)['results'].sort_by{ |x| -x['value']['score'] } - max_results = results.count if max_results.nil? - instantiate_mapreduce_results(results.first(max_results), rc_options) - else - result_collection = coll.map_reduce(map, reduce, mr_options)['result'] - results = collection.db.collection(result_collection).find.sort(['value.score',-1]) - results = results.limit(max_results) if !max_results.nil? - models = instantiate_mapreduce_results(results, rc_options) - collection.db.collection(result_collection).drop - models + cursors = ngrams.map do |ngram| + query = {'ngram' => ngram[0]} + query.update(Hash[options.map { |key,value| [ 'filter_values.%s' % key, { '$all' => [ value ].flatten } ] }]) + count = coll.find(query).count + {:ngram => ngram, :count => count, :query => query} + end.sort_by!{ |record| record[:count] } + + # Using the queries we just constructed and the n-gram frequency counts we + # just computed, pull in about *:max_candidate_set_size* candidates by + # considering the n-grams in order of increasing frequency. When we've + # spent all *:max_candidate_set_size* candidates, pull the top-scoring + # *max_results* candidates for each remaining n-gram. + results_so_far = 0 + candidates_list = cursors.map do |doc| + next if doc[:count] == 0 + query_options = {} + if results_so_far >= limit + query_options = {:sort => ordering, :limit => max_results} + elsif doc[:count] > limit - results_so_far + query_options = {:sort => ordering, :limit => limit - results_so_far} + end + results_so_far += doc[:count] + ngram_score = ngrams[doc[:ngram][0]] + Hash[coll.find(doc[:query], query_options).map do |candidate| + [candidate['document_id'], + {clazz: candidate['class'], score: candidate['score'] * ngram_score}] + end] + end.compact + + # Finally, score all candidates by matching them up with other candidates that are + # associated with the same document. This is similar to how you might process a + # boolean AND query, except that with an AND query, you'd stop after considering + # the first candidate list and matching its candidates up with candidates from other + # lists, whereas here we want the search to be a little fuzzier so we'll run through + # all candidate lists, removing candidates as we match them up. + all_scores = [] + while !candidates_list.empty? + candidates = candidates_list.pop + scores = candidates.map do |candidate_id, data| + {:id => candidate_id, + :clazz => data[:clazz], + :score => data[:score] + candidates_list.map{ |others| (others.delete(candidate_id) || {score: 0})[:score] }.sum + } + end + all_scores.concat(scores) end + all_scores.sort_by!{ |document| -document[:score] } + + instantiate_mapreduce_results(all_scores[0..max_results-1], { :return_scores => return_scores }) end def instantiate_mapreduce_result(result) - Object::const_get(result['value']['class']).find(:first, :conditions => {:id => result['_id']}) + result[:clazz].constantize.find(:first, :conditions => {'_id' => result[:id]}) end def instantiate_mapreduce_results(results, options) if (options[:return_scores]) - results.map { |result| [ instantiate_mapreduce_result(result), result['value']['score'] ] }.find_all { |result| ! result[0].nil? } + results.map { |result| [ instantiate_mapreduce_result(result), result[:score] ] }.find_all { |result| ! result[0].nil? } else - results.map { |result| instantiate_mapreduce_result(result) }.find_all { |result| ! result.nil? } + results.map { |result| instantiate_mapreduce_result(result) }.compact end end # returns an [ngram, score] [ngram, position] pair def all_ngrams(str, config, bound_number_returned = true) return {} if str.nil? or str.length < config[:ngram_width] - filtered_str = str.downcase.split('').map{ |ch| config[:alphabet][ch] }.find_all{ |ch| !ch.nil? }.join('') + filtered_str = str.downcase.split('').map{ |ch| config[:alphabet][ch] }.compact.join('') if bound_number_returned step_size = [((filtered_str.length - config[:ngram_width]).to_f / config[:max_ngrams_to_search]).ceil, 1].max else step_size = 1 @@ -139,11 +190,24 @@ ngram_hash[ngram] = [ngram_hash[ngram] || 0, score].max end ngram_hash end - + + def remove_from_ngram_index + self.mongoid_fulltext_config.each_pair do |index_name, fulltext_config| + coll = collection.db.collection(index_name) + coll.remove({'class' => self.name}) + end + end + + def update_ngram_index + self.all.each do |model| + model.update_ngram_index + end + end + end def update_ngram_index self.mongoid_fulltext_config.each_pair do |index_name, fulltext_config| # remove existing ngrams from external index @@ -160,10 +224,10 @@ begin [key, value.call(self)] rescue # Suppress any exceptions caused by filters end - end.find_all{ |x| !x.nil? }] + end.compact] end # insert new ngrams in external index ngrams.each_pair do |ngram, score| index_document = {'ngram' => ngram, 'document_id' => self._id, 'score' => score, 'class' => self.class.name} index_document['filter_values'] = filter_values if fulltext_config.has_key?(:filters)