module ActsAsIndexed #:nodoc: class SearchIndex # fields:: Fields or instance methods of ActiveRecord model to be indexed. # config:: ActsAsIndexed::Configuration instance. def initialize(fields, config) @storage = Storage.new(config) @fields = fields @atoms = ActiveSupport::OrderedHash.new @min_word_size = config.min_word_size @records_size = @storage.record_count @case_sensitive = config.case_sensitive @if_proc = config.if_proc end # Adds +record+ to the index. def add_record(record) return unless allow_indexing?(record) condensed_record = condense_record(record) atoms = add_occurences(condensed_record, record.id) @storage.add(atoms) end # Adds multiple records to the index. Accepts an array of +records+. def add_records(records) atoms = ActiveSupport::OrderedHash.new records_count = 0 records.each do |record| next unless allow_indexing?(record) records_count += 1 condensed_record = condense_record(record) atoms = add_occurences(condensed_record, record.id, atoms) end @storage.add(atoms, records_count) end # Removes +record+ from the index. def remove_record(record) condensed_record = condense_record(record) atoms = add_occurences(condensed_record,record.id) @storage.remove(atoms) end def update_record(record_new, record_old) if !record_unchanged?(record_new, record_old) remove_record(record_old) add_record(record_new) # Always try to remove the record if it is non-indexable, in case proc # makes use of any methods or attributes exteral of the record. elsif !allow_indexing?(record_new) remove_record(record_old) end end # Returns an array of IDs for records matching +query+. def search(query) return [] if query.nil? @atoms = @storage.fetch(cleanup_atoms(query), query[/\^/]) queries = parse_query(query.dup) positive = run_queries(queries[:positive]) positive_quoted = run_quoted_queries(queries[:positive_quoted]) negative = run_queries(queries[:negative]) negative_quoted = run_quoted_queries(queries[:negative_quoted]) starts_with = run_queries(queries[:starts_with], true) start_quoted = run_quoted_queries(queries[:start_quoted], true) results = ActiveSupport::OrderedHash.new if queries[:start_quoted].any? results = merge_query_results(results, start_quoted) end if queries[:starts_with].any? results = merge_query_results(results, starts_with) end if queries[:positive_quoted].any? results = merge_query_results(results, positive_quoted) end if queries[:positive].any? results = merge_query_results(results, positive) end negative_results = (negative.keys + negative_quoted.keys) results.delete_if { |r_id, w| negative_results.include?(r_id) } results end private # The record is unchanged for our purposes if all the fields are the same # and the if_proc returns the same result for both. def record_unchanged?(record_new, record_old) # NOTE: Using the dirty state would be great here, but it doesn't keep track of # in-place changes. allow_indexing?(record_old) == allow_indexing?(record_new) && !@fields.map { |field| record_old.send(field) == record_new.send(field) }.include?(false) end def allow_indexing?(record) @if_proc.call(record) end def merge_query_results(results1, results2) # Return the other if one is empty. return results1 if results2.empty? return results2 if results1.empty? # Delete any records from results 1 that are not in results 2. r1 = results1.delete_if{ |r_id,w| !results2.include?(r_id) } # Delete any records from results 2 that are not in results 1. r2 = results2.delete_if{ |r_id,w| !results1.include?(r_id) } # Merge the results by adding their respective scores. r1.merge(r2) { |r_id,old_val,new_val| old_val + new_val} end def add_occurences(condensed_record, record_id, atoms=ActiveSupport::OrderedHash.new) condensed_record.each_with_index do |atom_name, i| atoms[atom_name] = SearchAtom.new unless atoms.include?(atom_name) atoms[atom_name].add_position(record_id, i) end atoms end def parse_query(s) # Find ^"foo bar". start_quoted = [] while st_quoted = s.slice!(/\^\"[^\"]*\"/) start_quoted << cleanup_atoms(st_quoted) end # Find -"foo bar". negative_quoted = [] while neg_quoted = s.slice!(/-\"[^\"]*\"/) negative_quoted << cleanup_atoms(neg_quoted) end # Find "foo bar". positive_quoted = [] while pos_quoted = s.slice!(/\"[^\"]*\"/) positive_quoted << cleanup_atoms(pos_quoted) end # Find ^foo. starts_with = [] while st_with = s.slice!(/\^[\S]*/) starts_with << cleanup_atoms(st_with).first end # Find -foo. # Ignores instances where a dash is used as a hyphen. negative = [] s.gsub!(/^(.*\s)?-(\S*)/) do |match| negative << cleanup_atoms($2).first $1 end # Find +foo positive = [] while pos = s.slice!(/\+[\S]*/) positive << cleanup_atoms(pos).first end # Find all other terms. positive += cleanup_atoms(s,true) { :start_quoted => start_quoted, :negative_quoted => negative_quoted, :positive_quoted => positive_quoted, :starts_with => starts_with, :negative => negative, :positive => positive } end def run_queries(atoms, starts_with=false) results = ActiveSupport::OrderedHash.new atoms.each do |atom| interim_results = ActiveSupport::OrderedHash.new # If these atoms are to be run as 'starts with', make them a Regexp # with a carat. atom = /^#{atom}/ if starts_with # Get the resulting matches, and break if none exist. matches = get_atom_results(@atoms.keys, atom) break if matches.nil? # Grab the record IDs and weightings. interim_results = matches.weightings(@records_size) # Merge them with the results obtained already, if any. results = results.empty? ? interim_results : merge_query_results(results, interim_results) break if results.empty? end results end def run_quoted_queries(quoted_atoms, starts_with=false) results = ActiveSupport::OrderedHash.new quoted_atoms.each do |quoted_atom| interim_results = ActiveSupport::OrderedHash.new break if quoted_atom.empty? # If these atoms are to be run as 'starts with', make the final atom a # Regexp with a line-start anchor. quoted_atom[-1] = /^#{quoted_atom.last}/ if starts_with # Little bit of memoization. atoms_keys = @atoms.keys # Get the matches for the first atom. matches = get_atom_results(atoms_keys, quoted_atom.first) break if matches.nil? # Check the index contains all the required atoms. # for each of the others # return atom containing records + positions where current atom is preceded by following atom. # end # Return records from final atom. quoted_atom[1..-1].each do |atom_name| interim_matches = get_atom_results(atoms_keys, atom_name) if interim_matches.nil? matches = nil break end matches = interim_matches.preceded_by(matches) end break if matches.nil? # Grab the record IDs and weightings. interim_results = matches.weightings(@records_size) # Merge them with the results obtained already, if any. results = results.empty? ? interim_results : merge_query_results(results, interim_results) break if results.empty? end results end def get_atom_results(atoms_keys, atom) if atom.is_a? Regexp matching_keys = atoms_keys.grep(atom) results = SearchAtom.new matching_keys.each do |key| results += @atoms[key] end results else @atoms[atom] end end def cleanup_atoms(s, limit_size=false) pre_tokenized = PreTokenizer.process(s) tokenized = Tokenizer.process(pre_tokenized) TokenNormalizer.process(tokenized, :normalize_case => !@case_sensitive, :min_token_length => !limit_size ? @min_token_length : false) end def condense_record(record) atoms = [] @fields.each do |f| if (value = record.send(f)).present? atoms += cleanup_atoms(value.to_s) #U+3000 separates fields so that quoted terms cannot match across #fields atoms << "\u3000" end end atoms end end end