# ActsAsIndexed # Copyright (c) 2007 - 2010 Douglas F Shearer. # http://douglasfshearer.com # Distributed under the MIT license as included with this plugin. module ActsAsIndexed #:nodoc: class SearchIndex # root:: Location of index on filesystem as a Pathname. # index_depth:: Degree of index partitioning. # fields:: Fields or instance methods of ActiveRecord model to be indexed. # min_word_size:: Smallest query term that will be run through search. # if_proc:: A Proc. If the proc is true, the index gets added, if false if doesn't def initialize(root, index_depth, fields, min_word_size, if_proc=Proc.new{true}) @root = Pathname.new(root.to_s) @fields = fields @index_depth = index_depth @atoms = {} @min_word_size = min_word_size @records_size = exists? ? load_record_size : 0 @if_proc = if_proc end # Adds +record+ to the index. def add_record(record, no_save=false) return unless @if_proc.call(record) condensed_record = condense_record(record) load_atoms(condensed_record) add_occurences(condensed_record,record.id) @records_size += 1 self.save unless no_save end # Adds multiple records to the index. Accepts an array of +records+. def add_records(records) records.each do |record| add_record(record, true) end self.save end # Removes +record+ from the index. def remove_record(record) atoms = condense_record(record) load_atoms(atoms) atoms.each do |a| @atoms[a].remove_record(record.id) if @atoms.has_key?(a) @records_size -= 1 end self.save end def update_record(record_new, record_old) remove_record(record_old) add_record(record_new) end # Saves the current index partitions to the filesystem. def save prepare atoms_sorted = {} @atoms.each do |atom_name, records| (atoms_sorted[encoded_prefix(atom_name)] ||= {})[atom_name] = records end atoms_sorted.each do |e_p, atoms| @root.join(e_p.to_s).open("w+") do |f| Marshal.dump(atoms,f) end end save_record_size end # Deletes the current model's index from the filesystem. #-- # TODO: Write a public method that will delete all indexes. def destroy @root.delete end # Returns an array of IDs for records matching +query+. def search(query) return [] if query.nil? load_options = { :start => true } if query[/\^/] load_atoms(cleanup_atoms(query), load_options || {}) queries = parse_query(query.dup) positive = run_queries(queries[:positive]) positive_quoted = run_quoted_queries(queries[:positive_quoted]) negative = run_queries(queries[:negative]) negative_quoted = run_quoted_queries(queries[:negative_quoted]) starts_with = run_queries(queries[:starts_with], true) start_quoted = run_quoted_queries(queries[:start_quoted], true) results = {} if queries[:start_quoted].any? results = merge_query_results(results, start_quoted) end if queries[:starts_with].any? results = merge_query_results(results, starts_with) end if queries[:positive_quoted].any? results = merge_query_results(results, positive_quoted) end if queries[:positive].any? results = merge_query_results(results, positive) end negative_results = (negative.keys + negative_quoted.keys) results.delete_if { |r_id, w| negative_results.include?(r_id) } results end def merge_query_results(results1, results2) # Return the other if one is empty. return results1 if results2.empty? return results2 if results1.empty? # Delete any records from results 1 that are not in results 2. r1 = results1.delete_if{ |r_id,w| results2.exclude?(r_id) } # Delete any records from results 2 that are not in results 1. r2 = results2.delete_if{ |r_id,w| results1.exclude?(r_id) } # Merge the results by adding their respective scores. r1.merge(r2) { |r_id,old_val,new_val| old_val + new_val} end # Returns true if the index root exists on the FS. #-- # TODO: Make a private method called 'root_exists?' which checks for the root directory. def exists? @root.join('size').exist? end private # Gets the size file from the index. def load_record_size @root.join('size').open do |f| Marshal.load(f) end end # Saves the size to the size file. def save_record_size @root.join('size').open('w+') do |f| Marshal.dump(@records_size,f) end end # Returns true if the given atom is present. def include_atom?(atom) if atom.is_a? Regexp @atoms.keys.grep(atom).any? else @atoms.has_key?(atom) end end # Returns true if all the given atoms are present. def include_atoms?(atoms_arr) atoms_arr.each do |a| return false unless include_atom?(a) end true end # Returns true if the given record is present. def include_record?(record_id) @atoms.each do |atomname, atom| return true if atom.include_record?(record_id) end end def add_atom(atom) @atoms[atom] = SearchAtom.new unless include_atom?(atom) end def add_occurences(condensed_record,record_id) condensed_record.each_with_index do |atom, i| add_atom(atom) @atoms[atom].add_position(record_id, i) end end def encoded_prefix(atom) prefix = atom[0,@index_depth] unless (@prefix_cache ||= {}).has_key?(prefix) if atom.length > 1 @prefix_cache[prefix] = prefix.split(//).map{|c| encode_character(c)}.join('_') else @prefix_cache[prefix] = encode_character(atom) end end @prefix_cache[prefix] end # Allows compatibility with 1.8.6 which has no ord method. def encode_character(char) if @@has_ord ||= char.respond_to?(:ord) char.ord.to_s else char[0] end end def parse_query(s) # Find ^"foo bar". start_quoted = [] while st_quoted = s.slice!(/\^\"[^\"]*\"/) start_quoted << cleanup_atoms(st_quoted) end # Find -"foo bar". negative_quoted = [] while neg_quoted = s.slice!(/-\"[^\"]*\"/) negative_quoted << cleanup_atoms(neg_quoted) end # Find "foo bar". positive_quoted = [] while pos_quoted = s.slice!(/\"[^\"]*\"/) positive_quoted << cleanup_atoms(pos_quoted) end # Find ^foo. starts_with = [] while st_with = s.slice!(/\^[\S]*/) starts_with << cleanup_atoms(st_with).first end # Find -foo. negative = [] while neg = s.slice!(/-[\S]*/) negative << cleanup_atoms(neg).first end # Find +foo positive = [] while pos = s.slice!(/\+[\S]*/) positive << cleanup_atoms(pos).first end # Find all other terms. positive += cleanup_atoms(s,true) { :start_quoted => start_quoted, :negative_quoted => negative_quoted, :positive_quoted => positive_quoted, :starts_with => starts_with, :negative => negative, :positive => positive } end def run_queries(atoms, starts_with=false) results = {} atoms.each do |atom| interim_results = {} # If these atoms are to be run as 'starts with', make them a Regexp # with a carat. atom = /^#{atom}/ if starts_with # Get the resulting matches, and break if none exist. matches = get_atom_results(@atoms.keys, atom) break if matches.nil? # Grab the record IDs and weightings. interim_results = matches.weightings(@records_size) # Merge them with the results obtained already, if any. results = results.empty? ? interim_results : merge_query_results(results, interim_results) break if results.empty? end results end def run_quoted_queries(quoted_atoms, starts_with=false) results = {} quoted_atoms.each do |quoted_atom| interim_results = {} break if quoted_atom.empty? # If these atoms are to be run as 'starts with', make the final atom a # Regexp with a line-start anchor. quoted_atom[-1] = /^#{quoted_atom.last}/ if starts_with # Little bit of memoization. atoms_keys = @atoms.keys # Get the matches for the first atom. matches = get_atom_results(atoms_keys, quoted_atom.first) break if matches.nil? # Check the index contains all the required atoms. # for each of the others # return atom containing records + positions where current atom is preceded by following atom. # end # Return records from final atom. quoted_atom[1..-1].each do |atom_name| interim_matches = get_atom_results(atoms_keys, atom_name) if interim_matches.nil? matches = nil break end matches = interim_matches.preceded_by(matches) end break if matches.nil? # Grab the record IDs and weightings. interim_results = matches.weightings(@records_size) # Merge them with the results obtained already, if any. results = results.empty? ? interim_results : merge_query_results(results, interim_results) break if results.empty? end results end def get_atom_results(atoms_keys, atom) if atom.is_a? Regexp matching_keys = atoms_keys.grep(atom) results = SearchAtom.new matching_keys.each do |key| results += @atoms[key] end results else @atoms[atom] end end def load_atoms(atoms, options={}) # Remove duplicate atoms. # Remove atoms already in index. # Calculate prefixes. # Remove duplicate prefixes. atoms.uniq.reject{|a| include_atom?(a)}.collect{|a| encoded_prefix(a)}.uniq.each do |name| pattern = @root.join(name.to_s).to_s pattern += '*' if options[:start] Pathname.glob(pattern).each do |atom_file| atom_file.open do |f| @atoms.merge!(Marshal.load(f)) end end end end def prepare # Makes the RAILS_ROOT/index/ENVIRONMENT/CLASS directories @root.mkpath end def cleanup_atoms(s, limit_size=false, min_size = @min_word_size || 3) atoms = s.downcase.gsub(/\W/,' ').squeeze(' ').split return atoms unless limit_size atoms.reject{|w| w.size < min_size} end def condense_record(record) condensed = [] @fields.each do |f| if (value = record.send(f)).present? condensed << value.to_s end end cleanup_atoms(condensed.join(' ')) end end end