# ActsAsIndexed # Copyright (c) 2007 - 2010 Douglas F Shearer. # http://douglasfshearer.com # Distributed under the MIT license as included with this plugin. module ActsAsIndexed #:nodoc: class SearchIndex # root:: Location of index on filesystem as a Pathname. # index_depth:: Degree of index partitioning. # fields:: Fields or instance methods of ActiveRecord model to be indexed. # min_word_size:: Smallest query term that will be run through search. # if_proc:: A Proc. If the proc is true, the index gets added, if false if doesn't def initialize(root, index_depth, fields, min_word_size, if_proc=Proc.new{true}) @root = Pathname.new(root.to_s) @fields = fields @index_depth = index_depth @atoms = {} @min_word_size = min_word_size @records_size = exists? ? load_record_size : 0 @if_proc = if_proc end # Adds +record+ to the index. def add_record(record) return @records_size unless @if_proc.call(record) condensed_record = condense_record(record) load_atoms(condensed_record) add_occurences(condensed_record,record.id) @records_size += 1 end # Adds multiple records to the index. Accepts an array of +records+. def add_records(records) records.each do |record| add_record(record) end end # Removes +record+ from the index. def remove_record(record) atoms = condense_record(record) load_atoms(atoms) atoms.each do |a| @atoms[a].remove_record(record.id) if @atoms.has_key?(a) @records_size -= 1 #p "removing #{record.id} from #{a}" end end def update_record(record_new, record_old) # Work out which atoms have modifications. # Minimises loading and saving of partitions. old_atoms = condense_record(record_old) new_atoms = condense_record(record_new) # Remove the old version from the appropriate atoms. load_atoms(old_atoms) old_atoms.each do |a| @atoms[a].remove_record(record_new.id) if @atoms.has_key?(a) end if @if_proc.call(record_new) # Add the new version to the appropriate atoms. load_atoms(new_atoms) # TODO: Make a version of this method that takes the # atomised version of the record. add_occurences(new_atoms, record_new.id) end end # Saves the current index partitions to the filesystem. def save prepare atoms_sorted = {} @atoms.each do |atom_name, records| (atoms_sorted[encoded_prefix(atom_name)] ||= {})[atom_name] = records end atoms_sorted.each do |e_p, atoms| #p "Saving #{e_p}." @root.join(e_p.to_s).open("w+") do |f| Marshal.dump(atoms,f) end end save_record_size end # Deletes the current model's index from the filesystem. #-- # TODO: Write a public method that will delete all indexes. def destroy @root.delete end # Returns an array of IDs for records matching +query+. def search(query) return [] if query.nil? load_atoms(cleanup_atoms(query)) queries = parse_query(query.dup) positive = run_queries(queries[:positive]) positive_quoted = run_quoted_queries(queries[:positive_quoted]) negative = run_queries(queries[:negative]) negative_quoted = run_quoted_queries(queries[:negative_quoted]) if queries[:positive_quoted].any? && queries[:positive].any? p = positive.delete_if{ |r_id,w| positive_quoted.exclude?(r_id) } pq = positive_quoted.delete_if{ |r_id,w| positive.exclude?(r_id) } results = p.merge(pq) { |r_id,old_val,new_val| old_val + new_val} elsif queries[:positive].any? results = positive else results = positive_quoted end negative_results = (negative.keys + negative_quoted.keys) results.delete_if { |r_id, w| negative_results.include?(r_id) } #p results results end # Returns true if the index root exists on the FS. #-- # TODO: Make a private method called 'root_exists?' which checks for the root directory. def exists? @root.join('size').exist? end private # Gets the size file from the index. def load_record_size #p "About to load #{@root.join('size')}" @root.join('size').open do |f| Marshal.load(f) end end # Saves the size to the size file. def save_record_size @root.join('size').open('w+') do |f| Marshal.dump(@records_size,f) end end # Returns true if the given atom is present. def include_atom?(atom) @atoms.has_key?(atom) end # Returns true if all the given atoms are present. def include_atoms?(atoms_arr) atoms_arr.each do |a| return false unless include_atom?(a) end true end # Returns true if the given record is present. def include_record?(record_id) @atoms.each do |atomname, atom| return true if atom.include_record?(record_id) end end def add_atom(atom) @atoms[atom] = SearchAtom.new unless include_atom?(atom) end def add_occurences(condensed_record,record_id) condensed_record.each_with_index do |atom, i| add_atom(atom) @atoms[atom].add_position(record_id, i) #p "adding #{record.id} to #{atom}" end end def encoded_prefix(atom) prefix = atom[0,@index_depth] unless (@prefix_cache ||= {}).has_key?(prefix) if atom.length > 1 @prefix_cache[prefix] = prefix.split(//).map{|c| encode_character(c)}.join('_') else @prefix_cache[prefix] = encode_character(atom) end end @prefix_cache[prefix] end # Allows compatibility with 1.8.6 which has no ord method. def encode_character(char) if @@has_ord ||= char.respond_to?(:ord) char.ord.to_s else char[0] end end def parse_query(s) # Find -"foo bar". negative_quoted = [] while neg_quoted = s.slice!(/-\"[^\"]*\"/) negative_quoted << cleanup_atoms(neg_quoted) end # Find "foo bar". positive_quoted = [] while pos_quoted = s.slice!(/\"[^\"]*\"/) positive_quoted << cleanup_atoms(pos_quoted) end # Find -foo. negative = [] while neg = s.slice!(/-[\S]*/) negative << cleanup_atoms(neg).first end # Find +foo positive = [] while pos = s.slice!(/\+[\S]*/) positive << cleanup_atoms(pos).first end # Find all other terms. positive += cleanup_atoms(s,true) {:negative_quoted => negative_quoted, :positive_quoted => positive_quoted, :negative => negative, :positive => positive} end def run_queries(atoms) results = {} atoms.uniq.each do |atom| interim_results = {} if include_atom?(atom) # Collect all the weightings for the current atom. interim_results = @atoms[atom].weightings(@records_size) end if results.empty? # If first time round, set results with initial weightings. results = interim_results else # If second time round, add weightings together for records # matching both atoms. Any matching only one are discarded. rr = {} interim_results.each do |r,w| rr[r] = w + results[r] if results[r] end results = rr end end #p results results end def run_quoted_queries(quoted_atoms) results = {} quoted_atoms.each do |quoted_atom| interim_results = {} # Check the index contains all the required atoms. # match_atom = first_word_atom # for each of the others # return atom containing records + positions where current atom is preceded by following atom. # end # return records from final atom. next unless include_atoms?(quoted_atom) matches = @atoms[quoted_atom.first] quoted_atom[1..-1].each do |atom_name| matches = @atoms[atom_name].preceded_by(matches) end #results += matches.record_ids interim_results = matches.weightings(@records_size) if results.empty? results = interim_results else rr = {} interim_results.each do |r,w| rr[r] = w + results[r] if results[r] end #p results.class results = rr end end results end def load_atoms(atoms) # Remove duplicate atoms. # Remove atoms already in index. # Calculate prefixes. # Remove duplicate prefixes. atoms.uniq.reject{|a| include_atom?(a)}.collect{|a| encoded_prefix(a)}.uniq.each do |name| if (atom_file = @root.join(name.to_s)).exist? atom_file.open do |f| @atoms.merge!(Marshal.load(f)) end end end end def prepare # Makes the RAILS_ROOT/index/ENVIRONMENT/CLASS directories @root.mkpath end def cleanup_atoms(s, limit_size=false, min_size = @min_word_size || 3) atoms = s.downcase.gsub(/\W/,' ').squeeze(' ').split return atoms unless limit_size atoms.reject{|w| w.size < min_size} end def condense_record(record) condensed = [] @fields.each do |f| if (value = record.send(f)).present? condensed << value.to_s end end cleanup_atoms(condensed.join(' ')) end end end