# encoding: utf-8 module XapianDb # The indexer creates a Xapian::Document from an object. They object must be an instance # of a class that has a blueprint configuration. # @author Gernot Kogler class Indexer # Constructor # @param [XapianDb::Database] database The database to use (needed to build a spelling index) # @param [XapianDb::DocumentBlueprint] document_blueprint The blueprint to use def initialize(database, document_blueprint) @database, @document_blueprint = database, document_blueprint end # Build the document for an object. The object must respond to 'xapian_id'. # The configured adapter should implement this method. # @return [Xapian::Document] The xapian document (see http://xapian.org/docs/sourcedoc/html/classXapian_1_1Document.html) def build_document_for(obj) @obj = obj @blueprint = DocumentBlueprint.blueprint_for(@obj.class.name) @xapian_doc = Xapian::Document.new @xapian_doc.data = @obj.xapian_id store_fields index_text @xapian_doc end private # Store all configured fields def store_fields # class name of the object goes to position 0 @xapian_doc.add_value 0, @obj.class.name # natural sort order goes to position 1 if @blueprint._natural_sort_order.is_a? Proc sort_value = @obj.instance_eval &@blueprint._natural_sort_order else sort_value = @obj.send @blueprint._natural_sort_order end @xapian_doc.add_value 1, sort_value.to_s @blueprint.attribute_names.each do |attribute| block = @blueprint.block_for_attribute attribute if block value = @obj.instance_eval &block else value = @obj.send attribute end codec = XapianDb::TypeCodec.codec_for @blueprint.type_map[attribute] encoded_string = codec.encode value @xapian_doc.add_value DocumentBlueprint.value_number_for(attribute), encoded_string unless encoded_string.nil? end end # Index all configured text methods def index_text term_generator = Xapian::TermGenerator.new term_generator.database = @database.writer term_generator.document = @xapian_doc if XapianDb::Config.stemmer term_generator.stemmer = XapianDb::Config.stemmer term_generator.stopper = XapianDb::Config.stopper if XapianDb::Config.stopper # Enable the creation of a spelling dictionary if the database is not in memory term_generator.set_flags Xapian::TermGenerator::FLAG_SPELLING if @database.is_a? XapianDb::PersistentDatabase end # Index the primary key as a unique term @xapian_doc.add_term("Q#{@obj.xapian_id}") # Index the class with the field name term_generator.index_text("#{@obj.class}".downcase, 1, "XINDEXED_CLASS") @xapian_doc.add_term("C#{@obj.class}") @blueprint.indexed_method_names.each do |method| options = @blueprint.options_for_indexed_method method if options.block obj = @obj.instance_eval(&options.block) else obj = @obj.send(method) end unless obj.nil? values = get_values_to_index_from obj values.each do |value| terms = value.to_s.downcase terms = @blueprint.preprocess_terms.call(terms) if @blueprint.preprocess_terms terms = split(terms) if XapianDb::Config.term_splitter_count > 0 && !options.no_split # Add value with field name term_generator.index_text(terms, options.weight, "X#{method.upcase}") if options.prefixed # Add value without field name term_generator.index_text(terms, options.weight) end end end terms_to_ignore = @xapian_doc.terms.select{ |term| term.term.length < XapianDb::Config.term_min_length } terms_to_ignore.each { |term| @xapian_doc.remove_term term.term } end # Get the values to index from an object def get_values_to_index_from(obj) # if it's an array, we collect the values for its elements recursive if obj.is_a? Array return obj.map { |element| get_values_to_index_from element }.flatten.compact end # if the object responds to attributes and attributes is a hash, # we use the attributes values (works well for active_record and datamapper objects) return obj.attributes.values.compact if obj.respond_to?(:attributes) && obj.attributes.is_a?(Hash) # The object is unkown and will be indexed by its to_s method; if to_s retruns nil, we # will not index it obj.to_s.nil? ? [] : [obj] end private def split(terms) splitted_terms = [] terms.split(" ").each do |term| (1..XapianDb::Config.term_splitter_count).each { |i| splitted_terms << term[0...i] } splitted_terms << term end splitted_terms.join " " end end end