require 'rsi/porter' require 'rsi/logmanager' # mixin Stemmable.stem (from porter.rb) into String class String include Stemmable end # # Classes for building and querying indexes. # module RSI class IndexException < RuntimeError; end # Document index. Interface for adding documents to index, and # for querying an index. class Indexer include RSI::Loggable # Dictionary of terms. attr_reader :root # Analyzer to use for document and query tokenization. attr_accessor :analyzer, :query_analyzer, :serializer, :dicts META_FILE = "meta.yaml" DOCS_FILE = "docs.list" def initialize( root ) @root = root @docs = {} @meta = { :next_docid => 0 } @serializer = RSI::NativeSerializer.new() @analyzer = RSI::DefaultTextAnalyzer.new() @query_analyzer = RSI::DefaultTextAnalyzer.new() @dicts = {} @opened = false end def open() Dir.mkdir( @root ) unless FileTest.exists?( @root ) log_fh = File.open( File.join( @root, "index.log" ), File::WRONLY|File::APPEND|File::CREAT ) log_fh.sync = true logger.info( "Trying to reload index..." ) begin reload() rescue logger.info( "Reload failed (#{$!}), creating new index" ) # nothing to do end # Query the analyzer, getting the fields it tokenizes. # Initialize and open a dictionary for each field. logger.info( "Assigning dictionaries..." ) @analyzer.get_field_types().each do |field, type| field_root = File.join( @root, field ) klass = map_field_type( type ) logger.debug( "Field: #{field} at #{field_root} is #{klass}" ) @dicts[field] = klass.new( field_root ) @dicts[field].serializer = @serializer end logger.info( "Opening dictionaries" ) @dicts.each do |name, dict| logger.debug( "Dictionary: #{name}" ) dict.open() end @opened = true end # Gets a dictionary instance for the given field type def map_field_type( type ) case type when RSI::FIELD_TYPE_TEXT return RSI::Dictionary when RSI::FIELD_TYPE_DATE raise "implement me! XXX" end end # Add a document to the index. def add_document( doc_uri, content ) open() unless @opened logger.info("Adding document #{doc_uri}") if @docs.has_value?( doc_uri ) raise IndexException, "Cannot do updates yet" else docid = next_docid() @docs[ docid ] = doc_uri pos = 0 term_entries = {} logger.debug("Tokenizing") @analyzer.tokenize( content ).each do |field, termlist| termlist.each do |term| termid = @dicts[field].get_termid_for(term, true) raise "POO" if termid==nil unless term_entries.has_key?( termid ) term_entries[termid] = [] end term_entries[termid] << pos pos += 1 end logger.debug("Adding term entries to #{field}") term_entries.each do |termid, pos_list| @dicts[field].add_term_entries(docid, termid, term_entries[termid]) end end end end # Remove a document from the index (slow!). def delete_document( doc_uri ) open() unless @opened raise "This is too hard for me, yet" end # Stop adding documents to the index, and serialize to storage. def flush() open() unless @opened logger.info("Finishing") store_metadata() store_doclist() @dicts.each do |field, dict| dict.store() end end # Return a list of document ids which contain any of the given # search termsn (OR query). The terms will be tokenized by the # current Analyzer. def find_any( terms_str ) open() unless @opened raise "unimplemented" end def get_dict_for_field( field ) return @dicts[field] end # Return a list of document ids which contain any of the given # search terms (AND query). The terms will be tokenized by the # current Analyzer. # def find_all( terms_str ) q = @query_analyzer.tokenize_query( terms_str ) logger.debug( "Query=#{q.to_s}" ) docids = q.evaluate( self ) docids.uniq! return docids.collect {|id| @docs[id]} end def OLD_find_all( terms_str ) open() unless @opened # this querying logic is too fragile logger.info { "Query: #{terms_str}" } t_set = @query_analyzer.tokenize_query( terms_str ) logger.debug { "Tokenized: #{t_set}" } # build map of docid => term-match-count finds = {} t_set.each do |field, term_list| term_list.each do |term| logger.debug { "field='#{field}', term='#{term}'" } # lookup termid in dict for field unless @dicts[field].has_term?( term ) logger.info { "No term #{term} in dictionary #{field}" } next end termid = @dicts[field].get_termid_for( term ) logger.debug { "termid=#{termid}" } # get list of entries for termid e_list = @dicts[field].get_entry_list( termid ) # get list of docids e_list.each do |e| logger.debug { " docid=#{e.docid}" } finds[ e.docid ] = finds[ e.docid ].to_i + 1 end end end total_terms = 0 t_set.each_value {|vl| total_terms += vl.size() } logger.debug { "Total terms: #{total_terms}" } # foreach docid in map: match if term-match-count == terms-count d_return = [] finds.each do |docid, count| if count == total_terms # return docid uri = @docs[ docid ] d_return << uri end end return d_return end protected # needs synchro def next_docid() @meta[ :next_docid ] += 1 return @meta[ :next_docid ] end def reload() logger.info("Reloading from #{@root}") load_metadata() load_doclist() end def store_metadata() @meta[ :serializer ] = @serializer @meta[ :analyzer ] = @analyzer.class.name @meta[ :query_analyzer ] = @query_analyzer.class.name fn = File.join( @root, META_FILE ) logger.info( "Storing metadata to #{fn}" ) File.open( fn, "w" ) {|f| YAML.dump( @meta, f ) } end def load_metadata() fn = File.join( @root, META_FILE ) logger.info( "Loading metadata from #{fn}" ) File.open( fn, "r" ) {|f| @meta = YAML.load( f ) } @serializer = @meta[ :serializer ] @analyzer = eval "#{@meta[ :analyzer ]}.new()" @query_analyzer = eval "#{@meta[ :query_analyzer ]}.new()" end def store_doclist() fn = File.join( @root, DOCS_FILE ) logger.info( "Storing doc list to #{fn}" ) File.open( fn, "w" ) {|f| @serializer.dump( @docs, f ) } end def load_doclist() fn = File.join( @root, DOCS_FILE ) logger.info( "Loading doc list from #{fn}" ) File.open( fn, "r" ) {|f| @docs = @serializer.load( f ) } end end end