# # Dictionary maintenence for text indexes # require 'yaml' require 'rsi/logmanager' module RSI # An occurrence of a term in a document. # [huh. freq and pos_list don't seem to add much to the dict size] # freq is redundant. ==pos_list.length class TermEntry attr_accessor :docid, :freq, :pos_list def initialize( docid ) @docid = docid @freq = 0 @pos_list = [] end def to_s YAML.dump(self) end end class Dictionary include Loggable attr_accessor :terms, :serializer attr_reader :root @@termgroup_loading = 50 META_FILE = "meta.yaml" TERMS_FILE = "terms.list" TERMGROUP_FILE = "termgroup.list" def initialize( root ) @root = root @serializer = NativeSerializer.new() @terms_root = File.join( @root, "terms" ) @terms = {} # term => id @entries = {} # termid => [TermEntry...] @pending_entries = {} # of termid=>1 @termgroups = {} # termid -> termgroupid @meta = { :next_termid => 0, :next_termgroup_id => 0, :next_termgroup_count => 0, } end def open() Dir.mkdir( @root ) unless FileTest.exists?( @root ) logger.info( "Opening dictionary" ) begin self.reload() rescue logger.debug( $! ) logger.info( "DB does not exist (#{$!}), creating..." ) self.create_store() end @opened = true end # Serialize the current state of the dictionary. # (Currently requires time proportional (at least) to the # full size of the dictionary. This is a bug.) def store() logger.info( "Storing at #{@root}" ) # meta info, stored as yaml @meta[ :serializer ] = @serializer; File.open( File.join( @root, META_FILE ), "w" ) do |meta_f| YAML.dump( @meta, meta_f ) end # store terms term_fn = File.join( @root, TERMS_FILE ) logger.debug( "Storing terms to #{term_fn}" ) File.open( term_fn, "w" ) do |term_f| logger.debug( "terms=#{@terms}" ) @serializer.dump( @terms, term_f ) end File.open( File.join( @root, TERMGROUP_FILE), "w" ) do |termgroups_f| @serializer.dump( @termgroups, termgroups_f ) end store_term_entries() end def has_term?( term ) return @terms.has_key?( term ) end # Get the termid for the given (tokenized) term. If create is # true (the default), the given term has not been previously added # to the dictionary, a new id will be created and returned. def get_termid_for( term, create=false ) unless @terms.has_key?( term ) return nil unless create t = self.next_termid() @terms[term] = t end return @terms[term] end def add_term_entries( docid, termid, pos_list=[0] ) e = TermEntry.new( docid ) e.pos_list = pos_list e.freq = pos_list.length() add_entry( termid, e ) end # Get a list of entries for the given termid. # Creates the entry list, if it doesn't already exist. # Returns a list of TermEntries def get_entry_list( termid ) logger.debug( "[termid #{termid}]" ) unless @entries.has_key?( termid ) logger.debug( " No entry[#{termid}]" ) unless @termgroups.has_key?( termid ) logger.debug( " No termgroups[#{termid}]" ) @termgroups[ termid ] = next_termgroup_id() end id = @termgroups[ termid ] logger.debug( " Termgroup id=#{id}" ) tg_fn = File.join( @terms_root, "#{id}.tg" ) logger.debug( " fn=#{tg_fn}" ) if FileTest.exists?( tg_fn ) logger.debug( " Reloading termgroup record #{tg_fn}" ) tg_f = File.open( tg_fn, "r" ) tg = @serializer.load( tg_f ) tg_f.close() tg.each do |tid, term_entries| @entries[tid] = term_entries end end unless @entries.has_key?( termid ) logger.debug( " Creating termgroup record" ) @entries[termid] = [] end end logger.debug( "[returning #{@entries[termid]}]" ) return @entries[termid] end protected # Create a new storage location. def create_store() logger.info( "Creating store at #{@root}" ) Dir.mkdir( @root ) unless FileTest.exists?( @root ) Dir.mkdir( @terms_root ) unless FileTest.exists?( @terms_root ) end # Load the dictionary from storage. def reload() logger.info( "Reloading from #{@root}" ) # meta file is dumped/loaded as yaml, always File.open( File.join( @root, META_FILE ), "r" ) do |meta_f| @meta = YAML.load( meta_f ) @serializer = @meta[ :serializer ] logger.debug( "Loaded meta from #{META_FILE}" ) end term_fn = File.join( @root, TERMS_FILE ) File.open( term_fn, "r" ) do |term_f| @terms = @serializer.load( term_f ) logger.debug( "Loaded terms from #{term_fn}" ) end File.open( File.join( @root, TERMGROUP_FILE), "r" ) do |termgroups_f| @termgroups = @serializer.load( termgroups_f ) logger.debug( "Loaded termgroup map from #{TERMGROUP_FILE}" ) end # entries are loaded lazily... use get_entry_list end # Return the next sequential document id. def next_docid #not threadsafe @meta[:next_docid] += 1 return @meta[:next_docid] end # Return the next sequential term id. def next_termid @meta[:next_termid] += 1 return @meta[:next_termid] end def next_termgroup_id # totally not threadsafe @meta[:next_termgroup_count] += 1 if @meta[:next_termgroup_count] > @@termgroup_loading @meta[:next_termgroup_id] += 1 @meta[:next_termgroup_count] = 0 end return @meta[:next_termgroup_id] end def store_term_entries() logger.info( "Storing term entries" ) @pending_entries.each do |tg_id, termids| tg_fn = File.join( @terms_root, "#{tg_id}.tg" ) tg = nil if FileTest.exists?( tg_fn ) File.open( tg_fn, "r" ) {|tg_f| tg = @serializer.load( tg_f )} else tg = {} end termids.each do |termid| tg[ termid ] = @entries[ termid ] # update to internal state end logger.debug( "Writing #{tg_fn}" ) File.open( tg_fn, "w" ) {|f| @serializer.dump( tg, f )} end @pending_entries = {} # clear pending set end # Add an entry for the given termid. def add_entry( termid, entry ) # load entrylist and add entry to it, for internal state get_entry_list( termid ) << entry # track entry for later store() tg_id = @termgroups[ termid ] unless @pending_entries.has_key?( tg_id ) @pending_entries[ tg_id ] = [] end @pending_entries[ tg_id ] << termid end end end