lib/picolena/templates/app/models/indexer.rb in picolena-0.1.6 vs lib/picolena/templates/app/models/indexer.rb in picolena-0.1.7

- old
+ new

@@ -8,10 +8,11 @@ class << self def index_every_directory(remove_first=false) @@do_not_disturb_while_indexing=true clear! if remove_first + @from_scratch = remove_first # Forces Finder.searcher and Finder.index to be reloaded, by removing them from the cache. Finder.reload! log :debug => "Indexing every directory" start=Time.now Picolena::IndexedDirectories.each{|dir, alias_dir| @@ -33,17 +34,23 @@ indexing_list_chunks=indexing_list.in_transposed_slices(@@threads_number) prepare_multi_threads_environment indexing_list_chunks.each_with_thread{|chunk| - chunk.each{|filename| - add_file(filename) + chunk.each{|complete_path| + last_itime=index_time_dbm_file[complete_path] + if @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime) then + add_or_update_file(complete_path) + else + log :debug => "Identical : #{complete_path}" + end + index_time_dbm_file[complete_path] = Time.now._dump } } end - def add_file(complete_path) + def add_or_update_file(complete_path) default_fields = Document.default_fields_for(complete_path) begin document = PlainTextExtractor.extract_content_and_language_from(complete_path) raise "empty document #{complete_path}" if document[:content].strip.empty? document.merge! default_fields @@ -67,10 +74,23 @@ def close @@index.close rescue nil # Ferret will SEGFAULT otherwise. @@index = nil end + + + # Checks for indexed files that are missing from filesytem + # and removes them from index & dbm file. + def prune_index + missing_files=index_time_dbm_file.reject{|filename,itime| File.exists?(filename) && Picolena::IndexedDirectories.any?{|dir,alias_path| filename.starts_with?(dir)}} + missing_files.each{|filename, itime| + index.writer.delete(:complete_path, filename) + index_time_dbm_file.delete(filename) + log :debug => "Removed : #{filename}" + } + index.optimize + end # Only one IndexWriter should be instantiated. # If one index already exists, returns it. # Creates it otherwise. def index @@ -79,15 +99,21 @@ def ensure_index_existence index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production" end - def doc_count - index.writer.doc_count + # Returns how many files are indexed. + def size + index.size end private + + # Copied from Ferret book, By David Balmain + def index_time_dbm_file + @@dbm_file ||= DBM.open(File.join(Picolena::IndexSavePath, 'added_at')) + end def index_exists? index_filename and File.exists?(index_filename) end @@ -106,11 +132,11 @@ :path => Picolena::IndexSavePath, :analyzer => Picolena::Analyzer, :field_infos => default_field_infos, # Great way to ensure that no file is indexed twice! :key => :probably_unique_id - } + }.merge Picolena::IndexingConfiguration end def default_field_infos returning Ferret::Index::FieldInfos.new do |field_infos| field_infos.add_field(:complete_path, :store => :yes, :index => :untokenized) @@ -118,26 +144,27 @@ field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5) field_infos.add_field(:filename, :store => :no, :index => :yes, :boost => 1.5) field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5) field_infos.add_field(:modified, :store => :yes, :index => :untokenized) field_infos.add_field(:probably_unique_id, :store => :no, :index => :untokenized) - field_infos.add_field(:language, :store => :yes, :index => :yes) + field_infos.add_field(:language, :store => :yes, :index => :untokenized) end end def prepare_multi_threads_environment # It initializes the Index before launching multithreaded # indexing. Otherwise, two threads could try to instantiate # an IndexWriter at the same time, and get a # Ferret::Store::Lock::LockError index - # NOTE: is it really necessary? + # Opens dbm file to dump indexing time. + index_time_dbm_file # ActiveSupport sometime raises # Expected Object is NOT missing constant # without. Document Finder Query PlainTextExtractor end end -end \ No newline at end of file +end