lib/picolena/templates/app/models/indexer.rb in picolena-0.1.6 vs lib/picolena/templates/app/models/indexer.rb in picolena-0.1.7
- old
+ new
@@ -8,10 +8,11 @@
class << self
def index_every_directory(remove_first=false)
@@do_not_disturb_while_indexing=true
clear! if remove_first
+ @from_scratch = remove_first
# Forces Finder.searcher and Finder.index to be reloaded, by removing them from the cache.
Finder.reload!
log :debug => "Indexing every directory"
start=Time.now
Picolena::IndexedDirectories.each{|dir, alias_dir|
@@ -33,17 +34,23 @@
indexing_list_chunks=indexing_list.in_transposed_slices(@@threads_number)
prepare_multi_threads_environment
indexing_list_chunks.each_with_thread{|chunk|
- chunk.each{|filename|
- add_file(filename)
+ chunk.each{|complete_path|
+ last_itime=index_time_dbm_file[complete_path]
+ if @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime) then
+ add_or_update_file(complete_path)
+ else
+ log :debug => "Identical : #{complete_path}"
+ end
+ index_time_dbm_file[complete_path] = Time.now._dump
}
}
end
- def add_file(complete_path)
+ def add_or_update_file(complete_path)
default_fields = Document.default_fields_for(complete_path)
begin
document = PlainTextExtractor.extract_content_and_language_from(complete_path)
raise "empty document #{complete_path}" if document[:content].strip.empty?
document.merge! default_fields
@@ -67,10 +74,23 @@
def close
@@index.close rescue nil
# Ferret will SEGFAULT otherwise.
@@index = nil
end
+
+
+ # Checks for indexed files that are missing from filesytem
+ # and removes them from index & dbm file.
+ def prune_index
+ missing_files=index_time_dbm_file.reject{|filename,itime| File.exists?(filename) && Picolena::IndexedDirectories.any?{|dir,alias_path| filename.starts_with?(dir)}}
+ missing_files.each{|filename, itime|
+ index.writer.delete(:complete_path, filename)
+ index_time_dbm_file.delete(filename)
+ log :debug => "Removed : #{filename}"
+ }
+ index.optimize
+ end
# Only one IndexWriter should be instantiated.
# If one index already exists, returns it.
# Creates it otherwise.
def index
@@ -79,15 +99,21 @@
def ensure_index_existence
index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production"
end
- def doc_count
- index.writer.doc_count
+ # Returns how many files are indexed.
+ def size
+ index.size
end
private
+
+ # Copied from Ferret book, By David Balmain
+ def index_time_dbm_file
+ @@dbm_file ||= DBM.open(File.join(Picolena::IndexSavePath, 'added_at'))
+ end
def index_exists?
index_filename and File.exists?(index_filename)
end
@@ -106,11 +132,11 @@
:path => Picolena::IndexSavePath,
:analyzer => Picolena::Analyzer,
:field_infos => default_field_infos,
# Great way to ensure that no file is indexed twice!
:key => :probably_unique_id
- }
+ }.merge Picolena::IndexingConfiguration
end
def default_field_infos
returning Ferret::Index::FieldInfos.new do |field_infos|
field_infos.add_field(:complete_path, :store => :yes, :index => :untokenized)
@@ -118,26 +144,27 @@
field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5)
field_infos.add_field(:filename, :store => :no, :index => :yes, :boost => 1.5)
field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5)
field_infos.add_field(:modified, :store => :yes, :index => :untokenized)
field_infos.add_field(:probably_unique_id, :store => :no, :index => :untokenized)
- field_infos.add_field(:language, :store => :yes, :index => :yes)
+ field_infos.add_field(:language, :store => :yes, :index => :untokenized)
end
end
def prepare_multi_threads_environment
# It initializes the Index before launching multithreaded
# indexing. Otherwise, two threads could try to instantiate
# an IndexWriter at the same time, and get a
# Ferret::Store::Lock::LockError
index
- # NOTE: is it really necessary?
+ # Opens dbm file to dump indexing time.
+ index_time_dbm_file
# ActiveSupport sometime raises
# Expected Object is NOT missing constant
# without.
Document
Finder
Query
PlainTextExtractor
end
end
-end
\ No newline at end of file
+end