lib/picolena/templates/app/models/indexer.rb in picolena-0.1.4 vs lib/picolena/templates/app/models/indexer.rb in picolena-0.1.5

- old
+ new

@@ -3,140 +3,120 @@ @@exclude = /(Thumbs\.db)/ # Number of threads that will be used during indexing process @@max_threads_number = 8 class << self - def fields_for(complete_path) - { - :complete_path => complete_path, - :probably_unique_id => complete_path.base26_hash, - :file => File.basename(complete_path), - :basename => File.basename(complete_path, File.extname(complete_path)).gsub(/_/,' '), - :filetype => File.extname(complete_path), - :date => File.mtime(complete_path).strftime("%Y%m%d%H%M%S") - } - end - - def index_every_directory(update=true) + def index_every_directory(remove_first=false) + clear! if remove_first + # Forces Finder.searcher and Finder.index to be reloaded, by removing them from the cache. + Finder.reload! log :debug => "Indexing every directory" - - start=Time.now - @update = update - reset! unless update - Picolena::IndexedDirectories.each{|dir, alias_dir| index_directory_with_multithreads(dir) } - # FIXME: with those 2 lines, + log :debug => "Now optimizing index" writer.optimize - writer.close - # launching Indexer.index_every_directory twice in a row - # would raise a SEGFAULT: - # picolena/lib/picolena/templates/app/models/indexer.rb:27: [BUG] Segmentation fault - # ruby 1.8.6 (2007-06-07) [i486-linux] - # - # Aborted (core dumped) - # - # But without those 2 lines, specs don't pass anymore. - # log :debug => "Indexing done in #{Time.now-start} s." end def index_directory_with_multithreads(dir) - # FIXME: Don't know why, but if more than one thread is created while update the index, - # indexer raises: - # - # current thread not owner - # /usr/lib/ruby/1.8/monitor.rb:278:in `mon_check_owner' - # /home/www/picolena/lib/picolena/templates/lib/core_exts.rb:32:in `join' - # ... - # - # So Index creation is multithreaded, Index update is monothreaded. - threads_number = @update ? 1 : @@max_threads_number + threads_number = @@max_threads_number log :debug => "Indexing #{dir}, #{threads_number} thread(s)" indexing_list=Dir[File.join(dir,"**/*")].select{|filename| File.file?(filename) && filename !~ @@exclude } indexing_list_chunks=indexing_list.in_transposed_slices(threads_number) + # It initializes an IndexWriter before launching multithreaded + # indexing. Otherwise, two threads could try to instantiate + # an IndexWriter at the same time, and get a + # Ferret::Store::Lock::LockError + writer + indexing_list_chunks.each_with_thread{|chunk| chunk.each{|filename| - add_or_update_file(filename) + add_file(filename) } } end - def add_or_update_file(complete_path) - should_be_added = true - if @update then - log :debug => "What to do with #{complete_path} ?" - occurences = reader.occurences_number(complete_path) - log :debug => "\tappears #{occurences} times in the index" - case occurences - when 0 - #Nothing to do here, the file will be added. - when 1 - d=Document.find_by_complete_path(complete_path) - if File.mtime(complete_path).strftime("%Y%m%d%H%M%S").to_i > d.mtime then - log :debug => "\thas been modified" - delete_file(complete_path) - else - should_be_added = false - log :debug => "\thas not been modified. leaving it" - end - else - delete_file(complete_path) - end - end - add_file(complete_path) if should_be_added - end - def add_file(complete_path) - log :debug => "Adding #{complete_path}" - mime_type=File.mime(complete_path) - fields = fields_for(complete_path) - - begin - text, lang = PlainTextExtractor.extract_content_and_language_from(complete_path) - raise "\tempty document #{complete_path}" if text.strip.empty? - fields[:content] = text - log :debug => "language found: #{lang}" if lang - fields[:lang] = lang + default_fields = Document.default_fields_for(complete_path) + begin + document = PlainTextExtractor.extract_content_and_language_from(complete_path) + raise "empty document #{complete_path}" if document[:content].strip.empty? + document.merge! default_fields + log :debug => ["Added : #{complete_path}",document[:language] ? " (#{document[:language]})" : ""].join rescue => e log :debug => "\tindexing without content: #{e.message}" + document = default_fields end - - writer << fields + writer << document end - def writer - @@writer ||= IndexWriter.new + # Ensures writer is closed, and removes every index file for RAILS_ENV. + def clear!(all=false) + close + to_remove=all ? Picolena::IndexesSavePath : Picolena::IndexSavePath + Dir.glob(File.join(to_remove,'**/*')).each{|f| FileUtils.rm(f) if File.file?(f)} end - def reader - @@reader ||= IndexReader.new + # Closes the writer and + # ensures that a new IndexWriter is instantiated next time writer is called. + def close + @@writer.close rescue nil + # Ferret will SEGFAULT otherwise. + @@writer = nil end - def reset! - log :debug => "Resetting Index" - @@writer=nil - @@reader=nil - IndexWriter.remove + # Only one IndexWriter should be instantiated. + # If one already exists, returns it. + # Creates it otherwise. + def writer + @@writer ||= Ferret::Index::IndexWriter.new(default_index_params) end - def delete_file(complete_path) - log :debug => "\tRemoving from index" - reader.delete_by_complete_path(complete_path) + def index + Ferret::Index::Index.new(default_index_params) end + def ensure_index_existence + index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production" + end + private + def index_exists? + index_filename and File.exists?(index_filename) + end + + def index_filename + Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first + end + def log(hash) hash.each{|level,message| IndexerLogger.send(level,message) } - end + end + + def default_index_params + {:path => Picolena::IndexSavePath, :analyzer => Picolena::Analyzer, :field_infos => default_field_infos} + end + + def default_field_infos + returning Ferret::Index::FieldInfos.new do |field_infos| + field_infos.add_field(:complete_path, :store => :yes, :index => :untokenized) + field_infos.add_field(:content, :store => :yes, :index => :yes) + field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5) + field_infos.add_field(:filename, :store => :no, :index => :yes, :boost => 1.5) + field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5) + field_infos.add_field(:modified, :store => :yes, :index => :untokenized) + field_infos.add_field(:probably_unique_id, :store => :no, :index => :yes) + field_infos.add_field(:language, :store => :yes, :index => :yes) + end + end end end \ No newline at end of file