# Indexer is used to index (duh!) documents contained in IndexedDirectories
# It can create, update, delete and prune the index, and take care that only
# one IndexWriter exists at any given time, even when used in a multi-threaded
# way.
require 'indexer_logger'
class Indexer
  # This regexp defines which files should *not* be indexed.
  @@exclude          = /(Thumbs\.db)/
  # Number of threads that will be used during indexing process
  @@threads_number = 8

  class << self
    # Finds every document included in IndexedDirectories, parses them with
    # PlainTextExtractor and adds them to the index.
    #
    # Updates the index unless remove_first parameter is set to true, in which
    # case it removes the index first before re-creating it.
    def index_every_directory(remove_first=false)
      clear! if remove_first
      lock!
      @from_scratch = remove_first
      logger.start_indexing
      Picolena::IndexedDirectories.each{|dir, alias_dir|
        index_directory_with_multithreads(dir)
      }
      logger.debug "Now optimizing index"
      index.optimize
      index_time_dbm_file['last']=Time.now._dump
      unlock!
      logger.show_report
    end

    # Indexes a given directory, using @@threads_number threads.
    # To do so, it retrieves a list of every included document, cuts it in
    # @@threads_number chunks, and create a new indexing thread for every chunk.
    def index_directory_with_multithreads(dir)
      logger.debug "Indexing #{dir}, #{@@threads_number} threads"
      indexing_list=Dir[File.join(dir,"**/*")].select{|filename|
        File.file?(filename) && filename !~ @@exclude
      }

      indexing_list_chunks=indexing_list.in_transposed_slices(@@threads_number)
      prepare_multi_threads_environment

      indexing_list_chunks.each_with_thread{|chunk|
        chunk.each{|complete_path|
          if should_index_this_document?(complete_path) then
            add_or_update_file(complete_path)
          else
            logger.debug "Identical : #{complete_path}"
          end
          index_time_dbm_file[complete_path] = Time.now._dump
        }
      }
    end

    # Retrieves content and language from a given document, and adds it to the index.
    # Since Document#probably_unique_id is used as index :key, no document will be added
    # twice to the index, and the old document will just get updated.
    #
    # If for some reason (no content found or no defined PlainTextExtractor), content cannot
    # be found, some basic information about the document (mtime, filename, complete_path)
    # gets indexed anyway.
    def add_or_update_file(complete_path)
      document = Document.default_fields_for(complete_path)
      begin
        document.merge! PlainTextExtractor.extract_content_and_language_from(complete_path)
        raise "empty document #{complete_path}" if document[:content].strip.empty?
        logger.add_document document
      rescue => e
        logger.reject_document document, e
      end
      index << document
    end

    # Ensures index is closed, and removes every index file for RAILS_ENV.
    def clear!(all=false)
      close
      to_remove=all ? Picolena::IndexesSavePath : Picolena::IndexSavePath
      Dir.glob(File.join(to_remove,'**/*')).each{|f| FileUtils.rm(f) if File.file?(f)}
    end

    # Closes the index and
    # ensures that a new Index is instantiated next time index is called.
    def close
      @@index.close rescue nil
      @@index = nil
    end
    
    # Checks for indexed files that are missing from filesytem
    # and removes them from index & dbm file.
    def prune_index
      missing_files=index_time_dbm_file.reject{|filename,itime| File.exists?(filename) && Picolena::IndexedDirectories.any?{|dir,alias_path| filename.starts_with?(dir)}}
      missing_files.each{|filename, itime|
        index.writer.delete(:complete_path, filename)
        index_time_dbm_file.delete(filename)
        logger.debug "Removed : #{filename}"
      }
      index.optimize
    end

    # Only one IndexWriter should be instantiated.
    # If one index already exists, returns it.
    # Creates it otherwise.
    def index
      @@index ||= Ferret::Index::Index.new(default_index_params)
    end

    # Creates the index unless it already exists.
    def ensure_index_existence
      index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production"
    end

    # Returns how many files are indexed.
    def size
      index.size
    end

    # Returns the time at which the index was last created/updated.
    # Returns "none" if it doesn't exist.
    def last_update
      Time._load(index_time_dbm_file['last']) rescue "none"
    end
    
    # Returns the time at which the reload file was last touched.
    # Useful to know if other processes have modified the shared index,
    # and if the Indexer should be reloaded.
    def reload_file_mtime
      touch_reload_file! unless File.exists?(reload_file)
      File.mtime(reload_file)
    end
    
    # For a given document, it retrieves the time it was last indexed, compare it to
    # its modification time and returns false unless the file has been
    # modified after the last indexing process.
    def should_index_this_document?(complete_path)
      last_itime=index_time_dbm_file[complete_path]
      @from_scratch || !last_itime || File.mtime(complete_path)> Time._load(last_itime) 
    end
    
    def locked?
      File.exists?(lock_file)
    end

    private
    
    def touch_reload_file!
      FileUtils.touch(reload_file)
      # To ensure that every process can touch reload_file, even if Picolena
      # is launched as a special user.
      FileUtils.chmod(0666, reload_file)
    end
    
    def reload_file
      File.join(Picolena::MetaIndexPath,'reload')
    end
    
    def lock!
      FileUtils.touch(lock_file)
    end
    
    def unlock!
      FileUtils.rm(lock_file)
      # Forces Finder.index to be reloaded.
      touch_reload_file!
    end
    
    def lock_file
      File.join(Picolena::MetaIndexPath,'lock')
    end

    def logger
      @@logger ||= IndexerLogger.new
    end
    
    # Copied from Ferret book, By David Balmain
    def index_time_dbm_file
      @@dbm_file ||= DBM.open(File.join(Picolena::MetaIndexPath, 'added_at'))
    end

    def index_exists?
      index_filename and File.exists?(index_filename)
    end

    def index_filename
      Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first
    end

    def default_index_params
      {
        :path        => Picolena::IndexSavePath,
        :analyzer    => Picolena::Analyzer,
        :field_infos => default_field_infos,
        # Great way to ensure that no file is indexed twice!
        :key         => :probably_unique_id
        }.merge Picolena::IndexingConfiguration
    end

    def default_field_infos
      returning Ferret::Index::FieldInfos.new do |field_infos|
        field_infos.add_field(:complete_path,      :store => :yes, :index => :untokenized)
        field_infos.add_field(:content,            :store => :yes, :index => :yes)
        field_infos.add_field(:basename,           :store => :no,  :index => :yes, :boost => 1.5)
        field_infos.add_field(:filename,           :store => :no,  :index => :yes, :boost => 1.5)
        field_infos.add_field(:filetype,           :store => :no,  :index => :yes, :boost => 1.5)
        field_infos.add_field(:modified,           :store => :yes, :index => :untokenized)
        field_infos.add_field(:probably_unique_id, :store => :no,  :index => :untokenized)
        field_infos.add_field(:language,           :store => :yes, :index => :untokenized)
      end
    end
    
    def prepare_multi_threads_environment
      # It initializes the Index before launching multithreaded
      # indexing. Otherwise, two threads could try to instantiate
      # an IndexWriter at the same time, and get a
      #  Ferret::Store::Lock::LockError
      index
      # Opens dbm file to dump indexing time.
      index_time_dbm_file
      # ActiveSupport sometime raises
      #  Expected Object is NOT missing constant
      # without.
      Document
      Finder
      Query
      PlainTextExtractor
    end
  end
end