Sha256: 1ef8baea45ab3ca3a1bc4313ec4e24185919df230b161c7c009b4db042e26ac9

Contents?: true

Size: 1.92 KB

Versions: 5

Compression:

Stored size: 1.92 KB

Contents

# encoding: utf-8
#
module Indexers
  
  # The indexer defines the control flow.
  #
  class Serial
    
    attr_accessor :tokenizer, :source
    
    def initialize configuration, source, tokenizer
      @configuration = configuration
      @source        = source || raise_no_source
      @tokenizer     = tokenizer
    end
    
    # Raise a no source exception.
    #
    def raise_no_source
      raise NoSourceSpecifiedException.new("No source given for #{@configuration}.")
    end
    
    # Delegates the key format to the source.
    #
    # Default is to_i.
    #
    def key_format
      @source.key_format || :to_i
    end
    
    # Selects the original id (indexed id) and a column to process. The column data is called "token".
    #
    # Note: Puts together the parts first in an array, then releasing the array from time to time by joining.
    #
    def index
      indexing_message
      process
    end
    def process
      comma   = ?,
      newline = ?\n
      
      # TODO Move open to config?
      #
      # @category.prepared_index do |file|
      #   source.harvest(@index, @category) do |indexed_id, text|
      #     tokenizer.tokenize(text).each do |token_text|
      #       next unless token_text
      #       file.buffer indexed_id << comma << token_text << newline
      #     end
      #     file.write_maybe
      #   end
      # end
      #
      @configuration.prepared_index_file do |file|
        result = []
        source.harvest(@configuration.index, @configuration.category) do |indexed_id, text|
          tokenizer.tokenize(text).each do |token_text|
            next unless token_text
            result << indexed_id << comma << token_text << newline
          end
          file.write(result.join) && result.clear if result.size > 100_000
        end
        file.write result.join
      end
    end
    def indexing_message
      timed_exclaim "INDEX #{@configuration}" # TODO from ...
    end
    
  end
end

Version data entries

5 entries across 5 versions & 1 rubygems

Version Path
picky-1.5.2 lib/picky/internals/indexers/serial.rb
picky-1.5.1 lib/picky/internals/indexers/serial.rb
picky-1.5.0 lib/picky/internals/indexers/serial.rb
picky-1.4.3 lib/picky/internals/indexers/serial.rb
picky-1.4.2 lib/picky/internals/indexers/serial.rb