Sha256: 15564df794511a40ccf8b7f9f9f25513b124d6ee67163262fac5846f9a5c1900

Contents?: true

Size: 1.93 KB

Versions: 1

Compression:

Stored size: 1.93 KB

Contents

# encoding: utf-8
#
module Picky

  module Indexers

    # Uses a number of categories, a source, and a tokenizer to index data.
    #
    # The tokenizer is taken from each category if specified, from the index, if not.
    #
    class Parallel < Base

      # Process does the actual indexing.
      #
      # Parameters:
      #  * categories: An Enumerable of Category-s.
      #
      def process categories, scheduler = Scheduler.new
        # Prepare a combined object - array.
        #
        combined = categories.map do |category|
          [category, category.prepared_index_file, [], (category.tokenizer || tokenizer)]
        end

        # Go through each object in the source.
        #
        objects = []

        reset_source

        source.each do |object|

          # Accumulate objects.
          #
          objects << object
          next if objects.size < 10_000

          # THINK Is it a good idea that not the tokenizer has
          # control over when he gets the next text?
          #
          combined.each do |category, file, cache, tokenizer|
            index_flush objects, file, category, cache, tokenizer
          end

          objects.clear

        end

        # Close all files.
        #
        combined.each do |category, file, cache, tokenizer|
          index_flush objects, file, category, cache, tokenizer
          yield file
          file.close
        end
      end

      def index_flush objects, file, category, cache, tokenizer
        comma   = ?,
        newline = ?\n

        objects.each do |object|
          tokens, _ = tokenizer.tokenize object.send(category.from) # Note: Originals not needed.
          tokens.each do |token_text|
            next unless token_text
            cache << object.id << comma << token_text << newline
          end
        end

        flush file, cache
      end

      def flush file, cache
        file.write(cache.join) && cache.clear
      end

    end

  end

end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
picky-4.0.0pre2 lib/picky/indexers/parallel.rb