# encoding: utf-8 # module Indexers # Uses a number of categories, a source, and a tokenizer to index data. # # The tokenizer is taken from each category if specified, from the index, if not. # class Parallel < Base # Process does the actual indexing. # # Parameters: # * categories: An Enumerable of Category-s. # def process categories comma = ?, newline = ?\n # Prepare a combined object - array. # combined = categories.map do |category| [category, [], category.prepared_index_file, (category.tokenizer || tokenizer)] end # Index. # # TODO Extract into flush_every(100_000) do # i = 0 # Explicitly reset the source to avoid caching trouble. # source.reset if source.respond_to?(:reset) # Go through each object in the source. # source.each do |object| id = object.id # This needs to be rewritten. # # Is it a good idea that not the tokenizer has control over when he gets the next text? # combined.each do |category, cache, _, tokenizer| tokenizer.tokenize(object.send(category.from).to_s).each do |token_text| next unless token_text cache << id << comma << token_text << newline end end if i >= 100_000 flush combined i = 0 end i += 1 end flush combined combined.each do |_, _, file, _| timed_exclaim %Q{"#{@index_or_category.identifier}": => #{file.path}.} file.close end end # Flush the combined array into the file. # def flush combined # :nodoc: combined.each do |_, cache, file, _| file.write(cache.join) && cache.clear end end # # def start_indexing_message # :nodoc: timed_exclaim %Q{"#{@index_or_category.identifier}": Starting parallel data preparation.} end def finish_indexing_message # :nodoc: timed_exclaim %Q{"#{@index_or_category.identifier}": Finished parallel data preparation.} end end end