lib/picky/indexers/serial.rb in picky-3.6.16 vs lib/picky/indexers/serial.rb in picky-4.0.0pre1
- old
+ new
@@ -14,36 +14,59 @@
# and write to an intermediate "prepared index" file.
#
# Parameters:
# * categories: An enumerable of Category-s.
#
- def process categories
- comma = ?,
- newline = ?\n
-
+ def process categories, scheduler = Scheduler.new
categories.each do |category|
- tokenizer = category.tokenizer
-
category.prepared_index_file do |file|
+
+ datas = []
result = []
+ tokenizer = category.tokenizer
- source.harvest(category) do |indexed_id, text|
- tokens, _ = tokenizer.tokenize text # Note: Originals not needed.
- tokens.each do |token_text|
- next unless token_text
- result << indexed_id << comma << token_text << newline
- end
- file.write(result.join) && result.clear if result.size > 100_000
+ source.harvest(category) do |*data|
+
+ # Accumulate data.
+ #
+ datas << data
+ next if datas.size < 10_000
+
+ # Opening the file inside the scheduler to
+ # have it automagically closed.
+ #
+ index_flush datas, file, result, tokenizer
+
+ datas.clear
+
end
- yield file
+ index_flush datas, file, result, tokenizer
- file.write result.join
+ yield file
end
+ end
+ end
+
+ def index_flush datas, file, cache, tokenizer
+ comma = ?,
+ newline = ?\n
+
+ datas.each do |indexed_id, text|
+ tokens, _ = tokenizer.tokenize text # Note: Originals not needed.
+ tokens.each do |token_text|
+ next unless token_text
+ cache << indexed_id << comma << token_text << newline
+ end
end
+ flush file, cache
+ end
+
+ def flush prepared_file, cache
+ prepared_file.write(cache.join) && cache.clear
end
end
end
\ No newline at end of file