lib/picky/indexers/serial.rb in picky-3.6.16 vs lib/picky/indexers/serial.rb in picky-4.0.0pre1

- old
+ new

@@ -14,36 +14,59 @@ # and write to an intermediate "prepared index" file. # # Parameters: # * categories: An enumerable of Category-s. # - def process categories - comma = ?, - newline = ?\n - + def process categories, scheduler = Scheduler.new categories.each do |category| - tokenizer = category.tokenizer - category.prepared_index_file do |file| + + datas = [] result = [] + tokenizer = category.tokenizer - source.harvest(category) do |indexed_id, text| - tokens, _ = tokenizer.tokenize text # Note: Originals not needed. - tokens.each do |token_text| - next unless token_text - result << indexed_id << comma << token_text << newline - end - file.write(result.join) && result.clear if result.size > 100_000 + source.harvest(category) do |*data| + + # Accumulate data. + # + datas << data + next if datas.size < 10_000 + + # Opening the file inside the scheduler to + # have it automagically closed. + # + index_flush datas, file, result, tokenizer + + datas.clear + end - yield file + index_flush datas, file, result, tokenizer - file.write result.join + yield file end + end + end + + def index_flush datas, file, cache, tokenizer + comma = ?, + newline = ?\n + + datas.each do |indexed_id, text| + tokens, _ = tokenizer.tokenize text # Note: Originals not needed. + tokens.each do |token_text| + next unless token_text + cache << indexed_id << comma << token_text << newline + end end + flush file, cache + end + + def flush prepared_file, cache + prepared_file.write(cache.join) && cache.clear end end end \ No newline at end of file