require 'set' module Unipept class BatchIterator attr_reader :batch_size def initialize(batch_size) @batch_size = batch_size end # Splits the input lines into slices, based on the batch_size of the current # command. Executes the given block for each of the batches. # # Supports both normal input and input in the fasta format. # # @input [Iterator] lines An iterator containing the input lines # # @input [lambda] block The code to execute on the slices def iterate(lines, &block) first_line = lines.next rescue return if fasta? first_line fasta_iterator(first_line, lines, &block) elsif csv_taxa2tree? first_line csv_taxa_iterator(first_line, lines, &block) else normal_iterator(first_line, lines, &block) end end # Checks if the geven line is a fasta header. # # @param [String] line The input line # # @return [Boolean] Whether te input is a fasta header def fasta?(line) line.start_with? '>' end def csv_taxa2tree?(line) line.include? 'taxon_id' end private # Splits the input lines in fasta format into slices, based on the # batch_size of the current command. Executes the given block for each of # the batches. def fasta_iterator(first_line, next_lines) current_fasta_header = first_line.chomp next_lines.each_slice(batch_size).with_index do |slice, i| fasta_mapper = [] input_set = Set.new slice.each do |line| line = line.chomp if fasta? line current_fasta_header = line else fasta_mapper << [current_fasta_header, line] input_set << line end end yield(input_set.to_a, i, fasta_mapper) end end # Splits the input lines into slices, based on the batch_size of the current # command. Executes the given block for each of the batches. def normal_iterator(first_line, next_lines, &block) Enumerator.new do |y| y << first_line loop do y << next_lines.next end end.each_slice(batch_size).with_index(&block) end def csv_taxa_iterator(first_line, next_lines, &block) # Find index of taxon_id in the first_line and only parse this part from the next lines taxon_idx = first_line.rstrip.split(',').find_index('taxon_id') Enumerator.new do |y| loop do y << next_lines.next.rstrip.split(',')[taxon_idx] end end.each_slice(batch_size).with_index(&block) end end end