Sha256: b20254477ca657cb2f56dcb0ce72623a38e43c26d3f50b1e37af416fcbfd8105

Contents?: true

Size: 1.55 KB

Versions: 1

Compression:

Stored size: 1.55 KB

Contents

require 'set'

module Lederhosen
  class CLI
    desc 'get_reps', 'get representative reads for a uc file'

    method_option :input,    :type => :string, :required => true
    method_option :database, :type => :string, :required => true
    method_option :output,   :type => :string, :required => true

    def get_reps
      inputs   = Dir[options[:input]]
      database = File.expand_path(options[:database])
      output   = File.expand_path(options[:output])

      taxa = Set.new

      ohai "getting representative database sequences from #{database} using #{inputs} clusters and saving to #{output}"

      # parse uc file, get list of taxa we need to get
      # full sequences for from the database
      total_bytes = inputs.map { |x| File.size(x) }.inject(:+)
      pbar = ProgressBar.new 'reading uc(s)', total_bytes

      inputs.each do |input|
        File.open(input) do |handle|
          handle.each do |line|
            pbar.inc line.unpack('*C').size
            header = parse_usearch_line(line.strip)
            taxa << header[:original] rescue nil
          end
        end
      end

      pbar.finish

      ohai "found #{taxa.size} representative sequences"

      # print representative sequences from database
      output = File.open(output, 'w')
      kept = 0
      File.open(database) do |handle|
        Dna.new(handle).each do |record|
          if taxa.include? record.name
            output.puts record
            kept += 1
          end
        end
      end

      output.close

      ohai "saved #{kept} representatives"

    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
lederhosen-1.2.2 lib/lederhosen/tasks/get_reps.rb