Sha256: fac40140a5145473255a3847750eee0613606038f022173c4ab152a6ab838a4f

Contents?: true

Size: 1.83 KB

Versions: 1

Compression:

Stored size: 1.83 KB

Contents

##
# uniquify - uniquify a fasta file, also output table with sequence_id -> number of reads
#

module Lederhosen
  class CLI
    desc 'uniquify',
      'uniquify a fasta file and generate a table with sequence_id -> abundance'

    method_option :input,     :type => :string, :required => true
    method_option :output,    :type => :string, :required => true
    method_option :table_out, :type => :string, :required => true

    def uniquify
      input     = options[:input]
      output    = options[:output]
      table_out = options[:table_out]

      ohai "uniquifying #{input} to #{output} w/ table #{table_out}"

      sequence_counts = Hash.new { |h, k| h[k] = 0 }
      sequence_to_id = Hash.new

      out = File.open(output, 'w')

      no_records = `grep -c '^>' #{input}`.split.first.to_i
      pbar = ProgressBar.new 'loading', no_records

      File.open(input) do |handle|
        Dna.new(handle).each do |record|
          pbar.inc
          unless sequence_counts.has_key? record.sequence
            # store the sequence and id so we can have ids in the
            # table. If the file is sorted by length then this
            # should also be a seed sequence.
            sequence_to_id[record.sequence] = record.name
            out.puts record
          end
          sequence_counts[record.sequence] += 1
        end
      end

      pbar.finish
      out.close

      # write table
      pbar = ProgressBar.new 'table', no_records
      File.open(table_out, 'w') do |out|
        sequence_counts.each_pair do |sequence, count|
          pbar.inc
          id = sequence_to_id[sequence]
          out.puts "#{id}\t#{count}"
        end
      end
      pbar.finish
      kept = sequence_counts.keys.size
      total = sequence_counts.values.inject(:+)
      ohai "kept #{kept} out of #{total} reads (#{100*kept/total.to_f})"
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
lederhosen-0.3.8 lib/lederhosen/tasks/uniquify.rb