Sha256: 8b50693e8d4f9b77f6ad2cba31ca2ec8e53689cdb2004ea0faf5930c46b22d78

Contents?: true

Size: 1.91 KB

Versions: 2

Compression:

Stored size: 1.91 KB

Contents

##
# uniquify - uniquify a fasta file generating a fasta file of only unique sequences
# also output table with sequence_id -> number of reads
#

module Lederhosen
  class CLI
    desc 'uniquify',
      'uniquify a fasta file generating a fasta file of only unique sequences.' +\
      'also generate a table with sequence_id -> abundance'

    method_option :input,     :type => :string, :required => true
    method_option :output,    :type => :string, :required => true
    method_option :table_out, :type => :string, :required => true

    def uniquify
      input     = options[:input]
      output    = options[:output]
      table_out = options[:table_out]

      ohai "uniquifying #{input} to #{output} w/ table #{table_out}"

      sequence_counts = Hash.new { |h, k| h[k] = 0 }
      sequence_to_id = Hash.new

      out = File.open(output, 'w')

      File.open(input) do |handle|
        pbar = ProgressBar.new 'loading', File.size(input)
        Dna.new(handle).each do |record|
          pbar.inc handle.pos
          unless sequence_counts.has_key? record.sequence
            # store the sequence and id so we can have ids in the
            # table. If the file is sorted by length then this
            # should also be a seed sequence.
            sequence_to_id[record.sequence] = record.name
            out.puts record
          end
          sequence_counts[record.sequence] += 1
        end
        pbar.finish
      end

      out.close

      # write table
      pbar = ProgressBar.new 'table', sequence_counts.size
      File.open(table_out, 'w') do |out|
        sequence_counts.each_pair do |sequence, count|
          pbar.inc
          id = sequence_to_id[sequence]
          out.puts "#{id}\t#{count}"
        end
      end
      pbar.finish
      kept = sequence_counts.keys.size
      total = sequence_counts.values.inject(:+)
      ohai "kept #{kept} out of #{total} reads (#{100*kept/total.to_f})"
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
lederhosen-0.4.0 lib/lederhosen/tasks/uniquify.rb
lederhosen-0.3.9 lib/lederhosen/tasks/uniquify.rb