Sha256: 9a800360035e699d49cd5e6c9c667e337f63faaaa8b79134499bd03d2a23a348

Contents?: true

Size: 1.59 KB

Versions: 6

Compression:

Stored size: 1.59 KB

Contents

##
# uniquify - uniquify a fasta file, also output table with md5 -> number of reads
#

require 'digest/md5'

module Lederhosen
  class CLI
    desc 'uniquify',
      'uniquify a fasta file and generate a table with md5 -> abundance'

    method_option :input, :type     => :string, :required => true
    method_option :output, :type    => :string, :required => true
    method_option :table_out, :type => :string, :required => true

    def uniquify
      input     = options[:input]
      output    = options[:output]
      table_out = options[:table_out]

      ohai "uniquifying #{input} to #{output} w/ table #{table_out}"

      sequence_counts = Hash.new { |h, k| h[k] = 0 }

      out = File.open(output, 'w')

      no_records = `grep -c #{input} '^>'`.split.first.to_i
      pbar = ProgressBar.new 'loading', no_records
      File.open(input) do |handle|
        Dna.new(handle).each do |record|
          pbar.inc
          unless sequence_counts.has_key? record.sequence
            out.puts record
          end
          sequence_counts[record.sequence] += 1
        end
      end

      pbar.finish
      out.close

      # write table
      pbar = ProgressBar.new 'table', no_records
      File.open(table_out, 'w') do |out|
        sequence_counts.each_pair do |sequence, count|
          pbar.inc
          digest = Digest::MD5.hexdigest(sequence)
          out.puts "#{digest},#{count}"
        end
      end
      pbar.finish
      kept = sequence_counts.keys.size
      total = sequence_counts.values.inject(:+)
      ohai "kept #{kept} out of #{total} reads (#{100*kept/total.to_f})"
    end
  end
end

Version data entries

6 entries across 6 versions & 1 rubygems

Version Path
lederhosen-0.3.1 lib/lederhosen/tasks/uniquify.rb
lederhosen-0.3.0 lib/lederhosen/tasks/uniquify.rb
lederhosen-0.2.13 lib/lederhosen/tasks/uniquify.rb
lederhosen-0.2.12 lib/lederhosen/tasks/uniquify.rb
lederhosen-0.2.11 lib/lederhosen/tasks/uniquify.rb
lederhosen-0.2.10 lib/lederhosen/tasks/uniquify.rb