Sha256: c61fa59d77a6aa5e87a4366751b8b82e64d3fa6698a5bdf08ec25b0e65267599

Contents?: true

Size: 1.29 KB

Versions: 7

Compression:

Stored size: 1.29 KB

Contents

##
# uniquify - uniquify a fasta file, also output table with md5 -> number of reads
#

require 'digest/md5'

module Lederhosen
  class CLI
    desc 'uniquify',
      'uniquify a fasta file and generate a table with md5 -> abundance'

    method_option :input, :type     => :string, :required => true
    method_option :output, :type    => :string, :required => true
    method_option :table_out, :type => :string, :required => true

    def uniquify
      input     = options[:input]
      output    = options[:output]
      table_out = options[:table_out]

      sequence_counts = Hash.new { |h, k| h[k] = 0 }

      out = File.open(output, 'w')

      File.open(input) do |handle|
        Dna.new(handle).each do |record|
          unless sequence_counts.has_key? record.sequence
            out.puts record
          end
          sequence_counts[record.sequence] += 1
        end
      end

      out.close

      # write table
      File.open(table_out, 'w') do |out|
        sequence_counts.each_pair do |sequence, count|
          digest = Digest::MD5.hexdigest(sequence)
          out.puts "#{digest},#{count}"
        end
      end

      kept = sequence_counts.keys.size
      total = sequence_counts.values.inject(:+)
      ohai "kept #{kept} out of #{total} reads (#{100*kept/total.to_f})"
    end
  end
end

Version data entries

7 entries across 7 versions & 1 rubygems

Version Path
lederhosen-0.2.9 lib/lederhosen/tasks/uniquify.rb
lederhosen-0.2.8 lib/lederhosen/tasks/uniquify.rb
lederhosen-0.2.7 lib/lederhosen/tasks/uniquify.rb
lederhosen-0.2.6 lib/lederhosen/tasks/uniquify.rb
lederhosen-0.2.5 lib/lederhosen/tasks/uniquify.rb
lederhosen-0.2.4 lib/lederhosen/tasks/uniquify.rb
lederhosen-0.2.3 lib/lederhosen/tasks/uniquify.rb