Sha256: 8b50693e8d4f9b77f6ad2cba31ca2ec8e53689cdb2004ea0faf5930c46b22d78
Contents?: true
Size: 1.91 KB
Versions: 2
Compression:
Stored size: 1.91 KB
Contents
## # uniquify - uniquify a fasta file generating a fasta file of only unique sequences # also output table with sequence_id -> number of reads # module Lederhosen class CLI desc 'uniquify', 'uniquify a fasta file generating a fasta file of only unique sequences.' +\ 'also generate a table with sequence_id -> abundance' method_option :input, :type => :string, :required => true method_option :output, :type => :string, :required => true method_option :table_out, :type => :string, :required => true def uniquify input = options[:input] output = options[:output] table_out = options[:table_out] ohai "uniquifying #{input} to #{output} w/ table #{table_out}" sequence_counts = Hash.new { |h, k| h[k] = 0 } sequence_to_id = Hash.new out = File.open(output, 'w') File.open(input) do |handle| pbar = ProgressBar.new 'loading', File.size(input) Dna.new(handle).each do |record| pbar.inc handle.pos unless sequence_counts.has_key? record.sequence # store the sequence and id so we can have ids in the # table. If the file is sorted by length then this # should also be a seed sequence. sequence_to_id[record.sequence] = record.name out.puts record end sequence_counts[record.sequence] += 1 end pbar.finish end out.close # write table pbar = ProgressBar.new 'table', sequence_counts.size File.open(table_out, 'w') do |out| sequence_counts.each_pair do |sequence, count| pbar.inc id = sequence_to_id[sequence] out.puts "#{id}\t#{count}" end end pbar.finish kept = sequence_counts.keys.size total = sequence_counts.values.inject(:+) ohai "kept #{kept} out of #{total} reads (#{100*kept/total.to_f})" end end end
Version data entries
2 entries across 2 versions & 1 rubygems
Version | Path |
---|---|
lederhosen-0.4.0 | lib/lederhosen/tasks/uniquify.rb |
lederhosen-0.3.9 | lib/lederhosen/tasks/uniquify.rb |