Sha256: fac40140a5145473255a3847750eee0613606038f022173c4ab152a6ab838a4f
Contents?: true
Size: 1.83 KB
Versions: 1
Compression:
Stored size: 1.83 KB
Contents
## # uniquify - uniquify a fasta file, also output table with sequence_id -> number of reads # module Lederhosen class CLI desc 'uniquify', 'uniquify a fasta file and generate a table with sequence_id -> abundance' method_option :input, :type => :string, :required => true method_option :output, :type => :string, :required => true method_option :table_out, :type => :string, :required => true def uniquify input = options[:input] output = options[:output] table_out = options[:table_out] ohai "uniquifying #{input} to #{output} w/ table #{table_out}" sequence_counts = Hash.new { |h, k| h[k] = 0 } sequence_to_id = Hash.new out = File.open(output, 'w') no_records = `grep -c '^>' #{input}`.split.first.to_i pbar = ProgressBar.new 'loading', no_records File.open(input) do |handle| Dna.new(handle).each do |record| pbar.inc unless sequence_counts.has_key? record.sequence # store the sequence and id so we can have ids in the # table. If the file is sorted by length then this # should also be a seed sequence. sequence_to_id[record.sequence] = record.name out.puts record end sequence_counts[record.sequence] += 1 end end pbar.finish out.close # write table pbar = ProgressBar.new 'table', no_records File.open(table_out, 'w') do |out| sequence_counts.each_pair do |sequence, count| pbar.inc id = sequence_to_id[sequence] out.puts "#{id}\t#{count}" end end pbar.finish kept = sequence_counts.keys.size total = sequence_counts.values.inject(:+) ohai "kept #{kept} out of #{total} reads (#{100*kept/total.to_f})" end end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
lederhosen-0.3.8 | lib/lederhosen/tasks/uniquify.rb |