## # Create a fasta file with nucleotide sequences for each cluster larger than a cutoff # module Lederhosen class CLI desc "split", "create fasta files containing reads from each cluster" method_option :clusters, :type => :string, :required => true method_option :reads, :type => :string, :required => true method_option :out_dir, :type => :string, :required => true method_option :buffer_size, :type => :numeric, :default => 1000 method_option :min_clst_size, :type => :numeric, :default => 1 def split clusters = options[:clusters] reads = options[:reads] out_dir = options[:out_dir] buffer_size = options[:buffer_size] min_clst_size = options[:min_clst_size] finalize_every = 100_000 `mkdir -p #{out_dir}/` ohai "loading #{clusters}" # Load read id -> cluster read_to_clusterid = Hash.new # keep track of cluster sizes cluster_counts = Hash.new { |h, k| h[k] = 0} File.open(clusters)do |handle| handle.each do |line| line = line.strip.split cluster_nr = line[1] if line[0] == 'S' || line[0] == 'H' read = line[8] else next end read_to_clusterid[read] = cluster_nr cluster_counts[cluster_nr] += 1 end end read_to_clusterid.delete_if do |read, cluster_nr| cluster_counts[cluster_nr] < min_clst_size end total_reads = read_to_clusterid.length total_clusters = read_to_clusterid.values.uniq.length ohai "#{total_reads} reads in #{total_clusters} clusters" pbar = ProgressBar.new "saving", total_reads # Write reads to individual fasta files using Buffer buffer = Buffer.new :buffer_max => buffer_size File.open(reads) do |handle| records = Dna.new handle records.each_with_index do |record, i| cluster_id = read_to_clusterid[record.name] if cluster_id pbar.inc filename = File.join(out_dir, cluster_id + '.fasta') buffer[filename] << record buffer.finalize if (i%finalize_every == 0) end end end pbar.finish ohai "finalizing output" buffer.finalize # finish writing out puts "done" end end end