exe/key_cols in tree_clusters-0.8.0 vs exe/key_cols in tree_clusters-0.8.1

- old
+ new

@@ -7,11 +7,12 @@ require "parse_fasta" require "fileutils" TreeClusters.extend TreeClusters -GREETING = "The '#{__FILE__}' program" +PROGRAM = "key_cols" +GREETING = "The #{PROGRAM} program" UNDERLINE = "=" * GREETING.length opts = Trollop.options do version TreeClusters::VERSION @@ -44,29 +45,28 @@ # Options: # EOS banner <<-EOS -#{GREETING} -#{UNDERLINE} +The key_cols program +==================== - Hi. My name is #{__FILE__}. If you give me a Newick tree file and - an alignment file (fasta format), I will tell you key columns for - all clades/clusters that have them. + Hi. My name is key_cols. If you give me a Newick tree file and an + alignment file (fasta format), I will tell you key columns for all + clades/clusters that have them. Overview -------- A clade has key columns if you can use the residue/nucleotide at those columns to tell sequences in the clade from sequences outside of the clade. Here's an example.... - After you run me (#{__FILE__} is my name), you'll get an output file - with the extension, '*.tree_clusters.key_cols.txt'. It may look - something like this: + After you run me, you'll get an output file with the extension, + '*.tree_clusters.key_cols.txt'. It may look something like this: cluster_A 4 1-A 2-A 3-A 5-G cluster_B 4 1-C 2-C 3-C 5-A This file has the clade name, the number of key columns for that @@ -91,10 +91,14 @@ -------------- - I ignore columns with gap chars (currently just '-') regardless of column entropy. + - If you're Newick tree has parentheses "(" or ")" in leaf names + you'll get a parsing error even if the name is single quoted. + We'll fix this at some point. + Option info ----------- --entropy-cutoff: A cutoff of 0 means that you allow no variation at any column. @@ -118,20 +122,20 @@ "Consider only clades with at least this many leaves", default: 1) opt(:outdir, "Output directory", - default: ".") + default: "key_cols_output") opt(:base, "Basename for output", - default: "snazzy_clades") + default: "key_cols_output") end abort_if opts[:tree].nil?, - "--tree is a required arg. Try running: #{__FILE__} --help" + "--tree is a required arg. Try running: #{PROGRAM} --help" abort_if opts[:aln].nil?, - "--aln is a required arg. Try running: #{__FILE__} --help" + "--aln is a required arg. Try running: #{PROGRAM} --help" abort_unless_file_exists opts[:tree] abort_unless_file_exists opts[:aln] # TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln] @@ -141,62 +145,78 @@ abort_unless opts[:clade_size_cutoff] >= 1, "--clade-size-cutoff must be >= 1" FileUtils.mkdir_p opts[:outdir] +AbortIf.logger.info { "Parsing input files" } + tree = NewickTree.fromFile opts[:tree] leaf2attrs = TreeClusters.read_alignment opts[:aln] -members_fname = - File.join opts[:outdir], - "#{opts[:base]}.tree_clusters.clade_members.txt" -key_cols_fname = - File.join opts[:outdir], - "#{opts[:base]}.tree_clusters.key_cols.txt" +members_fname = + File.join opts[:outdir], + "#{opts[:base]}.tree_clusters.clade_members.txt" +key_cols_fname = + File.join opts[:outdir], + "#{opts[:base]}.tree_clusters.key_cols.txt" annotated_tree_fname = - File.join opts[:outdir], - "#{opts[:base]}.tree_clusters.annotated_tree.txt" + File.join opts[:outdir], + "#{opts[:base]}.tree_clusters.annotated_tree.txt" -clade_members_f = - File.open(members_fname, "w") -key_cols_f = - File.open(key_cols_fname, "w") -annotated_tree_f = - File.open(annotated_tree_fname, "w") +clade_members_f = + File.open(members_fname, "w") +key_cols_f = + File.open(key_cols_fname, "w") +annotated_tree_f = + File.open(annotated_tree_fname, "w") key_col_sets = {} +clade_sizes = {} +clade_count = TreeClusters.all_clades(tree).count +AbortIf.logger.info { "Processing clades" } begin TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx| - clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}" + if ((idx + 1) % 100).zero? + perc = ((idx + 1) / clade_count.to_f * 100).round 2 + STDERR.printf("Processing clades: #{perc}%\r") + end + + clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}" + clade_sizes[clade_id] = clade.all_leaves.count + clade_members_f.puts [clade_id, clade.all_leaves.count, clade.all_leaves].join "\t" - key_cols_all_leaves = - TreeClusters.low_ent_cols_with_bases clade.all_leaves, - leaf2attrs, - opts[:entropy_cutoff] + key_cols_all_leaves = + TreeClusters.low_ent_cols_with_bases clade.all_leaves, + leaf2attrs, + opts[:entropy_cutoff] unless key_col_sets.has_key? key_cols_all_leaves key_col_sets[key_cols_all_leaves] = Set.new [clade_id] end key_col_sets[key_cols_all_leaves] << clade_id # This will change the node in the original NewickTree clade.node.name = "'#{clade_id}'" - end + AbortIf.logger.info { "Writing results" } + # We only want key column sets that are unique to a single clade. - key_col_sets.select { |_, clades| clades.count == 1 }.each do |kc_set, clades| + key_col_sets.select {|_, clades| clades.count == 1}.each do |kc_set, clades| clade_id = clades.first - key_cols_f.puts [ - clade_id, - kc_set.count, - kc_set.map { |pos, bases| "#{pos}-#{bases.join}" } - ].join "\t" + + # TODO should we just skip processing clades that are too small rather than just not printing them out? + if clade_sizes[clade_id] > opts[:clade_size_cutoff] + key_cols_f.puts [clade_id, + kc_set.count, + kc_set.map {|pos, bases| "#{pos}-#{bases.join}"} + ].join "\t" + end end annotated_tree_f.puts tree.to_s.sub(/;+$/, ";") ensure clade_members_f.close