key_cols in tree_clusters-0.8.1

- old
+ new

@@ -7,11 +7,12 @@
 require "parse_fasta"
 require "fileutils"
 
 TreeClusters.extend TreeClusters
 
-GREETING = "The '#{__FILE__}' program"
+PROGRAM   = "key_cols"
+GREETING  = "The #{PROGRAM} program"
 UNDERLINE = "=" * GREETING.length
 
 opts = Trollop.options do
   version TreeClusters::VERSION
 
@@ -44,29 +45,28 @@
   # Options:
   # EOS
 
   banner <<-EOS
 
-#{GREETING}
-#{UNDERLINE}
+The key_cols program
+====================
 
-  Hi.  My name is #{__FILE__}.  If you give me a Newick tree file and
-  an alignment file (fasta format), I will tell you key columns for
-  all clades/clusters that have them.
+  Hi.  My name is key_cols.  If you give me a Newick tree file and an
+  alignment file (fasta format), I will tell you key columns for all
+  clades/clusters that have them.
 
 Overview
 --------
 
   A clade has key columns if you can use the residue/nucleotide at
   those columns to tell sequences in the clade from sequences outside
   of the clade.
 
   Here's an example....
 
-  After you run me (#{__FILE__} is my name), you'll get an output file
-  with the extension, '*.tree_clusters.key_cols.txt'.  It may look
-  something like this:
+  After you run me, you'll get an output file with the extension,
+  '*.tree_clusters.key_cols.txt'.  It may look something like this:
 
 cluster_A	4	1-A	2-A	3-A	5-G
 cluster_B	4	1-C	2-C	3-C	5-A
 
   This file has the clade name, the number of key columns for that
@@ -91,10 +91,14 @@
 --------------
 
   - I ignore columns with gap chars (currently just '-') regardless of
     column entropy.
 
+  - If you're Newick tree has parentheses "(" or ")" in leaf names
+    you'll get a parsing error even if the name is single quoted.
+    We'll fix this at some point.
+
 Option info
 -----------
 
   --entropy-cutoff: A cutoff of 0 means that you allow no variation at
     any column.
@@ -118,20 +122,20 @@
       "Consider only clades with at least this many leaves",
       default: 1)
 
   opt(:outdir,
       "Output directory",
-      default: ".")
+      default: "key_cols_output")
   opt(:base,
       "Basename for output",
-      default: "snazzy_clades")
+      default: "key_cols_output")
 end
 
 abort_if opts[:tree].nil?,
-         "--tree is a required arg.  Try running: #{__FILE__} --help"
+         "--tree is a required arg.  Try running: #{PROGRAM} --help"
 abort_if opts[:aln].nil?,
-         "--aln is a required arg.  Try running: #{__FILE__} --help"
+         "--aln is a required arg.  Try running: #{PROGRAM} --help"
 
 abort_unless_file_exists opts[:tree]
 abort_unless_file_exists opts[:aln]
 
 # TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln]
@@ -141,62 +145,78 @@
 abort_unless opts[:clade_size_cutoff] >= 1,
              "--clade-size-cutoff must be >= 1"
 
 FileUtils.mkdir_p opts[:outdir]
 
+AbortIf.logger.info { "Parsing input files" }
+
 tree       = NewickTree.fromFile opts[:tree]
 leaf2attrs = TreeClusters.read_alignment opts[:aln]
 
-members_fname                     =
-  File.join opts[:outdir],
-            "#{opts[:base]}.tree_clusters.clade_members.txt"
-key_cols_fname                    =
-  File.join opts[:outdir],
-            "#{opts[:base]}.tree_clusters.key_cols.txt"
+members_fname        =
+    File.join opts[:outdir],
+              "#{opts[:base]}.tree_clusters.clade_members.txt"
+key_cols_fname       =
+    File.join opts[:outdir],
+              "#{opts[:base]}.tree_clusters.key_cols.txt"
 annotated_tree_fname =
-  File.join opts[:outdir],
-            "#{opts[:base]}.tree_clusters.annotated_tree.txt"
+    File.join opts[:outdir],
+              "#{opts[:base]}.tree_clusters.annotated_tree.txt"
 
-clade_members_f               =
-  File.open(members_fname, "w")
-key_cols_f                    =
-  File.open(key_cols_fname, "w")
-annotated_tree_f                    =
-  File.open(annotated_tree_fname, "w")
+clade_members_f  =
+    File.open(members_fname, "w")
+key_cols_f       =
+    File.open(key_cols_fname, "w")
+annotated_tree_f =
+    File.open(annotated_tree_fname, "w")
 
 key_col_sets = {}
+clade_sizes  = {}
+clade_count  = TreeClusters.all_clades(tree).count
 
+AbortIf.logger.info { "Processing clades" }
 begin
   TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx|
-    clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
+    if ((idx + 1) % 100).zero?
+      perc = ((idx + 1) / clade_count.to_f * 100).round 2
 
+      STDERR.printf("Processing clades: #{perc}%\r")
+    end
+
+    clade_id              = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
+    clade_sizes[clade_id] = clade.all_leaves.count
+
     clade_members_f.puts [clade_id,
                           clade.all_leaves.count,
                           clade.all_leaves].join "\t"
 
-    key_cols_all_leaves         =
-      TreeClusters.low_ent_cols_with_bases clade.all_leaves,
-                                           leaf2attrs,
-                                           opts[:entropy_cutoff]
+    key_cols_all_leaves =
+        TreeClusters.low_ent_cols_with_bases clade.all_leaves,
+                                             leaf2attrs,
+                                             opts[:entropy_cutoff]
 
     unless key_col_sets.has_key? key_cols_all_leaves
       key_col_sets[key_cols_all_leaves] = Set.new [clade_id]
     end
     key_col_sets[key_cols_all_leaves] << clade_id
 
     # This will change the node in the original NewickTree
     clade.node.name = "'#{clade_id}'"
-
   end
 
+  AbortIf.logger.info { "Writing results" }
+
   # We only want key column sets that are unique to a single clade.
-  key_col_sets.select { |_, clades| clades.count == 1 }.each do |kc_set, clades|
+  key_col_sets.select {|_, clades| clades.count == 1}.each do |kc_set, clades|
     clade_id = clades.first
-    key_cols_f.puts [
-      clade_id,
-      kc_set.count,
-      kc_set.map { |pos, bases| "#{pos}-#{bases.join}" }
-    ].join "\t"
+
+    # TODO should we just skip processing clades that are too small rather than just not printing them out?
+    if clade_sizes[clade_id] > opts[:clade_size_cutoff]
+      key_cols_f.puts [clade_id,
+                       kc_set.count,
+                       kc_set.map {|pos, bases| "#{pos}-#{bases.join}"}
+                      ].join "\t"
+    end
   end
 
   annotated_tree_f.puts tree.to_s.sub(/;+$/, ";")
 ensure
   clade_members_f.close