key_cols in tree_clusters-0.8.2

- old
+ new

@@ -167,17 +167,22 @@
 key_cols_f       =
     File.open(key_cols_fname, "w")
 annotated_tree_f =
     File.open(annotated_tree_fname, "w")
 
-key_col_sets = {}
-clade_sizes  = {}
-clade_count  = TreeClusters.all_clades(tree).count
+key_col_sets       = {}
+clade_sizes        = {}
+clade_count        = TreeClusters.all_clades(tree).count
+change_these_names = Set.new
 
+def all_clades_helper tree
+  TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse
+end
+
 AbortIf.logger.info { "Processing clades" }
 begin
-  TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx|
+  all_clades_helper(tree).each_with_index do |clade, idx|
     if ((idx + 1) % 100).zero?
       perc = ((idx + 1) / clade_count.to_f * 100).round 2
 
       STDERR.printf("Processing clades: #{perc}%\r")
     end
@@ -197,25 +202,35 @@
     unless key_col_sets.has_key? key_cols_all_leaves
       key_col_sets[key_cols_all_leaves] = Set.new [clade_id]
     end
     key_col_sets[key_cols_all_leaves] << clade_id
 
-    # This will change the node in the original NewickTree
-    clade.node.name = "'#{clade_id}'"
   end
 
   AbortIf.logger.info { "Writing results" }
 
   # We only want key column sets that are unique to a single clade.
-  key_col_sets.select {|_, clades| clades.count == 1}.each do |kc_set, clades|
+  key_col_sets.select { |_, clades| clades.count == 1 }.each do |kc_set, clades|
     clade_id = clades.first
 
     # TODO should we just skip processing clades that are too small rather than just not printing them out?
-    if clade_sizes[clade_id] > opts[:clade_size_cutoff]
+    if clade_sizes[clade_id] >= opts[:clade_size_cutoff]
+      change_these_names << clade_id
       key_cols_f.puts [clade_id,
                        kc_set.count,
-                       kc_set.map {|pos, bases| "#{pos}-#{bases.join}"}
+                       kc_set.map { |pos, bases| "#{pos}-#{bases.join}" }
                       ].join "\t"
+    end
+  end
+
+  AbortIf.logger.info { "Annotating tree" }
+
+  all_clades_helper(tree).each_with_index do |clade, idx|
+    clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
+
+    if change_these_names.include? clade_id
+      # This will change the node in the original NewickTree
+      clade.node.name = "'#{clade_id}'"
     end
   end
 
   annotated_tree_f.puts tree.to_s.sub(/;+$/, ";")
 ensure