exe/key_cols in tree_clusters-0.8.2 vs exe/key_cols in tree_clusters-0.8.3
- old
+ new
@@ -1,7 +1,9 @@
#!/usr/bin/env ruby
+# TODO the block variables called "clade" aren't actually instances of the TreeClusters::Clade class. It's kind of confusing.
+
Signal.trap("PIPE", "EXIT")
require "tree_clusters"
require "trollop"
require "parse_fasta"
@@ -52,10 +54,12 @@
Hi. My name is key_cols. If you give me a Newick tree file and an
alignment file (fasta format), I will tell you key columns for all
clades/clusters that have them.
+ Version: v#{TreeClusters::VERSION}
+
Overview
--------
A clade has key columns if you can use the residue/nucleotide at
those columns to tell sequences in the clade from sequences outside
@@ -145,13 +149,14 @@
abort_unless opts[:clade_size_cutoff] >= 1,
"--clade-size-cutoff must be >= 1"
FileUtils.mkdir_p opts[:outdir]
-AbortIf.logger.info { "Parsing input files" }
+AbortIf.logger.info { "Reading tree" }
+tree = NewickTree.fromFile opts[:tree]
-tree = NewickTree.fromFile opts[:tree]
+AbortIf.logger.info { "Reading alignment" }
leaf2attrs = TreeClusters.read_alignment opts[:aln]
members_fname =
File.join opts[:outdir],
"#{opts[:base]}.tree_clusters.clade_members.txt"
@@ -167,26 +172,41 @@
key_cols_f =
File.open(key_cols_fname, "w")
annotated_tree_f =
File.open(annotated_tree_fname, "w")
-key_col_sets = {}
-clade_sizes = {}
-clade_count = TreeClusters.all_clades(tree).count
+key_col_sets = {}
+clade_sizes = {}
+# AbortIf.logger.info { "Counting clades" }
+# clade_count = TreeClusters.all_clades(tree).count
change_these_names = Set.new
def all_clades_helper tree
- TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse
+ # TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse
+ tree.clade_nodes.
+ reverse.
+ sort_by { |node| node.all_leaves.count }.
+ reverse
end
-AbortIf.logger.info { "Processing clades" }
+AbortIf.logger.info { "Getting all clades" }
+clades = all_clades_helper(tree)
+clade_count = clades.count
+
+AbortIf.logger.info { "Processing clades (The first few clades go reaaally slowly, but then I speed up!)" }
begin
- all_clades_helper(tree).each_with_index do |clade, idx|
- if ((idx + 1) % 100).zero?
+ clades.each_with_index do |clade, idx|
+ # It starts off really slowly, then speeds up a lot.
+ if (idx+1) < 100 ||
+ ((idx+1) < 1000 && ((idx + 1) % 10).zero?) ||
+ ((idx+1) < 10000 && ((idx + 1) % 100).zero?) ||
+ ((idx+1) < 100000 && ((idx + 1) % 1000).zero?) ||
+ ((idx + 1) % 10000).zero?
+
perc = ((idx + 1) / clade_count.to_f * 100).round 2
- STDERR.printf("Processing clades: #{perc}%\r")
+ STDERR.printf("Processing clades: #{idx + 1} of #{clade_count} (#{perc}%%)\r")
end
clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
clade_sizes[clade_id] = clade.all_leaves.count
@@ -222,15 +242,15 @@
end
end
AbortIf.logger.info { "Annotating tree" }
- all_clades_helper(tree).each_with_index do |clade, idx|
+ clades.each_with_index do |clade, idx|
clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
if change_these_names.include? clade_id
# This will change the node in the original NewickTree
- clade.node.name = "'#{clade_id}'"
+ clade.name = "'#{clade_id}'"
end
end
annotated_tree_f.puts tree.to_s.sub(/;+$/, ";")
ensure