exe/key_cols in tree_clusters-0.8.2 vs exe/key_cols in tree_clusters-0.8.3

- old
+ new

@@ -1,7 +1,9 @@ #!/usr/bin/env ruby +# TODO the block variables called "clade" aren't actually instances of the TreeClusters::Clade class. It's kind of confusing. + Signal.trap("PIPE", "EXIT") require "tree_clusters" require "trollop" require "parse_fasta" @@ -52,10 +54,12 @@ Hi. My name is key_cols. If you give me a Newick tree file and an alignment file (fasta format), I will tell you key columns for all clades/clusters that have them. + Version: v#{TreeClusters::VERSION} + Overview -------- A clade has key columns if you can use the residue/nucleotide at those columns to tell sequences in the clade from sequences outside @@ -145,13 +149,14 @@ abort_unless opts[:clade_size_cutoff] >= 1, "--clade-size-cutoff must be >= 1" FileUtils.mkdir_p opts[:outdir] -AbortIf.logger.info { "Parsing input files" } +AbortIf.logger.info { "Reading tree" } +tree = NewickTree.fromFile opts[:tree] -tree = NewickTree.fromFile opts[:tree] +AbortIf.logger.info { "Reading alignment" } leaf2attrs = TreeClusters.read_alignment opts[:aln] members_fname = File.join opts[:outdir], "#{opts[:base]}.tree_clusters.clade_members.txt" @@ -167,26 +172,41 @@ key_cols_f = File.open(key_cols_fname, "w") annotated_tree_f = File.open(annotated_tree_fname, "w") -key_col_sets = {} -clade_sizes = {} -clade_count = TreeClusters.all_clades(tree).count +key_col_sets = {} +clade_sizes = {} +# AbortIf.logger.info { "Counting clades" } +# clade_count = TreeClusters.all_clades(tree).count change_these_names = Set.new def all_clades_helper tree - TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse + # TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse + tree.clade_nodes. + reverse. + sort_by { |node| node.all_leaves.count }. + reverse end -AbortIf.logger.info { "Processing clades" } +AbortIf.logger.info { "Getting all clades" } +clades = all_clades_helper(tree) +clade_count = clades.count + +AbortIf.logger.info { "Processing clades (The first few clades go reaaally slowly, but then I speed up!)" } begin - all_clades_helper(tree).each_with_index do |clade, idx| - if ((idx + 1) % 100).zero? + clades.each_with_index do |clade, idx| + # It starts off really slowly, then speeds up a lot. + if (idx+1) < 100 || + ((idx+1) < 1000 && ((idx + 1) % 10).zero?) || + ((idx+1) < 10000 && ((idx + 1) % 100).zero?) || + ((idx+1) < 100000 && ((idx + 1) % 1000).zero?) || + ((idx + 1) % 10000).zero? + perc = ((idx + 1) / clade_count.to_f * 100).round 2 - STDERR.printf("Processing clades: #{perc}%\r") + STDERR.printf("Processing clades: #{idx + 1} of #{clade_count} (#{perc}%%)\r") end clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}" clade_sizes[clade_id] = clade.all_leaves.count @@ -222,15 +242,15 @@ end end AbortIf.logger.info { "Annotating tree" } - all_clades_helper(tree).each_with_index do |clade, idx| + clades.each_with_index do |clade, idx| clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}" if change_these_names.include? clade_id # This will change the node in the original NewickTree - clade.node.name = "'#{clade_id}'" + clade.name = "'#{clade_id}'" end end annotated_tree_f.puts tree.to_s.sub(/;+$/, ";") ensure