exe/key_cols in tree_clusters-0.8.1 vs exe/key_cols in tree_clusters-0.8.2

- old
+ new

@@ -167,17 +167,22 @@ key_cols_f = File.open(key_cols_fname, "w") annotated_tree_f = File.open(annotated_tree_fname, "w") -key_col_sets = {} -clade_sizes = {} -clade_count = TreeClusters.all_clades(tree).count +key_col_sets = {} +clade_sizes = {} +clade_count = TreeClusters.all_clades(tree).count +change_these_names = Set.new +def all_clades_helper tree + TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse +end + AbortIf.logger.info { "Processing clades" } begin - TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx| + all_clades_helper(tree).each_with_index do |clade, idx| if ((idx + 1) % 100).zero? perc = ((idx + 1) / clade_count.to_f * 100).round 2 STDERR.printf("Processing clades: #{perc}%\r") end @@ -197,25 +202,35 @@ unless key_col_sets.has_key? key_cols_all_leaves key_col_sets[key_cols_all_leaves] = Set.new [clade_id] end key_col_sets[key_cols_all_leaves] << clade_id - # This will change the node in the original NewickTree - clade.node.name = "'#{clade_id}'" end AbortIf.logger.info { "Writing results" } # We only want key column sets that are unique to a single clade. - key_col_sets.select {|_, clades| clades.count == 1}.each do |kc_set, clades| + key_col_sets.select { |_, clades| clades.count == 1 }.each do |kc_set, clades| clade_id = clades.first # TODO should we just skip processing clades that are too small rather than just not printing them out? - if clade_sizes[clade_id] > opts[:clade_size_cutoff] + if clade_sizes[clade_id] >= opts[:clade_size_cutoff] + change_these_names << clade_id key_cols_f.puts [clade_id, kc_set.count, - kc_set.map {|pos, bases| "#{pos}-#{bases.join}"} + kc_set.map { |pos, bases| "#{pos}-#{bases.join}" } ].join "\t" + end + end + + AbortIf.logger.info { "Annotating tree" } + + all_clades_helper(tree).each_with_index do |clade, idx| + clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}" + + if change_these_names.include? clade_id + # This will change the node in the original NewickTree + clade.node.name = "'#{clade_id}'" end end annotated_tree_f.puts tree.to_s.sub(/;+$/, ";") ensure