exe/key_cols in tree_clusters-0.8.0 vs exe/key_cols in tree_clusters-0.8.1
- old
+ new
@@ -7,11 +7,12 @@
require "parse_fasta"
require "fileutils"
TreeClusters.extend TreeClusters
-GREETING = "The '#{__FILE__}' program"
+PROGRAM = "key_cols"
+GREETING = "The #{PROGRAM} program"
UNDERLINE = "=" * GREETING.length
opts = Trollop.options do
version TreeClusters::VERSION
@@ -44,29 +45,28 @@
# Options:
# EOS
banner <<-EOS
-#{GREETING}
-#{UNDERLINE}
+The key_cols program
+====================
- Hi. My name is #{__FILE__}. If you give me a Newick tree file and
- an alignment file (fasta format), I will tell you key columns for
- all clades/clusters that have them.
+ Hi. My name is key_cols. If you give me a Newick tree file and an
+ alignment file (fasta format), I will tell you key columns for all
+ clades/clusters that have them.
Overview
--------
A clade has key columns if you can use the residue/nucleotide at
those columns to tell sequences in the clade from sequences outside
of the clade.
Here's an example....
- After you run me (#{__FILE__} is my name), you'll get an output file
- with the extension, '*.tree_clusters.key_cols.txt'. It may look
- something like this:
+ After you run me, you'll get an output file with the extension,
+ '*.tree_clusters.key_cols.txt'. It may look something like this:
cluster_A 4 1-A 2-A 3-A 5-G
cluster_B 4 1-C 2-C 3-C 5-A
This file has the clade name, the number of key columns for that
@@ -91,10 +91,14 @@
--------------
- I ignore columns with gap chars (currently just '-') regardless of
column entropy.
+ - If you're Newick tree has parentheses "(" or ")" in leaf names
+ you'll get a parsing error even if the name is single quoted.
+ We'll fix this at some point.
+
Option info
-----------
--entropy-cutoff: A cutoff of 0 means that you allow no variation at
any column.
@@ -118,20 +122,20 @@
"Consider only clades with at least this many leaves",
default: 1)
opt(:outdir,
"Output directory",
- default: ".")
+ default: "key_cols_output")
opt(:base,
"Basename for output",
- default: "snazzy_clades")
+ default: "key_cols_output")
end
abort_if opts[:tree].nil?,
- "--tree is a required arg. Try running: #{__FILE__} --help"
+ "--tree is a required arg. Try running: #{PROGRAM} --help"
abort_if opts[:aln].nil?,
- "--aln is a required arg. Try running: #{__FILE__} --help"
+ "--aln is a required arg. Try running: #{PROGRAM} --help"
abort_unless_file_exists opts[:tree]
abort_unless_file_exists opts[:aln]
# TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln]
@@ -141,62 +145,78 @@
abort_unless opts[:clade_size_cutoff] >= 1,
"--clade-size-cutoff must be >= 1"
FileUtils.mkdir_p opts[:outdir]
+AbortIf.logger.info { "Parsing input files" }
+
tree = NewickTree.fromFile opts[:tree]
leaf2attrs = TreeClusters.read_alignment opts[:aln]
-members_fname =
- File.join opts[:outdir],
- "#{opts[:base]}.tree_clusters.clade_members.txt"
-key_cols_fname =
- File.join opts[:outdir],
- "#{opts[:base]}.tree_clusters.key_cols.txt"
+members_fname =
+ File.join opts[:outdir],
+ "#{opts[:base]}.tree_clusters.clade_members.txt"
+key_cols_fname =
+ File.join opts[:outdir],
+ "#{opts[:base]}.tree_clusters.key_cols.txt"
annotated_tree_fname =
- File.join opts[:outdir],
- "#{opts[:base]}.tree_clusters.annotated_tree.txt"
+ File.join opts[:outdir],
+ "#{opts[:base]}.tree_clusters.annotated_tree.txt"
-clade_members_f =
- File.open(members_fname, "w")
-key_cols_f =
- File.open(key_cols_fname, "w")
-annotated_tree_f =
- File.open(annotated_tree_fname, "w")
+clade_members_f =
+ File.open(members_fname, "w")
+key_cols_f =
+ File.open(key_cols_fname, "w")
+annotated_tree_f =
+ File.open(annotated_tree_fname, "w")
key_col_sets = {}
+clade_sizes = {}
+clade_count = TreeClusters.all_clades(tree).count
+AbortIf.logger.info { "Processing clades" }
begin
TreeClusters.all_clades(tree).sort_by {|cl| cl.all_leaves.count}.reverse.each_with_index do |clade, idx|
- clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
+ if ((idx + 1) % 100).zero?
+ perc = ((idx + 1) / clade_count.to_f * 100).round 2
+ STDERR.printf("Processing clades: #{perc}%\r")
+ end
+
+ clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}"
+ clade_sizes[clade_id] = clade.all_leaves.count
+
clade_members_f.puts [clade_id,
clade.all_leaves.count,
clade.all_leaves].join "\t"
- key_cols_all_leaves =
- TreeClusters.low_ent_cols_with_bases clade.all_leaves,
- leaf2attrs,
- opts[:entropy_cutoff]
+ key_cols_all_leaves =
+ TreeClusters.low_ent_cols_with_bases clade.all_leaves,
+ leaf2attrs,
+ opts[:entropy_cutoff]
unless key_col_sets.has_key? key_cols_all_leaves
key_col_sets[key_cols_all_leaves] = Set.new [clade_id]
end
key_col_sets[key_cols_all_leaves] << clade_id
# This will change the node in the original NewickTree
clade.node.name = "'#{clade_id}'"
-
end
+ AbortIf.logger.info { "Writing results" }
+
# We only want key column sets that are unique to a single clade.
- key_col_sets.select { |_, clades| clades.count == 1 }.each do |kc_set, clades|
+ key_col_sets.select {|_, clades| clades.count == 1}.each do |kc_set, clades|
clade_id = clades.first
- key_cols_f.puts [
- clade_id,
- kc_set.count,
- kc_set.map { |pos, bases| "#{pos}-#{bases.join}" }
- ].join "\t"
+
+ # TODO should we just skip processing clades that are too small rather than just not printing them out?
+ if clade_sizes[clade_id] > opts[:clade_size_cutoff]
+ key_cols_f.puts [clade_id,
+ kc_set.count,
+ kc_set.map {|pos, bases| "#{pos}-#{bases.join}"}
+ ].join "\t"
+ end
end
annotated_tree_f.puts tree.to_s.sub(/;+$/, ";")
ensure
clade_members_f.close