lib/tree_clusters.rb in tree_clusters-0.8.2 vs lib/tree_clusters.rb in tree_clusters-0.8.3
- old
+ new
@@ -25,16 +25,24 @@
end
return clades
end
def unquoted_taxa
+ # @note self.taxa calls taxa method on the root of the tree.
self.taxa.map { |str| str.tr %q{"'}, "" }
end
end
+class NewickNode
+ def all_leaves
+ self.leaves.map { |n| n.name.tr %q{"'}, "" }
+ end
+end
+
# Top level namespace of the Gem.
module TreeClusters
+ PROJ_ROOT = File.join __dir__, ".."
# Given an ary of strings, find the most common string in the ary.
#
# @param bases [Array<String>] an array of strings
#
@@ -46,22 +54,28 @@
# TreeClusters::consensus %w[a c T t C t g] #=> "T"
#
# @note Each string is upcase'd before frequencies are calculated.
def consensus bases
bases.
- map(&:upcase).
- group_by(&:itself).
- sort_by { |_, bases| bases.count }.
- reverse.
- first.
- first
+ map(&:upcase).
+ group_by(&:itself).
+ sort_by { |_, bases| bases.count }.
+ reverse.
+ first.
+ first
end
def read_alignment aln_fname
leaf2attrs = TreeClusters::Attrs.new
- aln_len = nil
+ aln_len = nil
+ seq_num = 0
ParseFasta::SeqFile.open(aln_fname).each_record do |rec|
+ seq_num += 1
+ if ((seq_num + 1) % 1000).zero?
+ STDERR.printf("Reading alignment: #{seq_num + 1}\r")
+ end
+
leaf2attrs[rec.id] = { aln: rec.seq.chars }
aln_len ||= rec.seq.length
abort_unless aln_len == rec.seq.length,
@@ -71,17 +85,17 @@
leaf2attrs
end
def low_ent_cols leaves, leaf2attrs, entropy_cutoff
low_ent_cols = []
- alns = leaf2attrs.attrs leaves, :aln
- aln_cols = alns.transpose
+ alns = leaf2attrs.attrs leaves, :aln
+ aln_cols = alns.transpose
aln_cols.each_with_index do |aln_col, aln_col_idx|
- has_gaps = aln_col.any? { |aa| aa == "-" }
+ has_gaps = aln_col.any? { |aa| aa == "-" }
low_entropy =
- Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
+ Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
if !has_gaps && low_entropy
low_ent_cols << (aln_col_idx + 1)
end
end
@@ -90,15 +104,15 @@
end
# Like low_ent_cols method but also returns the bases at the positions.
def low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff
low_ent_cols = []
- alns = leaf2attrs.attrs leaves, :aln
- aln_cols = alns.transpose
+ alns = leaf2attrs.attrs leaves, :aln
+ aln_cols = alns.transpose
aln_cols.each_with_index do |aln_col, aln_col_idx|
- has_gaps = aln_col.any? { |aa| aa == "-" }
+ has_gaps = aln_col.any? { |aa| aa == "-" }
low_entropy =
Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
if !has_gaps && low_entropy
low_ent_cols << [(aln_col_idx + 1), aln_col.map(&:upcase).uniq.sort]
@@ -128,13 +142,13 @@
end
if !(tree_ids == mapping_ids && mapping_ids == aln_ids)
AbortIf::logger.error { "Seq IDs did not match in all input files" }
- tree_ids = tree_ids.to_a.sort
+ tree_ids = tree_ids.to_a.sort
mapping_ids = mapping_ids.to_a.sort
- aln_ids = aln_ids.to_a.sort
+ aln_ids = aln_ids.to_a.sort
AbortIf::logger.debug { ["tree_ids", tree_ids].join "\t" }
AbortIf::logger.debug { ["mapping_ids", mapping_ids].join "\t" }
AbortIf::logger.debug { ["aln_ids", aln_ids].join "\t" }
@@ -150,11 +164,11 @@
# @param tree [NewickTree] a NewickTree object
#
# @yieldparam clade [Clade] a clade of the tree
#
# @return [Enumerator<Clade>] enumerator of Clade objects
- def all_clades tree, metadata=nil
+ def all_clades tree, metadata = nil
return enum_for(:all_clades, tree, metadata) unless block_given?
tree.clade_nodes.reverse.each do |node|
yield Clade.new node, tree, metadata
end
@@ -162,25 +176,25 @@
def snazzy_clades tree, metadata
snazzy_clades = {}
clades = self.
- all_clades(tree, metadata).
- sort_by { |clade| clade.all_leaves.count }.
- reverse
+ all_clades(tree, metadata).
+ sort_by { |clade| clade.all_leaves.count }.
+ reverse
metadata.each do |md_cat, leaf2mdtag|
- already_checked = Set.new
+ already_checked = Set.new
single_tag_clades = {}
clades.each do |clade|
assert clade.all_leaves.count > 1,
"A clade cannot also be a leaf"
unless clade.all_leaves.all? do |leaf|
- already_checked.include? leaf
- end
+ already_checked.include? leaf
+ end
md_tags = clade.all_leaves.map do |leaf|
assert leaf2mdtag.has_key?(leaf),
"leaf #{leaf} is missing from leaf2mdtag ht"
leaf2mdtag[leaf]
@@ -222,31 +236,31 @@
def snazzy_info tree, metadata
snazzy_info = {}
clades = self.
- all_clades(tree, metadata).
- sort_by { |clade| clade.all_leaves.count }.
- reverse
+ all_clades(tree, metadata).
+ sort_by { |clade| clade.all_leaves.count }.
+ reverse
# Non snazzy clades have a value of nil, so set all to nil and the
# snazzy ones will be overwritten.
clades.each do |clade|
snazzy_info[clade] = nil
end
metadata.each do |md_cat, leaf2mdtag|
- already_checked = Set.new
+ already_checked = Set.new
single_tag_clades = {}
clades.each do |clade|
assert clade.all_leaves.count > 1,
"A clade cannot also be a leaf"
unless clade.all_leaves.all? do |leaf|
- already_checked.include? leaf
- end
+ already_checked.include? leaf
+ end
md_tags = clade.all_leaves.map do |leaf|
assert leaf2mdtag.has_key?(leaf),
"leaf #{leaf} is missing from leaf2mdtag ht"
leaf2mdtag[leaf]
@@ -287,11 +301,11 @@
snazzy_info
end
def read_mapping_file fname
md_cat_names = nil
- metadata = TreeClusters::Attrs.new
+ metadata = TreeClusters::Attrs.new
File.open(fname, "rt").each_line.with_index do |line, idx|
leaf_name, *metadata_vals = line.chomp.split "\t"
if idx.zero?
@@ -340,8 +354,8 @@
attrs[leaf][attr_name] << attr_val
end
end
end
- [attr_names, attrs]
+ [attr_names, attrs]
end
end