lib/tree_clusters.rb in tree_clusters-0.6.0 vs lib/tree_clusters.rb in tree_clusters-0.7.0
- old
+ new
@@ -1,10 +1,13 @@
require "abort_if"
require "Newick"
require "set"
require "parse_fasta"
require "shannon"
+require "tree_clusters/attrs"
+require "tree_clusters/attr_array"
+require "tree_clusters/clade"
require "tree_clusters/version"
include AbortIf
include AbortIf::Assert
@@ -196,10 +199,77 @@
end
snazzy_clades
end
+ def snazzy_info tree, metadata
+ snazzy_info = {}
+
+ clades = self.
+ all_clades(tree, metadata).
+ sort_by { |clade| clade.all_leaves.count }.
+ reverse
+
+ # Non snazzy clades have a value of nil, so set all to nil and the
+ # snazzy ones will be overwritten.
+ clades.each do |clade|
+ snazzy_info[clade] = nil
+ end
+
+ metadata.each do |md_cat, leaf2mdtag|
+ already_checked = Set.new
+ single_tag_clades = {}
+
+ clades.each do |clade|
+ assert clade.all_leaves.count > 1,
+ "A clade cannot also be a leaf"
+
+ unless clade.all_leaves.all? do |leaf|
+ already_checked.include? leaf
+ end
+ md_tags = clade.all_leaves.map do |leaf|
+ assert leaf2mdtag.has_key?(leaf),
+ "leaf #{leaf} is missing from leaf2mdtag ht"
+
+ leaf2mdtag[leaf]
+ end
+
+ # this clade is mono-phyletic w.r.t. this metadata category.
+ if md_tags.uniq.count == 1
+ clade.all_leaves.each do |leaf|
+ already_checked << leaf
+ end
+
+ assert !single_tag_clades.has_key?(clade),
+ "clade #{clade.name} is repeated in single_tag_clades for #{md_cat}"
+
+ single_tag_clades[clade] = md_tags.first
+ end
+ end
+ end
+
+ single_tag_clades.each do |clade, md_tag|
+ non_clade_leaves = tree.unquoted_taxa - clade.all_leaves
+
+ non_clade_leaves_with_this_md_tag = non_clade_leaves.map do |leaf|
+ [leaf, leaf2mdtag[leaf]]
+ end.select { |ary| ary.last == md_tag }
+
+ is_snazzy_clade = non_clade_leaves_with_this_md_tag.count.zero?
+ if is_snazzy_clade
+ if !snazzy_info[clade].nil?
+ snazzy_info[clade][md_cat] = md_tag
+ else
+ snazzy_info[clade] = { md_cat => md_tag }
+ end
+ end
+ end
+ end
+
+ snazzy_info
+ end
+
def read_mapping_file fname
md_cat_names = nil
metadata = TreeClusters::Attrs.new
File.open(fname, "rt").each_line.with_index do |line, idx|
@@ -252,197 +322,7 @@
end
end
end
[attr_names, attrs]
- end
-
- # A Hash table for genome/leaf/taxa attributes
- class Attrs < Hash
-
- # Returns the an AttrArray of Sets for the given genomes and
- # attribute.
- #
- # @note If a genome is in the leaves array, but is not in the hash
- # table, NO error will be raised. Rather that genome will be
- # skipped. This is for cases in which not all genomes have
- # attributes.
- #
- # @param leaves [Array<String>] names of the leaves for which you
- # need attributes
- # @param attr [Symbol] the attribute you are interested in eg,
- # :genes
- #
- # @return [AttrArray<Set>] an AttrArray of Sets of
- # attributes
- #
- # @raise [AbortIf::Exit] if they leaf is present but doesn't have
- # the requested attr
- def attrs leaves, attr
- ary = leaves.map do |leaf|
-
- if self.has_key? leaf
- abort_unless self[leaf].has_key?(attr),
- "Missing attr #{attr.inspect} for leaf '#{leaf}'"
-
- self[leaf][attr]
- else
- nil
- end
- end.compact
-
- TreeClusters::AttrArray.new ary
- end
-
- def add leaf, attr, val
- if self.has_key? leaf
- self[leaf][attr] = val
- else
- self[leaf] = { attr => val }
- end
- end
- end
-
- # Provides convenience methods for working with Arrays of Sets
- class AttrArray < Object::Array
- # Takes the union of all sets in the AttrArray
- #
- # @return [Set]
- def union
- self.reduce(&:union)
- end
-
- # Takes the intersection of all sets in the AttrArray
- #
- # @return [Set]
- def intersection
- self.reduce(&:intersection)
- end
- end
-
- # Represents a clade in a NewickTree
- class Clade
- attr_accessor :name,
- :all_leaves,
- :left_leaves,
- :right_leaves,
- :all_sibling_leaves,
- :each_sibling_leaf_set,
- :parent_leaves,
- :non_parent_leaves,
- :other_leaves,
- :single_tag_info,
- :all_tags
-
- # @note If a node name is quoted, then those quotes are removed
- # first.
- #
- # @param node [NewickNode] a NewickNode from a NewickTree
- # @param tree [NewickTree] a NewickTree
- def initialize node, tree, metadata=nil
- tree_taxa = tree.unquoted_taxa
-
- @name = unquote node.name
- @all_leaves = descendant_leaves node
-
- if (children = node.children).count == 2
- lchild, rchild = node.children
-
- @left_leaves = descendant_leaves lchild
-
- @right_leaves = descendant_leaves rchild
- end
-
- siblings = node.siblings
- # assert siblings.count == 1,
- # "Node #{node.name} has more than one sibling."
-
- @each_sibling_leaf_set = siblings.
- map { |node| descendant_leaves node }
-
- @all_sibling_leaves = @each_sibling_leaf_set.flatten.uniq
-
- parent = node.parent
- assert parent,
- "Noge #{node.name} has no parent. Is it the root?"
- @parent_leaves = descendant_leaves parent
-
- @other_leaves =
- Object::Set.new(tree_taxa) - Object::Set.new(all_leaves)
-
- @non_parent_leaves =
- Object::Set.new(tree_taxa) - Object::Set.new(parent_leaves)
-
- if metadata
- @metadata = metadata
- @all_tags ||= get_all_tags
- @single_tag_info ||= get_single_tag_info
- else
- @single_tag_info = nil
- end
- end
-
- # Compares two Clades field by field.
- #
- # If all instance variables are == than the two clades are == as
- # well.
- def == clade
- (
- self.name == clade.name &&
- self.all_leaves == clade.all_leaves &&
- self.left_leaves == clade.left_leaves &&
- self.right_leaves == clade.right_leaves &&
- self.all_sibling_leaves == clade.all_sibling_leaves &&
- self.each_sibling_leaf_set == clade.each_sibling_leaf_set &&
- self.parent_leaves == clade.parent_leaves &&
- self.other_leaves == clade.other_leaves &&
- self.single_tag_info == clade.single_tag_info &&
- self.all_tags == clade.all_tags
- )
- end
-
- # Alias for ==
- def eql? clade
- self == clade
- end
-
- private
-
- def get_single_tag_info
- @all_tags.map do |md_cat, set|
- [md_cat, set.count == 1 ? set.to_a.first : nil]
- end.to_h
- end
-
- def get_all_tags
- # name2tag has leaf names => metadata tag and is an Attrs
- @metadata.map do |md_cat, name2tag|
- tag_info = self.all_leaves.map do |leaf|
- assert name2tag.has_key?(leaf),
- "leaf #{leaf} is not present in name2tag ht for " +
- "md_cat #{md_cat}"
-
- name2tag[leaf]
- end
-
- [md_cat, Set.new(tag_info)]
- end.to_h
- end
-
- def descendant_leaves node
- if node.leaf?
- [unquote(node.name)]
- else
- node.
- descendants.
- flatten.
- uniq.
- select { |node| node.leaf? }.
- map { |node| unquote(node.name) }
- end
- end
-
- def unquote str
- str.tr %q{"'}, ""
- end
end
end