lib/tree_clusters.rb in tree_clusters-0.4.0 vs lib/tree_clusters.rb in tree_clusters-0.5.0
- old
+ new
@@ -1,8 +1,10 @@
require "abort_if"
require "Newick"
require "set"
+require "parse_fasta"
+require "shannon"
require "tree_clusters/version"
include AbortIf
include AbortIf::Assert
@@ -22,9 +24,99 @@
end
end
# Top level namespace of the Gem.
module TreeClusters
+
+ # Given an ary of strings, find the most common string in the ary.
+ #
+ # @param bases [Array<String>] an array of strings
+ #
+ # @return most_common_str [String] the most common string in the ary.
+ #
+ # @example Upper case and lower case count as the same.
+ # TreeClusters::consensus %w[a A C T] #=> "A"
+ # @example Ties take the one closest to the end
+ # TreeClusters::consensus %w[a c T t C t g] #=> "T"
+ #
+ # @note Each string is upcase'd before frequencies are calculated.
+ def consensus bases
+ bases.
+ map(&:upcase).
+ group_by(&:itself).
+ sort_by { |_, bases| bases.count }.
+ reverse.
+ first.
+ first
+ end
+
+ def read_alignment aln_fname
+ leaf2attrs = TreeClusters::Attrs.new
+ aln_len = nil
+ ParseFasta::SeqFile.open(aln_fname).each_record do |rec|
+ leaf2attrs[rec.id] = { aln: rec.seq.chars }
+
+ aln_len ||= rec.seq.length
+
+ abort_unless aln_len == rec.seq.length,
+ "Aln len mismatch for #{rec.id}"
+ end
+
+ leaf2attrs
+ end
+
+ def low_ent_cols leaves, leaf2attrs, entropy_cutoff
+ low_ent_cols = []
+ alns = leaf2attrs.attrs leaves, :aln
+ aln_cols = alns.transpose
+
+ aln_cols.each_with_index do |aln_col, aln_col_idx|
+ has_gaps = aln_col.any? { |aa| aa == "-" }
+ low_entropy =
+ Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff
+
+ if !has_gaps && low_entropy
+ low_ent_cols << (aln_col_idx + 1)
+ end
+ end
+
+ Set.new low_ent_cols
+ end
+
+ def check_ids tree, mapping, aln
+ tree_ids = Set.new(NewickTree.fromFile(tree).taxa)
+
+ mapping_ids = Set.new
+ File.open(mapping, "rt").each_line.with_index do |line, idx|
+ unless idx.zero?
+ id, *rest = line.chomp.split
+
+ mapping_ids << id
+ end
+ end
+
+ aln_ids = Set.new
+ ParseFasta::SeqFile.open(aln).each_record do |rec|
+ aln_ids << rec.id
+ end
+
+ if !(tree_ids == mapping_ids && mapping_ids == aln_ids)
+ AbortIf::logger.error { "Seq IDs did not match in all input files" }
+
+ tree_ids = tree_ids.to_a.sort
+ mapping_ids = mapping_ids.to_a.sort
+ aln_ids = aln_ids.to_a.sort
+
+ AbortIf::logger.debug { ["tree_ids", tree_ids].join "\t" }
+ AbortIf::logger.debug { ["mapping_ids", mapping_ids].join "\t" }
+ AbortIf::logger.debug { ["aln_ids", aln_ids].join "\t" }
+
+ raise AbortIf::Exit
+ else
+ true
+ end
+ end
+
# Given a NewickTree, return an array of all Clades in that tree.
#
# @param tree [NewickTree] a NewickTree object
#