lib/tree_clusters.rb in tree_clusters-0.8.2 vs lib/tree_clusters.rb in tree_clusters-0.8.3

- old
+ new

@@ -25,16 +25,24 @@ end return clades end def unquoted_taxa + # @note self.taxa calls taxa method on the root of the tree. self.taxa.map { |str| str.tr %q{"'}, "" } end end +class NewickNode + def all_leaves + self.leaves.map { |n| n.name.tr %q{"'}, "" } + end +end + # Top level namespace of the Gem. module TreeClusters + PROJ_ROOT = File.join __dir__, ".." # Given an ary of strings, find the most common string in the ary. # # @param bases [Array<String>] an array of strings # @@ -46,22 +54,28 @@ # TreeClusters::consensus %w[a c T t C t g] #=> "T" # # @note Each string is upcase'd before frequencies are calculated. def consensus bases bases. - map(&:upcase). - group_by(&:itself). - sort_by { |_, bases| bases.count }. - reverse. - first. - first + map(&:upcase). + group_by(&:itself). + sort_by { |_, bases| bases.count }. + reverse. + first. + first end def read_alignment aln_fname leaf2attrs = TreeClusters::Attrs.new - aln_len = nil + aln_len = nil + seq_num = 0 ParseFasta::SeqFile.open(aln_fname).each_record do |rec| + seq_num += 1 + if ((seq_num + 1) % 1000).zero? + STDERR.printf("Reading alignment: #{seq_num + 1}\r") + end + leaf2attrs[rec.id] = { aln: rec.seq.chars } aln_len ||= rec.seq.length abort_unless aln_len == rec.seq.length, @@ -71,17 +85,17 @@ leaf2attrs end def low_ent_cols leaves, leaf2attrs, entropy_cutoff low_ent_cols = [] - alns = leaf2attrs.attrs leaves, :aln - aln_cols = alns.transpose + alns = leaf2attrs.attrs leaves, :aln + aln_cols = alns.transpose aln_cols.each_with_index do |aln_col, aln_col_idx| - has_gaps = aln_col.any? { |aa| aa == "-" } + has_gaps = aln_col.any? { |aa| aa == "-" } low_entropy = - Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff + Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff if !has_gaps && low_entropy low_ent_cols << (aln_col_idx + 1) end end @@ -90,15 +104,15 @@ end # Like low_ent_cols method but also returns the bases at the positions. def low_ent_cols_with_bases leaves, leaf2attrs, entropy_cutoff low_ent_cols = [] - alns = leaf2attrs.attrs leaves, :aln - aln_cols = alns.transpose + alns = leaf2attrs.attrs leaves, :aln + aln_cols = alns.transpose aln_cols.each_with_index do |aln_col, aln_col_idx| - has_gaps = aln_col.any? { |aa| aa == "-" } + has_gaps = aln_col.any? { |aa| aa == "-" } low_entropy = Shannon::entropy(aln_col.join.upcase) <= entropy_cutoff if !has_gaps && low_entropy low_ent_cols << [(aln_col_idx + 1), aln_col.map(&:upcase).uniq.sort] @@ -128,13 +142,13 @@ end if !(tree_ids == mapping_ids && mapping_ids == aln_ids) AbortIf::logger.error { "Seq IDs did not match in all input files" } - tree_ids = tree_ids.to_a.sort + tree_ids = tree_ids.to_a.sort mapping_ids = mapping_ids.to_a.sort - aln_ids = aln_ids.to_a.sort + aln_ids = aln_ids.to_a.sort AbortIf::logger.debug { ["tree_ids", tree_ids].join "\t" } AbortIf::logger.debug { ["mapping_ids", mapping_ids].join "\t" } AbortIf::logger.debug { ["aln_ids", aln_ids].join "\t" } @@ -150,11 +164,11 @@ # @param tree [NewickTree] a NewickTree object # # @yieldparam clade [Clade] a clade of the tree # # @return [Enumerator<Clade>] enumerator of Clade objects - def all_clades tree, metadata=nil + def all_clades tree, metadata = nil return enum_for(:all_clades, tree, metadata) unless block_given? tree.clade_nodes.reverse.each do |node| yield Clade.new node, tree, metadata end @@ -162,25 +176,25 @@ def snazzy_clades tree, metadata snazzy_clades = {} clades = self. - all_clades(tree, metadata). - sort_by { |clade| clade.all_leaves.count }. - reverse + all_clades(tree, metadata). + sort_by { |clade| clade.all_leaves.count }. + reverse metadata.each do |md_cat, leaf2mdtag| - already_checked = Set.new + already_checked = Set.new single_tag_clades = {} clades.each do |clade| assert clade.all_leaves.count > 1, "A clade cannot also be a leaf" unless clade.all_leaves.all? do |leaf| - already_checked.include? leaf - end + already_checked.include? leaf + end md_tags = clade.all_leaves.map do |leaf| assert leaf2mdtag.has_key?(leaf), "leaf #{leaf} is missing from leaf2mdtag ht" leaf2mdtag[leaf] @@ -222,31 +236,31 @@ def snazzy_info tree, metadata snazzy_info = {} clades = self. - all_clades(tree, metadata). - sort_by { |clade| clade.all_leaves.count }. - reverse + all_clades(tree, metadata). + sort_by { |clade| clade.all_leaves.count }. + reverse # Non snazzy clades have a value of nil, so set all to nil and the # snazzy ones will be overwritten. clades.each do |clade| snazzy_info[clade] = nil end metadata.each do |md_cat, leaf2mdtag| - already_checked = Set.new + already_checked = Set.new single_tag_clades = {} clades.each do |clade| assert clade.all_leaves.count > 1, "A clade cannot also be a leaf" unless clade.all_leaves.all? do |leaf| - already_checked.include? leaf - end + already_checked.include? leaf + end md_tags = clade.all_leaves.map do |leaf| assert leaf2mdtag.has_key?(leaf), "leaf #{leaf} is missing from leaf2mdtag ht" leaf2mdtag[leaf] @@ -287,11 +301,11 @@ snazzy_info end def read_mapping_file fname md_cat_names = nil - metadata = TreeClusters::Attrs.new + metadata = TreeClusters::Attrs.new File.open(fname, "rt").each_line.with_index do |line, idx| leaf_name, *metadata_vals = line.chomp.split "\t" if idx.zero? @@ -340,8 +354,8 @@ attrs[leaf][attr_name] << attr_val end end end - [attr_names, attrs] + [attr_names, attrs] end end