#!/usr/bin/env ruby Signal.trap("PIPE", "EXIT") require "tree_clusters" require "trollop" require "parse_fasta" require "shannon" require "fileutils" TreeClusters.extend TreeClusters def puts_info outf, clade_id, key_cols outf.puts [clade_id, key_cols.count, key_cols.to_a].join "\t" end opts = Trollop.options do version TreeClusters::VERSION banner <<-EOS Checking IDs ------------ IDs for the sequences must match between the three input files. The tree file is allowed to have quoted taxa names, but the mapping file and alignment file are not. If your alignment file has spaces in the name, the ID part of the header (i.e., the part up until the space) must match with the sequence IDs in the tree and the mapping file. Example: This would be okay. tree file: ('genome_A', 'genome_B'); aln file: >genome_A apple pie AAAAA >genome_B brown sugar AATTA mapping file: name coolness genome_A cool genome_B notcool Subtracting parent nodes ------------------------ If a clade's parent would be the root of the tree, no columns will be subtracted when removing the parent columns as it would be the entire alignment. Options: EOS opt(:tree, "Newick tree file", type: :string) opt(:mapping, "Mapping file", type: :string) opt(:aln, "Alignment file", type: :string) opt(:entropy_cutoff, "Cutoff to consider a column low entropy", default: 0.0) opt(:clade_size_cutoff, "Consider only clades with at least this many leaves", default: 1) opt(:outdir, "Output directory", default: ".") opt(:base, "Basename for output", default: "snazzy_clades") end abort_if opts[:tree].nil?, "--tree is a required arg" abort_if opts[:mapping].nil?, "--mapping is a required arg" abort_if opts[:aln].nil?, "--aln is a required arg" abort_unless_file_exists opts[:tree] abort_unless_file_exists opts[:mapping] abort_unless_file_exists opts[:aln] TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln] abort_unless opts[:entropy_cutoff] >= 0, "--entropy-cutoff must be >= 0" abort_unless opts[:clade_size_cutoff] >= 1, "--clade-size-cutoff must be >= 1" FileUtils.mkdir_p opts[:outdir] tree = NewickTree.fromFile opts[:tree] metadata = TreeClusters.read_mapping_file opts[:mapping] snazzy_clades = TreeClusters.snazzy_clades tree, metadata leaf2attrs = TreeClusters.read_alignment opts[:aln] clades_fname = File.join opts[:outdir], "#{opts[:base]}.snazzy_clades.txt" members_fname = File.join opts[:outdir], "#{opts[:base]}.snazzy_clades_clade_members.txt" all_key_cols_fname = File.join opts[:outdir], "#{opts[:base]}.snazzy_clades_key_cols.txt" key_cols_fname = File.join opts[:outdir], "#{opts[:base]}.snazzy_clades_key_cols.txt" key_cols_minus_parent_cols_fname = File.join opts[:outdir], "#{opts[:base]}.snazzy_clades_key_cols_minus_parent_cols.txt" key_cols_minus_sibling_cols_fname = File.join opts[:outdir], "#{opts[:base]}.snazzy_clades_key_cols_minus_sibling_cols.txt" info_f = File.open(clades_fname, "w") clade_members_f = File.open(members_fname, "w") key_cols_f = File.open(key_cols_fname, "w") key_cols_minus_parent_cols_f = File.open(key_cols_minus_parent_cols_fname, "w") key_cols_minus_sibling_cols_f = File.open(key_cols_minus_sibling_cols_fname, "w") begin # info is { metadata_category => metadata_tag , ... } snazzy_clades.each_with_index do |(clade, info), idx| clade_id = "clade_#{idx+1}___#{clade.name}" info_f.puts [clade_id, info.count, info.map { |pair| pair.join("|")}].join "\t" clade_members_f.puts [clade_id, clade.all_leaves.count, clade.all_leaves].join "\t" key_cols_all_leaves = TreeClusters.low_ent_cols clade.all_leaves, leaf2attrs, opts[:entropy_cutoff] key_cols_all_sibling_leaves = TreeClusters.low_ent_cols clade.all_sibling_leaves, leaf2attrs, opts[:entropy_cutoff] key_cols_parent_leaves = TreeClusters.low_ent_cols clade.parent_leaves, leaf2attrs, opts[:entropy_cutoff] key_cols_all_minus_sibling = key_cols_all_leaves - key_cols_all_sibling_leaves key_cols_all_minus_parent = key_cols_all_leaves - key_cols_parent_leaves puts_info key_cols_f, clade_id, key_cols_all_leaves puts_info key_cols_minus_parent_cols_f, clade_id, key_cols_all_minus_parent puts_info key_cols_minus_sibling_cols_f, clade_id, key_cols_all_minus_sibling end ensure info_f.close clade_members_f.close key_cols_f.close key_cols_minus_parent_cols_f.close key_cols_minus_sibling_cols_f.close end