#!/usr/bin/env ruby Signal.trap("PIPE", "EXIT") require "tree_clusters" require "trollop" require "parse_fasta" require "fileutils" TreeClusters.extend TreeClusters PROGRAM = "key_cols" GREETING = "The #{PROGRAM} program" UNDERLINE = "=" * GREETING.length opts = Trollop.options do version TreeClusters::VERSION # banner <<-EOS # Checking IDs # ------------ # IDs for the sequences must match between the three input files. # The tree file is allowed to have quoted taxa names, but the # mapping file and alignment file are not. # If your alignment file has spaces in the name, the ID part of the # header (i.e., the part up until the space) must match with the # sequence IDs in the tree and the mapping file. # Example: This would be okay. # tree file: # ('genome_A', 'genome_B'); # aln file: # >genome_A apple pie # AAAAA # >genome_B brown sugar # AATTA # Options: # EOS banner <<-EOS The key_cols program ==================== Hi. My name is key_cols. If you give me a Newick tree file and an alignment file (fasta format), I will tell you key columns for all clades/clusters that have them. Overview -------- A clade has key columns if you can use the residue/nucleotide at those columns to tell sequences in the clade from sequences outside of the clade. Here's an example.... After you run me, you'll get an output file with the extension, '*.tree_clusters.key_cols.txt'. It may look something like this: cluster_A 4 1-A 2-A 3-A 5-G cluster_B 4 1-C 2-C 3-C 5-A This file has the clade name, the number of key columns for that clade, and then the rest of the columns tell you the position (1-based) and the nucleotide or residue in that column in all sequences of that clade. In this case we have only two clades. The key columns for both are 1, 2, 3, and 5. So you can use columns 1, 2, 3, and 5 to classify a sequence as belonging to one of these clades. If it has A, A, A, and G in those positions, it'll be in cluster_A, and if it has C, C, C, and A in those positions, it'll be in cluster_B. If it has any other combination in those 4 columns of the alignment, it won't be in either clade. This is just a silly example and most of the time you'll get different key columns for different clades. Note that every clade may not have key columns listed depending on your data and the options you select. Notes & Gotchas -------------- - I ignore columns with gap chars (currently just '-') regardless of column entropy. - If you're Newick tree has parentheses "(" or ")" in leaf names you'll get a parsing error even if the name is single quoted. We'll fix this at some point. Option info ----------- --entropy-cutoff: A cutoff of 0 means that you allow no variation at any column. --clade-size-cutoff: Use this option to ignore tiny clades. Options: EOS opt(:tree, "Newick tree file", type: :string) opt(:aln, "Alignment file", type: :string) opt(:entropy_cutoff, "Cutoff to consider a column low entropy", default: 0.0) opt(:clade_size_cutoff, "Consider only clades with at least this many leaves", default: 1) opt(:outdir, "Output directory", default: "key_cols_output") opt(:base, "Basename for output", default: "key_cols_output") end abort_if opts[:tree].nil?, "--tree is a required arg. Try running: #{PROGRAM} --help" abort_if opts[:aln].nil?, "--aln is a required arg. Try running: #{PROGRAM} --help" abort_unless_file_exists opts[:tree] abort_unless_file_exists opts[:aln] # TreeClusters.check_ids opts[:tree], opts[:mapping], opts[:aln] abort_unless opts[:entropy_cutoff] >= 0, "--entropy-cutoff must be >= 0" abort_unless opts[:clade_size_cutoff] >= 1, "--clade-size-cutoff must be >= 1" FileUtils.mkdir_p opts[:outdir] AbortIf.logger.info { "Parsing input files" } tree = NewickTree.fromFile opts[:tree] leaf2attrs = TreeClusters.read_alignment opts[:aln] members_fname = File.join opts[:outdir], "#{opts[:base]}.tree_clusters.clade_members.txt" key_cols_fname = File.join opts[:outdir], "#{opts[:base]}.tree_clusters.key_cols.txt" annotated_tree_fname = File.join opts[:outdir], "#{opts[:base]}.tree_clusters.annotated_tree.txt" clade_members_f = File.open(members_fname, "w") key_cols_f = File.open(key_cols_fname, "w") annotated_tree_f = File.open(annotated_tree_fname, "w") key_col_sets = {} clade_sizes = {} clade_count = TreeClusters.all_clades(tree).count change_these_names = Set.new def all_clades_helper tree TreeClusters.all_clades(tree).sort_by { |cl| cl.all_leaves.count }.reverse end AbortIf.logger.info { "Processing clades" } begin all_clades_helper(tree).each_with_index do |clade, idx| if ((idx + 1) % 100).zero? perc = ((idx + 1) / clade_count.to_f * 100).round 2 STDERR.printf("Processing clades: #{perc}%\r") end clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}" clade_sizes[clade_id] = clade.all_leaves.count clade_members_f.puts [clade_id, clade.all_leaves.count, clade.all_leaves].join "\t" key_cols_all_leaves = TreeClusters.low_ent_cols_with_bases clade.all_leaves, leaf2attrs, opts[:entropy_cutoff] unless key_col_sets.has_key? key_cols_all_leaves key_col_sets[key_cols_all_leaves] = Set.new [clade_id] end key_col_sets[key_cols_all_leaves] << clade_id end AbortIf.logger.info { "Writing results" } # We only want key column sets that are unique to a single clade. key_col_sets.select { |_, clades| clades.count == 1 }.each do |kc_set, clades| clade_id = clades.first # TODO should we just skip processing clades that are too small rather than just not printing them out? if clade_sizes[clade_id] >= opts[:clade_size_cutoff] change_these_names << clade_id key_cols_f.puts [clade_id, kc_set.count, kc_set.map { |pos, bases| "#{pos}-#{bases.join}" } ].join "\t" end end AbortIf.logger.info { "Annotating tree" } all_clades_helper(tree).each_with_index do |clade, idx| clade_id = "clade_#{idx + 1}___#{clade.name.tr("'", "_")}" if change_these_names.include? clade_id # This will change the node in the original NewickTree clade.node.name = "'#{clade_id}'" end end annotated_tree_f.puts tree.to_s.sub(/;+$/, ";") ensure clade_members_f.close key_cols_f.close annotated_tree_f.close end