#!/usr/bin/env ruby # require 'rraxml' require 'rnewick' require 'rphylip' require 'logger' class CycleController def initialize(opts) @opts = opts @opts[:num_ptrees] ||= @opts[:num_parsi_trees] @numtaxa, @seqlen = File.open(@opts[:phy]).readlines.first.split.map{|w| w.to_i} end def parsimonator_requirements bytes_inner = @numtaxa.to_f * @seqlen.to_f security_factor = 3.0 required_MB = bytes_inner * security_factor * 1E-6 required_MB = 16 unless required_MB > 16 puts required_MB required_MB.to_i.to_s end def raxmllight_requirements #(n-2) * m * ( 8 * 4 ) bytes_inner = @numtaxa.to_f * @seqlen.to_f * 8 * 4 security_factor = 1.3 required_MB = bytes_inner * security_factor * 1E-6 required_MB = 16 unless required_MB > 16 puts required_MB required_MB.to_i.to_s end def run_as_batch(script, logpath) raise "#{script} missing" unless File.exist?(script) #raise "#{logpath} missing" unless File.exist?(logpath) raise "User Number of parsimony trees not set" unless @opts[:num_parsi_trees] > 0 raise "Total Number of parsimony trees not set" unless @opts[:num_ptrees] > 0 opts = @opts[:iter].to_s opts += " " + File.expand_path(@opts[:phy]) opts += " " + parsimonator_requirements opts += " " + raxmllight_requirements opts += " " + @opts[:num_parsi_trees].to_s # -N parameter -p for suer opts += " " + @opts[:num_ptrees].to_s # total number of parsimony trees opts += " " + @opts[:num_bestML_trees].to_s opts += " " + @opts[:exp_name].to_s opts += " " + @opts[:base_dir].to_s puts "./#{script} #{opts}" system "./#{script} #{opts} >> #{logpath}" end end class TreeBunchStarter # Given an initial alignment, it creates a initial bunch of ML trees in bunch_0 dir # should log results attr_reader :cluster_batch def initialize(opts) @phylip = opts[:phylip] @base_dir = opts[:base_dir] @prev_dir = opts[:prev_dir] @update_id = opts[:update_id] || 0 @num_threads = opts[:num_threads] || 0 # create dirs if required @alignment_dir = File.join @base_dir, "alignments" @parsimony_trees_dir = File.join @base_dir, "parsimony_trees" @parsimony_trees_out_dir = File.join @parsimony_trees_dir, "output" @ml_trees_dir = File.join @base_dir, "ml_trees" @bestML_trees_dir = File.join @base_dir, "best_ml_trees" # the new phylip @phylip_updated = File.join @alignment_dir, "phy_#{@update_id.to_s}" # defaults @num_parsi_trees = 4 @num_bestML_trees = @num_parsi_trees / 2 @CAT_topology_bunch = File.join @ml_trees_dir, "CAT_topology_bunch.nw" @CAT_topology_bunch_order = File.join @ml_trees_dir, "CAT_topology_bunch_order.txt" @bestML_bunch = File.join @bestML_trees_dir, "best_bunch.nw" @prev_bestML_bunch = File.join @prev_dir, "best_ml_trees", "best_bunch.nw" unless @prev_dir.nil? @cluster_batch = opts[:cluster_batch] @logpath = File.join @base_dir, "starter.log" end def logput(msg, error = false) @logger ||= Logger.new(@logpath) if error @logger.error msg else @logger.info msg end puts msg end def ready? ready = true dirs = [@alignment_dir, @parsimony_trees_dir, @parsimony_trees_out_dir,@ml_trees_dir, @bestML_trees_dir] dirs.each do |d| if not File.exist?(d) FileUtils.mkdir_p d logput "Created #{d}" else logput "Exists #{d}" ready = false end end if @update_id == 0 FileUtils.cp @phylip, @alignment_dir else logput "Copying new update alignment (not expanding) from #{@phylip} to #{@phylip_updated}" FileUtils.cp @phylip, @phylip_updated end ready end def add_update(opts) check_options(opts) begin num_parsi_trees = opts[:num_parsi_trees] || @num_parsi_trees num_bestML_trees = opts[:num_bestML_trees] || @num_bestML_trees # prepare the parsimony starting trees raise "prev bunch not ready #{@prev_bestML_bunch}" unless File.exist?(@prev_bestML_bunch) last_best_bunch = NewickFile.new(@prev_bestML_bunch) # TODO this save_each_newick_as should be wrapped around sth. that considers a list of outliers and # prunes out the ouliers before saving as "prev_parsi_tree" to make it compatible with the next round last_best_bunch.save_each_newick_as(File.join(@parsimony_trees_dir, 'prev_parsi_tree'), "nw") prev_trees = Dir.entries(@parsimony_trees_dir).select{|f| f =~ /^prev_parsi_tree/} if num_bestML_trees > num_parsi_trees * prev_trees.size raise "#bestML trees (#{num_bestML_trees}) is too higher than trees from previous round" end unless @cluster_batch.empty? logput "Exp #{opts[:exp_name]}, your cluster will take care of this update no #{@update_id}. stay tuned" c = CycleController.new(:iter => @update_id, :phy => @phylip_updated, :num_parsi_trees => num_parsi_trees, :num_ptrees => num_parsi_trees * prev_trees.size, :num_bestML_trees => num_bestML_trees, :base_dir => @base_dir, :exp_name => opts[:exp_name] ) c.run_as_batch(@cluster_batch, @logpath) "cluster" else logput "****** Start update no #{@update_id} ********" logput "step 1 of 3 : Parsimony starting trees #{num_parsi_trees} each\n----" update_parsimony_trees(num_parsi_trees, prev_trees) # raxml light phase (2/3): use threads here? / each tree could be computed in paralell logput "step 2 of 3 : ML trees\n----" # generate_ML_trees generate_ML_trees(@parsimony_trees_out_dir, @phylip_updated) # raxml scoring of initial bunch (needs to be done after step 2, or not?) logput "step 3 of 3 : Score bunch of initial ML trees and select best #{num_bestML_trees}\n----" best_lh = score_ML_trees(num_bestML_trees, @phylip_updated) logput "Bunch of initial ML trees #{num_bestML_trees}, ready at #{@bestML_bunch}\n----" best_lh end rescue Exception => e logput(e, error = true) raise e end end def search_std(num_gamma_trees = nil) search_opts = { :phylip => @phylip, :outdir => @ml_trees_dir, :num_gamma_trees => num_gamma_trees || 1, :stderr => File.join(@ml_trees_dir, "err"), :stdout => File.join(@ml_trees_dir, "info"), :name => "std_GAMMA_search" } search_opts.merge!({:num_threads => @num_threads}) if @num_threads.to_i > 0 r = RaxmlGammaSearch.new(search_opts) logput "Start ML search from scratch with #{num_gamma_trees} trees" r.run bestLH = File.open(r.stdout).readlines.find{|l| l =~ /^Final GAMMA-based Score of best/}.chomp.split("tree").last logput "Done ML search from scratch with #{num_gamma_trees} trees" bestLH end def generate_initial_bunch(opts) check_options(opts) begin num_parsi_trees = opts[:num_parsi_trees] || @num_parsi_trees num_bestML_trees = opts[:num_bestML_trees] || @num_bestML_trees if num_bestML_trees > num_parsi_trees raise "#bestML trees (#{num_bestML_trees}) cant be higher than #parsi trees(#{num_parsi_trees})" end # phases 2 and 3 can be done in paralell unless @cluster_batch.empty? logput "your cluster will take care of this start #{@update_id}. Stay tuned" c = CycleController.new(:iter => 0, :phy => @phylip, :num_parsi_trees => num_parsi_trees, :num_bestML_trees => num_bestML_trees, :base_dir => @base_dir, :exp_name => opts[:exp_name] ) c.run_as_batch(@cluster_batch, @logpath) "cluster" else logput "Start generating initial bunch" logput "step 1 of 3 : Parsimony starting trees #{num_parsi_trees}\n----" generate_parsimony_trees(num_parsi_trees) # raxml light phase (2/3): use threads here? / each tree could be computed in paralell logput "step 2 of 3 : ML trees\n----" generate_ML_trees(@parsimony_trees_dir, @phylip) # raxml scoring of initial bunch (needs to be done after step 2, or not?) logput "step 3 of 3 : Score bunch of initial ML trees and select best #{num_bestML_trees}\n----" best_lh = score_ML_trees(num_bestML_trees, @phylip) logput "Bunch of initial ML trees #{num_bestML_trees}, ready at #{@bestML_bunch}\n----" best_lh end rescue Exception => e logput(e, error = true) raise e end end private def check_options(opts) supported_opts = [:num_parsi_trees, :num_bestML_trees, :exp_name, :cycle_batch_script] opts.keys.each do |key| unless supported_opts.include?(key) logput "Option #{key} is unknwon" end end end def generate_parsimony_trees(num_parsi_trees) logput "Starting parsimony with #{num_parsi_trees} trees" parsimonator_opts = { :phylip => @phylip, :num_trees => num_parsi_trees, :outdir => @parsimony_trees_dir, :stderr => File.join(@parsimony_trees_dir, "err"), :stdout => File.join(@parsimony_trees_dir, "info"), :name => "parsimony_initial" } parsi = Parsimonator.new(parsimonator_opts) logput "Start computing parsimony trees of initial bunch" parsi.run logput "Done with parsimony trees of initial bunch" end def update_parsimony_trees(num_parsi_trees, trees) trees.each_with_index do |parsi_start_tree, i| logput "Starting new parsimony tree with #{parsi_start_tree} trees" parsimonator_opts = { :phylip => @phylip_updated, :num_trees => num_parsi_trees, :newick => File.join(@parsimony_trees_dir, parsi_start_tree), :outdir => @parsimony_trees_out_dir, :stderr => File.join(@parsimony_trees_out_dir, "err_#{parsi_start_tree}"), :stdout => File.join(@parsimony_trees_out_dir, "info_#{parsi_start_tree}"), :name => "u#{@update_id}_#{parsi_start_tree}" } parsi = Parsimonator.new(parsimonator_opts) logput "Start computing parsimony trees of #{parsi_start_tree}, #{i+1} of #{trees.size}" parsi.run logput "run with options #{parsi.ops.to_s}" logput "Done with parsimony trees of #{parsi_start_tree}, #{i+1} of #{trees.size}" end end def generate_ML_trees(starting_trees_dir, phylip) starting_trees = Dir.entries(starting_trees_dir).select{|f| f =~ /^RAxML_parsimonyTree/} raise "no starting trees available" if starting_trees.nil? or starting_trees.size < 1 starting_trees.each_with_index do |parsimony_tree, i| # ideally we just submit here to the cluster...and start phase 3 when all are done tree_id = parsimony_tree.split("parsimonyTree.").last light_opts = { :phylip => phylip, :outdir => @ml_trees_dir, :flags => " -D ", # default to a RF convergence criterion :starting_newick => File.join(starting_trees_dir, parsimony_tree), :stderr => File.join(@ml_trees_dir, "err#{tree_id}"), :stdout => File.join(@ml_trees_dir, "info#{tree_id}"), :name => "starting_parsimony_tree_" + tree_id } light_opts.merge!({:num_threads => @num_threads}) if @num_threads.to_i > 0 r = RaxmlLight.new(light_opts) logput "Start ML search for #{parsimony_tree} (#{i+1} of #{starting_trees.size})" r.run logput "Done ML search for #{parsimony_tree} (#{i+1} of #{starting_trees.size})" # add the result to the bunch newick_str = NewickFile.new(File.join(r.outdir, "RAxML_result.#{r.name}")).newickStrings[0].str append_to_file(@CAT_topology_bunch, newick_str) append_to_file(@CAT_topology_bunch_order, r.name) end end def score_ML_trees(num_bestML_trees, phylip) logput "Starting scoring of ML trees" scorer_opts = { :phylip => phylip, :outdir => @ml_trees_dir, :starting_newick => @CAT_topology_bunch, :stderr => File.join(@ml_trees_dir, "err_scores"), :stdout => File.join(@ml_trees_dir, "info_scores"), :name => "SCORES" } scorer_opts.merge!({:num_threads => @num_threads}) if @num_threads.to_i > 0 scorer = GammaScorer.new(scorer_opts) scorer.run logput "Done scoring of ML trees, selecting the best #{num_bestML_trees}..." rank_file = File.join @ml_trees_dir, "RAxML_info.#{scorer.name}" lh_lines = File.open(rank_file).readlines.select{|l| l =~ /^[0-9]+ -[0-9]+.[0-9]+$/} rank_id = lh_lines.map{|l| l.split.first} best_lh = lh_lines.first.split.last newick_bunch = File.open(@CAT_topology_bunch).readlines File.open(@bestML_bunch, "w") do |f| rank_id[0...num_bestML_trees].each_with_index do |newick_id, i| f.puts newick_bunch[newick_id.to_i] logput "#{i+1}: Selected tree with id #{newick_id}" end end best_lh end def append_to_file(file, str) File.open(file, "a+") do |f| f.puts str end end end