#!/usr/bin/env ruby Signal.trap("PIPE", "EXIT") require "pp" require "tempfile" require "parse_fasta" require "trollop" require "big_simon" # TODO make scaled scores with high score being better. Process.extend Rya::CoreExtensions::Process opts = Trollop.options do version BigSimon::VERSION_BANNER banner <<-EOS #{BigSimon::VERSION_BANNER} Hi, I'm BigSimon! I'm here to help you figure out the hosts for your viruses. I run a bunch of different programs. In addition to doing some merging of results, I'll give you heatmaps for all the programs and you can check for yourself. The scaled scores run from 0 to 1 with lower scores being better. Options: EOS opt :viruses, "Path to fasta file(s) with viruses", type: :strings opt :hosts, "Path to fasta file(s) with hosts", type: :strings opt :outdir, "Output directory", default: "big_simon" opt :threads, "Number of threads to use", default: 1 end Rya::AbortIf.logger.debug { "Command line opts: #{opts.inspect}" } BigSimon::Utils.check_opt! opts, :viruses BigSimon::Utils.check_opt! opts, :hosts # Check infiles [opts[:viruses], opts[:hosts]].flatten.each do |fname| BigSimon::Utils.check_file! fname end Rya::AbortIf.abort_unless opts[:threads] > 0, "--threads must be > 0" programs = [ "WIsH", "VirHostMatcher", "mummer" ] outdir = opts[:outdir] threads = opts[:threads] virus_fnames = opts[:viruses] host_fnames = opts[:hosts] FileUtils.mkdir_p outdir tmpdir = File.join opts[:outdir], "big_simon_tmp" tmpdir_virus = File.join tmpdir, "virus" tmpdir_host = File.join tmpdir, "host" # all_predictions_fname = File.join outdir, "scores_all.txt" mean_scaled_scores_fname = File.join outdir, "scores_scaled.mean.txt" virus_recs, host_recs = [], [] # Tempfile.open do |vir_f| # Tempfile.open do |host_f| # virus_fnames.each do |fname| # ParseFasta::SeqFile.open(fname).each_record do |rec| # vir_f.puts rec # # vir_f.puts ">#{rec.id}___reverse\n#{rec.seq.reverse}" # end # end # # host_fnames.each do |fname| # ParseFasta::SeqFile.open(fname).each_record do |rec| # host_f.puts rec # host_f.puts ">#{rec.id}___reverse\n#{rec.seq.reverse}" # end # end # # vir_f.fsync # host_f.fsync # # cmd = "mummer -maxmatch -l 15 #{host_f.path} #{vir_f.path} > /Users/moorer/Desktop/mummer.OUT" # Process.run_and_time_it! "MUMMER", cmd # end # end # # header = nil # hits = [] # hit_info = {} # virus = nil # # File.open("/Users/moorer/Desktop/mummer.OUT", "rt").each_line.with_index do |line, idx| # if line.start_with? '>' # virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip # # unless hit_info.has_key? virus # hit_info[virus] = {} # end # else # host, _, _, len = line.chomp.strip.split(" ") # host = host.sub(/___reverse$/, "").strip # # unless hit_info[virus].has_key? host # hit_info[virus][host] = -1 # end # # hit_info[virus][host] = len.to_i if len.to_i > hit_info[virus][host] # end # end # # puts # # hh = hit_info.map do |virus, info| # [virus, info.to_a.sort_by {|gen, len| len}.reverse] # end # # pp hh # hh = hit_info.map do |virus, info| # [virus, info.to_a.sort_by { |host, hit_len| hit_len }.reverse # # end # p hit_info scores_files = {} programs.each do |program| raw_fname = File.join outdir, "scores_raw.#{program}.txt" scaled_fname = File.join outdir, "scores_scaled.#{program}.txt" scores_files[program] = { raw: File.open(raw_fname, "w"), scaled: File.open(scaled_fname, "w") } end scores_files.each do |program, files| files.each do |name, file| file.puts %w[virus host score].join "\t" end end name_map_virus, all_ids_virus = BigSimon::Utils.set_up_tmp_dirs virus_fnames, tmpdir_virus, "virus" name_map_host, all_ids_host = BigSimon::Utils.set_up_tmp_dirs host_fnames, tmpdir_host, "host" wish_outf = BigSimon::Runners.wish BigSimon::WISH, tmpdir_virus, tmpdir_host, tmpdir, threads vhm_outf = BigSimon::Runners.vir_host_matcher BigSimon::VHM, tmpdir_virus, tmpdir_host, tmpdir host_info_mummer = BigSimon::Runners.mummer BigSimon::MUMMER, tmpdir_virus, tmpdir_host, tmpdir, threads # Map them back to simple names. TODO just have it spit these out from the beginning. host_info_mummer_simple_names = {} inverted_name_map_virus = name_map_virus.invert inverted_name_map_host = name_map_host.invert host_info_mummer.each do |virus, host_tables| virname = virus if inverted_name_map_virus.has_key? virus virname = inverted_name_map_virus[virus] end host_info_mummer_simple_names[virname] = [] host_tables.map do |table| hostname = inverted_name_map_host.has_key?(table[:host]) ? inverted_name_map_host[table[:host]] : table[:host] new_table = { host: hostname, score: table[:score], scaled_score: table[:scaled_score] } host_info_mummer_simple_names[virname] << new_table end end host_info_wish = BigSimon::Parsers.wish wish_outf host_info_vhm = BigSimon::Parsers.vir_host_matcher vhm_outf host_info_simple_names = BigSimon::Pipeline.collate_host_results [host_info_wish, host_info_vhm, host_info_mummer_simple_names], programs host_info = BigSimon::Pipeline.map_taxa host_info_simple_names, name_map_virus, name_map_host # puts # pp host_info # puts # Just a basic all info file # File.open all_predictions_fname, "w" do |f| # f.puts %w[virus host program score scaled.score].join "\t" host_info.each do |virus, h1| h1.each do |host, h2| lines = {} h2[:scores].each do |program, score| lines[[virus, host, program]] = [score] scores_files[program][:raw].puts [virus, host, score].join "\t" end # Add in the scaled score too. h2[:scaled_scores].each do |program, score| lines[[virus, host, program]] << score scores_files[program][:scaled].puts [virus, host, score].join "\t" end # lines.each do |(virus, host, program), (score, scaled_score)| # f.puts [virus, host, program, score, scaled_score].join "\t" # end end end # end # A file with mean scaled scores. File.open mean_scaled_scores_fname, "w" do |f| f.puts %w[virus host score].join "\t" host_info.each do |virus, h1| h1.each do |host, h2| scaled_scores = h2[:scaled_scores].values mean_scaled_score = scaled_scores.reduce(:+) / scaled_scores.length.to_f f.puts [virus, host, mean_scaled_score].join "\t" end end end scores_files.each do |program, file| file.values.map(&:close) end FileUtils.rm_r tmpdir # Make the heatmaps BigSimon::Runners.heatmaps BigSimon::RSCRIPT, outdir, File.join(outdir, "heatmaps")