lib/big_simon/runners.rb in big_simon-0.1.1 vs lib/big_simon/runners.rb in big_simon-0.2.0

- old
+ new

@@ -1,7 +1,8 @@ require "tempfile" require "parse_fasta" +require "parallel" module BigSimon class Runners # @note To match the other things, you'd like them to be key'd on the file name. @@ -50,16 +51,19 @@ end vir_f.fsync host_f.fsync - cmd = "mummer -threads #{threads} -qthreads #{threads} -maxmatch -l 15 #{host_f.path} #{vir_f.path} > #{mummer_outfname}" + # -k 3 index every third position in reference (broken now, bug in mummer) + # -n -k 3 -threads 3 + # -n match only A C T G + cmd = "mummer -n -threads #{threads} -qthreads #{threads} -maxmatch -l 15 #{host_f.path} #{vir_f.path} > #{mummer_outfname}" Process.run_and_time_it! "MUMMER", cmd end end - virus = nil + virus = nil overall_max_score = 0 File.open(mummer_outfname, "rt").each_line.with_index do |line, idx| line.chomp! unless line.empty? @@ -73,11 +77,11 @@ # hit_table[virus] = {} # end else ary = line.strip.split " " - host = ary[0].sub(/___reverse$/, "").strip + host = ary[0].sub(/___reverse$/, "").strip score = ary[3].to_i Rya::AbortIf.assert hit_table[virus].has_key?(host) # unless hit_table[virus].has_key? host @@ -86,11 +90,11 @@ # We only want the longest hit. hit_table[virus][host] = score if score > hit_table[virus][host] # Track the overall max for scaling. - overall_max_score = score if score > overall_max_score + overall_max_score = score if score > overall_max_score end end end results_table = {} @@ -201,10 +205,125 @@ end results end + # For scoring homology-ness, I just sum the bitscore for all significant hits for all genomes. + # + # @note I will make the specified outdir if it doesn't exist. + # @note Assumes that the files end with *.fa + # @note Assumes that the file names match the IDs. This SHOULD be taken care of by the big_simon program. + # @todo assert that fname thing matches sequence ID name. + def self.homology vir_dir, host_dir, outdir, threads + FileUtils.mkdir_p outdir + + host_orfs = File.join outdir, "host_orfs.homology" + host_orfs_blast_db = host_orfs + ".blast_db.homology" + + # Call ORFs on Hosts + cmd = "cat #{host_dir}/*.fa | #{BigSimon::PRODIGAL} " \ + "-d #{host_orfs} " \ + "> /dev/null" + + Process.run_and_time_it! "Predicting host ORFs", cmd + + # Make blast db's for the host genes. + cmd = "#{BigSimon::MAKEBLASTDB} " \ + "-in #{host_orfs} " \ + "-out #{host_orfs_blast_db} " \ + "-dbtype nucl" + + Process.run_and_time_it! "Making host blast db", cmd + + vir_genome_fnames = Dir.glob(vir_dir + "/*.fa") + + blast_info = Parallel.map(vir_genome_fnames, in_processes: threads) do |vir_genome_fname| + vir_orfs = File.join outdir, File.basename(vir_genome_fname) + ".vir_orfs.homology" + blast_results = File.join outdir, File.basename(vir_genome_fname) + ".blast_results.homology" + + # this will be used as a viral ID. + vir_simple_fname = File.basename vir_genome_fname, ".fa" + blast_table = {} + blast_table[vir_simple_fname] = Hash.new 0 + + # Call ORFs on the virus. + cmd = "#{BigSimon::PRODIGAL} " \ + "-d #{vir_orfs} -p meta -i #{vir_genome_fname} " \ + "> /dev/null" + + Process.run_and_time_it! "Predicting ORFs for #{File.basename vir_genome_fname}", cmd + + # Blast the ORFs against genomes. + cmd = "#{BigSimon::BLASTN} -query #{vir_orfs} -db #{host_orfs_blast_db} -outfmt 6 -evalue 0.01 -word_size 11 -out #{blast_results}" + Process.run_and_time_it! "Blasting ORFs for #{File.basename vir_genome_fname}", cmd + + # Remove ORFs file + FileUtils.rm vir_orfs if File.exist? vir_orfs + + Rya::AbortIf.logger.info { "Parsing #{blast_results}" } + # Parse the blast. + + File.open(blast_results, "rt").each_line do |line| + ary = line.chomp.split "\t" + + # The .sub() is to remove the annotation that prodigal gives. + vir_id = ary[0].sub(/_[0-9]+$/, "") + host_id = ary[1].sub(/_[0-9]+$/, "") + score = ary[11].to_f + + Rya::AbortIf.assert blast_table.has_key?(vir_id), "blast_table: got #{vir_id} should have been #{vir_simple_fname}" + + blast_table[vir_id][host_id] += score + end + + # Remove blast file + # FileUtils.rm_r blast_results if File.exist? blast_results + + # Again, we're assuming the input is .fa, which the big_simon program SHOULD ensure. TODO check these things with assertions. + simple_vir_name = File.basename vir_genome_fname.sub(/.fa$/, "") + + [simple_vir_name, blast_table] + end + + collated_blast_table = {} + host_simple_names = Dir.glob(host_dir + "/*.fa").map { |fname| File.basename(fname, ".fa") } + + Rya::AbortIf.assert host_simple_names.length == host_simple_names.uniq.length, "host simple names are not unique" + + Rya::AbortIf.logger.info { "Collating blast results" } + + # Get max score + max_score = -1 + blast_info.each do |_, blast_table| + blast_table.each do |vir_id, host_scores| + this_max = host_scores.values.max || -1 # sometimes there are no hits at all + + max_score = this_max if this_max > max_score + end + end + Rya::AbortIf.assert max_score > -1, "didn't get any scores" + + + klass = Class.new.extend Rya::CoreExtensions::Math + blast_info.each do |simple_vir_name, blast_table| + blast_table.each do |vir_id, host_scores| + collated_blast_table[vir_id] = [] + + host_simple_names.each do |host_id| + scaled_score = klass.scale host_scores[host_id], 0, max_score, 1, 0 + + host_table = { host: host_id, score: host_scores[host_id], scaled_score: scaled_score } + collated_blast_table[vir_id] << host_table + end + end + end + + pp collated_blast_table + + collated_blast_table + end + def self.vir_host_matcher exe, vir_dir, host_dir, outdir FileUtils.mkdir_p outdir cmd = "python #{exe} " \ "-v #{vir_dir} " \ @@ -271,16 +390,16 @@ extname = File.extname in_fname basename = File.basename in_fname, extname out_fname = File.join outdir, "#{basename}.heatmap.pdf" - [in_fname, out_fname] + [File.absolute_path(in_fname), File.absolute_path(out_fname)] end rcode_str = BigSimon::Utils.rcode fnames - Object::Tempfile.open do |f| + Object::File.open(File.join(outdir, "RCODE.r"), "w") do |f| f.puts rcode_str f.fsync # ensure no data is buffered cmd = "#{exe} #{f.path}"