lib/big_simon/runners.rb in big_simon-0.1.1 vs lib/big_simon/runners.rb in big_simon-0.2.0
- old
+ new
@@ -1,7 +1,8 @@
require "tempfile"
require "parse_fasta"
+require "parallel"
module BigSimon
class Runners
# @note To match the other things, you'd like them to be key'd on the file name.
@@ -50,16 +51,19 @@
end
vir_f.fsync
host_f.fsync
- cmd = "mummer -threads #{threads} -qthreads #{threads} -maxmatch -l 15 #{host_f.path} #{vir_f.path} > #{mummer_outfname}"
+ # -k 3 index every third position in reference (broken now, bug in mummer)
+ # -n -k 3 -threads 3
+ # -n match only A C T G
+ cmd = "mummer -n -threads #{threads} -qthreads #{threads} -maxmatch -l 15 #{host_f.path} #{vir_f.path} > #{mummer_outfname}"
Process.run_and_time_it! "MUMMER", cmd
end
end
- virus = nil
+ virus = nil
overall_max_score = 0
File.open(mummer_outfname, "rt").each_line.with_index do |line, idx|
line.chomp!
unless line.empty?
@@ -73,11 +77,11 @@
# hit_table[virus] = {}
# end
else
ary = line.strip.split " "
- host = ary[0].sub(/___reverse$/, "").strip
+ host = ary[0].sub(/___reverse$/, "").strip
score = ary[3].to_i
Rya::AbortIf.assert hit_table[virus].has_key?(host)
# unless hit_table[virus].has_key? host
@@ -86,11 +90,11 @@
# We only want the longest hit.
hit_table[virus][host] = score if score > hit_table[virus][host]
# Track the overall max for scaling.
- overall_max_score = score if score > overall_max_score
+ overall_max_score = score if score > overall_max_score
end
end
end
results_table = {}
@@ -201,10 +205,125 @@
end
results
end
+ # For scoring homology-ness, I just sum the bitscore for all significant hits for all genomes.
+ #
+ # @note I will make the specified outdir if it doesn't exist.
+ # @note Assumes that the files end with *.fa
+ # @note Assumes that the file names match the IDs. This SHOULD be taken care of by the big_simon program.
+ # @todo assert that fname thing matches sequence ID name.
+ def self.homology vir_dir, host_dir, outdir, threads
+ FileUtils.mkdir_p outdir
+
+ host_orfs = File.join outdir, "host_orfs.homology"
+ host_orfs_blast_db = host_orfs + ".blast_db.homology"
+
+ # Call ORFs on Hosts
+ cmd = "cat #{host_dir}/*.fa | #{BigSimon::PRODIGAL} " \
+ "-d #{host_orfs} " \
+ "> /dev/null"
+
+ Process.run_and_time_it! "Predicting host ORFs", cmd
+
+ # Make blast db's for the host genes.
+ cmd = "#{BigSimon::MAKEBLASTDB} " \
+ "-in #{host_orfs} " \
+ "-out #{host_orfs_blast_db} " \
+ "-dbtype nucl"
+
+ Process.run_and_time_it! "Making host blast db", cmd
+
+ vir_genome_fnames = Dir.glob(vir_dir + "/*.fa")
+
+ blast_info = Parallel.map(vir_genome_fnames, in_processes: threads) do |vir_genome_fname|
+ vir_orfs = File.join outdir, File.basename(vir_genome_fname) + ".vir_orfs.homology"
+ blast_results = File.join outdir, File.basename(vir_genome_fname) + ".blast_results.homology"
+
+ # this will be used as a viral ID.
+ vir_simple_fname = File.basename vir_genome_fname, ".fa"
+ blast_table = {}
+ blast_table[vir_simple_fname] = Hash.new 0
+
+ # Call ORFs on the virus.
+ cmd = "#{BigSimon::PRODIGAL} " \
+ "-d #{vir_orfs} -p meta -i #{vir_genome_fname} " \
+ "> /dev/null"
+
+ Process.run_and_time_it! "Predicting ORFs for #{File.basename vir_genome_fname}", cmd
+
+ # Blast the ORFs against genomes.
+ cmd = "#{BigSimon::BLASTN} -query #{vir_orfs} -db #{host_orfs_blast_db} -outfmt 6 -evalue 0.01 -word_size 11 -out #{blast_results}"
+ Process.run_and_time_it! "Blasting ORFs for #{File.basename vir_genome_fname}", cmd
+
+ # Remove ORFs file
+ FileUtils.rm vir_orfs if File.exist? vir_orfs
+
+ Rya::AbortIf.logger.info { "Parsing #{blast_results}" }
+ # Parse the blast.
+
+ File.open(blast_results, "rt").each_line do |line|
+ ary = line.chomp.split "\t"
+
+ # The .sub() is to remove the annotation that prodigal gives.
+ vir_id = ary[0].sub(/_[0-9]+$/, "")
+ host_id = ary[1].sub(/_[0-9]+$/, "")
+ score = ary[11].to_f
+
+ Rya::AbortIf.assert blast_table.has_key?(vir_id), "blast_table: got #{vir_id} should have been #{vir_simple_fname}"
+
+ blast_table[vir_id][host_id] += score
+ end
+
+ # Remove blast file
+ # FileUtils.rm_r blast_results if File.exist? blast_results
+
+ # Again, we're assuming the input is .fa, which the big_simon program SHOULD ensure. TODO check these things with assertions.
+ simple_vir_name = File.basename vir_genome_fname.sub(/.fa$/, "")
+
+ [simple_vir_name, blast_table]
+ end
+
+ collated_blast_table = {}
+ host_simple_names = Dir.glob(host_dir + "/*.fa").map { |fname| File.basename(fname, ".fa") }
+
+ Rya::AbortIf.assert host_simple_names.length == host_simple_names.uniq.length, "host simple names are not unique"
+
+ Rya::AbortIf.logger.info { "Collating blast results" }
+
+ # Get max score
+ max_score = -1
+ blast_info.each do |_, blast_table|
+ blast_table.each do |vir_id, host_scores|
+ this_max = host_scores.values.max || -1 # sometimes there are no hits at all
+
+ max_score = this_max if this_max > max_score
+ end
+ end
+ Rya::AbortIf.assert max_score > -1, "didn't get any scores"
+
+
+ klass = Class.new.extend Rya::CoreExtensions::Math
+ blast_info.each do |simple_vir_name, blast_table|
+ blast_table.each do |vir_id, host_scores|
+ collated_blast_table[vir_id] = []
+
+ host_simple_names.each do |host_id|
+ scaled_score = klass.scale host_scores[host_id], 0, max_score, 1, 0
+
+ host_table = { host: host_id, score: host_scores[host_id], scaled_score: scaled_score }
+ collated_blast_table[vir_id] << host_table
+ end
+ end
+ end
+
+ pp collated_blast_table
+
+ collated_blast_table
+ end
+
def self.vir_host_matcher exe, vir_dir, host_dir, outdir
FileUtils.mkdir_p outdir
cmd = "python #{exe} " \
"-v #{vir_dir} " \
@@ -271,16 +390,16 @@
extname = File.extname in_fname
basename = File.basename in_fname, extname
out_fname = File.join outdir, "#{basename}.heatmap.pdf"
- [in_fname, out_fname]
+ [File.absolute_path(in_fname), File.absolute_path(out_fname)]
end
rcode_str = BigSimon::Utils.rcode fnames
- Object::Tempfile.open do |f|
+ Object::File.open(File.join(outdir, "RCODE.r"), "w") do |f|
f.puts rcode_str
f.fsync # ensure no data is buffered
cmd = "#{exe} #{f.path}"