lib/big_simon/runners.rb in big_simon-0.1.0 vs lib/big_simon/runners.rb in big_simon-0.1.1
- old
+ new
@@ -1,12 +1,119 @@
require "tempfile"
+require "parse_fasta"
module BigSimon
class Runners
+ # @note To match the other things, you'd like them to be key'd on the file name.
+ def self.mummer exe, vir_dir, host_dir, outdir, threads
+ klass = Class.new.extend Rya::CoreExtensions::Math
+ FileUtils.mkdir_p outdir
+
+ mummer_outfname = File.join outdir, "mummer_out.txt"
+
+ virus_fnames = Dir.glob(vir_dir + "/*")
+ host_fnames = Dir.glob(host_dir + "/*")
+
+ hit_table = {}
+
+ Tempfile.open do |vir_f|
+ Tempfile.open do |host_f|
+ virus_fnames.each do |fname|
+ Rya::AbortIf.assert fname.match(/.fa$/), "bad fname: #{fname}"
+
+ Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
+ # id needs to be the file name
+ new_id = File.basename fname.sub(/.fa$/, "")
+
+ hit_table[new_id] = {}
+
+ vir_f.puts ">#{new_id}\n#{rec.seq}"
+
+ vir_f.puts ">#{new_id}___reverse\n#{rec.seq.reverse}"
+ end
+ end
+
+ host_fnames.each do |fname|
+ Rya::AbortIf.assert fname.match(/.fa$/), "bad fname: #{fname}"
+
+ Object::ParseFasta::SeqFile.open(fname).each_record do |rec|
+ new_id = File.basename fname.sub(/.fa$/, "")
+
+ # Add this host to each virus in the hit_table
+ hit_table.each do |virus, host_table|
+ host_table[new_id] = 0 # set it to defualt score of 0
+ end
+
+ host_f.puts ">#{new_id}\n#{rec.seq}"
+ host_f.puts ">#{new_id}___reverse\n#{rec.seq.reverse}"
+ end
+ end
+
+ vir_f.fsync
+ host_f.fsync
+
+ cmd = "mummer -threads #{threads} -qthreads #{threads} -maxmatch -l 15 #{host_f.path} #{vir_f.path} > #{mummer_outfname}"
+ Process.run_and_time_it! "MUMMER", cmd
+ end
+ end
+
+ virus = nil
+ overall_max_score = 0
+ File.open(mummer_outfname, "rt").each_line.with_index do |line, idx|
+ line.chomp!
+
+ unless line.empty?
+ if line.start_with? ">"
+ virus = line.chomp.sub(/^>/, "").sub(/___reverse$/, "").strip
+
+ # It can be duplicated as there are forward and reverse for each sequence (in case they're contigs.)
+
+ Rya::AbortIf.assert hit_table.has_key?(virus)
+ # unless hit_table.has_key? virus
+ # hit_table[virus] = {}
+ # end
+ else
+ ary = line.strip.split " "
+
+ host = ary[0].sub(/___reverse$/, "").strip
+ score = ary[3].to_i
+
+ Rya::AbortIf.assert hit_table[virus].has_key?(host)
+
+ # unless hit_table[virus].has_key? host
+ # hit_table[virus][host] = -1
+ # end
+
+ # We only want the longest hit.
+ hit_table[virus][host] = score if score > hit_table[virus][host]
+
+ # Track the overall max for scaling.
+ overall_max_score = score if score > overall_max_score
+ end
+ end
+ end
+
+ results_table = {}
+
+ min, max, from, to = 0, overall_max_score, 1, 0
+
+ hit_table.each do |virus, host_table|
+ results_table[virus] = []
+
+ host_table.each do |host, score|
+ scaled_score = klass.scale score, min, max, from, to
+
+ results_table[virus] << { host: host, score: score, scaled_score: scaled_score }
+ end
+ end
+
+ results_table
+ end
+
# This one's a bit different as it parses as well and returns original names.
# @todo Also do the reverse of each genome in case it's a contig.
- def self.mummer exe, vir_dir, host_dir, outdir, threads
+ def self.mummer2 exe, vir_dir, host_dir, outdir, threads
klass = Class.new.extend Rya::CoreExtensions::Math
FileUtils.mkdir_p outdir
# TODO put these all in one file then do it?