#!/usr/bin/ruby -w require 'spec_id' require 'roc' require 'generator' require 'optparse' ################################################ $AREAS_ONLY = false ################################################ opts = OptionParser.new do |op| op.banner = "usage: #{File.basename(__FILE__)} prefix bioworks.xml" op.separator "" op.separator "takes Bioworks 3.2 xml output files (with probabilities)" op.separator "rank orders the probabilities and outputs num hits and precision" op.separator "Also takes gzipped (xml.gz) files labeled as such" op.separator "" op.separator "Outputs a comma separated value to STDOUT (.csv)" op.separator "" op.separator "To capture:" op.separator " #{File.basename(__FILE__)} bioworks.xml > out.csv" op.on("-a", "--area", "outputs the area under the curve instead") do |v| $AREAS_ONLY = true end end opts.parse! if ARGV.size < 2 puts opts exit end fp_prefix = ARGV[0] file = ARGV[1] obj = SpecID.new(file) re_prefix = /^#{Regexp.escape(fp_prefix)}/o prc = proc {|it| it.prots.first.reference =~ re_prefix } #(match, nomatch) = obj.classify(:peps, prc) obj.peps = obj.pep_prots (fp, tp) = obj.classify(:peps, prc) #puts fp.size.to_s #puts tp.size.to_s fp_obj = SpecID.new fp_obj.peps = fp tp_obj = SpecID.new tp_obj.peps = tp two_lists = [tp_obj, fp_obj].map do |obj| list = [] list.push( obj.pep_probs_by_pep_prots ) list.push( obj.pep_probs_by_bn_seq_charge ) # These each have a by_min and a by_top10 list.push(*( obj.pep_probs_by_bn_scan ) ) list.push(*( obj.pep_probs_by_bn_scan_charge ) ) list end headings = ["PepProts", "SeqCharge", "Scan(TopHit)", "Scan(Top10)", "ScanCharge(TopHit)", "ScanCharge(Top10)"] csv_headings = [] headings.each do |head| csv_headings << head + ": NH" csv_headings << head + ": PR" end pairs = two_lists[0].zip two_lists[1] roc = DecoyROC.new x_y= [] area_under_curve = [] #start_x = [] #end_x = [] pairs.each do |pair| #x,y = roc.pred_and_tps_and_ppv(pair[0], pair[1]) (num_hits, tps, ppv) = roc.pred_and_tps_and_ppv(pair[0], pair[1]) x = num_hits y = ppv if $AREAS_ONLY x.unshift 0 y.unshift 1.0 area_under_curve << roc.area_under_curve(x,y) #start_x << x.first #end_x << x.last else x_y.push(x, y) # <- normal output end end if $AREAS_ONLY headings.unshift "Filename" puts headings.join(" ") area_under_curve.unshift file puts area_under_curve.join(" ") #puts start_x.join(" ") #puts end_x.join(" ") exit ### <-------------- ABORT HERE end # X axis is the number of peptides id# (i.e., # of peps in TP db) # Y axis is the precision = TP/(TP+FP) puts "# NH = number of hits" puts "# TP = true positives" puts "# FP = false positives" puts "# PR = precision = TP/(TP+FP)" puts csv_headings.join(",") SyncEnumerator.new(*x_y).each do |row| #items_as_string = row.collect do |item| # sprintf("%.18f", item) #end ## THIS IS THE NORMAL OUTPUT: puts row.join(", ") #puts items_as_string.join(", ") end =begin files = ARGV.to_a two_lists = files.collect do |file| obj = Bioworks.new(file) list = [] list.push( obj.pep_probs_by_pep_prots ) list.push( obj.pep_probs_by_seq_charge ) # These each have a by_min and a by_top10 list.push(*( obj.pep_probs_by_scan ) ) list.push(*( obj.pep_probs_by_scan_charge ) ) list end headings = ["PepProts: TP", "PepProts: PR", "SeqCharge: TP", "SeqCharge: PR", "Scan(TopHit): TP", "Scan(TopHit): PR", "Scan(Top10): TP", "Scan(Top10): PR", "ScanCharge(TopHit): TP", "ScanCharge(TopHit): PR", "ScanCharge(Top10): TP", "ScanCharge(Top10): PR"] pairs = two_lists[0].zip two_lists[1] roc = ROC.new x_y= [] pairs.each do |pair| x,y = roc.tps_and_precision(pair[0], pair[1]) x_y.push(x, y) end # X axis is the number of peptides id# (i.e., # of peps in TP db) # Y axis is the precision = TP/(TP+FP) puts "# TP = true positives" puts "# FP = false positives" puts "# PR = precision = TP/(TP+FP)" puts headings.join(",") SyncEnumerator.new(*x_y).each do |row| #items_as_string = row.collect do |item| # sprintf("%.18f", item) #end puts row.join(", ") #puts items_as_string.join(", ") end =end