# for false_positive_rate:
require 'optparse'
require 'ostruct'
require 'generator'
require 'gnuplot'
require 'roc'
class String
def margin
self.gsub(/^\s*\|/,'')
end
end
class SpecID
class FalsePositiveRate
module HTML
# html and body tags
def html
"|
|#{yield}
|\n".margin
end
def body
"|
| #{yield}
|\n".margin
end
def header
"|
| #{style}
|\n".margin
end
def td
"#{yield} | "
end
def style
'
'
end
def table
"|\n".margin
end
def tr
"|
| #{yield}
|
\n".margin
end
end # module HTML
end # class FalsePositiveRate
end #class SpecID
class SpecID
class FalsePositiveRate
###########################################################
# GLOBAL SETTINGS:
DEF_PREFIX = "INV_"
DATA_PREC = 4 # decimal places of precision or fpr data
STDOUT_JTPLOT_BASE = "fpr" # if there is no outfile
###########################################################
include SpecID::FalsePositiveRate::HTML
## returns either an ascii string (if file_as_decoy) or
## returns an html string
def false_positive_rate(argv)
opt = parse_args(argv)
files = argv.to_a
file_as_decoy = false
if File.exist? opt.f
file_as_decoy = true
out_string = file_as_decoy(files, opt)
else
out_string = prefix_as_decoy(files, opt)
end
return [out_string, opt, file_as_decoy]
end
def run_cmd_line(argv)
output_string, opt, file_as_decoy = false_positive_rate(argv)
if file_as_decoy
puts output_string
else
## open file and write to it..
if opt.o == 'STDOUT'
print output_string
else
File.open(opt.o,'w') do |fh| fh.print output_string end
end
end
end
# returns the outfile with no extension
def outfile_noext(opt)
if opt == 'STDOUT'
"#{STDOUT_JTPLOT_BASE}"
else
opt.sub(/#{Regexp.escape(File.extname(opt))}$/, '')
end
end
def file_noext(file)
file.sub(/#{Regexp.escape(File.extname(file))}$/, '')
end
def parse_args(argv)
opt = OpenStruct.new
opt.f = DEF_PREFIX
opt.o = 'STDOUT'
opts = OptionParser.new do |op|
op.banner = "Usage: #{File.basename(__FILE__)} [options] bioworks.xml|proph-prot.xml ..."
op.separator ""
op.separator "Abbreviations and Definitions:"
op.separator " TP = True Positives"
op.separator " FP = False Positive"
op.separator " FPR = False Positive Rate = [FP/(TP+FP)] (between 0 and 1)"
op.separator " FPR2 = Gygi's estimation of FPR = [2*FPR]"
op.separator " Precision = [TP/(TP+FP)]"
op.separator ""
op.separator "Output: "
op.separator " 1. Decoy as separate search: FPR to STDOUT"
op.separator " 2. Decoy proteins from concatenated database: 'fpr.html'"
op.separator ""
op.separator "Options:"
op.on("-f", "--fp_data ", "PREFIX (def: #{DEF_PREFIX}) -or- decoy FILE") {|v| opt.f = v }
op.separator ""
op.separator " If searched with a concatenated DB, give a PREFIX to decoy proteins."
op.separator " If files have different prefixes, separate with commas."
op.separator " If searched with a separate decoy DB, give the FILE name of decoy data"
op.separator ""
op.on("-g", "--gygi", "also show Gygi's estimate of FPR (2*FPR)") {|v| opt.g = v}
## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins")
op.on("-p", "--prec", "also show precision (TP/(TP+FP))") {|v| opt.p = v}
op.on("-n", "--nofpr", "don't show FPR") {|v| opt.n = v}
op.separator ""
op.on("-o", "--outfile ", "write output to file (def: #{opt.o})") {|v| opt.o = v}
op.on("-a", "--area", "output area under the curve instead of the plot") {|v| opt.a = v}
op.on_tail("
Examples:
1. For a search on a concatenated database where the decoy proteins have
been flagged with the prefix 'INV_' for both Bioworks and ProteinProphet
output:
#{File.basename(__FILE__)} -f INV_ bioworks.xml proph-prot.xml
2. To determine the false positive rate of a search (and fpr2 and precision)
using a normal and decoy database search, filter both the normal and decoy
datasets identically, export to xml and run like this (only works for
Bioworks xml export):
#{File.basename(__FILE__)} -tp -f decoy_bioworks.xml bioworks.xml
")
end
opts.parse!(argv)
if argv.size < 1
puts opts
exit
end
opt
end
# takes a comma separated list and extends the last to create an array of
# desired size
def prefixes(arg, desired_size)
arg_arr = arg.split(',')
new_arr = []
last_arg = arg_arr[0]
desired_size.times do |i|
if arg_arr[i]
new_arr[i] = arg_arr[i]
last_arg = new_arr[i]
else
new_arr[i] = last_arg
end
end
new_arr
end
## collapses arrays to one level deep so we can sync them up
def arrays_to_one_level_deep(all_arrs)
mostly_flat = []
all_arrs.each do |per_file|
per_file.each do |per_style|
mostly_flat << per_style[0]
mostly_flat << per_style[1]
end
end
mostly_flat
end
# prints rows and th for the data
def table_cells(all_arrs, key)
## columns specific headings:
all_string = ""
all_string << tr do
line = ""
key.each do |per_file|
per_file.each do |per_ds|
line << "#{per_ds[1][0]} | #{per_ds[1][1]} | "
end
end
line
end
mostly_flat = arrays_to_one_level_deep(all_arrs)
SyncEnumerator.new(*mostly_flat).each do |row|
all_string << tr do
string = row.map {|it|
sty="%d"
if it.class == Float ; sty="%.#{DATA_PREC}f" end
td{ sprintf(sty,it)}
}.join
end
end
all_string
end
def html_table_output(all_arrs, key, files, filename_noext)
num_datasets_per_file = all_arrs.first.size
num_cols_per_dataset = 2
big_colspan = num_datasets_per_file * num_cols_per_dataset
output = table do
tr do
files.map do |file|
"#{file} | "
end.join
end +
tr do
key.map do |arr|
arr.map do |ds|
"#{ds.first} | "
end
end
end +
table_cells(all_arrs, key)
end
"" + output + "
"
end
def y_axis_label(key)
## We only take the keys for the first file, as it's assumed that the major
## labels will be identical for all of them
labels = key.first.map {|tp| tp.first }
labels.join " | "
end
# escapes any ' chars
def escape_to_gnuplot(string)
# long way, but it works.
new_string = ""
string.split(//).each do |chr|
if chr == "'" ; new_string << "\\" end
new_string << chr
end
new_string
end
## outputs a .toplot file based on filename_noext, creates a png file, and
## writes html to fh that will load the png file up
## This is a self contained module that can be swapped out for a
## completely different plotting program if desired.
def plot_figure(all_arrs, key, files, filename_noext)
## CREATE the PLOT IMAGE:
to_plot = filename_noext+'.toplot'
png = filename_noext+'.png'
Gnuplot.open do |gp|
Gnuplot::Plot.new( gp ) do |plot|
plot.terminal "png noenhanced"
plot.output png
plot.title "Classification Analysis"
plot.xlabel 'Num True Positives'
plot.ylabel escape_to_gnuplot(y_axis_label(key))
plot.style "line 1 lt 1"
plot.style "line 2 lt 12"
#plot.style "line 1 lt 1 lw #{opts.lw} pt 7 ps #{opts.ps}",
plot.yrange "[-0.05:#{1.0 + 0.2*files.size}]"
files.each_with_index do |file,i|
key[i].each_with_index do |k,j|
plot.data << Gnuplot::DataSet.new( [ all_arrs[i][j][0], all_arrs[i][j][1] ] ) do |ds|
ds.with = "lines"
ds.title = escape_to_gnuplot("#{file}: #{k[1][1]}")
end
end
end
end
end
=begin
## CREATE the PLOT IMAGE:
to_plot = filename_noext+'.toplot'
png = filename_noext+'.png'
File.open(to_plot,'w') do |out|
out.puts 'XYData'
out.puts filename_noext
out.puts "Classification Analysis"
out.puts 'Num True Positives'
out.puts escape_to_gnuplot(y_axis_label(key))
files.each_with_index do |file,i|
#p key[i]
#p all_arrs[i]
key[i].each_with_index do |k,j|
out.puts(escape_to_gnuplot("#{file}: #{k[1][1]}"))
out.puts all_arrs[i][j][0].join(' ')
out.puts all_arrs[i][j][1].join(' ')
end
end
end
num_files = files.size
if $".include? 'plotter.rb'
cmd = "#{to_plot} --yrange n0.05:#{1.0 + 0.2*num_files} --noenhanced -w l"
plot_cmd = "plot.rb #{cmd}"
Plotter.new.plot_string "#{to_plot} --yrange n0.05:#{1.0 + 0.2*num_files} --noenhanced -w l"
unless File.file? png
abort "Fatal Error in plotting cmd=\"#{plot_cmd}\":\n#{reply}"
end
else
warn "plotter.rb not found, not png plot image available"
end
=end
## CREATE the HTML to load the plot:
basename_filename_noext = File.basename(filename_noext)
output = "\n"
#output << "Additional views of this data may be obtained by using the plot.rb command on '#{to_plot}' (type plot.rb for more details). Plot generated with command: #{plot_cmd}\n"
output << " |
\n"
output << "
\n"
output
end # plot_figure
def file_as_decoy(files, opt)
bio = SpecID::Bioworks.new
puts "Calculating false positive rates using '#{opt.f}' as decoy ..."
fps = bio.num_prots(opt.f)
out = ""
files.each do |file|
tps = bio.num_prots(file)
out << "*****************************************************\n"
out << sprintf("%-36s # TP : #{tps}\n", file)
out << sprintf("%-36s # FP : #{fps}\n", opt.f)
out << sprintf(" False Positive Rate [FP/(TP+FP)] : %.3f\n", fps.to_f/(tps+fps)) unless opt.n
out << sprintf(" Gygi's False Positive Rate 2*[FP/(TP+FP)] : %.3f\n", 2.0*fps/(tps+fps)) if opt.g
out << sprintf(" Precision [TP/(TP+FP)] : %.3f\n", tps.to_f/(tps+fps)) if opt.p
out << "*****************************************************\n"
end
out
end
def prefix_as_decoy(files, opt)
#puts "Calculating false positive rates using prefix #{opt.f} ..."
prefix_arr = prefixes(opt.f, files.size)
all_arrs = []
key = []
out_noext = outfile_noext(opt.o)
files.each_with_index do |file,i|
all_arrs[i] = []
key[i] = []
sp = SpecID.new(file)
#headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"]
(tp, prec, fpr2) = sp.tps_and_precision_and_fpr2_times2_for_prob(prefix_arr[i])
if opt.g
all_arrs[i] << [tp,fpr2]
key[i] << ["Gygi FPR", ["#TP", "Gygi FPR = 2*FP/(TP+FP)"]]
end
if opt.p
all_arrs[i] << [tp,prec]
key[i] << ["Prec", ["#TP", "Prec = TP/(TP+FP)"]]
end
unless opt.n
## Add the fpr datasets
fpr = fpr2.map {|v| v/2.0}
all_arrs[i] << [tp,fpr]
key[i] << ["FPR", ["#TP", "FPR = FP/(TP+FP)"]]
end
end
string = ''
if opt.a
roc = ROC.new
#string << "***********************************************************\n"
#string << "AREA UNDER CURVE:\n"
key.each_with_index do |file,i|
string << "#{files[i]} (area under curve)\n"
key[i].each_index do |j|
string << "#{key[i][j][0]} [#{ key[i][j][1]}]:\t"
tps = all_arrs[i][j][0]
oth = all_arrs[i][j][1]
string << roc.area_under_curve(tps, oth).to_s << "\n"
end
end
#string << "***********************************************************\n"
else
string = html do
header +
body do
plot_figure(all_arrs, key, files, out_noext) +
html_table_output(all_arrs, key, files, out_noext)
end
end
end
string
end
end # class FalsePositiveRate
end # class SpecID