# for false_positive_rate: require 'optparse' require 'ostruct' require 'generator' require 'gnuplot' require 'roc' class String def margin self.gsub(/^\s*\|/,'') end end class SpecID class FalsePositiveRate module HTML # html and body tags def html "| |#{yield} |\n".margin end def body "| | #{yield} |\n".margin end def header "| | #{style} |\n".margin end def td "#{yield}" end def style ' ' end def table "| | #{yield} |
\n".margin end def tr "| | #{yield} |\n".margin end end # module HTML end # class FalsePositiveRate end #class SpecID class SpecID class FalsePositiveRate ########################################################### # GLOBAL SETTINGS: DEF_PREFIX = "INV_" DATA_PREC = 4 # decimal places of precision or fpr data STDOUT_JTPLOT_BASE = "fpr" # if there is no outfile ########################################################### include SpecID::FalsePositiveRate::HTML ## returns either an ascii string (if file_as_decoy) or ## returns an html string def false_positive_rate(argv) opt = parse_args(argv) files = argv.to_a file_as_decoy = false if File.exist? opt.f file_as_decoy = true out_string = file_as_decoy(files, opt) else out_string = prefix_as_decoy(files, opt) end return [out_string, opt, file_as_decoy] end def run_cmd_line(argv) output_string, opt, file_as_decoy = false_positive_rate(argv) if file_as_decoy puts output_string else ## open file and write to it.. if opt.o == 'STDOUT' print output_string else File.open(opt.o,'w') do |fh| fh.print output_string end end end end # returns the outfile with no extension def outfile_noext(opt) if opt == 'STDOUT' "#{STDOUT_JTPLOT_BASE}" else opt.sub(/#{Regexp.escape(File.extname(opt))}$/, '') end end def file_noext(file) file.sub(/#{Regexp.escape(File.extname(file))}$/, '') end def parse_args(argv) opt = OpenStruct.new opt.f = DEF_PREFIX opt.o = 'STDOUT' opts = OptionParser.new do |op| op.banner = "Usage: #{File.basename(__FILE__)} [options] bioworks.xml|proph-prot.xml ..." op.separator "" op.separator "Abbreviations and Definitions:" op.separator " TP = True Positives" op.separator " FP = False Positive" op.separator " FPR = False Positive Rate = [FP/(TP+FP)] (between 0 and 1)" op.separator " FPR2 = Gygi's estimation of FPR = [2*FPR]" op.separator " Precision = [TP/(TP+FP)]" op.separator "" op.separator "Output: " op.separator " 1. Decoy as separate search: FPR to STDOUT" op.separator " 2. Decoy proteins from concatenated database: 'fpr.html'" op.separator "" op.separator "Options:" op.on("-f", "--fp_data ", "PREFIX (def: #{DEF_PREFIX}) -or- decoy FILE") {|v| opt.f = v } op.separator "" op.separator " If searched with a concatenated DB, give a PREFIX to decoy proteins." op.separator " If files have different prefixes, separate with commas." op.separator " If searched with a separate decoy DB, give the FILE name of decoy data" op.separator "" op.on("-g", "--gygi", "also show Gygi's estimate of FPR (2*FPR)") {|v| opt.g = v} ## NOT YET FUNCTIONAL: op.on("-e", "--peptides", "do peptides instead of proteins") op.on("-p", "--prec", "also show precision (TP/(TP+FP))") {|v| opt.p = v} op.on("-n", "--nofpr", "don't show FPR") {|v| opt.n = v} op.separator "" op.on("-o", "--outfile ", "write output to file (def: #{opt.o})") {|v| opt.o = v} op.on("-a", "--area", "output area under the curve instead of the plot") {|v| opt.a = v} op.on_tail(" Examples: 1. For a search on a concatenated database where the decoy proteins have been flagged with the prefix 'INV_' for both Bioworks and ProteinProphet output: #{File.basename(__FILE__)} -f INV_ bioworks.xml proph-prot.xml 2. To determine the false positive rate of a search (and fpr2 and precision) using a normal and decoy database search, filter both the normal and decoy datasets identically, export to xml and run like this (only works for Bioworks xml export): #{File.basename(__FILE__)} -tp -f decoy_bioworks.xml bioworks.xml ") end opts.parse!(argv) if argv.size < 1 puts opts exit end opt end # takes a comma separated list and extends the last to create an array of # desired size def prefixes(arg, desired_size) arg_arr = arg.split(',') new_arr = [] last_arg = arg_arr[0] desired_size.times do |i| if arg_arr[i] new_arr[i] = arg_arr[i] last_arg = new_arr[i] else new_arr[i] = last_arg end end new_arr end ## collapses arrays to one level deep so we can sync them up def arrays_to_one_level_deep(all_arrs) mostly_flat = [] all_arrs.each do |per_file| per_file.each do |per_style| mostly_flat << per_style[0] mostly_flat << per_style[1] end end mostly_flat end # prints rows and th for the data def table_cells(all_arrs, key) ## columns specific headings: all_string = "" all_string << tr do line = "" key.each do |per_file| per_file.each do |per_ds| line << "#{per_ds[1][0]}#{per_ds[1][1]}" end end line end mostly_flat = arrays_to_one_level_deep(all_arrs) SyncEnumerator.new(*mostly_flat).each do |row| all_string << tr do string = row.map {|it| sty="%d" if it.class == Float ; sty="%.#{DATA_PREC}f" end td{ sprintf(sty,it)} }.join end end all_string end def html_table_output(all_arrs, key, files, filename_noext) num_datasets_per_file = all_arrs.first.size num_cols_per_dataset = 2 big_colspan = num_datasets_per_file * num_cols_per_dataset output = table do tr do files.map do |file| "#{file}" end.join end + tr do key.map do |arr| arr.map do |ds| "#{ds.first}" end end end + table_cells(all_arrs, key) end "
" + output + "
" end def y_axis_label(key) ## We only take the keys for the first file, as it's assumed that the major ## labels will be identical for all of them labels = key.first.map {|tp| tp.first } labels.join " | " end # escapes any ' chars def escape_to_gnuplot(string) # long way, but it works. new_string = "" string.split(//).each do |chr| if chr == "'" ; new_string << "\\" end new_string << chr end new_string end ## outputs a .toplot file based on filename_noext, creates a png file, and ## writes html to fh that will load the png file up ## This is a self contained module that can be swapped out for a ## completely different plotting program if desired. def plot_figure(all_arrs, key, files, filename_noext) ## CREATE the PLOT IMAGE: to_plot = filename_noext+'.toplot' png = filename_noext+'.png' Gnuplot.open do |gp| Gnuplot::Plot.new( gp ) do |plot| plot.terminal "png noenhanced" plot.output png plot.title "Classification Analysis" plot.xlabel 'Num True Positives' plot.ylabel escape_to_gnuplot(y_axis_label(key)) plot.style "line 1 lt 1" plot.style "line 2 lt 12" #plot.style "line 1 lt 1 lw #{opts.lw} pt 7 ps #{opts.ps}", plot.yrange "[-0.05:#{1.0 + 0.2*files.size}]" files.each_with_index do |file,i| key[i].each_with_index do |k,j| plot.data << Gnuplot::DataSet.new( [ all_arrs[i][j][0], all_arrs[i][j][1] ] ) do |ds| ds.with = "lines" ds.title = escape_to_gnuplot("#{file}: #{k[1][1]}") end end end end end =begin ## CREATE the PLOT IMAGE: to_plot = filename_noext+'.toplot' png = filename_noext+'.png' File.open(to_plot,'w') do |out| out.puts 'XYData' out.puts filename_noext out.puts "Classification Analysis" out.puts 'Num True Positives' out.puts escape_to_gnuplot(y_axis_label(key)) files.each_with_index do |file,i| #p key[i] #p all_arrs[i] key[i].each_with_index do |k,j| out.puts(escape_to_gnuplot("#{file}: #{k[1][1]}")) out.puts all_arrs[i][j][0].join(' ') out.puts all_arrs[i][j][1].join(' ') end end end num_files = files.size if $".include? 'plotter.rb' cmd = "#{to_plot} --yrange n0.05:#{1.0 + 0.2*num_files} --noenhanced -w l" plot_cmd = "plot.rb #{cmd}" Plotter.new.plot_string "#{to_plot} --yrange n0.05:#{1.0 + 0.2*num_files} --noenhanced -w l" unless File.file? png abort "Fatal Error in plotting cmd=\"#{plot_cmd}\":\n#{reply}" end else warn "plotter.rb not found, not png plot image available" end =end ## CREATE the HTML to load the plot: basename_filename_noext = File.basename(filename_noext) output = "
\n" #output << "\n" output << "\n" output << "
Additional views of this data may be obtained by using the plot.rb command on '#{to_plot}' (type plot.rb for more details). Plot generated with command:    #{plot_cmd}
\n" output end # plot_figure def file_as_decoy(files, opt) bio = SpecID::Bioworks.new puts "Calculating false positive rates using '#{opt.f}' as decoy ..." fps = bio.num_prots(opt.f) out = "" files.each do |file| tps = bio.num_prots(file) out << "*****************************************************\n" out << sprintf("%-36s # TP : #{tps}\n", file) out << sprintf("%-36s # FP : #{fps}\n", opt.f) out << sprintf(" False Positive Rate [FP/(TP+FP)] : %.3f\n", fps.to_f/(tps+fps)) unless opt.n out << sprintf(" Gygi's False Positive Rate 2*[FP/(TP+FP)] : %.3f\n", 2.0*fps/(tps+fps)) if opt.g out << sprintf(" Precision [TP/(TP+FP)] : %.3f\n", tps.to_f/(tps+fps)) if opt.p out << "*****************************************************\n" end out end def prefix_as_decoy(files, opt) #puts "Calculating false positive rates using prefix #{opt.f} ..." prefix_arr = prefixes(opt.f, files.size) all_arrs = [] key = [] out_noext = outfile_noext(opt.o) files.each_with_index do |file,i| all_arrs[i] = [] key[i] = [] sp = SpecID.new(file) #headers = ["#{file_noext(file)} Precision [TP/(TP+FP)]", "#{file_noext(file)} FPR [FP/(FP+TP)]"] (tp, prec, fpr2) = sp.tps_and_precision_and_fpr2_times2_for_prob(prefix_arr[i]) if opt.g all_arrs[i] << [tp,fpr2] key[i] << ["Gygi FPR", ["#TP", "Gygi FPR = 2*FP/(TP+FP)"]] end if opt.p all_arrs[i] << [tp,prec] key[i] << ["Prec", ["#TP", "Prec = TP/(TP+FP)"]] end unless opt.n ## Add the fpr datasets fpr = fpr2.map {|v| v/2.0} all_arrs[i] << [tp,fpr] key[i] << ["FPR", ["#TP", "FPR = FP/(TP+FP)"]] end end string = '' if opt.a roc = ROC.new #string << "***********************************************************\n" #string << "AREA UNDER CURVE:\n" key.each_with_index do |file,i| string << "#{files[i]} (area under curve)\n" key[i].each_index do |j| string << "#{key[i][j][0]} [#{ key[i][j][1]}]:\t" tps = all_arrs[i][j][0] oth = all_arrs[i][j][1] string << roc.area_under_curve(tps, oth).to_s << "\n" end end #string << "***********************************************************\n" else string = html do header + body do plot_figure(all_arrs, key, files, out_noext) + html_table_output(all_arrs, key, files, out_noext) end end end string end end # class FalsePositiveRate end # class SpecID