require 'spec_id' require 'optparse' require 'ostruct' require 'spec_id/aa_freqs' require 'shuffle' require 'vec' require 'table' ######################################################## WRITE_CYS_FIND = false ######################################################## module SpecID attr_accessor :orig_peps, :passed_peps, :passed_prots # The filename passed in for filtering attr_accessor :passed_in_filename # returns the top peptide hits per file dta (first_scan + charge) # all hits with same score as top score are returned # assumes that all fields are strings... # converts xcorr, deltacn, deltamass, mass, and charge into numerical types # deletes the protein array (but not relevant proteins) # hashes on [pep.basename, pep.first_scan.to_i, pep.charge.to_i] # sets the @orig_peps attribute to those passing def top_peps_prefilter! ## Bioworks peps are text based and need to be transformed first if peps.first.is_a? Bioworks::Pep peps.each do |pep| pep.xcorr = pep.xcorr.to_f pep.deltacn = pep.deltacn.to_f pep.deltamass = pep.deltamass.to_f pep.mass = pep.mass.to_f pep.charge = pep.charge.to_i pep.first_scan = pep.first_scan.to_i end end ## Srf Peps need no transformation! # get the top peptide by firstscan/charge (equivalent to .out files) top_peps = [] self.peps.hash_by {|pep| [pep.base_name, pep.first_scan, pep.charge]}.values.map do |v| #self.peps.hash_by {|pep| [pep.aaseq, pep.charge]}.values.map do |v| best_to_worst = v.sort_by {|pep| pep.xcorr}.reverse top_score = best_to_worst.first.xcorr best_to_worst.each do |pep| if pep.xcorr == top_score top_peps << pep else ; break end end end @orig_peps = top_peps end # (xcorr1, xcorr2, xcorr3, deltacn, ppm) # interface very unstable. For now, keeping it very loose... # assumed that peptide xcorr, deltacn, deltamass, mass, ppm are Floats # assumed that peptide charge is Integer # returns peps_passed # must respond to 'peps' # DOES NOT UPDATE the prot.peps attribute!! def filter_sequest(args, include_deltacnstar=false) (x1, x2, x3, deltacn, ppm) = args self.peps.select do |pep| # have to add the upper limit to deltacn because the lowest score is often # assigned a 1.10 in bioworks! pep_deltacn = pep.deltacn pep_charge = pep.charge ## The outer parentheses are critical to getting the correct answer! passing = ( (pep_deltacn >= deltacn) and ((pep_charge == 1 && pep.xcorr >= x1) or (pep_charge == 2 && pep.xcorr >= x2) or (pep_charge == 3 && pep.xcorr >= x3)) and ( pep.ppm <= ppm )) if passing if !include_deltacnstar && pep_deltacn > 1.0 false else true end else false end end end # given some list of SpecID::Pep based objects, finds the list of proteins # associated with those peptides # update_prot_peps => when true, updates prot.peps attribute given the list # of pephits # kind = # :no_update (current proteins are returned, but their peps attribute # is not updated) # :update (current proteins returned with peps attribute updated) # :new (new proteins are created complete with peps attribute) def self.passing_proteins(pephits, kind=:no_update) orig_pephits_prts = [] if kind == :new new_prots = {} pephits.each_with_index do |pep,i| orig_pephits_prts[i] = pep.prots peps_new_prts = pep.prots.map do |prt| if new_prots.key? prt.reference already_exists = new_prots[prt.reference] else np = prt.dup np.peps = [] new_prots[np.reference] = np np end end pep.prots = peps_new_prts end end if kind == :update pephits.each do |pep| pep.prots.each do |prt| prt.peps = [] end end end prot_set = {} pephits.each do |pep| prts = pep.prots prts.each do |prt| prot_set[ prt.reference ] = prt end if (kind == :update || kind == :new) prts.each do |prt| prt.peps << pep end end end ## Reset the original protein hits if kind == :new pephits.each_with_index do |pep,i| pep.prots = orig_pephits_prts[i] end end prot_set.values end end class SpecID::Filter NUM_PROT_FPPR_ITERATIONS = 10 def self.run_from_argv(argv) obj = self.new obj.run_from_argv(argv) end def run_from_argv(argv) reply = get_options(argv) return unless reply files, opt = reply #files = ARGV.map {|file| file } #ARGV.clear $stderr.puts "reading files (can take a minute or two for large files)..." if $VERBOSE spec_ids = files.map do |file| spec_id = file_to_prefiltered_spec_id(file, opt) spec_id end ## the options hash hash = {} if opt.cys if opt.cys[1] opt.cys[1] = opt.cys[1].to_f else opt.cys[1] = 0.0 end hash[:cys] = opt.cys end hash[:tps] = if opt.tps Fasta.new.read_file(opt.tps).prots.map do |prot| prot.aaseq.chomp end end hash[:dcy] = if opt.false new_spec_ids = [] prefixes_or_files = SpecID.extend_args(opt.false, files.size) false_spec_ids = spec_ids.zip(prefixes_or_files).map do |spec_id, prefix_or_file| if File.exist? prefix_or_file new_spec_ids << spec_id file_to_prefiltered_spec_id(prefix_or_file, opt) else (tps, fps) = spec_id.classify_by_false_flag(:peps, prefix_or_file, true, opt.prefix) fps_specid = spec_id.class.new tps_specid = spec_id.class.new fps_specid.peps = fps tps_specid.peps = tps new_spec_ids << tps_specid fps_specid end end spec_ids = new_spec_ids false_spec_ids end defaults = { :dcy => nil, # { spec_id => false_spec_id } :cys => nil, # [cys_background_freq, cys_containing_freq] :tps => nil, :tmm => nil, :occams_razor => opt.occams_razor, } args = defaults.merge hash base_args = [opt.x1, opt.x2, opt.x3, opt.c, opt.ppm] #################################################### <-- @fppr_methods = [:tmm, :tps, :cys, :dcy].select do |x| args[x] end @groups_reporting = [:pephits, :aaseq, :prothits] @groups_reporting.push( :occams_razor ) if args[:occams_razor] @cat_labels = { :pephits => 'pep_hits', :prothits => 'prot_hits', :aaseq => 'uniq_aa_hits', :occams_razor => 'occams_prot_hits', } #################################################### <-- if opt.log @logfh = File.open(opt.log, 'w') else @logfh = nil end ######################################### # PRINT FILTER LEGEND out filter_legend(@fppr_methods) ######################################### if opt.filters_file lines = IO.readlines(opt.filters_file) lines.each do |line| line.chomp! answer = prep_reply(line, base_args) next if answer == false base_args = answer filter_round(spec_ids, base_args, args) end elsif opt.i ## CLEAR ARGV (since otherwise, gets reads it!) ARGV.clear out interactive_help reply = "nil" loop do b = base_args out "#{b[0]} #{b[1]} #{b[2]} dcn:#{b[3]} ppm:#{b[4]}" loop do reply = gets.chomp answer = prep_reply(reply, base_args) if answer == false out interactive_help else base_args = answer filter_round(spec_ids, base_args, args) break end end end else filter_round(spec_ids, base_args, args) end if opt.log @logfh.close end end def out(string) puts string if @logfh @logfh.puts string end end # takes a fasta file or a string ( to be cast as a float ) def get_cys_freq(arg) if File.exist? arg SpecID::AAFreqs.new(arg).aafreqs[:C] else arg.to_f end end # prints shortened number for display def short(num) sprintf( "%.3f",num) end # if good arguments, returns [files_array, options] # else prints an error argument and returns nil def get_options(argv) dup_argv = argv.dup opt = OpenStruct.new opt.x1 = 1.0 opt.x2 = 1.5 opt.x3 = 2.0 opt.c = 0.1 opt.ppm = 1000.0 opt.false = false opts = OptionParser.new do |op| op.banner = "usage: #{File.basename(__FILE__)} [OPTS] " op.separator("prints number of peptides/proteins ID'd at given thresholds") op.separator "only top hit (by xcorr) per scan+charge is considered" #op.separator("** 'dcn*' is the number of peptides with deltacn == 1.1") #op.separator(" (these are peptides who are the only hit with xcorr > 0)") op.separator "" op.on("-1", "--xcorr1 N", Float, "xcorr for +1 charge d: #{opt.x1}") {|v| opt.x1 = v} op.on("-2", "--xcorr2 N", Float, "xcorr for +2 charge d: #{opt.x2}") {|v| opt.x2 = v} op.on("-3", "--xcorr3 N", Float, "xcorr for +3 charge d: #{opt.x3}") {|v| opt.x3 = v} op.on("-c", "--deltacn N", Float, ">= deltacn d: #{opt.c}") {|v| opt.c = v} op.on("-p", "--ppm N", Float, "<= ppm d: #{opt.ppm}") {|v| opt.ppm = v} op.separator " if bioworks.xml, = 10^6deltamass/mass" op.on("-i", "--interactive", "interactive filtering") {|v| opt.i = v} op.on("-f", "--false a,b,c", Array, "flag for false proteins or filenames of decoys") {|v| opt.false = v} op.separator(" e.g., for Bioworks: 'REVERSE'") op.separator(" (last given will apply to remaining files)") op.on("--prefix", "match false flag for prefixes only") {|v| opt.prefix = v} op.on("-y", "--cys ", Array, "report fpr by expected cysteine freq") do |v| v[0] = get_cys_freq(v[0]) opt.cys = v end op.separator(" freq = freq of cysteine as amino acid") op.separator(" [bkg] = freq of cys containing peps d: 0.0") op.on("--filters_file ", "(no -i) file with list of interactive input") {|v| opt.filters_file = v} op.on("-t", "--tps ", "fasta file containing true hits") {|v| opt.tps = v } #op.on("--tmm ", "toppred.out file with transmembr. topology") {|v| opt.tps = v } op.on("--yaml", "spits out yaml-ized data") {|v| opt.tabulate = v } op.on("--combined_score", "shows the combined score") {|v| opt.combined_score = v } op.on("--marshal", "will write marshaled data or read existing") {|v| opt.marshal = v } op.on("--log ", "also writes all output to file") {|v| opt.log = v } ## NEED TO IMPLEMENT THIS: #op.on("--protein_summary", "writes passing proteins to .summary.html files") {|v| opt.protein_summary = v } op.on("-z", "--occams_razor", "will show minimal set of proteins") {|v| opt.occams_razor = v } end opts.parse!(dup_argv) if dup_argv.size < 1 puts opts return nil end [dup_argv, opt] end # (actual # with cys, expected # with cys, total#peptides, # mean_fraction_of_cysteines_true, std) # PepHit(C) = Peptide containing cysteine # # Total PepHit(C) # Observed Bad Pep (C) # ------------------ proportional_to ---------------------- # # Total PepHit # Total Bad PepHit (X) # returns the fppr and the total number false def fppr_by_cysteines(ac_num_with_cys, exp_num_with_cys, total_peptides, mean_fraction_true_cys=nil, std_fraction_true_cys=nil) # the number of bona fide BAD cysteine hits # (some of the cysteine hits (~5%) are true positives) ac_num_with_cys -= exp_num_with_cys * mean_fraction_true_cys if mean_fraction_true_cys if ac_num_with_cys < 0.0 ; ac_num_with_cys = 0.0 end total_number_false = (ac_num_with_cys * total_peptides).to_f/exp_num_with_cys fppr = total_number_false / total_peptides [fppr, total_number_false] end # num_peps_per_protein is an array of the number of peptides per protein hit # (these are the true hits) # assumes that the number follows a gaussian distribution (binomial # distributions tend toward gaussians, I believe, at large N) # returns [mean_num_wrong, mean_fppr, stdev_num_wrong, stdev_fppr] fppr def protein_fppr( num_peps_per_protein, number_false_peptides, num_iterations=10) ## Check for more false peptides than peptides in our proteins: total_protein_peps = 0 contained = num_peps_per_protein.each do |num| total_protein_peps += num end ## All peptides will be wrong every time! ## which means all proteins will be wrong every time! if number_false_peptides >= total_protein_peps # [all proteins wrong, fppr=1.0 return [num_peps_per_protein.size, 1.0, 0.0, 0.0] end num_prots = num_peps_per_protein.size sample = VecD.new(num_iterations) # indexed by peptide_number, pointing to a protein's peptide_count # we shuffle the indices and then walk along until we are finished # then we count how many proteins still have peptides # we create an array to hold the peptide number for each protein, then we # can reference the same entity when subtracting the peptides in the # algorithm cont_pep_num_per_prot_ars = (0...num_iterations).map do |i| total_protein_peps = 0 contained = num_peps_per_protein.map do |num| [num] end end cont_num_by_pep_index_ars = cont_pep_num_per_prot_ars.map do |ar| index_count = 0 pc_ar = [] ar.each do |contained_num| contained_num.first.times do pc_ar[index_count] = contained_num index_count += 1 end end pc_ar end indices = (0...(cont_num_by_pep_index_ars.first.size)).map {|x| x } (0...num_iterations).each do |i| num_false = 0 indices.shuffle! pc = cont_num_by_pep_index_ars[i] number_false_peptides.times do |shuffle_index| #big_i = indices[shuffle_index] pc[indices[shuffle_index]][0] -= 1 end cont_pep_num_per_prot_ars[i].each do |contained_pep_count| if contained_pep_count.first == 0 num_false += 1 end end sample[i] = num_false end (mean_num_wrong, stdev) = sample.sample_stats mean_fppr = mean_num_wrong / num_prots stdev_fppr = stdev / num_prots [mean_num_wrong, mean_fppr, stdev, stdev_fppr] end # returns [total_number_false, fppr, fraction_expected] # also takes a hash of pephits keyed on :aaseq def fraction_false_by_cysteines(pephits, cys_bg_freq, cys_containing_freq) (ac, exp) = SpecID::AAFreqs.new.actual_and_expected_number_containing_cysteines(pephits, cys_bg_freq) fraction_of_expected = ac.to_f/exp (cys_fprate, total_num_false) = fppr_by_cysteines(ac, exp, pephits.size, cys_containing_freq) [total_num_false, cys_fprate, fraction_of_expected] end def report_cysteines #### UNDERWAY::: cys_tps = pep_nums[i] - total_num_false puts "CYSTEINE FPR: " puts " (# peps containing >= 1 cysteines)" puts " actual: #{ac}" puts "fraction of expected: #{short(fraction_of_expected)}" puts " expected # FP's: " + short(total_num_false) puts " estimated FPR: " + short( 100.0*cys_fprate ) + " % " puts "combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/ppm)" puts "Combined Score & FPR" puts "#{combined_score}\t#{cys_fprate}" puts "Combined Score & fraction of expected" #puts "#{combined_score} #{fraction_of_expected}" to_write_cys_find = ["WRITE_CYS_FIND:", combined_score, fraction_of_expected] puts to_write_cys_find.join("\t") if WRITE_CYS_FIND puts(['TABULATE:', combined_score, pep_tps, pep_fpr, cys_tps, cys_fprate, '', x1, x2, x3, deltacn, ppm].join("\t")) if opt.tabulate end def filter_legend(fppr_methods) lines = [] lines << "Note: protein FPPR values are probably optimistic" lines << "[this implementation assumes an equal likelihood that a false peptide" lines << " comes from a protein with more hits as one with less (which is probably" lines << " not the case)]" lines << "* = deltacn_star = peptides with deltacn > 1.0 (no sibling hits)" if fppr_methods.size > 0 lines << "Following are methods for determining false identification rate:" lines << ['dcy=decoy', 'cys=cysteine', 'tps=known_true_positives'].join(" ") ## when tmm is implemented: #lines << ['dcy=decoy', 'cys=cysteine', 'tmm=transmembrane', 'tps=known_true_positives'].join(" ") end lines.join("\n") end # does this give aafreq from a fasta file? # freq = cysteines.aafreqs[:C] # returns [total_number_false, fppr] # pephits can be an array or a hash of peptides keyed on :aaseq def fraction_false_by_true_pos(pephits, true_pos_aaseqs_ar) if pephits.is_a? Hash seqs = pephits.keys else seqs = pephits.map do |v| v.aaseq end end real_tps = 0 real_fps = 0 # could also do with partition seqs.each do |pep_aaseq| if true_pos_aaseqs_ar.any? {|prot_aaseq| prot_aaseq.include? pep_aaseq} real_tps += 1 else real_fps += 1 end end real_fppr = real_fps.to_f/pephits.size [real_fps, real_fppr] end def filter_spec_id(spec_id, filter_args, args) results_hash = {} # that second argument is to update protein peptides pephits = spec_id.filter_sequest(filter_args) results_hash[:prothits] = SpecID.passing_proteins(pephits, :no_update) results_hash[:pephits] = pephits results_hash[:dcn_cnt] = pephits.select{|v| v.deltacn > 1.0}.size # be aware that this is a hash keyed by aaseq and values of arrays of # peptides sharing the same aaseq! results_hash[:aaseq] = pephits.hash_by(:aaseq) results_hash end # returns [#FP, FPPR] def dcy_fppr(pephits, false_pephits) fps = false_pephits.size [fps, fps.to_f/pephits.size] end def tmm_fppr(pephits) abort "NEED TO IMPLEMENT" end # returns [#FP, FPPR] def cys_fppr(pephits, cys_bg_freq, cys_containing_freq) (total_num_false, cys_fprate, fraction_of_expected) = fraction_false_by_cysteines(pephits, cys_bg_freq, cys_containing_freq) [total_num_false, cys_fprate] end def tps_fppr(pephits, true_pos_aaseqs_ar) fraction_false_by_true_pos(pephits, true_pos_aaseqs_ar) end ## methods should be passed in like this 'cysteine' for cysteine_fppr ## all methods should return [number_false, fppr] ## returns a hash (by method) for each set of pephits ## if :dcy is given as a method, then expects the false pephits array def calculate_pep_fppr(pephits_ar, methods, args, false_pephits_ar=nil) cnt = 0 pephits_ar.map do |ph| hash = {} methods.each do |mth| case mth when :dcy hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, false_pephits_ar[cnt]) when :cys hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, *(args[:cys]) ) when :tps hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph, (args[:tps]) ) else hash[mth.to_sym] = send("#{mth}_fppr".to_sym, ph) end end cnt += 1 hash end end # fpr is a SpecID obj that is the false positives # cysteines holds an aafreqs object or nil def filter_round(spec_ids, filter_args, args) # push fpr on the end for the calculations ## FILTER the NORMAL spec_id objects little_tables = [] spec_ids.each_with_index do |spec_id, i| normal_results = filter_spec_id(spec_id, filter_args, args) ## FILTER the FALSE objects (if given) false_results = if args[:dcy] little_args_hash = args.dup false_results = filter_spec_id(args[:dcy][i], filter_args, little_args_hash) end ## HOW TO CALCULATE FPPR FOR EVERYTHING: # pephits Fpephits C/Tpephits TPpephits # uniqaa Funiqaa C/Tuniqaa TPuniqaa # prothits ProtFPR(Fpephits, prothits) ProtFPR(C/Tpephits, prothits) ProtFPR(total-TPpephits, prothits) # OccProthits ProtFPR(Funiqaa, OccProthits) ProtFPR(C/Tuniqaa, OccProthits) ProtFPR(total-TPuniqaa, OccProthits) # C/T = cystein or Transmembrane method ## set up false results array if args[:dcy] fr_ar = [false_results[:pephits], false_results[:aaseq]] else fr_ar = nil end (pephits_fppr_results, aaseq_fppr_results) = calculate_pep_fppr([normal_results[:pephits], normal_results[:aaseq]], @fppr_methods, args, fr_ar) ## NORMAL prothits ## update prothits peptides updated_proteins = SpecID.passing_proteins(normal_results[:pephits], :update) pep_cnt_arr = updated_proteins.map {|v| v.peps.size } ## update occams prothits if args[:occams_razor] updated_occams_protein_triplets = SpecID::occams_razor(updated_proteins, true) occams_pep_cnt_arr = updated_occams_protein_triplets.map {|v| v[1].size } occams_prots = updated_occams_protein_triplets.map {|v| v[0] } normal_results[:occams_razor] = occams_prots end ## note that the original prot.peps arrays are obliterated by this. ## we would need to re-update if someone wanted these prothits_fppr_results = {} occams_results = {} @fppr_methods.each do |mth| prothits_fppr_results[mth] = protein_fppr(pep_cnt_arr, pephits_fppr_results[mth].first.ceil.to_i, NUM_PROT_FPPR_ITERATIONS) occams_results[mth] = protein_fppr(occams_pep_cnt_arr, aaseq_fppr_results[mth].first.ceil.to_i, NUM_PROT_FPPR_ITERATIONS) if args[:occams_razor] end fppr_results = { :pephits => pephits_fppr_results, :aaseq => aaseq_fppr_results, :prothits => prothits_fppr_results, } fppr_results[:occams_razor] = occams_results if args[:occams_razor] ## CHANGE ALL RESULTS INTO PERCENTAGES: fppr_results.each do |bk,hash| hash.each do |k,val| hash[k][1] = 100.0 * val[1] end end little_tables[i] = to_table( spec_id, args, normal_results, fppr_results, @groups_reporting, @fppr_methods, @cat_labels) end out filter_params_string(filter_args, @fppr_methods) little_tables.each do |tbl| out tbl.to_formatted_string(nil, ' ') out "-----------------------------------------------\n" end #big_table(spec_ids, filter_args, args, normal_results, groups_reporting, fppr_results, cat_labels) end def filter_params_string(filter_args, fppr_methods) (x1, x2, x3, deltacn, ppm) = filter_args st = [] st << "==========================================================================" st << " xcorr(1,2,3) >= #{x1},#{x2},#{x3} || deltacn >= #{deltacn} || ppm <= #{ppm} " st << '' st.join("\n") #st = [] #st << ["xcorr(1,2,3) >= #{x1},#{x2},#{x3}", "deltacn >= #{deltacn}", "ppm <= #{ppm}"].join("\t") #st end def to_table(spec_id, args, normal_results, fppr_results, groups_reporting, fppr_methods, cat_labels) #table is in the form: { column heading => [ values ] } title = spec_id.passed_in_filename col_labels = ['num', *(fppr_methods.map{|v| "#{v}%" })] row_labels = groups_reporting.map {|grp| cat_labels[grp]} dt = groups_reporting.map do |grp| line = [normal_results[grp].size] fppr_methods.each do |mth| line << fppr_results[grp][mth][1] end line end Table.new(dt, row_labels, col_labels, title) #puts(['TABULATE:', combined_score, pep_tps, pep_fppr, real_tps, real_fppr, '', x1, x2, x3, deltacn, ppm].join("\t")) if opt.tabulate end def combined_score(filter_args) (x1, x2, x3, deltacn, ppm) = filter_args combined_score = x1 + x2 + x3 + 20.0*deltacn + 4000.0*(1.0/ppm) end # assumes its already chomped # updates the 5 globals def prep_reply(reply, base) if reply == 'q' ; exit ; end if reply =~ /^\s*$/ base elsif reply arr = reply.split(/\s+/) to_change = [] to_change_hash = {} arr.each do |it| if it.include? ':' (k,v) = it.split(':') to_change_hash[k] = v else to_change << it end end to_change.each_with_index do |tc,i| begin base[i] = tc.to_f rescue NoMethodError out "BAD ARG: #{tc}" return false end end to_change_hash.each do |k,v| case k when 'x1' ; base[0] = v when 'x2' ; base[1] = v when 'x3' ; base[2] = v when 'dcn' ; base[3] = v when 'ppm' ; base[4] = v else out "BAD ARG: #{k}:#{v}" end end base.map {|v| v.to_f } else false end end def file_to_prefiltered_spec_id(file, opt) spec_id = nil marshal_file = file + ".prefiltered.msh" if File.exist?(marshal_file) File.open(marshal_file) do |fh| spec_id = Marshal.load(fh) end else spec_id = SpecID.new(file) spec_id.passed_in_filename = file spec_id.top_peps_prefilter! ## marshal it! if opt.marshal File.open(marshal_file, "w") do |fh| Marshal.dump(spec_id,fh) end end end spec_id end def interactive_help string = [] string << "********************************************************" string << "INTERACTIVE FILTERING HELP:" string << "enter: " string << "or : x1: x2: x3: dcn: ppm:" string << "or : dcn:" string << "or : ppm:" string << "etc..." string << " to (re)run current values" string << "'q' to quit" string << "********************************************************" string.join("\n") end end