require 'ostruct' require 'set' require 'hash_by' require 'spec_id/precision' require 'roc' require 'sample_enzyme' # for others require 'spec_id/bioworks' require 'spec_id/sequest' require 'spec_id/proph' require 'spec_id_xml' class SampleEnzyme ; end class Mass # http://expasy.org/tools/findmod/findmod_masses.html # still need to add the modifications MONO = { :A => 71.03711, :R => 156.10111, :N => 114.04293, :D => 115.02694, :C => 103.00919, :E => 129.04259, :Q => 128.05858, :G => 57.02146, :H => 137.05891, :I => 113.08406, :L => 113.08406, :K => 128.09496, :M => 131.04049, :F => 147.06841, :P => 97.05276, :S => 87.03203, :T => 101.04768, :W => 186.07931, :Y => 163.06333, :V => 99.06841, :h => 1.00783, :h_plus => 1.00728, :o => 15.9949146, :h2o => 18.01056, } AVG = { :A => 71.0788, :R => 156.1875, :N => 114.1038, :D => 115.0886, :C => 103.1388, :E => 129.1155, :Q => 128.1307, :G => 57.0519, :H => 137.1411, :I => 113.1594, :L => 113.1594, :K => 128.1741, :M => 131.1926, :F => 147.1766, :P => 97.1167, :S => 87.0782, :T => 101.1051, :W => 186.2132, :Y => 163.1760, :V => 99.1326, :h => 1.00794, :h_plus => 1.00739, :o => 15.9994, :h2o => 18.01524, } end module SpecID ; end class GenericSpecID ; include SpecID ; end module SpecID MONO = Mass::MONO AVG = Mass::AVG attr_accessor :peps, :prots # True if a high protein/peptide score is better than low, false otherwise # This is set automatically for known file types attr_accessor :hi_prob_best # A relative pathname of the file the specid object is derived from attr_accessor :filename # tp = file_type # Will return a SpecID object (really, the object corresponding to the # file type which mixes in SpecID [is_a?(SpecID) == true]) # If no file is given, will return a GenericSpecID object. def self.new(file=nil, tp=nil) if file from_file(file, tp) else GenericSpecID.new end end # tp = file_type # only takes an array if they are srf files! def self.from_file(file, tp=nil) obj = nil unless tp tp = file_type(file) end obj = case tp when 'srg' @hi_prob_best = false SRFGroup.new(file) when 'bioworks' @hi_prob_best = false Bioworks.new(file) when 'protproph' @hi_prob_best = true Proph::ProtSummary.new(file) else abort "UNRECOGNIZED file type for #{file}" end obj end def inspect "<#{self.class} #peps=\"#{peps.size}\">" end # takes a comma separated list or array and extends the last to create an # array of desired size def self.extend_args(arg, desired_size) arg_arr = arg if arg.is_a? String arg_arr = arg.split(',') end new_arr = [] last_arg = arg_arr[0] desired_size.times do |i| if arg_arr[i] new_arr[i] = arg_arr[i] last_arg = new_arr[i] else new_arr[i] = last_arg end end new_arr end # takes an array of proteins, each having peps # peptide grouping is done # by- # the protein with the most unique peptides ends up taking any # degenerate peptides, tie goes to one with most hits total, then the one # that had the top xcorr(s) (before removing any peptides).All other # proteins with identical peptides will lose those peptides. So, the rich # stay rich, and the poor get poorer. # returns an array of triplets where each is [prot, pep_hits, # uniq_aaseqs] (uniq_aaseqs is an array) where the protein contains >= 1 # peptide. The internal links (prot.peps and pep.prots) is NOT modified!! # update_prots == true will set each protein with the peptides found def self.occams_razor(array_of_prots, update_prots=false) peps_found = Set.new to_sort = array_of_prots.map do |prot| pps = prot.peps peps_by_uniq_aaseq = pps.hash_by(:aaseq) uniq_aaseqs = Set.new( pps.map {|pep| pep.aaseq } ) xcorrs = pps.map {|pep| pep.xcorr } silly = OpenStruct.new # 0 1 2 3 4 5 [uniq_aaseqs.size, pps.size, xcorrs.sort, prot, uniq_aaseqs, peps_by_uniq_aaseq] end prot_triplets = [] to_sort.sort.reverse.each do |ar| prot = ar[3] ## overlapping set: common = peps_found & ar[4] ## find the uniq ones in our little set of peptides: uniq = ar[4] - common pep_hits = [] if uniq.size != 0 ## add to the found list: peps_found.merge(uniq) uniq.each do |seq| pep_hits.push( *(ar[5][seq]) ) end prot_triplets << [prot, pep_hits, uniq.to_a] prot.peps = pep_hits if update_prots end end prot_triplets end ## basically, this is the command line wrapper def self.precision(argv) Prec.new.run_cmd_line(argv) end # returns number of true positives (array) and the specified output (as # parallel array). Requires the classification method and a sorted array of # tp values and an array fp values. # (This is simply a wrapper around ROC#by_tps method!) def by_tps(classification_method, tp, fp) ROC.new.by_tps(classification_method, tp, fp) end # from the unique set of peptide hits, create a separate peptide hit for # each protein reference where that peptide only references that protein # e.g. pep.prots = [(a single protein)] def pep_prots pps = [] peps.each do |pep| pep.prots.map do |prt| pep.dup pep.prots = [prt] pps << pep end end pps end def classify_by_regex(items, regex, fp_on_match=true) case items when :prots myproc = proc { |prt| if prt.reference =~ regex ; !fp_on_match else ; fp_on_match end } return classify(items, myproc) when :peps match = [] ; nomatch = [] peps.each do |pep| match_prots = [] ; nomatch_prots = [] (hit, nohit) = pep.prots.partition do |prot| prot.reference =~ regex end if hit.size == 0 nomatch << pep elsif nohit.size == 0 match << pep else ## both have hits pep.prots = match_prots nomatch_pep = pep.dup nomatch_pep.prots = nomatch_prots match << pep nomatch << pep end end if fp_on_match return [nomatch , match] else return [match, nomatch] end else abort "don't recognize " end end # returns [tp, fp] based on the protein prefix for items where items = # (:prot|:peps) # this may result in a duplication of some peptides if they match both # normal and decoy proteins. In this case, the protein arrays are split, # too, so that each points only to its breed of protein. def classify_by_false_flag(items, flag, fp_on_match=true, prefix=false) if prefix regex = /^#{Regexp.escape(flag)}/ else regex = /#{Regexp.escape(flag)}/ end classify_by_regex(items, regex, fp_on_match) end # Returns (match, nomatch) # items = symbol (:prots, :peps) # Returns two arrays, those returning true from classify_item_by and those # returning false def classify(items, classify_item_by) its = send(items) f = []; t = [] its.each do |it| if classify_item_by.call(it) t << it else f << it end end [t,f] end # returns two arrays, true positives and false positives (determined by proc # classify_item_by) sorted by proc rank_item_by. Items will be ranked from # lowest to highest based on the return value of rank_item_by. items is a # symbol (:prots or :peps) def rank_and_classify(items, rank_item_by, classify_item_by) its = send(items) #its.each do |it| puts it.probability.to_s ; puts it.reference end doublets = its.collect do |item| [ rank_item_by.call(item), classify_item_by.call(item) ] end roc = ROC.new tp, fp = roc.prep_list(doublets) return tp, fp end # returns a proc for getting all probabilities so that an ascending sort # will put the best scores first def probability_proc if hi_prob_best get_prob_proc = proc {|prt| prt.probability * -1 } else get_prob_proc = proc {|prt| prt.probability } end get_prob_proc end def separate_by_prefix(items, fp_prefix) its = send(items) if items == :prots elsif items == :peps abort "not implemented yet" else abort "no other items recognized yet" end end # sorts the probabilities and then # calcs predicted number hits and precision for protein probabilities # (summing probabilities) # one_minus_ppv = SUM(1-probX)/#prots = what is commonly and mistakenly # called false positive rate # SUM(1-probX)/#prots def num_hits_and_ppv_for_protein_prophet_probabilities current_sum_one_minus_prob = 0.0 num_prots = [] ppv = [] prot_cnt = 0 probs = prots.map {|v| v.probability} sorted = probs.sort.reverse sorted.each do |prob| prot_cnt += 1 num_prots << prot_cnt current_sum_one_minus_prob += 1.0 - prob ppv << 1.0 - ( current_sum_one_minus_prob / prot_cnt ) # current_fpr_ratio = current_sum_one_minus_prob / prot_cnt end [num_prots, ppv] end # convenience method for the common task of determining precision for # proteins (with decoy proteins found by false_flag) # returns (num_hits, precision) def num_hits_and_ppv_for_prob(false_flag, prefix=false) if prefix regex = /^#{Regexp.escape(false_flag)}/ else regex = /#{Regexp.escape(false_flag)}/ end prob_proc = probability_proc myproc = proc { |prt| if prt.reference =~ regex ; false else ; true end } real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc) (num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits) [num_hits, precision] end # # takes the existing spec_id object and marshals it into "file.msh" # # a new file will always look for a file.msh to load # def marshal(force=false) # if !(File.exist? @marshal_file)| force # File.open(@marshal_file, 'w') {|out| Marshal.dump(@obj, out) } # end # end # Returns 'bioworks' if bioworks xml, 'protproph' if Protein prophet # 'srf' if SRF file, 'srg' if search results group file. def self.file_type(file) if file =~ /\.srg$/ return 'srg' end if IO.read(file, 7,438) == 'Enzyme:' return 'srf' end File.open(file) do |fh| lines = "" 4.times { lines << fh.readline } if lines =~ // return 'bioworks' elsif lines =~ / b.peptide_probability } end #puts min_peptides[0] # -> Bioworks::Pep min_sorted_peps = sorted_probabilities(min_peptides) #puts min_sorted_peps[0] # -> probability (Float) peptides_by_tens = [] hash.each do |k,v| arr = v.sort_by {|pep| pep.peptide_probability }.slice(0,10) peptides_by_tens.push(*arr) end top_10_sorted_peps = sorted_probabilities(peptides_by_tens) #puts top_10_sorted_peps[0] # -> float #puts "size: top_10_sorted_peps.size : #{top_10_sorted_peps.size}" #puts "size: min_sorted_peps.size : #{min_sorted_peps.size}" #p top_10_sorted_peps #p min_sorted_peps return min_sorted_peps, top_10_sorted_peps end # Returns a list of sorted probabilities given the array of peptides def sorted_probabilities(peptides) #puts peptides.first.peptide_probability.class #peptides.each do |pep| print pep.class.to_s + " " end #puts peptides.first.is_a? Array #abort "DFHDFD" peptides.collect{|pep| pep.probability }.sort end # returns a sorted lists of probabilities based on all pepprots (a peptide # associated with a protein) def pep_probs_by_pep_prots sorted_probabilities(peps) end ########################################################################## # WARNING! These might be dangerous to your health if there are multiple # files collected in your bioworks file ########################################################################## # (prob_list_by_min, prob_list_by_best10) # returns 2 sorted lists of probabilities based on: # 1. best peptide hit # 2. top 10 peptide hits # on a per scan basis # NOTE: you may want to hash on base_name first! def pep_probs_by_scan hash = peps.hash_by(:first_scan, :last_scan) return min_and_best10(hash) end #(prob_list_by_min, prob_list_by_best10) # same as pep_probs_by_scan but per charge state # NOTE: you may want to hash on base_name first! def pep_probs_by_scan_charge hash = peps.hash_by(:first_scan, :last_scan, :charge) return min_and_best10(hash) end # (prob_list_by_min) # hashes on seq-charge and returns the sorted list of probabilities of top # hit per seq-charge # NOTE: you may want to hash on base_name first! def pep_probs_by_seq_charge hash = peps.hash_by(:sequence, :charge) min_peptides = hash.collect do |k,v| v.min {|a,b| a.peptide_probability <=> b.peptide_probability } end sorted_probabilities(min_peptides) end ########################################################################## # USE these if you have multiple files in your bioworks.xml file ########################################################################## # (prob_list_by_min, prob_list_by_best10) # returns 2 sorted lists of probabilities based on: # 1. best peptide hit # 2. top 10 peptide hits # on a per scan basis # NOTE: you may want to hash on base_name first! def pep_probs_by_bn_scan hash = peps.hash_by(:base_name, :first_scan, :last_scan) return min_and_best10(hash) end #(prob_list_by_min, prob_list_by_best10) # same as pep_probs_by_scan but per charge state # NOTE: you may want to hash on base_name first! def pep_probs_by_bn_scan_charge hash = peps.hash_by(:base_name, :first_scan, :last_scan, :charge) return min_and_best10(hash) end # (prob_list_by_min) # hashes on seq-charge and returns the sorted list of probabilities of top # hit per seq-charge # NOTE: you may want to hash on base_name first! def pep_probs_by_bn_seq_charge hash = peps.hash_by(:base_name, :sequence, :charge) min_peptides = hash.collect do |k,v| v.min {|a,b| a.peptide_probability <=> b.peptide_probability } end sorted_probabilities(min_peptides) end end # A Generic spectraID protein module SpecID::Prot # probability is always a float! attr_accessor :probability, :reference, :peps def <=> (other) self.reference <=> other.reference end end module SpecID::Pep Non_standard_amino_acid_char_re = /[^A-Z\.\-]/ attr_accessor :prots attr_accessor :probability # full sequence: (..) with '-' for no first # or last. attr_accessor :sequence # the basic amino acid sequence (no leading or trailing '.' or amino acids) # should not contain any special symbols, etc. attr_accessor :aaseq attr_accessor :charge # removes nonstandard chars with Non_standard_amino_acid_char_re # preserves A-Z and '.' and '-' def self.remove_non_amino_acids(sequence) sequence.gsub(Non_standard_amino_acid_char_re, '') end # remove_non_amino_acids && split_sequence def self.prepare_sequence(val) nv = remove_non_amino_acids(val) split_sequence(nv) end def <=>(other) aaseq <=> other.aaseq end # Returns prev, peptide, next from sequence. Parse errors return # nil,nil,nil # R.PEPTIDE.A # -> R, PEPTIDE, A # R.PEPTIDE.- # -> R, PEPTIDE, - # PEPTIDE.A # -> -, PEPTIDE, A # A.PEPTIDE # -> A, PEPTIDE, - # PEPTIDE # -> nil,nil,nil def self.split_sequence(val) peptide_prev_aa = ""; peptide = ""; peptide_next_aa = "" pieces = val.split('.') case pieces.size when 3 peptide_prev_aa, peptide, peptide_next_aa = *pieces when 2 if pieces[0].size > 1 ## N termini peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1] else ## C termini peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-' end when 1 ## this must be a parse error! peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil when 0 peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil end return peptide_prev_aa, peptide, peptide_next_aa end ## def self.sequence_to_aaseq(sequence) after_removed = remove_non_amino_acids(sequence) pieces = after_removed.split('.') case pieces.size when 3 pieces[1] when 2 if pieces[0].size > 1 ## N termini pieces[0] else ## C termini pieces[1] end when 1 ## this must be a parse error! pieces[0] ## which is the peptide itself else abort "bad peptide sequence: #{sequence}" end end # This will rapidly determine the list of proteins for which given # peptides belong. It is meant to be low level and fast (eventually), # so it asks for the data in a format amenable to this. # returns a mirror array where each entry is an array of Fasta::Prot # objects where each protein contains the sequence def self.protein_groups_by_sequence(peptide_strings_list, fasta_obj) prots = fasta_obj.prots prot_seqs = prots.map do |prot| prot.aaseq end groups = peptide_strings_list.map do |pep_seq| prot_index = 0 protein_group = [] prot_seqs.each do |prot_seq| if prot_seq.include? pep_seq protein_group << prots[prot_index] end prot_index += 1 end protein_group end groups end # units can be :mmu, :amu, :ppm def mass_accuracy(pep, unit=:ppm, mono=true) # 10^6 * deltam accuracy/ m[measured] # i.e., theoretical mass 1000, measured 999.9: 100ppm # http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD # pep.mass is the theoretical M+H of the peptide # this assumes that the deltacn value we're being told is correct, but I # have my suspicions (since the value is not accurate...) ######## TO COMPLETE (and add to spec_id..?) case unit when :ppm when :amu when :mmu end end end class SpecID::GenericProt include SpecID::Prot end class SpecID::GenericPep include SpecID::Pep end