require 'ostruct' require 'set' require 'hash_by' require 'roc' require 'sample_enzyme' # for others require 'spec_id/bioworks' require 'spec_id/sequest' require 'spec_id/proph/prot_summary' require 'spec_id/proph/pep_summary' require 'spec_id_xml' require 'spec_id/sqt' require 'spec_id/mass' require 'fasta' module ProteinReferenceable ; end class SampleEnzyme ; end module SpecID ; end class GenericSpecID ; include SpecID ; end module SpecID MONO = Mass::MONO AVG = Mass::AVG attr_accessor :peps, :prots # True if a high protein/peptide score is better than low, false otherwise # This is set automatically for known file types attr_accessor :hi_prob_best # A relative pathname of the file the specid object is derived from attr_accessor :filename # tp = file_type # Will return a SpecID object (really, the object corresponding to the # file type which mixes in SpecID [is_a?(SpecID) == true]) # If no file is given, will return a GenericSpecID object. # If file is an array, this is assumed to be a group of srf files which is # converted into an SRFGroup Ojbect and run. def self.new(file=nil, tp=nil) # this will need to be specialized for other groups later if file.is_a?(Array) # takes an array of srf filenames SRFGroup.new(file) elsif file from_file(file, tp) else GenericSpecID.new end end # tp = file_type # a single srf file will be packaged into an SRFGroup object def self.from_file(file, tp=nil) obj = nil unless tp tp = file_type(file) end obj = case tp when 'srf' #@hi_prob_best = false SRFGroup.new([file]) when 'srg' #@hi_prob_best = false SRFGroup.new(file) when 'bioworks' #@hi_prob_best = false Bioworks.new(file) when 'protproph' #@hi_prob_best = true Proph::ProtSummary.new(file) when 'pepproph' Proph::PepSummary.new(file) when 'sqg' SQTGroup.new(file) when 'sqt' SQTGroup.new([file]) else abort "UNRECOGNIZED file type for #{file}" end obj end def inspect peps_string = if peps "peps(#)=#{peps.size}" else "peps=(nil)" end "<#{self.class} #{peps_string}>" end # given some list of SpecID::Pep based objects, returns the list of proteins # associated with those peptides # kind must be a symbol: # :no_update (current proteins are returned, but their peps attribute # is not updated) # :update (current proteins returned with peps attribute updated) # :new (new proteins are created complete with peps attribute) def self.protein_list(pephits, kind=:no_update) orig_pephits_prts = [] if kind == :new new_prots = {} pephits.each_with_index do |pep,i| orig_pephits_prts[i] = pep.prots peps_new_prts = pep.prots.map do |prt| if new_prots.key? prt.reference already_exists = new_prots[prt.reference] else np = prt.dup np.peps = [] new_prots[np.reference] = np np end end pep.prots = peps_new_prts end end if kind == :update pephits.each do |pep| pep.prots.each do |prt| prt.peps = [] end end end prot_set = {} pephits.each do |pep| prts = pep.prots prts.each do |prt| prot_set[ prt.reference ] = prt end if (kind == :update || kind == :new) prts.each do |prt| prt.peps << pep end end end ## Reset the original protein hits if kind == :new pephits.each_with_index do |pep,i| pep.prots = orig_pephits_prts[i] end end prot_set.values end # takes a comma separated list or array and extends the last to create an # array of desired size def self.extend_args(arg, desired_size) arg_arr = arg if arg.is_a? String arg_arr = arg.split(',') end new_arr = [] last_arg = arg_arr[0] desired_size.times do |i| if arg_arr[i] new_arr[i] = arg_arr[i] last_arg = new_arr[i] else new_arr[i] = last_arg end end new_arr end # takes an array of proteins, each having peps # peptide grouping is done # by- # the protein with the most unique peptides ends up taking any # degenerate peptides, tie goes to one with most hits total, then the one # that had the top xcorr(s) (before removing any peptides).All other # proteins with identical peptides will lose those peptides. So, the rich # stay rich, and the poor get poorer. # returns an array of triplets where each is [prot, pep_hits, # uniq_aaseqs] (uniq_aaseqs is an array) where the protein contains >= 1 # peptide. The internal links (prot.peps and pep.prots) is NOT modified!! # update_prots == true will set each protein with the peptides found def self.occams_razor(array_of_prots, update_prots=false) peps_found = Set.new to_sort = array_of_prots.map do |prot| pps = prot.peps peps_by_uniq_aaseq = pps.hash_by(:aaseq) uniq_aaseqs = Set.new( pps.map {|pep| pep.aaseq } ) xcorrs = pps.map {|pep| pep.xcorr } silly = OpenStruct.new # 0 1 2 3 4 5 [uniq_aaseqs.size, pps.size, xcorrs.sort, prot, uniq_aaseqs, peps_by_uniq_aaseq] end prot_triplets = [] to_sort.sort.reverse.each do |ar| prot = ar[3] ## overlapping set: common = peps_found & ar[4] ## find the uniq ones in our little set of peptides: uniq = ar[4] - common pep_hits = [] if uniq.size != 0 ## add to the found list: peps_found.merge(uniq) uniq.each do |seq| pep_hits.push( *(ar[5][seq]) ) end prot_triplets << [prot, pep_hits, uniq.to_a] prot.peps = pep_hits if update_prots end end prot_triplets end # returns number of true positives (array) and the specified output (as # parallel array). Requires the classification method and a sorted array of # tp values and an array fp values. # (This is simply a wrapper around ROC#by_tps method!) def by_tps(classification_method, tp, fp) ROC.new.by_tps(classification_method, tp, fp) end # from the unique set of peptide hits, create a separate peptide hit for # each protein reference where that peptide only references that protein # e.g. pep.prots = [(a single protein)] def pep_prots pps = [] peps.each do |pep| pep.prots.map do |prt| pep.dup pep.prots = [prt] pps << pep end end pps end def self.prots?(ar) ar.first.is_a? SpecID::Prot end def self.peps?(ar) ar.first.is_a? SpecID::Pep end # for older stuff def classify_by_regex(items, regex, decoy_on_match=true, ties=:both) objects = case items when :prots prots when :peps peps end SpecID.classify_by_prot(objects, regex, decoy_on_match, ties) end # includes the peptide hit in both # returns (target, decoy) # (for peps) ties can be :both, true (target wins), false (decoy wins) # regardless of ties behavior, will partition out the proteins to be # appropriate for the peptide def self.classify_by_prot(items, regex, decoy_on_match=true, ties=:both) if items.size == 0 return [[],[]] elsif prots?(items) myproc = proc { |prt| if prt.reference =~ regex ; !decoy_on_match else ; decoy_on_match end } return classify(items, myproc) elsif peps?(items) match = [] ; nomatch = [] items.each do |pep| (match_prots, nomatch_prots) = pep.prots.partition do |prot| prot.reference =~ regex end if match_prots.size == 0 nomatch << pep elsif nomatch_prots.size == 0 match << pep else ## both have hits pep.prots = match_prots nomatch_pep = pep.dup nomatch_pep.prots = nomatch_prots # resolve ties case ties when true if decoy_on_match nomatch << pep else match << pep end when false if decoy_on_match match << pep else nomatch << pep end when :both match << pep nomatch << pep else ; raise ArgumentError end end end if decoy_on_match return [nomatch , match] else return [match, nomatch] end else raise ArgumentError, "arg1 is ar of objects descended from SpecID::Prot/Pep" end end # returns [tp, fp] based on the protein prefix for items where items = # (:prot|:peps) # this may result in a duplication of some peptides if they match both # normal and decoy proteins. In this case, the protein arrays are split, # too, so that each points only to its breed of protein. def classify_by_decoy_flag(items, flag, decoy_on_match=true, prefix=false) if prefix regex = /^#{Regexp.escape(flag)}/ else regex = /#{Regexp.escape(flag)}/ end classify_by_regex(items, regex, decoy_on_match) end # Returns (match, nomatch) # items = symbol (:prots, :peps) # Returns two arrays, those returning true from classify_item_by and those # returning false def classify(items, classify_item_by) its = send(items) f = []; t = [] its.each do |it| if classify_item_by.call(it) t << it else f << it end end [t,f] end # returns two arrays, true positives and false positives (determined by proc # classify_item_by) sorted by proc rank_item_by. Items will be ranked from # lowest to highest based on the return value of rank_item_by. items is a # symbol (:prots or :peps) def rank_and_classify(items, rank_item_by, classify_item_by) its = send(items) #its.each do |it| puts it.probability.to_s ; puts it.reference end doublets = its.collect do |item| [ rank_item_by.call(item), classify_item_by.call(item) ] end roc = ROC.new tp, fp = roc.doublets_to_separate(doublets) return tp, fp end # returns a proc for getting all probabilities so that an ascending sort # will put the best scores first def probability_proc if hi_prob_best get_prob_proc = proc {|prt| prt.probability * -1 } else get_prob_proc = proc {|prt| prt.probability } end get_prob_proc end def separate_by_prefix(items, fp_prefix) its = send(items) if items == :prots elsif items == :peps abort "not implemented yet" else abort "no other items recognized yet" end end # sorts the probabilities and then # calcs predicted number hits and precision for protein probabilities # (summing probabilities) # one_minus_ppv = SUM(1-probX)/#prots = what is commonly and mistakenly # called false positive rate # SUM(1-probX)/#prots def num_hits_and_ppv_for_protein_prophet_probabilities current_sum_one_minus_prob = 0.0 num_prots = [] ppv = [] prot_cnt = 0 probs = prots.map {|v| v.probability} sorted = probs.sort.reverse sorted.each do |prob| prot_cnt += 1 num_prots << prot_cnt current_sum_one_minus_prob += 1.0 - prob ppv << 1.0 - ( current_sum_one_minus_prob / prot_cnt ) # current_fpr_ratio = current_sum_one_minus_prob / prot_cnt end [num_prots, ppv] end # convenience method for the common task of determining precision for # proteins (with decoy proteins found by false_flag) # returns (num_hits, precision) def num_hits_and_ppv_for_prob(false_flag, prefix=false) if prefix regex = /^#{Regexp.escape(false_flag)}/ else regex = /#{Regexp.escape(false_flag)}/ end prob_proc = probability_proc myproc = proc { |prt| if prt.reference =~ regex ; false else ; true end } real_hits, decoy_hits = rank_and_classify(:prots, prob_proc, myproc) (num_hits, num_tps, precision) = DecoyROC.new.pred_and_tps_and_ppv(real_hits, decoy_hits) [num_hits, precision] end # # takes the existing spec_id object and marshals it into "file.msh" # # a new file will always look for a file.msh to load # def marshal(force=false) # if !(File.exist? @marshal_file)| force # File.open(@marshal_file, 'w') {|out| Marshal.dump(@obj, out) } # end # end # Returns 'bioworks' if bioworks xml, 'protproph' if Protein prophet # 'srf' if SRF file, 'srg' if search results group file. def self.file_type(file) if file =~ /\.srg$/ return 'srg' elsif file =~ /\.sqg$/ return 'sqg' end if IO.read(file, 7,438) == 'Enzyme:' return 'srf' end File.open(file) do |fh| lines = "" 8.times { lines << fh.readline } if lines =~ // return 'bioworks' elsif ((lines =~ / b.peptide_probability } end #puts min_peptides[0] # -> Bioworks::Pep min_sorted_peps = sorted_probabilities(min_peptides) #puts min_sorted_peps[0] # -> probability (Float) peptides_by_tens = [] hash.each do |k,v| arr = v.sort_by {|pep| pep.peptide_probability }.slice(0,10) peptides_by_tens.push(*arr) end top_10_sorted_peps = sorted_probabilities(peptides_by_tens) #puts top_10_sorted_peps[0] # -> float #puts "size: top_10_sorted_peps.size : #{top_10_sorted_peps.size}" #puts "size: min_sorted_peps.size : #{min_sorted_peps.size}" #p top_10_sorted_peps #p min_sorted_peps return min_sorted_peps, top_10_sorted_peps end # Returns a list of sorted probabilities given the array of peptides def sorted_probabilities(peptides) #puts peptides.first.peptide_probability.class #peptides.each do |pep| print pep.class.to_s + " " end #puts peptides.first.is_a? Array #abort "DFHDFD" peptides.collect{|pep| pep.probability }.sort end # returns a sorted lists of probabilities based on all pepprots (a peptide # associated with a protein) def pep_probs_by_pep_prots sorted_probabilities(peps) end ########################################################################## # WARNING! These might be dangerous to your health if there are multiple # files collected in your bioworks file ########################################################################## # (prob_list_by_min, prob_list_by_best10) # returns 2 sorted lists of probabilities based on: # 1. best peptide hit # 2. top 10 peptide hits # on a per scan basis # NOTE: you may want to hash on base_name first! def pep_probs_by_scan hash = peps.hash_by(:first_scan, :last_scan) return min_and_best10(hash) end #(prob_list_by_min, prob_list_by_best10) # same as pep_probs_by_scan but per charge state # NOTE: you may want to hash on base_name first! def pep_probs_by_scan_charge hash = peps.hash_by(:first_scan, :last_scan, :charge) return min_and_best10(hash) end # (prob_list_by_min) # hashes on seq-charge and returns the sorted list of probabilities of top # hit per seq-charge # NOTE: you may want to hash on base_name first! def pep_probs_by_seq_charge hash = peps.hash_by(:sequence, :charge) min_peptides = hash.collect do |k,v| v.min {|a,b| a.peptide_probability <=> b.peptide_probability } end sorted_probabilities(min_peptides) end ########################################################################## # USE these if you have multiple files in your bioworks.xml file ########################################################################## # (prob_list_by_min, prob_list_by_best10) # returns 2 sorted lists of probabilities based on: # 1. best peptide hit # 2. top 10 peptide hits # on a per scan basis # NOTE: you may want to hash on base_name first! def pep_probs_by_bn_scan hash = peps.hash_by(:base_name, :first_scan, :last_scan) return min_and_best10(hash) end #(prob_list_by_min, prob_list_by_best10) # same as pep_probs_by_scan but per charge state # NOTE: you may want to hash on base_name first! def pep_probs_by_bn_scan_charge hash = peps.hash_by(:base_name, :first_scan, :last_scan, :charge) return min_and_best10(hash) end # (prob_list_by_min) # hashes on seq-charge and returns the sorted list of probabilities of top # hit per seq-charge # NOTE: you may want to hash on base_name first! def pep_probs_by_bn_seq_charge hash = peps.hash_by(:base_name, :sequence, :charge) min_peptides = hash.collect do |k,v| v.min {|a,b| a.peptide_probability <=> b.peptide_probability } end sorted_probabilities(min_peptides) end end # A Generic spectraID protein module SpecID::Prot include ProteinReferenceable # probability is always a float! attr_accessor :probability, :reference, :peps def <=> (other) self.reference <=> other.reference end def inspect pep_string = if peps ", @peps(#)=#{peps.size}" end "<#{self.class} @probability=#{probability}, @reference=#{reference}#{pep_string}>" end end module SpecID::Pep Non_standard_amino_acid_char_re = /[^A-Z\.\-]/ attr_accessor :prots attr_accessor :probability # full sequence: (..) with '-' for no first # or last. attr_accessor :sequence # the basic amino acid sequence (no leading or trailing '.' or amino acids) # should not contain any special symbols, etc. attr_accessor :aaseq attr_accessor :charge # removes nonstandard chars with Non_standard_amino_acid_char_re # preserves A-Z and '.' and '-' def self.remove_non_amino_acids(sequence) sequence.gsub(Non_standard_amino_acid_char_re, '') end # remove_non_amino_acids && split_sequence def self.prepare_sequence(val) nv = remove_non_amino_acids(val) split_sequence(nv) end def <=>(other) aaseq <=> other.aaseq end # Returns prev, peptide, next from sequence. Parse errors return # nil,nil,nil # R.PEPTIDE.A # -> R, PEPTIDE, A # R.PEPTIDE.- # -> R, PEPTIDE, - # PEPTIDE.A # -> -, PEPTIDE, A # A.PEPTIDE # -> A, PEPTIDE, - # PEPTIDE # -> nil,nil,nil def self.split_sequence(val) peptide_prev_aa = ""; peptide = ""; peptide_next_aa = "" pieces = val.split('.') case pieces.size when 3 peptide_prev_aa, peptide, peptide_next_aa = *pieces when 2 if pieces[0].size > 1 ## N termini peptide_prev_aa, peptide, peptide_next_aa = '-', pieces[0], pieces[1] else ## C termini peptide_prev_aa, peptide, peptide_next_aa = pieces[0], pieces[1], '-' end when 1 ## this must be a parse error! peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil when 0 peptide_prev_aa, peptide, peptide_next_aa = nil,nil,nil end return peptide_prev_aa, peptide, peptide_next_aa end ## def self.sequence_to_aaseq(sequence) after_removed = remove_non_amino_acids(sequence) pieces = after_removed.split('.') case pieces.size when 3 pieces[1] when 2 if pieces[0].size > 1 ## N termini pieces[0] else ## C termini pieces[1] end when 1 ## this must be a parse error! pieces[0] ## which is the peptide itself else abort "bad peptide sequence: #{sequence}" end end # This will rapidly determine the list of proteins for which given # peptides belong. It is meant to be low level and fast (eventually), # so it asks for the data in a format amenable to this. # returns a mirror array where each entry is an array of Fasta::Prot # objects where each protein contains the sequence def self.protein_groups_by_sequence(peptide_strings_list, fasta_obj) prots = fasta_obj.prots prot_seqs = prots.map do |prot| prot.aaseq end groups = peptide_strings_list.map do |pep_seq| prot_index = 0 protein_group = [] prot_seqs.each do |prot_seq| if prot_seq.include? pep_seq protein_group << prots[prot_index] end prot_index += 1 end protein_group end groups end # units can be :mmu, :amu, :ppm def mass_accuracy(pep, unit=:ppm, mono=true) # 10^6 * deltam accuracy/ m[measured] # i.e., theoretical mass 1000, measured 999.9: 100ppm # http://www.waters.com/WatersDivision/ContentD.asp?watersit=EGOO-66LRQD # pep.mass is the theoretical M+H of the peptide # this assumes that the deltacn value we're being told is correct, but I # have my suspicions (since the value is not accurate...) ######## TO COMPLETE (and add to spec_id..?) case unit when :ppm when :amu when :mmu end end # calls the method associated with each key and returns the value def values_at(*args) args.map do |arg| send(arg) end end def inspect prot_string = if prots ", @prots(#)=#{prots.size}" end "<#{self.class} @probability=#{probability}, @sequence=#{sequence}, @aaseq=#{aaseq}, @charge=#{charge}#{prot_string}>" end end class SpecID::GenericProt include SpecID::Prot end class SpecID::GenericPep include SpecID::Pep end