require 'validator' require 'set' require 'group_by' require 'shuffle' # calculates protein hit precision based on peptide precision class Validator::ProtFromPep < Validator # calculate protein precision based on the number of false peptides # returns the precision based on the number of proteins *completely false* # calculates the worst precision by assuming that proteins with the fewest # peptides are all false (before prots with more pephits) # note that this approaches the worst, but is not guaranteed to be worst # unless each pephit maps to a single protein hit. # [worst, normal_mean, normal_stddev] # options # :num_its_normal => Integer, # num iterations for normal (d: 10) # :num_its_worstcase => Integer, # num iterations for worstcase (d: 10) # def prothit_precision(peps, num_false_pephits, opts={}) opts[:num_its_normal] ||= 10 opts[:num_its_worstcase] ||= 10 # get the num_peps_per_protein array worst = worstcase_prothit_precision(peps, num_false_pephits, :num_its => opts[:num_its_worstcase]) (normal_mean, normal_stdev) = normal_prothit_precision( peps, num_false_pephits, :num_its => opts[:num_its_normal]) [worst, normal_mean, normal_stdev] end # returns an array of the number of peptide hits in each protein def num_peps_per_protein(peps) num_pephits_by_prot = Hash.new { 0 } peps.each do |pep| pep.prots.each do |prot| num_pephits_by_prot[prot.reference] += 1 end end num_pephits_by_prot.values end # returns the worstcase precision. This assumes that every small protein # with the fewest peptide hits is completely 'filled' with incorrect hits in # preference to any higher hit protein. # Where each peptide hit maps to a single protein, this is guaranteed to be # worst-case. If this doesn't hold, there are some extreme cases where a # poorer precision could be generated, but this is still probably fairly # close. Thus, a slightly different answer may be generated each time. # ...variation is produced by shuffling the order of the proteins from which # peptides are removed within groups of proteins having the same number of # peptides. # This method does NOT require that the prothits be updated to reflect only # those pephits being passed in. # # validator.worstcase_prothit_precision(peps, 14, 1) # => 0.232111 # # options: # :num_its => Integer (default: 10) number of times to run (finds minimum) # :one_prot_per_pep => true | *false assumes each peptide maps to a # single protein def worstcase_prothit_precision(peps, num_false_pephits, opts = {}) num_its = opts[:num_its] || 10 one_prot_per_pep = opts[:one_prot_per_pep] # nil or false still == false one_prot_per_pep = false if one_prot_per_pep == nil ############################################## # The END Cases (can be dealt with quickly) ############################################## if num_false_pephits == 0 return 1.0 elsif num_false_pephits >= peps.size return 0.0 end if one_prot_per_pep num_peps_per_prot = num_peps_per_protein(peps) return worstcase_prothit_precision_by_numbers(num_peps_per_prot, num_false_pephits) else ##################################### # HERE's the basic plan!! ##################################### # order the proteins by num peptides # create a set of peptides # delete peptides from the proteins off the set o' peptides (ensuring that # a deleted one cannot be deleted twice) ##################################### # order the proteins by num peptides # and create a hash that holds the peptides (given here) in those proteins prots_to_peps_here = Hash.new {|h,k| h[k] = [] } prots_to_peps_size = Hash.new { 0 } pep_ids = [] pep_ids_to_prot_ids = Hash.new {|h,k| h[k] = [] } peps.each do |pep| #puts pep.prots.size pep.prots.each do |prot| #p prot.reference prots_to_peps_here[prot] << pep prots_to_peps_size[prot] += 1 pep_ids << pep pep_ids_to_prot_ids[pep] << prot end end prot_ids_listed_by_peps_size = prots_to_peps_size.keys tot_num_prots = prot_ids_listed_by_peps_size.size sample = Array.new(num_its) srand( 777 ) precision_sample = (0...num_its).to_a.map do num_false_pephits_counter = num_false_pephits # create a set of peptides pep_ids_set = pep_ids.to_set # shuffle the proteins within size groups finished = false prot_ids_listed_by_peps_size.group_by {|prot_id| prots_to_peps_size[prot_id] }.sort.each do |k,group_of_proteins_with_same_pep_size| group_of_proteins_with_same_pep_size.shuffle! group_of_proteins_with_same_pep_size.each do |prot_id| prots_to_peps_here[prot_id].each do |pep_id| if pep_ids_set.include?(pep_id) # if 1 # remove a peptide pep_ids_set.delete(pep_id) num_false_pephits_counter -= 1 if num_false_pephits_counter == 0 # if 2 finished = true end # close if 2 end # close if 1 break if finished # each pep end break if finished # each prot end break if finished # each group_of_proteins_with_same_pep_size end # each group_of_proteins_with_same_pep_size ## Figure out the number of proteins left! proteins_still_around = pep_ids_set.inject(Set.new) {|protset,pep_id| protset.merge( pep_ids_to_prot_ids[pep_id]) } proteins_still_around.size.to_f / tot_num_prots end # a sample return precision_sample.min end # FINAL else end # returns the precision of the worst possible outcome def worstcase_prothit_precision_by_numbers(num_peps_per_prot, num_false_pephits) completely_false_proteins = 0 num_peps_per_prot.sort.each do |num_peps| num_false_pephits -= num_peps if num_false_pephits >= 0 completely_false_proteins += 1 end if num_false_pephits <= 0 break end end num_prots = num_peps_per_prot.size (num_prots - completely_false_proteins).to_f/num_prots end # normal as in a standard normal distribution of peptide hits per protein # they are distributed randomly and the precision is assumed to take on a # standard normal distribution. # num_peps_per_protein is an array of the number of peptides per protein hit # (these are the true hits) # assumes that the number follows a gaussian distribution (binomial # distributions tend toward gaussians, I believe, at large N) # returns [mean_precision, stdev_precision] # options: # :num_its => Integer (default: 10) # # if num_iterations is set at 1, then only the precision will be returned # though random, the same seed is always used to start this process, meaning # that the same results will be produced on consecutive attempts. # # validator.normal_prothit_precision(peps, 13, :num_its => 1) # -> 0.95433 # validator.normal_prothit_precision(peps, 13, :num_its => 2) # -> [0.92002, 1.2223] def normal_prothit_precision( peps, num_false_pephits, opts={}) num_iterations = opts[:num_its] || 10 srand( 38272 ) ############################################## # The END Cases (can be dealt with quickly) ############################################## if num_false_pephits == 0 if num_iterations == 1 return 1.0 else return [1.0, 0.0] end elsif num_false_pephits >= peps.size if num_iterations == 1 return 0.0 else return [0.0, 0.0] end end ############################################## # Everything else: ############################################## sample = Array.new(num_iterations) base_indices = (0...(peps.size)).to_a ### ACUTALLY, I THINK WE WANT TO CREATE AND MERGE!!!! # This would mean that only a single hit would validate the protein # if we are subtracting, then we lose the protein on a single peptide!!!! prot_id_set = peps.inject(Set.new) do |prtset, pep| prtset.merge( pep.prots.map {|prot| prot } ) end tot_num_prots = prot_id_set.size # could also merge off the good indices # TODO: we should optimize based on how many false pephits given... precision_sample = (0...num_iterations).to_a.map do shuffled_indices = base_indices.map shuffled_indices.shuffle! good_indices = shuffled_indices[num_false_pephits..-1] still_remaining = Set.new peps.values_at(*good_indices).each do |pep| still_remaining.merge(pep.prots.map {|prot| prot }) end still_remaining.size.to_f / tot_num_prots end if num_iterations == 1 precision_sample.shift else #puts "PRECISION GROUP: " #p precision_sample sample_stats(precision_sample) end end end