require 'rsruby' require 'vec' require 'vec/r' require 'enumerator' module PiZero class << self # takes a sorted array of p-values (floats between 0 and 1 inclusive) # returns [thresholds_ar, instantaneous pi_0 calculations_ar] # evenly incremented values will be used by default: # :start=>0.0, :stop=>0.9, :step=>0.01 def pi_zero_hats(sorted_pvals, args={}) defaults = {:start => 0.0, :stop=>0.9, :step=>0.05 } margs = defaults.merge( args ) (start, stop, step) = margs.values_at(:start, :stop, :step) # From Storey et al. PNAS 2003: lambdas = [] # lambda pi_zeros = [] # pi_0 total = sorted_pvals.size # m # totally inefficient implementation (with correct logic): # TODO: implement this efficiently start.step(stop, step) do |lam| lambdas << lam (greater, less) = sorted_pvals.partition {|pval| pval > lam } pi_zeros.push( greater.size.to_f / ( total * (1.0 - lam) ) ) end [lambdas, pi_zeros] end =begin def plateau_height_with_gsl(x, y) require 'gsl' x_deltas = (0...(x.size-1)).to_a.map do |i| x[i+1] - x[i] end y_deltas = (0...(y.size-1)).to_a.map do |i| y[i+1] - y[i] end new_xs = x.dup new_ys = y.dup x_deltas.reverse.each do |delt| new_xs.push( new_xs.last + delt ) end y_cnt = y.size y_deltas.reverse.each do |delt| y_cnt -= 1 new_ys.push( y[y_cnt] - delt ) end x_vec = GSL::Vector.alloc(new_xs) y_vec = GSL::Vector.alloc(new_ys) coef, cov, chisq, status = GSL::Poly.fit(x_vec,y_vec, 3) coef.eval(x.last) #x2 = GSL::Vector::linspace(0,2.4,20) #graph([x_vec,y_vec], [x2, coef.eval(x2)], "-C -g 3 -S 4") end =end # expecting x and y to make a scatter plot descending to a plateau on the # right side (which is assumed to be of increasing noise as it goes to the # right) # returns the height of the plateau at the right edge # # * # * # * # ** # ** *** * * # ***** **** *** def plateau_height(x, y) r = RSRuby.instance answ = r.smooth_spline(x,y, :df => 3) ## to plot it! r.plot(x,y, :ylab=>"pi_zeros or frit") r.lines(answ['x'], answ['y']) r.points(answ['x'], answ['y']) sleep(4) answ['y'].last end def plateau_exponential(x,y) require 'gsl' xvec = GSL::Vector.alloc(x) yvec = GSL::Vector.alloc(y) a2, b2, = GSL::Fit.linear(xvec, GSL::Sf::log(yvec)) x2 = GSL::Vector.linspace(0, 1.2, 20) exp_a = GSL::Sf::exp(a2) out_y = exp_a*GSL::Sf::exp(b2*x2) raise NotImplementedError, "need to grab out the answer" #graph([xvec, yvec], [x2, exp_a*GSL::Sf::exp(b2*x2)], "-C -g 3 -S 4") end # returns a conservative (but close) estimate of pi_0 given p-values # following Storey et al. 2003, PNAS. def pi_zero(pvals) sorted_pvals = pvals.sort plateau_height( *(pi_zero_hats(sorted_pvals)) ) end # returns an array where the left values have been filled in using the # similar values on the right side of the distribution. These values are # pushed onto the end of the array in no guaranteed order. # extends a distribution on the left side where it is missing since # xcorr values <= 0.0 are not reported # ** # * * # * * # * # * # * # Grabs the right tail from above and inverts it to the left side (less # than zero), creating a more full distribution. raises an ArgumentError # if values_chopped_at_zero.size == 0 # this method would be more robust with some smoothing. # Method currently only meant for large amounts of data. # input data does not need to be sorted def extend_distribution_left_of_zero(values_chopped_at_zero) sz = values_chopped_at_zero.size raise ArgumentError, "array.size must be > 0" if sz == 0 num_bins = (Math.log10(sz) * 100).round vec = VecD.new(values_chopped_at_zero) (bins, freqs) = vec.histogram(num_bins) start_i = 0 freqs.each_with_index do |f,i| if f.is_a?(Numeric) && f > 0 start_i = i break end end match_it = freqs[start_i] # get the index of the first frequency value less than the zero frequency index_to_chop_at = -1 rev_freqs = freqs.reverse rev_freqs.each_with_index do |freq,rev_i| if match_it - rev_freqs[rev_i+1] <= 0 index_to_chop_at = freqs.size - 1 - rev_i break end end cut_point = bins[index_to_chop_at] values_chopped_at_zero + values_chopped_at_zero.select {|v| v >= cut_point }.map {|v| cut_point - v } end # assumes the decoy_vals follows a normal distribution def p_values(target_vals, decoy_vals) (mean, stdev) = VecD.new(decoy_vals).sample_stats r = RSRuby.instance vec = VecD.new(target_vals) right_tailed = true vec.p_value_normal(mean, stdev, right_tailed) end def p_values_for_sequest(target_hits, decoy_hits) dh_vals = decoy_hits.map {|v| v.xcorr } new_decoy_vals = PiZero.extend_distribution_left_of_zero(dh_vals) #File.open("target.yml", 'w') {|out| out.puts new_decoy_vals.join(" ") } #File.open("decoy.yml", 'w') {|out| out.puts target_hits.map {|v| v.xcorr }.join(" ") } #abort 'checking' p_values(target_hits.map {|v| v.xcorr}, new_decoy_vals ) end #### NEED TO VERIFY if this is PIT or PI_ZERO! =begin # takes a list of booleans with true being a target hit and false being a # decoy hit and returns the pi_zero using the smooth method # Should be ordered from best to worst (i.e., one expects more true values # at the beginning of the list) def pi_zero_from_booleans(booleans) targets = 0 decoys = 0 xs = [] ys = [] booleans.reverse.each_with_index do |v,index| if v targets += 1 else decoys += 1 end if decoys > 0 xs << index ys << targets.to_f / decoys end end ys.reverse! plateau_height(xs, ys) end =end # returns fraction of incorrect target hits (frit) (this is the percent # incorrect targets [PIT] expressed as a fraction rather than percent) # takes two parallel arrays consisting of the total number of hits (this # will typically be the total # target hits) at that point and the # precision (ranging from: [0,1]) (typically determined by counting the # number of decoy hits). Expects the number of total hits to be # monotonically increasing and the precision to roughly start high and # decrease as more hits (of lesser quality) are added. def frit_from_precision(total_num_hits_ar, precision_ar) instant_pi_zeros = [] total_num_hits_ar.reverse.zip(precision_ar.reverse).each_cons(2) do |dp1, dp0| (x1, y1) = dp1 (x0, y0) = dp0 instant_pi_zeros << ((x1 * (1.0 - y1)) - (x0 * (1.0 - y0) )) / (x1 - x0) end instant_pi_zeros.reverse! plateau_height(total_num_hits_ar[1..-1], instant_pi_zeros) end # Takes an array of doublets ([[int, int], [int, int]...]) where the first # value is the number of target hits and the second is the number of decoy # hits. Expects that best hits are at the beginning of the list. Assumes # that each sum is a subset of the following group (shown as actual hits # rather than number of hits): # # [[target, target, target, decoy], [target, target, target, decoy, # target, decoy, target], [target, target, target, decoy, target, # decoy, target, decoy, target, target]] # # This assumption may be relaxed somewhat and should still give good # results. def frit_from_groups(array_of_doublets) frits = [] array_of_doublets.reverse.each_cons(2) do |two_doublets| bigger, smaller = two_doublets num_targets = bigger[0] - smaller[0] num_decoy = bigger[1] - smaller[1] num_targets = 0 if num_targets < 0 num_decoy = 0 if num_targets < 0 if num_decoy > 0 frits << (num_targets.to_f / num_decoy) end end frits.reverse! xs = (0...(frits.size)).to_a plateau_height(xs, frits) end end end