pi_zero.rb in mspire-0.4.7

- old
+ new

@@ -1,7 +1,6 @@
 require 'rsruby'
-require 'gsl'
 require 'vec'
 require 'vec/r'
 require 'enumerator'
 
 
@@ -19,19 +18,50 @@
       # From Storey et al. PNAS 2003:
       lambdas = []                 # lambda
       pi_zeros = []                # pi_0
       total = sorted_pvals.size  # m 
 
-      # totally retarded implementation with correct logic:
+      # totally inefficient implementation (with correct logic):
+      # TODO: implement this efficiently
       start.step(stop, step) do |lam|
         lambdas << lam
         (greater, less) = sorted_pvals.partition {|pval| pval > lam }
         pi_zeros.push( greater.size.to_f / ( total * (1.0 - lam) ) )
       end
       [lambdas, pi_zeros]
     end
 
+=begin
+    def plateau_height_with_gsl(x, y)
+      require 'gsl'
+      x_deltas = (0...(x.size-1)).to_a.map do |i|
+        x[i+1] - x[i]
+      end
+      y_deltas = (0...(y.size-1)).to_a.map do |i|
+        y[i+1] - y[i]
+      end
+      new_xs = x.dup
+      new_ys = y.dup
+      x_deltas.reverse.each do |delt|
+        new_xs.push( new_xs.last + delt )
+      end
+
+      y_cnt = y.size
+      y_deltas.reverse.each do |delt|
+        y_cnt -= 1
+        new_ys.push( y[y_cnt] - delt )
+      end
+
+      x_vec = GSL::Vector.alloc(new_xs)
+      y_vec = GSL::Vector.alloc(new_ys)
+      coef, cov, chisq, status = GSL::Poly.fit(x_vec,y_vec, 3)
+      coef.eval(x.last)
+      #x2 = GSL::Vector::linspace(0,2.4,20)
+      #graph([x_vec,y_vec], [x2, coef.eval(x2)], "-C -g 3 -S 4")
+    end
+=end
+
     # expecting x and y to make a scatter plot descending to a plateau on the
     # right side (which is assumed to be of increasing noise as it goes to the
     # right)
     # returns the height of the plateau at the right edge
     #
@@ -40,50 +70,23 @@
     #     *
     #       **
     #          ** ***         *    *
     #                    ***** **** ***
     def plateau_height(x, y)
-=begin
-    require 'gsl'
-    x_deltas = (0...(x.size-1)).to_a.map do |i|
-      x[i+1] - x[i]
-    end
-    y_deltas = (0...(y.size-1)).to_a.map do |i|
-      y[i+1] - y[i]
-    end
-    new_xs = x.dup
-    new_ys = y.dup
-    x_deltas.reverse.each do |delt|
-      new_xs.push( new_xs.last + delt )
-    end
-
-    y_cnt = y.size
-    y_deltas.reverse.each do |delt|
-      y_cnt -= 1
-      new_ys.push( y[y_cnt] - delt )
-    end
-
-    x_vec = GSL::Vector.alloc(new_xs)
-    y_vec = GSL::Vector.alloc(new_ys)
-    coef, cov, chisq, status = GSL::Poly.fit(x_vec,y_vec, 3)
-    coef.eval(x.last)
-    #x2 = GSL::Vector::linspace(0,2.4,20)
-    #graph([x_vec,y_vec], [x2, coef.eval(x2)], "-C -g 3 -S 4")
-=end
-
       r = RSRuby.instance
       answ = r.smooth_spline(x,y, :df => 3)
       ## to plot it!
-      #r.plot(x,y, :ylab=>"instantaneous pi_zeros")
-      #r.lines(answ['x'], answ['y'])
-      #r.points(answ['x'], answ['y'])
-      #sleep(8)
+      r.plot(x,y, :ylab=>"pi_zeros or frit")
+      r.lines(answ['x'], answ['y'])
+      r.points(answ['x'], answ['y'])
+      sleep(4)
 
       answ['y'].last
     end
 
     def plateau_exponential(x,y)
+      require 'gsl'
       xvec = GSL::Vector.alloc(x)
       yvec = GSL::Vector.alloc(y)
       a2, b2, = GSL::Fit.linear(xvec, GSL::Sf::log(yvec))
       x2 = GSL::Vector.linspace(0, 1.2, 20)
       exp_a = GSL::Sf::exp(a2)
@@ -91,13 +94,14 @@
       raise NotImplementedError, "need to grab out the answer"
       #graph([xvec, yvec], [x2, exp_a*GSL::Sf::exp(b2*x2)], "-C -g 3 -S 4")
 
     end
 
-    # returns a conservative (but close) estimate of pi_0 given sorted p-values
+    # returns a conservative (but close) estimate of pi_0 given p-values
     # following Storey et al. 2003, PNAS.
-    def pi_zero(sorted_pvals)
+    def pi_zero(pvals)
+      sorted_pvals = pvals.sort
       plateau_height( *(pi_zero_hats(sorted_pvals)) )
     end
 
     # returns an array where the left values have been filled in using the
     # similar values on the right side of the distribution.  These values are
@@ -159,10 +163,12 @@
       #File.open("decoy.yml", 'w') {|out| out.puts target_hits.map {|v| v.xcorr }.join(" ") }
       #abort 'checking'
       p_values(target_hits.map {|v| v.xcorr}, new_decoy_vals )
     end
 
+#### NEED TO VERIFY if this is PIT or PI_ZERO!
+=begin
     # takes a list of booleans with true being a target hit and false being a
     # decoy hit and returns the pi_zero using the smooth method
     # Should be ordered from best to worst (i.e., one expects more true values
     # at the beginning of the list)
     def pi_zero_from_booleans(booleans)
@@ -182,46 +188,57 @@
         end
       end
       ys.reverse!
       plateau_height(xs, ys)
     end
+=end
 
+    # returns fraction of incorrect target hits (frit) (this is the percent
+    # incorrect targets [PIT] expressed as a fraction rather than percent)
+    # takes two parallel arrays consisting of the total number of hits (this
+    # will typically be the total # target hits) at that point and the
+    # precision (ranging from: [0,1]) (typically determined by counting the
+    # number of decoy hits).  Expects the number of total hits to be
+    # monotonically increasing and the precision to roughly start high and
+    # decrease as more hits (of lesser quality) are added.
+    def frit_from_precision(total_num_hits_ar, precision_ar)
+      instant_pi_zeros = []
+      total_num_hits_ar.reverse.zip(precision_ar.reverse).each_cons(2) do |dp1, dp0|
+        (x1, y1) = dp1
+        (x0, y0) = dp0
+        instant_pi_zeros << ((x1 * (1.0 - y1)) - (x0 * (1.0 - y0) )) / (x1 - x0)
+      end
+      instant_pi_zeros.reverse!
+      plateau_height(total_num_hits_ar[1..-1], instant_pi_zeros)
+    end
+
     # Takes an array of doublets ([[int, int], [int, int]...]) where the first
     # value is the number of target hits and the second is the number of decoy
     # hits.  Expects that best hits are at the beginning of the list.  Assumes
-    # that each sum is a subset
-    # of the following group (shown as actual hits rather than number of hits):
+    # that each sum is a subset of the following group (shown as actual hits
+    # rather than number of hits):
     #
     #    [[target, target, target, decoy], [target, target, target, decoy,
     #    target, decoy, target], [target, target, target, decoy, target,
     #    decoy, target, decoy, target, target]]
     #
     # This assumption may be relaxed somewhat and should still give good
     # results.
-    def pi_zero_from_groups(array_of_doublets)
-      pi_zeros = []
+    def frit_from_groups(array_of_doublets)
+      frits = []
       array_of_doublets.reverse.each_cons(2) do |two_doublets|
         bigger, smaller = two_doublets
-        bigger[0] = bigger[0] - smaller[0]
-        bigger[1] = bigger[1] - smaller[1]
-        bigger.map! {|v| v < 0 ? 0 : v }
-        if bigger[1] > 0
-          pi_zeros << (bigger[0].to_f / bigger[1])
+        num_targets = bigger[0] - smaller[0] 
+        num_decoy = bigger[1] - smaller[1]
+        num_targets = 0 if num_targets < 0
+        num_decoy = 0 if num_targets < 0
+        if num_decoy > 0
+          frits << (num_targets.to_f / num_decoy)
         end
       end
-      pi_zeros.reverse!
-      xs = (0...(pi_zeros.size)).to_a
-      plateau_height(xs, pi_zeros)
+      frits.reverse!
+      xs = (0...(frits.size)).to_a
+      plateau_height(xs, frits)
     end
 
   end
-
-
-end
-
-if $0 == __FILE__
-  #xcorrs = IO.readlines("/home/jtprince/xcorr_hist/all_xcorrs.yada").first.chomp.split(/\s+/).map {|v| v.to_f }
-  #PiZero.p_values_for_sequest(
-  #File.open("newtail.yada", 'w') {|out| out.puts new_dist.join(" ") }
-  
-
 end