counting.rb in macroape-3.3.8

- old
+ new

@@ -2,31 +2,52 @@
 
 module Bioinform
   class PWM
     # sets or gets limit size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data
     make_parameters :max_hash_size
-    
+
     def threshold(pvalue)
       thresholds(pvalue){|_, thresh, _| return thresh }
     end
     def threshold_and_real_pvalue(pvalue)
       thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
     end
+    def weak_threshold(pvalue)
+      weak_thresholds(pvalue){|_, thresh, _| return thresh }
+    end
+    def weak_threshold_and_real_pvalue(pvalue)
+      weak_thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv }
+    end
 
     def thresholds(*pvalues)
       thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
         threshold = thresholds.begin + 0.1 * (thresholds.end - thresholds.begin)
         real_pvalue = counts.end.to_f / vocabulary_volume
         yield pvalue, threshold, real_pvalue
       end
     end
 
+    # "weak" means that threshold has real pvalue not less than given pvalue, while usual threshold not greater
+    def weak_thresholds(*pvalues)
+      thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)|
+        threshold = thresholds.begin.to_f
+        real_pvalue = counts.begin.to_f / vocabulary_volume
+        yield pvalue, threshold, real_pvalue
+      end
+    end
+
+
     def count_distribution_under_pvalue(max_pvalue)
       cnt_distribution = {}
       look_for_count = max_pvalue * vocabulary_volume
       until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count
-        cnt_distribution = count_distribution_after_threshold(threshold_gauss_estimation(max_pvalue))
+        begin
+          approximate_threshold = threshold_gauss_estimation(max_pvalue)
+        rescue
+          approximate_threshold = worst_score
+        end
+        cnt_distribution = count_distribution_after_threshold(approximate_threshold)
         max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue
       end
 
       cnt_distribution
     end
@@ -81,15 +102,20 @@
       new_scores
     end
 
     def counts_by_thresholds(*thresholds)
       scores = count_distribution_after_threshold(thresholds.min)
-      thresholds.map{ |threshold|
-        scores.inject(0.0){|sum,(score,count)|  (score >= threshold) ? sum + count : sum}
+      thresholds.inject({}){ |hsh, threshold|
+        hsh[threshold] = scores.inject(0.0){|sum,(score,count)|  (score >= threshold) ? sum + count : sum}
+        hsh
       }
     end
 
+    def count_by_threshold(threshold)
+      counts_by_thresholds(threshold)[threshold]
+    end
+
     def pvalue_by_threshold(threshold)
-      counts_by_thresholds(threshold).first / vocabulary_volume
+      count_by_threshold(threshold) / vocabulary_volume
     end
   end
 end
\ No newline at end of file