lib/macroape/counting.rb in macroape-3.3.7 vs lib/macroape/counting.rb in macroape-3.3.8

- old
+ new

@@ -2,31 +2,52 @@ module Bioinform class PWM # sets or gets limit size of calculation hash. It's a defence against overuse CPU resources by non-appropriate data make_parameters :max_hash_size - + def threshold(pvalue) thresholds(pvalue){|_, thresh, _| return thresh } end def threshold_and_real_pvalue(pvalue) thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv } end + def weak_threshold(pvalue) + weak_thresholds(pvalue){|_, thresh, _| return thresh } + end + def weak_threshold_and_real_pvalue(pvalue) + weak_thresholds(pvalue){|_, thresh, real_pv| return thresh, real_pv } + end def thresholds(*pvalues) thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)| threshold = thresholds.begin + 0.1 * (thresholds.end - thresholds.begin) real_pvalue = counts.end.to_f / vocabulary_volume yield pvalue, threshold, real_pvalue end end + # "weak" means that threshold has real pvalue not less than given pvalue, while usual threshold not greater + def weak_thresholds(*pvalues) + thresholds_by_pvalues(*pvalues).each do |pvalue,(thresholds, counts)| + threshold = thresholds.begin.to_f + real_pvalue = counts.begin.to_f / vocabulary_volume + yield pvalue, threshold, real_pvalue + end + end + + def count_distribution_under_pvalue(max_pvalue) cnt_distribution = {} look_for_count = max_pvalue * vocabulary_volume until cnt_distribution.inject(0.0){|sum,(score,count)| sum + count} >= look_for_count - cnt_distribution = count_distribution_after_threshold(threshold_gauss_estimation(max_pvalue)) + begin + approximate_threshold = threshold_gauss_estimation(max_pvalue) + rescue + approximate_threshold = worst_score + end + cnt_distribution = count_distribution_after_threshold(approximate_threshold) max_pvalue *=2 # if estimation counted too small amount of words - try to lower threshold estimation by doubling pvalue end cnt_distribution end @@ -81,15 +102,20 @@ new_scores end def counts_by_thresholds(*thresholds) scores = count_distribution_after_threshold(thresholds.min) - thresholds.map{ |threshold| - scores.inject(0.0){|sum,(score,count)| (score >= threshold) ? sum + count : sum} + thresholds.inject({}){ |hsh, threshold| + hsh[threshold] = scores.inject(0.0){|sum,(score,count)| (score >= threshold) ? sum + count : sum} + hsh } end + def count_by_threshold(threshold) + counts_by_thresholds(threshold)[threshold] + end + def pvalue_by_threshold(threshold) - counts_by_thresholds(threshold).first / vocabulary_volume + count_by_threshold(threshold) / vocabulary_volume end end end \ No newline at end of file