#! /usr/bin/env ruby ########################## #LIBRARIES ########################## require 'optparse' ########################## #METHODS ########################## @success_percentage_distribution = [] @prediction_vector = [] @rankings = { :in => [], :out => []} @genome_fraction_predicted = 0 #All positive cases @good_predicted_subregions = 0 #True positive cases def compute_rankings if !@prediction_vector.empty? n_preds = @prediction_vector.length.fdiv(100) @prediction_vector.each_with_index do |in_control, i| ranking = (i + 1).fdiv(n_preds) if in_control @rankings[:in] << ranking else @rankings[:out] << ranking end end @prediction_vector = [] end end def load_prediction_file(input_file) predicted_regions = [] File.open(input_file).each do |line| line.chomp! predicted_info = line.split("\t") profile_index = predicted_info[0].gsub('ProfID:','').to_i if predicted_info[1] != 'Results not found' predicted_hpos_number = predicted_info[4].split(',').length predicted_regions << [profile_index, predicted_info[1], predicted_info[2].to_i, predicted_info[3].to_i, predicted_info[6].to_f, predicted_hpos_number] else predicted_regions << [profile_index] end end return predicted_regions # profile_index, pred_chr, pred_start, pred_stop, score end def get_imputation_scores(predicted_regions, integration_method) # predicted_regions.map{|pred_reg| pred_reg.pop} # STDERR.puts predicted_regions.inspect selected_regions = predicted_regions.select{|r| r[4].class == Float } score_regionLength_pairs = selected_regions.map{|r| [r[4], r[3] - r[2]]} #Get combined score and region length score_regionLength_pairs.sort!{|p1, p2| p1.first <=> p2.first} score_regionLength_pairs.reverse! if integration_method == 'fisher' total_region_length = score_regionLength_pairs.map{|p| p.last }.inject{|sum, l| sum + l } length2inspect = total_region_length/1000 acumulated_score = 0 inspected_length = 0 score_regionLength_pairs.each do |score, length| acumulated_score += score * length inspected_length += length break if inspected_length >= length2inspect end return acumulated_score.fdiv(inspected_length) end def generate_random_imp_score(imputation_score, desv) range = imputation_score * desv * rand() if [true, false].sample final_score = imputation_score - range else final_score = imputation_score + range end return final_score end def load_patient_data(input_data_file) patient_data = [] File.open(input_data_file).each do |line| line.chomp! mutation_coords, hpo_profile = line.split("\t") number_of_phenotypes = hpo_profile.split('|').length chr, start_pos, stop_pos = mutation_coords.split(':') patient_data << [chr, start_pos.to_i, stop_pos.to_i, number_of_phenotypes] end return patient_data #ctrl_chr, ctrl_start, ctrl_stop, #number_of_phens end def get_perfomance_table(ctrl_regions, predicted_regions, scale, imputation_score, hpo_min_recovery, apply_imputation) table = [] last_profile_id = ctrl_chr = ctrl_start = ctrl_stop = predicted_hpos_number = number_of_phenotypes = nil in_out_regions = [] predicted_regions.each do |profile_index, pred_chr, pred_start, pred_stop, score, predicted_hpos_number| if last_profile_id != profile_index && !last_profile_id.nil? table.concat(process_in_out_regions(ctrl_start, ctrl_stop, scale, in_out_regions, imputation_score, apply_imputation)) @genome_fraction_predicted = 0 @good_predicted_subregions = 0 in_out_regions = [] compute_rankings end ctrl_chr, ctrl_start, ctrl_stop, number_of_phenotypes = ctrl_regions[profile_index] #get position in array, for each prediction unless predicted_hpos_number.nil? || number_of_phenotypes.nil? hpo_recovery_percentage = ( predicted_hpos_number / number_of_phenotypes.to_f ) * 100 #STDERR.puts "#{predicted_hpos_number}\t#{number_of_phenotypes}" if hpo_recovery_percentage > hpo_min_recovery in_out_regions.concat(get_in_out_regions(ctrl_chr, ctrl_start, ctrl_stop, pred_chr, pred_start, pred_stop, score)) last_profile_id = profile_index end end end table.concat(process_in_out_regions(ctrl_start, ctrl_stop, scale, in_out_regions, imputation_score, apply_imputation)) return table end def process_in_out_regions(ctrl_start, ctrl_stop, scale, in_out_regions, imputation_score, apply_imputation) @success_percentage_distribution << get_sucess_percentage(in_out_regions) table = [] ctrl_length = ctrl_stop - ctrl_start non_predicted_regions = ctrl_length - @good_predicted_subregions if non_predicted_regions > 0 #total_predicted_region_length = in_out_regions.map{|s| s.last}.inject(0){|i, sum| i + sum} #imputation_score = in_out_regions.map{|s| s[1] * s.last}.inject(0){|i, sum| i + sum}.fdiv(total_predicted_region_length) #imputation_score += 0.25 * imputation_score #imputation_score = in_out_regions.map{|s| s[1]}.max #index = (9 * in_out_regions.length).fdiv(10).ceil #imputation_score = in_out_regions.map{|s| s[1]}.sort[index] #in_out_regions << ["in", generate_random_imp_score(0.764, 0.35), non_predicted_regions] if apply_imputation in_out_regions << ["in", generate_random_imp_score(imputation_score, 0.35), non_predicted_regions] if apply_imputation end evaluated_genome_fraction = @genome_fraction_predicted + ctrl_length in_out_regions.each do |group, score, region_length| list_entries = (region_length.fdiv(evaluated_genome_fraction) * scale).ceil table.concat(Array.new(list_entries, [group, score])) end return table end def get_in_out_regions(ctrl_chr, ctrl_start, ctrl_stop, pred_chr, pred_start, pred_stop, score) in_out_regions = [] if ctrl_chr == pred_chr if pred_start < ctrl_start && pred_stop > ctrl_stop # predicted region larger than ctrl region region_length = ctrl_stop - ctrl_start in_out_regions << ["in", score, region_length] @good_predicted_subregions += region_length region_length = ctrl_start - pred_start in_out_regions << ["out", score, region_length] @genome_fraction_predicted += region_length region_length = pred_stop - ctrl_stop in_out_regions << ["out", score, region_length] @genome_fraction_predicted += region_length elsif pred_start >= ctrl_start && pred_stop <= ctrl_stop #within ctrl region region_length = pred_stop - pred_start in_out_regions << ["in", score, region_length] @good_predicted_subregions += region_length elsif ctrl_start < pred_stop && ctrl_stop >= pred_stop #upstream region out of ctrl region region_length = pred_stop - ctrl_start in_out_regions << ["in", score, region_length] @good_predicted_subregions += region_length region_length = ctrl_start - pred_start in_out_regions << ["out", score, region_length] @genome_fraction_predicted += region_length elsif ctrl_start <= pred_start && ctrl_stop > pred_start #downstream region out of ctrl region region_length = ctrl_stop - pred_start in_out_regions << ["in", score, region_length] @good_predicted_subregions += region_length region_length = pred_stop - ctrl_stop in_out_regions << ["out", score, region_length] @genome_fraction_predicted += region_length else #in same chr but not in ctrl region region_length = pred_stop - pred_start in_out_regions << ["out", score, region_length] @genome_fraction_predicted += region_length end elsif !pred_chr.nil? #in different chr region_length = pred_stop - pred_start in_out_regions << ["out", score, region_length] @genome_fraction_predicted += region_length end if in_out_regions.map{|reg| reg.first }.include?('in') @prediction_vector << true else @prediction_vector << false end return in_out_regions end def get_sucess_percentage(in_out_regions) percentage = 0 if !in_out_regions.empty? count = 0 total = 0 in_out_regions.each do |group, score, reg_length| count += reg_length if group == 'in' total += reg_length end percentage = count.fdiv(total) end return percentage end ########################## #OPT-PARSE ########################## options = {} OptionParser.new do |opts| opts.banner = "Usage: #{__FILE__} [options]" options[:input_prediction] = nil opts.on("-i", "--input_prediction PATH", "Input prediction file for checking") do |input_prediction| options[:input_prediction] = input_prediction end options[:meth] = nil opts.on("-m", "--meth STRING", "Method used in score integration calculation, affects to the imputation algorithm (if used)") do |meth| options[:meth] = meth end options[:output_file] = 'final_values_for_pr_curve.txt' opts.on("-o", "--output_file PATH", "Output results for PR curve") do |output_file| options[:output_file] = output_file end options[:hpo_recovery] = 0 opts.on("-p", "--hpo_recovery INTEGER", "Minimum percentage of HPO terms to consider predictions") do |hpo_recovery| options[:hpo_recovery] = hpo_recovery.to_f abort("Please, choose a recovery value higher than 0") if options[:hpo_recovery] <= 0 end options[:input_regions] = nil opts.on("-r", "--input_regions PATH", "Input patients true affected regions (ctrl file)") do |input_regions| options[:input_regions] = input_regions end options[:success_percentage] = 'success_percentage' opts.on("-s", "--success_percentage PATH", "Output results with success percentage for each prediction") do |success_percentage| options[:success_percentage] = success_percentage end options[:apply_imputation] = false opts.on("-y", "--apply_imputation", "Activates imputation") do options[:apply_imputation] = true end options[:scale_size] = 100 opts.on("-z", "--scale_size INTEGER", "Scale region size to avoid long range regions") do |scale_size| options[:scale_size] = scale_size.to_i abort("Please, choose a scale value higher than 0") if options[:scale_size] <= 0 end opts.on_tail("-h", "--help", "Show this message") do puts opts exit end end.parse! ########################## #MAIN ########################## predicted_regions = load_prediction_file(options[:input_prediction]) imputation_score = get_imputation_scores(predicted_regions, options[:meth]) patient_data = load_patient_data(options[:input_regions]) table = get_perfomance_table(patient_data, predicted_regions, options[:scale_size], imputation_score, options[:hpo_recovery], options[:apply_imputation]) File.open(options[:output_file], 'w') do |f| f.puts "group\tscore" table.each do |output, score| if options[:apply_imputation] #score = generate_random_imp_score(0.764, 0.35) if score.nil? score = generate_random_imp_score(imputation_score, 0.35) if score.nil? else next if score.nil? #when no imputation end f.puts "#{output}\t#{score}" end end File.open(options[:success_percentage], 'w') do |f| f.puts 'perc' @success_percentage_distribution.each do |pg| f.puts pg end end File.open('ranking', 'w') do |f| f.puts "reg\tranking" @rankings.each do |reg, ranks| ranks.each do |rank| f.puts "#{reg.to_s}\t#{rank}" end end end