bin/phen2reg.rb in pets-0.1.4 vs bin/phen2reg.rb in pets-0.2.3

- old
+ new

@@ -3,53 +3,19 @@ # Program to predict the position from given HPO codes, sorted by their association values. REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates')) ROOT_PATH = File.dirname(__FILE__) $: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets')) -require 'net/ftp' -require 'net/http' -require 'zlib' -require 'json' + require 'generalMethods.rb' require 'phen2reg_methods.rb' require 'optparse' require 'report_html' +require 'semtools' ########################## -#METHODS -########################## - -def calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, min_hpo_recovery_percentage, patient_number) - records_to_delete = [] - counter = 0 - adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score| - hpo_coincidences = patient_original_phenotypes & hpo_list - original_hpo_recovery_percentage = hpo_coincidences.length / patient_original_phenotypes.length.to_f * 100 - records_to_delete << counter if original_hpo_recovery_percentage < min_hpo_recovery_percentage - query = predicted_hpo_percentage[patient_number] - if query.nil? - predicted_hpo_percentage[patient_number] = [original_hpo_recovery_percentage] - else - query << original_hpo_recovery_percentage - end - counter += 1 - end - records_to_delete.reverse_each do |record_number| - adjacent_regions_joined.delete_at(record_number) - end -end - -def download(ftp_server, path, name) - ftp = Net::FTP.new() - ftp.connect(ftp_server) - ftp.login - ftp.getbinaryfile(path, name) - ftp.close -end - -########################## #OPT-PARSER ########################## options = {} OptionParser.new do |opts| @@ -77,15 +43,21 @@ options[:information_coefficient] = nil opts.on("-i", "--information_coefficient PATH", "Input file with information coefficients") do |information_coefficient| options[:information_coefficient] = information_coefficient end + options[:join_adyacent_regions] = false + opts.on('-j', "--join_adyacent_regions", "When a group of regions are adyacent they are merged in a single one with averaged parameters. The phenotypes in the regions must be shared across them.") do + options[:join_adyacent_regions] = true + end + options[:retrieve_kegg_data] = false opts.on('-k', "--retrieve_kegg_data", "Add KEGG data to prediction report") do options[:retrieve_kegg_data] = true end + options[:print_matrix] = false opts.on('-m', "--print_matrix", "Print output matrix") do options[:print_matrix] = true end @@ -107,10 +79,15 @@ options[:output_matrix] = 'output_matrix.txt' opts.on("-o", "--output_matrix PATH", "Output matrix file, with associations for each input HPO") do |output_matrix| options[:output_matrix] = output_matrix end + options[:output_path] = './' + opts.on("-O", "--output_path PATH", "General output folder path, takes precedence for other options") do |output| + options[:output_path] = output + end + options[:prediction_data] = nil #chr\tstart\tstop opts.on("-p", "--prediction_file PATH", "Input data with HPO codes for predicting their location. It can be either, a file path or string with HPO separated by pipes (|)") do |input_path| options[:prediction_data] = input_path end @@ -172,214 +149,170 @@ all_paths[:biosystems_gene] = File.join(all_paths[:external_data], 'biosystems_gene.gz') all_paths[:biosystems_info] = File.join(all_paths[:external_data], 'bsid2info.gz') all_paths[:gene_data_with_pathways] = File.join(all_paths[:external_data], 'gene_data_with_pathways.gz') all_paths[:gene_location] = File.join(all_paths[:external_data], 'gene_location.gz') -########################## -#DOWNLOADS -########################## -sources = [ - ['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]], - ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]], - ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/bsid2info.gz', all_paths[:biosystems_info]] -] -sources.each do |server, path, output| - download(server, path, output) if !File.exists?(output) -end +output_folder = File.expand_path(options[:output_path]) +Dir.mkdir(output_folder) if !File.exists?(output_folder) ########################## #MAIN ########################## -if File.exist?(options[:prediction_data]) +#- Loading patient profiles +#------------------------------ +if File.exist?(options[:prediction_data]) # From file if !options[:multiple_profile] options[:prediction_data] = [File.open(options[:prediction_data]).readlines.map!{|line| line.chomp}] - #STDERR.puts options[:prediction_data].inspect else - multiple_profiles = [] - File.open(options[:prediction_data]).each do |line| - line.chomp! - multiple_profiles << line.split('|') - end - options[:prediction_data] = multiple_profiles + options[:prediction_data] = File.open(options[:prediction_data]).readlines.map!{|line| line.chomp.split('|')} end -else - # if you want to add phenotypes through the terminal +else # if you want to add phenotypes through the terminal if !options[:multiple_profile] options[:prediction_data] = [options[:prediction_data].split('|')] else options[:prediction_data] = options[:prediction_data].split('!').map{|profile| profile.split('|')} end end - -########################## #- Loading data +#------------------------------ +# hpo = Ontology.new +# hpo.load_data(options[:hpo_file]) +hpo = Ontology.new(file: options[:hpo_file], load_file: true) +trainingData = load_training_file4HPO(options[:training_file], options[:best_thresold]) +hpos_ci_values = load_hpo_ci_values(options[:information_coefficient]) if options[:quality_control] -hpo_storage = load_hpo_file(options[:hpo_file]) -if options[:quality_control] - hpo_child_metadata = get_child_parent_relations(hpo_storage) - hpos_ci_values = load_hpo_ci_values(options[:information_coefficient]) -end - genes_with_kegg = {} gene_location = {} if options[:retrieve_kegg_data] - if !File.exists?(all_paths[:gene_data_with_pathways]) || !File.exists?(all_paths[:gene_location]) - gene_list, gene_location = load_gene_data(all_paths[:gene_data]) - ### kegg_data = parse_kegg_data(genes_found_attributes.keys) - kegg_data = parse_kegg_from_biosystems(all_paths[:biosystems_gene], all_paths[:biosystems_info]) - genes_with_kegg = merge_genes_with_kegg_data(gene_list, kegg_data) - write_compressed_plain_file(genes_with_kegg, all_paths[:gene_data_with_pathways]) - write_compressed_plain_file(gene_location, all_paths[:gene_location]) - else - gene_location = read_compressed_json(all_paths[:gene_location]) - genes_with_kegg = read_compressed_json(all_paths[:gene_data_with_pathways]) - end + gene_location, genes_with_kegg = get_and_parse_external_data(all_paths) end -# hpo_dictionary = load_hpo_dictionary_name2code(options[:hpo2name_file]) if options[:hpo_is_name] -trainingData = load_training_file4HPO(options[:training_file], options[:best_thresold]) - -########################## #- HPO PROFILE ANALYSIS - +#--------------------------------- phenotypes_by_patient = {} predicted_hpo_percentage = {} options[:prediction_data].each_with_index do |patient_hpo_profile, patient_number| - phenotypes_by_patient[patient_number] = patient_hpo_profile - # STDERR.puts patient_hpo_profile.inspect - if options[:hpo_is_name] - translated_hpos = [] - hpo_dictionary = create_hpo_dictionary(hpo_storage) - patient_hpo_profile.each_with_index do |name, i| - hpo_code = hpo_dictionary[name] - if hpo_code.nil? - #STDERR.puts "Warning! Invalid HPO name: #{name}" - hpo_code = nil + if options[:hpo_is_name] + # patient_hpo_profile, rejected = hpo.translate_names2codes(patient_hpo_profile) + patient_hpo_profile, rejected = hpo.translate_names(patient_hpo_profile) + STDERR.puts "Phenotypes #{rejected.join(",")} in patient #{patient_number} not exist" end - patient_hpo_profile[i] = hpo_code - end - patient_hpo_profile.compact! - end - #HPO quality control - #--------------------------- - characterised_hpos = [] - #hpo_metadata = [] - if options[:quality_control] - #characterised_hpos, hpo_metadata = hpo_quality_control(options[:prediction_data], options[:hpo2name_file], options[:information_coefficient]) - # characterised_hpos, hpo_storage = hpo_quality_control(patient_hpo_profile, hpo_storage, hpo_child_metadata, hpos_ci_values) - characterised_hpos = hpo_quality_control(patient_hpo_profile, hpo_storage, hpo_child_metadata, hpos_ci_values) - output_quality_control = File.open(options[:output_quality_control], "w") - header = ["HPO name", "HPO code", "Exists?", "CI value", "Is child of", "Childs"] - output_quality_control.puts Terminal::Table.new :headings => header, :rows => characterised_hpos - output_quality_control.close - end - - #Prediction steps - #--------------------------- - hpo_regions = search4HPO(patient_hpo_profile, trainingData) - if hpo_regions.empty? - puts "ProfID:#{patient_number}\tResults not found" - elsif options[:group_by_region] == false - hpo_regions.each do |hpo, regions| - regions.each do |region| - puts "ProfID:#{patient_number}\t#{hpo}\t#{region.join("\t")}" - end - end - elsif options[:group_by_region] == true - region2hpo, regionAttributes, association_scores = group_by_region(hpo_regions) - #STDERR.puts patient_hpo_profile.inspect - #add_parentals_of_not_found_hpos_in_regions(patient_hpo_profile, trainingData, region2hpo, regionAttributes, association_scores, hpo_metadata) - #STDERR.puts patient_hpo_profile.inspect - null_value = 0 - hpo_region_matrix = generate_hpo_region_matrix(region2hpo, association_scores, patient_hpo_profile, null_value) - if options[:print_matrix] - output_matrix = File.open(options[:output_matrix] + "_#{patient_number}", "w") - output_matrix.puts "Region\t#{patient_hpo_profile.join("\t")}" - regionAttributes_array = regionAttributes.values - hpo_region_matrix.each_with_index do |association_values, i| - chr, start, stop = regionAttributes_array[i] - output_matrix.puts "#{chr}:#{start}-#{stop}\t#{association_values.join("\t")}" - end - output_matrix.close - end - - - scoring_regions(regionAttributes, hpo_region_matrix, options[:ranking_style], options[:pvalue_cutoff], options[:freedom_degree], null_value) - if regionAttributes.empty? - puts "ProfID:#{patient_number}\tResults not found" - else - adjacent_regions_joined = [] - regionAttributes.each do |regionID, attributes| - chr, start, stop, patient_ID, region_length, score = attributes - association_values = association_scores[regionID] - adjacent_regions_joined << [chr, start, stop, association_values.keys, association_values.values, score] - end - adjacent_regions_joined = join_regions(adjacent_regions_joined) # MOVER A ANTES DE CONSTRUIR LA MATRIZ + patient_hpo_profile, rejected_hpos = hpo.check_ids(patient_hpo_profile.map{|h| h.to_sym}) + STDERR.puts "WARNING: unknown CODES #{rejected_hpos.join(',')}" if !rejected_hpos.empty? - #Ranking - if options[:ranking_style] == 'fisher' - adjacent_regions_joined.sort!{|r1, r2| r1.last <=> r2.last} - else - adjacent_regions_joined.sort!{|r1, r2| r2.last <=> r1.last} + characterised_hpos = [] + if options[:quality_control] + characterised_hpos = hpo_quality_control(patient_hpo_profile, hpos_ci_values, hpo) + File.open(File.join(output_folder, options[:output_quality_control]), "w") do |f| + header = ["HPO name", "HPO code", "Exists?", "CI value", "Is child of", "Childs"] + f.puts Terminal::Table.new :headings => header, :rows => characterised_hpos + end end - patient_original_phenotypes = phenotypes_by_patient[patient_number] - calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, options[:hpo_recovery], patient_number) - if adjacent_regions_joined.empty? + patient_hpo_profile, parental_hpo = hpo.remove_ancestors_from_profile(patient_hpo_profile) + patient_hpo_profile.map!{|h| h.to_s} # Convert codes to string to be compatible with search4HPO like methods + phenotypes_by_patient[patient_number] = patient_hpo_profile + + #Prediction steps + #--------------------------- + hpo_regions = search4HPO(patient_hpo_profile, trainingData) + if hpo_regions.empty? puts "ProfID:#{patient_number}\tResults not found" - else - adjacent_regions_joined = adjacent_regions_joined[0..options[:max_number]-1] if !options[:max_number].nil? - adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score| - puts "ProfID:#{patient_number}\t#{chr}\t#{start}\t#{stop}\t#{hpo_list.join(',')}\t#{association_values.join(',')}\t#{score}" + elsif options[:group_by_region] == false + hpo_regions.each do |hpo, regions| + regions.each do |region| + puts "ProfID:#{patient_number}\t#{hpo}\t#{region.join("\t")}" + end end - end - end - end #elsif + elsif options[:group_by_region] == true + region2hpo, regionAttributes, association_scores = group_by_region(hpo_regions) + #STDERR.puts patient_hpo_profile.inspect + #add_parentals_of_not_found_hpos_in_regions(patient_hpo_profile, trainingData, region2hpo, regionAttributes, association_scores, hpo_metadata) + #STDERR.puts patient_hpo_profile.inspect + null_value = 0 + hpo_region_matrix = generate_hpo_region_matrix(region2hpo, association_scores, patient_hpo_profile, null_value) + if options[:print_matrix] + mat_output = File.join(output_folder, options[:output_matrix]) + "_#{patient_number}" + save_patient_matrix(mat_output, patient_hpo_profile, regionAttributes, hpo_region_matrix) + end - pathway_stats = {} - if options[:retrieve_kegg_data] - genes_found = [] - genes_found_attributes = {} - adjacent_regions_joined.each do |adjacent_region| - ref_chr, ref_start, ref_stop = adjacent_region - chr_genes = gene_location[ref_chr] - genes = [] - chr_genes.each do |gene_name, gene_start, gene_stop| - if (ref_start > gene_start && ref_stop < gene_stop) || - (ref_start < gene_start && ref_stop > gene_stop) || - (ref_start < gene_start && ref_stop > gene_start) || - (ref_start < gene_stop && ref_stop > gene_stop) - genes << gene_name + adjacent_regions_joined = [] + scoring_regions(regionAttributes, hpo_region_matrix, options[:ranking_style], options[:pvalue_cutoff], options[:freedom_degree], null_value) + if regionAttributes.empty? + puts "ProfID:#{patient_number}\tResults not found" + else + regionAttributes.each do |regionID, attributes| + chr, start, stop, patient_ID, region_length, score = attributes + association_values = association_scores[regionID] + adjacent_regions_joined << [chr, start, stop, association_values.keys, association_values.values, score] end - end - genes_found << genes - end + adjacent_regions_joined = join_regions(adjacent_regions_joined) if options[:join_adyacent_regions] # MOVER A ANTES DE CONSTRUIR LA MATRIZ + + #Ranking + if options[:ranking_style] == 'fisher' + adjacent_regions_joined.sort!{|r1, r2| r1.last <=> r2.last} + else + adjacent_regions_joined.sort!{|r1, r2| r2.last <=> r1.last} + end + patient_original_phenotypes = phenotypes_by_patient[patient_number] + calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, options[:hpo_recovery], patient_number) + if adjacent_regions_joined.empty? + puts "ProfID:#{patient_number}\tResults not found" + else + adjacent_regions_joined = adjacent_regions_joined.shift(options[:max_number]) if !options[:max_number].nil? + adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score| + puts "ProfID:#{patient_number}\t#{chr}\t#{start}\t#{stop}\t#{hpo_list.join(',')}\t#{association_values.join(',')}\t#{score}" + end + end + end + end #elsif - genes_with_kegg_data = [] - genes_found.each do |genes| - genes_cluster = [] - genes.each do |gene| - query = genes_with_kegg[gene] - genes_cluster << [gene, query] + pathway_stats = {} + if options[:retrieve_kegg_data] + genes_found = [] + genes_found_attributes = {} + adjacent_regions_joined.each do |adjacent_region| + ref_chr, ref_start, ref_stop = adjacent_region + chr_genes = gene_location[ref_chr] + genes = [] + chr_genes.each do |gene_name, gene_start, gene_stop| + genes << gene_name if coor_overlap?(ref_start, ref_stop, gene_start, gene_stop) + end + genes_found << genes + end + + genes_with_kegg_data = [] + genes_found.each do |genes| + genes_cluster = [] + genes.each do |gene| + query = genes_with_kegg[gene] + genes_cluster << [gene, query] + end + genes_with_kegg_data << genes_cluster + end + pathway_stats = compute_pathway_enrichment(genes_with_kegg_data, genes_with_kegg) + pathway_stats.sort!{|p1, p2| p1.last <=> p2.last} end - genes_with_kegg_data << genes_cluster - end - pathway_stats = compute_pathway_enrichment(genes_with_kegg_data, genes_with_kegg) - pathway_stats.sort!{|p1, p2| p1.last <=> p2.last} - end - #Creating html report - #------------------- - ####PLEASE CHECK THIS METHOD! - report_data(characterised_hpos, adjacent_regions_joined, options[:html_file], hpo_storage, genes_with_kegg_data, pathway_stats) if options[:html_reporting] + #Creating html report + #------------------- + + report_data( + characterised_hpos, + adjacent_regions_joined, + File.join(output_folder, options[:html_file]), + hpo, + genes_with_kegg_data, + pathway_stats + ) if options[:html_reporting] end # end each_with_index if options[:write_hpo_recovery_file] - handler = File.open('output_profile_recovery', 'w') - predicted_hpo_percentage.each do |patient, percentage| - percentage.each do |perc| - handler.puts "ProfID:#{patient}\t#{perc.inspect}" + File.open(File.join(output_folder, 'output_profile_recovery'), 'w') do |f| + predicted_hpo_percentage.each do |patient, percentage| + percentage.each do |perc| + f.puts "ProfID:#{patient}\t#{perc.inspect}" + end end end - handler.close end