bin/phen2reg.rb in pets-0.1.4 vs bin/phen2reg.rb in pets-0.2.3
- old
+ new
@@ -3,53 +3,19 @@
# Program to predict the position from given HPO codes, sorted by their association values.
REPORT_FOLDER=File.expand_path(File.join(File.dirname(__FILE__), '..', 'templates'))
ROOT_PATH = File.dirname(__FILE__)
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
-require 'net/ftp'
-require 'net/http'
-require 'zlib'
-require 'json'
+
require 'generalMethods.rb'
require 'phen2reg_methods.rb'
require 'optparse'
require 'report_html'
+require 'semtools'
##########################
-#METHODS
-##########################
-
-def calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, min_hpo_recovery_percentage, patient_number)
- records_to_delete = []
- counter = 0
- adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score|
- hpo_coincidences = patient_original_phenotypes & hpo_list
- original_hpo_recovery_percentage = hpo_coincidences.length / patient_original_phenotypes.length.to_f * 100
- records_to_delete << counter if original_hpo_recovery_percentage < min_hpo_recovery_percentage
- query = predicted_hpo_percentage[patient_number]
- if query.nil?
- predicted_hpo_percentage[patient_number] = [original_hpo_recovery_percentage]
- else
- query << original_hpo_recovery_percentage
- end
- counter += 1
- end
- records_to_delete.reverse_each do |record_number|
- adjacent_regions_joined.delete_at(record_number)
- end
-end
-
-def download(ftp_server, path, name)
- ftp = Net::FTP.new()
- ftp.connect(ftp_server)
- ftp.login
- ftp.getbinaryfile(path, name)
- ftp.close
-end
-
-##########################
#OPT-PARSER
##########################
options = {}
OptionParser.new do |opts|
@@ -77,15 +43,21 @@
options[:information_coefficient] = nil
opts.on("-i", "--information_coefficient PATH", "Input file with information coefficients") do |information_coefficient|
options[:information_coefficient] = information_coefficient
end
+ options[:join_adyacent_regions] = false
+ opts.on('-j', "--join_adyacent_regions", "When a group of regions are adyacent they are merged in a single one with averaged parameters. The phenotypes in the regions must be shared across them.") do
+ options[:join_adyacent_regions] = true
+ end
+
options[:retrieve_kegg_data] = false
opts.on('-k', "--retrieve_kegg_data", "Add KEGG data to prediction report") do
options[:retrieve_kegg_data] = true
end
+
options[:print_matrix] = false
opts.on('-m', "--print_matrix", "Print output matrix") do
options[:print_matrix] = true
end
@@ -107,10 +79,15 @@
options[:output_matrix] = 'output_matrix.txt'
opts.on("-o", "--output_matrix PATH", "Output matrix file, with associations for each input HPO") do |output_matrix|
options[:output_matrix] = output_matrix
end
+ options[:output_path] = './'
+ opts.on("-O", "--output_path PATH", "General output folder path, takes precedence for other options") do |output|
+ options[:output_path] = output
+ end
+
options[:prediction_data] = nil
#chr\tstart\tstop
opts.on("-p", "--prediction_file PATH", "Input data with HPO codes for predicting their location. It can be either, a file path or string with HPO separated by pipes (|)") do |input_path|
options[:prediction_data] = input_path
end
@@ -172,214 +149,170 @@
all_paths[:biosystems_gene] = File.join(all_paths[:external_data], 'biosystems_gene.gz')
all_paths[:biosystems_info] = File.join(all_paths[:external_data], 'bsid2info.gz')
all_paths[:gene_data_with_pathways] = File.join(all_paths[:external_data], 'gene_data_with_pathways.gz')
all_paths[:gene_location] = File.join(all_paths[:external_data], 'gene_location.gz')
-##########################
-#DOWNLOADS
-##########################
-sources = [
- ['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]],
- ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]],
- ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/bsid2info.gz', all_paths[:biosystems_info]]
-]
-sources.each do |server, path, output|
- download(server, path, output) if !File.exists?(output)
-end
+output_folder = File.expand_path(options[:output_path])
+Dir.mkdir(output_folder) if !File.exists?(output_folder)
##########################
#MAIN
##########################
-if File.exist?(options[:prediction_data])
+#- Loading patient profiles
+#------------------------------
+if File.exist?(options[:prediction_data]) # From file
if !options[:multiple_profile]
options[:prediction_data] = [File.open(options[:prediction_data]).readlines.map!{|line| line.chomp}]
- #STDERR.puts options[:prediction_data].inspect
else
- multiple_profiles = []
- File.open(options[:prediction_data]).each do |line|
- line.chomp!
- multiple_profiles << line.split('|')
- end
- options[:prediction_data] = multiple_profiles
+ options[:prediction_data] = File.open(options[:prediction_data]).readlines.map!{|line| line.chomp.split('|')}
end
-else
- # if you want to add phenotypes through the terminal
+else # if you want to add phenotypes through the terminal
if !options[:multiple_profile]
options[:prediction_data] = [options[:prediction_data].split('|')]
else
options[:prediction_data] = options[:prediction_data].split('!').map{|profile| profile.split('|')}
end
end
-
-##########################
#- Loading data
+#------------------------------
+# hpo = Ontology.new
+# hpo.load_data(options[:hpo_file])
+hpo = Ontology.new(file: options[:hpo_file], load_file: true)
+trainingData = load_training_file4HPO(options[:training_file], options[:best_thresold])
+hpos_ci_values = load_hpo_ci_values(options[:information_coefficient]) if options[:quality_control]
-hpo_storage = load_hpo_file(options[:hpo_file])
-if options[:quality_control]
- hpo_child_metadata = get_child_parent_relations(hpo_storage)
- hpos_ci_values = load_hpo_ci_values(options[:information_coefficient])
-end
-
genes_with_kegg = {}
gene_location = {}
if options[:retrieve_kegg_data]
- if !File.exists?(all_paths[:gene_data_with_pathways]) || !File.exists?(all_paths[:gene_location])
- gene_list, gene_location = load_gene_data(all_paths[:gene_data])
- ### kegg_data = parse_kegg_data(genes_found_attributes.keys)
- kegg_data = parse_kegg_from_biosystems(all_paths[:biosystems_gene], all_paths[:biosystems_info])
- genes_with_kegg = merge_genes_with_kegg_data(gene_list, kegg_data)
- write_compressed_plain_file(genes_with_kegg, all_paths[:gene_data_with_pathways])
- write_compressed_plain_file(gene_location, all_paths[:gene_location])
- else
- gene_location = read_compressed_json(all_paths[:gene_location])
- genes_with_kegg = read_compressed_json(all_paths[:gene_data_with_pathways])
- end
+ gene_location, genes_with_kegg = get_and_parse_external_data(all_paths)
end
-# hpo_dictionary = load_hpo_dictionary_name2code(options[:hpo2name_file]) if options[:hpo_is_name]
-trainingData = load_training_file4HPO(options[:training_file], options[:best_thresold])
-
-##########################
#- HPO PROFILE ANALYSIS
-
+#---------------------------------
phenotypes_by_patient = {}
predicted_hpo_percentage = {}
options[:prediction_data].each_with_index do |patient_hpo_profile, patient_number|
- phenotypes_by_patient[patient_number] = patient_hpo_profile
- # STDERR.puts patient_hpo_profile.inspect
- if options[:hpo_is_name]
- translated_hpos = []
- hpo_dictionary = create_hpo_dictionary(hpo_storage)
- patient_hpo_profile.each_with_index do |name, i|
- hpo_code = hpo_dictionary[name]
- if hpo_code.nil?
- #STDERR.puts "Warning! Invalid HPO name: #{name}"
- hpo_code = nil
+ if options[:hpo_is_name]
+ # patient_hpo_profile, rejected = hpo.translate_names2codes(patient_hpo_profile)
+ patient_hpo_profile, rejected = hpo.translate_names(patient_hpo_profile)
+ STDERR.puts "Phenotypes #{rejected.join(",")} in patient #{patient_number} not exist"
end
- patient_hpo_profile[i] = hpo_code
- end
- patient_hpo_profile.compact!
- end
- #HPO quality control
- #---------------------------
- characterised_hpos = []
- #hpo_metadata = []
- if options[:quality_control]
- #characterised_hpos, hpo_metadata = hpo_quality_control(options[:prediction_data], options[:hpo2name_file], options[:information_coefficient])
- # characterised_hpos, hpo_storage = hpo_quality_control(patient_hpo_profile, hpo_storage, hpo_child_metadata, hpos_ci_values)
- characterised_hpos = hpo_quality_control(patient_hpo_profile, hpo_storage, hpo_child_metadata, hpos_ci_values)
- output_quality_control = File.open(options[:output_quality_control], "w")
- header = ["HPO name", "HPO code", "Exists?", "CI value", "Is child of", "Childs"]
- output_quality_control.puts Terminal::Table.new :headings => header, :rows => characterised_hpos
- output_quality_control.close
- end
-
- #Prediction steps
- #---------------------------
- hpo_regions = search4HPO(patient_hpo_profile, trainingData)
- if hpo_regions.empty?
- puts "ProfID:#{patient_number}\tResults not found"
- elsif options[:group_by_region] == false
- hpo_regions.each do |hpo, regions|
- regions.each do |region|
- puts "ProfID:#{patient_number}\t#{hpo}\t#{region.join("\t")}"
- end
- end
- elsif options[:group_by_region] == true
- region2hpo, regionAttributes, association_scores = group_by_region(hpo_regions)
- #STDERR.puts patient_hpo_profile.inspect
- #add_parentals_of_not_found_hpos_in_regions(patient_hpo_profile, trainingData, region2hpo, regionAttributes, association_scores, hpo_metadata)
- #STDERR.puts patient_hpo_profile.inspect
- null_value = 0
- hpo_region_matrix = generate_hpo_region_matrix(region2hpo, association_scores, patient_hpo_profile, null_value)
- if options[:print_matrix]
- output_matrix = File.open(options[:output_matrix] + "_#{patient_number}", "w")
- output_matrix.puts "Region\t#{patient_hpo_profile.join("\t")}"
- regionAttributes_array = regionAttributes.values
- hpo_region_matrix.each_with_index do |association_values, i|
- chr, start, stop = regionAttributes_array[i]
- output_matrix.puts "#{chr}:#{start}-#{stop}\t#{association_values.join("\t")}"
- end
- output_matrix.close
- end
-
-
- scoring_regions(regionAttributes, hpo_region_matrix, options[:ranking_style], options[:pvalue_cutoff], options[:freedom_degree], null_value)
- if regionAttributes.empty?
- puts "ProfID:#{patient_number}\tResults not found"
- else
- adjacent_regions_joined = []
- regionAttributes.each do |regionID, attributes|
- chr, start, stop, patient_ID, region_length, score = attributes
- association_values = association_scores[regionID]
- adjacent_regions_joined << [chr, start, stop, association_values.keys, association_values.values, score]
- end
- adjacent_regions_joined = join_regions(adjacent_regions_joined) # MOVER A ANTES DE CONSTRUIR LA MATRIZ
+ patient_hpo_profile, rejected_hpos = hpo.check_ids(patient_hpo_profile.map{|h| h.to_sym})
+ STDERR.puts "WARNING: unknown CODES #{rejected_hpos.join(',')}" if !rejected_hpos.empty?
- #Ranking
- if options[:ranking_style] == 'fisher'
- adjacent_regions_joined.sort!{|r1, r2| r1.last <=> r2.last}
- else
- adjacent_regions_joined.sort!{|r1, r2| r2.last <=> r1.last}
+ characterised_hpos = []
+ if options[:quality_control]
+ characterised_hpos = hpo_quality_control(patient_hpo_profile, hpos_ci_values, hpo)
+ File.open(File.join(output_folder, options[:output_quality_control]), "w") do |f|
+ header = ["HPO name", "HPO code", "Exists?", "CI value", "Is child of", "Childs"]
+ f.puts Terminal::Table.new :headings => header, :rows => characterised_hpos
+ end
end
- patient_original_phenotypes = phenotypes_by_patient[patient_number]
- calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, options[:hpo_recovery], patient_number)
- if adjacent_regions_joined.empty?
+ patient_hpo_profile, parental_hpo = hpo.remove_ancestors_from_profile(patient_hpo_profile)
+ patient_hpo_profile.map!{|h| h.to_s} # Convert codes to string to be compatible with search4HPO like methods
+ phenotypes_by_patient[patient_number] = patient_hpo_profile
+
+ #Prediction steps
+ #---------------------------
+ hpo_regions = search4HPO(patient_hpo_profile, trainingData)
+ if hpo_regions.empty?
puts "ProfID:#{patient_number}\tResults not found"
- else
- adjacent_regions_joined = adjacent_regions_joined[0..options[:max_number]-1] if !options[:max_number].nil?
- adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score|
- puts "ProfID:#{patient_number}\t#{chr}\t#{start}\t#{stop}\t#{hpo_list.join(',')}\t#{association_values.join(',')}\t#{score}"
+ elsif options[:group_by_region] == false
+ hpo_regions.each do |hpo, regions|
+ regions.each do |region|
+ puts "ProfID:#{patient_number}\t#{hpo}\t#{region.join("\t")}"
+ end
end
- end
- end
- end #elsif
+ elsif options[:group_by_region] == true
+ region2hpo, regionAttributes, association_scores = group_by_region(hpo_regions)
+ #STDERR.puts patient_hpo_profile.inspect
+ #add_parentals_of_not_found_hpos_in_regions(patient_hpo_profile, trainingData, region2hpo, regionAttributes, association_scores, hpo_metadata)
+ #STDERR.puts patient_hpo_profile.inspect
+ null_value = 0
+ hpo_region_matrix = generate_hpo_region_matrix(region2hpo, association_scores, patient_hpo_profile, null_value)
+ if options[:print_matrix]
+ mat_output = File.join(output_folder, options[:output_matrix]) + "_#{patient_number}"
+ save_patient_matrix(mat_output, patient_hpo_profile, regionAttributes, hpo_region_matrix)
+ end
- pathway_stats = {}
- if options[:retrieve_kegg_data]
- genes_found = []
- genes_found_attributes = {}
- adjacent_regions_joined.each do |adjacent_region|
- ref_chr, ref_start, ref_stop = adjacent_region
- chr_genes = gene_location[ref_chr]
- genes = []
- chr_genes.each do |gene_name, gene_start, gene_stop|
- if (ref_start > gene_start && ref_stop < gene_stop) ||
- (ref_start < gene_start && ref_stop > gene_stop) ||
- (ref_start < gene_start && ref_stop > gene_start) ||
- (ref_start < gene_stop && ref_stop > gene_stop)
- genes << gene_name
+ adjacent_regions_joined = []
+ scoring_regions(regionAttributes, hpo_region_matrix, options[:ranking_style], options[:pvalue_cutoff], options[:freedom_degree], null_value)
+ if regionAttributes.empty?
+ puts "ProfID:#{patient_number}\tResults not found"
+ else
+ regionAttributes.each do |regionID, attributes|
+ chr, start, stop, patient_ID, region_length, score = attributes
+ association_values = association_scores[regionID]
+ adjacent_regions_joined << [chr, start, stop, association_values.keys, association_values.values, score]
end
- end
- genes_found << genes
- end
+ adjacent_regions_joined = join_regions(adjacent_regions_joined) if options[:join_adyacent_regions] # MOVER A ANTES DE CONSTRUIR LA MATRIZ
+
+ #Ranking
+ if options[:ranking_style] == 'fisher'
+ adjacent_regions_joined.sort!{|r1, r2| r1.last <=> r2.last}
+ else
+ adjacent_regions_joined.sort!{|r1, r2| r2.last <=> r1.last}
+ end
+ patient_original_phenotypes = phenotypes_by_patient[patient_number]
+ calculate_hpo_recovery_and_filter(adjacent_regions_joined, patient_original_phenotypes, predicted_hpo_percentage, options[:hpo_recovery], patient_number)
+ if adjacent_regions_joined.empty?
+ puts "ProfID:#{patient_number}\tResults not found"
+ else
+ adjacent_regions_joined = adjacent_regions_joined.shift(options[:max_number]) if !options[:max_number].nil?
+ adjacent_regions_joined.each do |chr, start, stop, hpo_list, association_values, score|
+ puts "ProfID:#{patient_number}\t#{chr}\t#{start}\t#{stop}\t#{hpo_list.join(',')}\t#{association_values.join(',')}\t#{score}"
+ end
+ end
+ end
+ end #elsif
- genes_with_kegg_data = []
- genes_found.each do |genes|
- genes_cluster = []
- genes.each do |gene|
- query = genes_with_kegg[gene]
- genes_cluster << [gene, query]
+ pathway_stats = {}
+ if options[:retrieve_kegg_data]
+ genes_found = []
+ genes_found_attributes = {}
+ adjacent_regions_joined.each do |adjacent_region|
+ ref_chr, ref_start, ref_stop = adjacent_region
+ chr_genes = gene_location[ref_chr]
+ genes = []
+ chr_genes.each do |gene_name, gene_start, gene_stop|
+ genes << gene_name if coor_overlap?(ref_start, ref_stop, gene_start, gene_stop)
+ end
+ genes_found << genes
+ end
+
+ genes_with_kegg_data = []
+ genes_found.each do |genes|
+ genes_cluster = []
+ genes.each do |gene|
+ query = genes_with_kegg[gene]
+ genes_cluster << [gene, query]
+ end
+ genes_with_kegg_data << genes_cluster
+ end
+ pathway_stats = compute_pathway_enrichment(genes_with_kegg_data, genes_with_kegg)
+ pathway_stats.sort!{|p1, p2| p1.last <=> p2.last}
end
- genes_with_kegg_data << genes_cluster
- end
- pathway_stats = compute_pathway_enrichment(genes_with_kegg_data, genes_with_kegg)
- pathway_stats.sort!{|p1, p2| p1.last <=> p2.last}
- end
- #Creating html report
- #-------------------
- ####PLEASE CHECK THIS METHOD!
- report_data(characterised_hpos, adjacent_regions_joined, options[:html_file], hpo_storage, genes_with_kegg_data, pathway_stats) if options[:html_reporting]
+ #Creating html report
+ #-------------------
+
+ report_data(
+ characterised_hpos,
+ adjacent_regions_joined,
+ File.join(output_folder, options[:html_file]),
+ hpo,
+ genes_with_kegg_data,
+ pathway_stats
+ ) if options[:html_reporting]
end # end each_with_index
if options[:write_hpo_recovery_file]
- handler = File.open('output_profile_recovery', 'w')
- predicted_hpo_percentage.each do |patient, percentage|
- percentage.each do |perc|
- handler.puts "ProfID:#{patient}\t#{perc.inspect}"
+ File.open(File.join(output_folder, 'output_profile_recovery'), 'w') do |f|
+ predicted_hpo_percentage.each do |patient, percentage|
+ percentage.each do |perc|
+ f.puts "ProfID:#{patient}\t#{perc.inspect}"
+ end
end
end
- handler.close
end