bin/get_network_nodes.rb in pets-0.1.4 vs bin/get_network_nodes.rb in pets-0.2.3
- old
+ new
@@ -1,67 +1,58 @@
#! /usr/bin/env ruby
# Rojano E. & Seoane P., March 2019
# Code to prepare data to get the associations between pathological phenotypes (HPO) and genomic regions (SOR)
-
ROOT_PATH = File.dirname(__FILE__)
+EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
+HPO_FILE = File.join(EXTERNAL_DATA, 'hp.obo')
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
##############################
#LIBRARIES
##############################
require 'generalMethods.rb'
require 'optparse'
-require "benchmark"
+require 'semtools'
###############################
#METHODS
###############################
-def loadPatientFile(patient_file, hpo_storage, hpo_dictionary, add_parents)
+def loadPatientFile(patient_file, hpo, add_parents)
patient2phenotype = {}
hpo_count = {}
not_found = []
patients_genomic_region_by_chr = {}
File.open(patient_file).each do |line|
line.chomp!
next if line.include?("#")
patient, chr, start, stop, phenotype_profile = line.split("\t", 5)
next if phenotype_profile.nil? #For skipping patients without phenotypes
phenotypes = phenotype_profile.split('|')
- phenotypes.each do |hpo_name|
- hpo_code = hpo_dictionary[hpo_name]
- if hpo_code.nil?
- not_found << hpo_name if !not_found.include?(hpo_name)
- else
- get_all_hpos(patient, hpo_code, patient2phenotype, hpo_storage, hpo_count, add_parents)
- end
+ # phenotypes, rejected = hpo.translate_names2codes(phenotypes)
+ phenotypes, rejected = hpo.translate_names(phenotypes)
+ not_found = not_found | rejected
+ phenotypes.each do |hpo_code|
+ get_all_hpos(patient, hpo_code, patient2phenotype, hpo, hpo_count, add_parents) if !hpo.is_removable(hpo_code)
end
info = [patient, start.to_i, stop.to_i]
add_record(patients_genomic_region_by_chr, chr, info)
end
- if add_parents == 'coh'
- general_parents_in_cohort = get_parents_in_patients(patient2phenotype, hpo_storage)
- parent_patient2phenotype = {} # For new parent hpo added to patients.
- end
return patient2phenotype, hpo_count, not_found, patients_genomic_region_by_chr
end
-def get_parents_in_patients(patient2phenotype, hpo_storage)
- all_hpo_codes = []
- patient2phenotype.each do |patient, hpo_codes|
- all_hpo_codes = all_hpo_codes | hpo_codes
- end
-end
-def get_all_hpos(patient, hpo_code, patient2phenotype, hpo_storage, hpo_count, add_parents)
+def get_all_hpos(patient, hpo_code, patient2phenotype, hpo, hpo_count, add_parents)
add_record(hpo_count, hpo_code, patient)
add_record(patient2phenotype, patient, hpo_code)
if add_parents == 'root'
- hpo_parent_codes = hpo_storage[hpo_code][2]
+ # hpo_parent_codes = hpo.get_parents(hpo_code)
+ hpo_parent_codes = hpo.get_ancestors(hpo_code)
hpo_parent_codes.each do |parent_code|
- get_all_hpos(patient, parent_code, patient2phenotype, hpo_storage, hpo_count, add_parents)
+ add_record(hpo_count, parent_code, patient)
+ add_record(patient2phenotype, patient, parent_code)
end
end
end
def build_tripartite_network(patients2hpo, hpo_stats, ic_threshold, patients_by_cluster)
@@ -91,11 +82,11 @@
hpo_stats[hpo_code] = [hpo_freq, hpo_ic]
patient_ids.each do |patient_id|
patient_hpo_ic << [patient_id, hpo_code, hpo_ic]
end
end
- return hpo_stats, patient_hpo_ic.sort{|a,b| a.first.to_i <=> b.first.to_i}
+ return hpo_stats, patient_hpo_ic.sort{|a,b| a.first <=> b.first}
end
def write_hash(hash, file_path, header = [])
File.open(file_path, 'w') do |handler|
handler.puts header.join("\t") if !header.empty?
@@ -126,11 +117,11 @@
OptionParser.new do |opts|
opts.banner = "Usage: #{__FILE__} [options]"
options[:cluster_file] = 'cluster_coords.txt'
opts.on("-c", "--cluster_file PATH", "Cluster coords output file that will be used to translate SOR nodes") do |value|
- options[:cluster_file] = value
+ options[:cluster_file] = File.basename(value)
end
options[:excluded_hpo] = nil
opts.on("-e", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo|
options[:excluded_hpo] = excluded_hpo
@@ -155,17 +146,17 @@
opts.on("-p", "--hpo_file PATH", "Input HPO file for extracting HPO codes") do |value|
options[:hpo_file] = value
end
options[:add_parents] = nil
- opts.on("-r", "--parents STRING", "'root' to add all parents until the ontology root. 'coh' to add parents until the most general term in the cohort.") do |value|
+ opts.on("-r", "--parents STRING", "'root' to add all parents until the ontology root. 'coh' to add parents until the most general term in the cohort(TODO).") do |value|
options[:add_parents] = value
end
options[:hpo_stat_file] = 'hpo_stats.txt'
opts.on("-s", "--hpo_stat_file PATH", "Output file with HPO codes, their frequency and CI") do |value|
- options[:hpo_stat_file] = value
+ options[:hpo_stat_file] = File.basename(value)
end
options[:thresold] = 0
opts.on("-t", "--info_thresold FLOAT", "IC thresold to discard non informative hpo. Default: 0.") do |thresold|
options[:thresold] = thresold.to_f
@@ -179,19 +170,33 @@
end.parse!
###############################
#MAIN
###############################
-hpo_black_list = load_hpo_black_list(options[:excluded_hpo])
-hpo_storage = load_hpo_file(options[:hpo_file], hpo_black_list)
-hpo_dictionary = create_hpo_dictionary(hpo_storage)
-patients2hpo, hpo_count, not_found, chr_patients_genomic_region = loadPatientFile(options[:patient_file], hpo_storage, hpo_dictionary, options[:add_parents])
+output_folder = File.dirname(File.expand_path(options[:output_file]))
+Dir.mkdir(output_folder) if !File.exists?(output_folder)
+
+hpo_file = options[:hpo_file]
+hpo_file = ENV['hpo_file'] if hpo_file.nil?
+hpo_file = HPO_FILE if hpo_file.nil?
+
+# hpo = Ontology.new
+# hpo.load_black_list(options[:excluded_hpo]) if !options[:excluded_hpo].nil?
+# hpo.load_data(hpo_file)
+if !options[:excluded_hpo].nil?
+ hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(options[:excluded_hpo]))
+else
+ hpo = Ontology.new(file: hpo_file, load_file: true)
+end
+patients2hpo, hpo_count, not_found, chr_patients_genomic_region = loadPatientFile(options[:patient_file], hpo, options[:add_parents])
+
hpo_stats, patient_hpo_ic = compute_hpo_stats(hpo_count, patients2hpo.length)
patients_by_cluster, sors = generate_cluster_regions(chr_patients_genomic_region, options[:mutation_type])
+
tripartite_network = build_tripartite_network(patients2hpo, hpo_stats, options[:thresold], patients_by_cluster)
-write_array(not_found - hpo_black_list, 'missing_hpo_names')
-write_array(sors, options[:cluster_file])
-write_hash(hpo_stats.select{|hp_code, stats| stats.last > options[:thresold]}, options[:hpo_stat_file], %w[HPOcode Frequency IC])
+# write_array(not_found - hpo.excluded_codes, File.join(output_folder, 'missing_hpo_names'))
+write_array(not_found - hpo.removable_terms, File.join(output_folder, 'missing_hpo_names'))
+write_array(sors, File.join(output_folder, options[:cluster_file]))
+write_hash(hpo_stats.select{|hp_code, stats| stats.last > options[:thresold]}, File.join(output_folder, options[:hpo_stat_file]), %w[HPOcode Frequency IC])
write_array(tripartite_network, options[:output_file])
-write_array(patient_hpo_ic, 'filtered_hpo.txt')
-
+write_array(patient_hpo_ic, File.join(output_folder, 'filtered_hpo.txt'))
\ No newline at end of file