bin/get_network_nodes.rb in pets-0.2.3 vs bin/get_network_nodes.rb in pets-0.2.4
- old
+ new
@@ -1,169 +1,123 @@
#! /usr/bin/env ruby
# Rojano E. & Seoane P., March 2019
# Code to prepare data to get the associations between pathological phenotypes (HPO) and genomic regions (SOR)
ROOT_PATH = File.dirname(__FILE__)
-EXTERNAL_DATA = File.expand_path(File.join(ROOT_PATH, '..', 'external_data'))
-HPO_FILE = File.join(EXTERNAL_DATA, 'hp.obo')
$: << File.expand_path(File.join(ROOT_PATH, '..', 'lib', 'pets'))
##############################
#LIBRARIES
##############################
-require 'generalMethods.rb'
require 'optparse'
-require 'semtools'
+require 'pets'
###############################
#METHODS
###############################
-def loadPatientFile(patient_file, hpo, add_parents)
- patient2phenotype = {}
- hpo_count = {}
- not_found = []
- patients_genomic_region_by_chr = {}
- File.open(patient_file).each do |line|
- line.chomp!
- next if line.include?("#")
- patient, chr, start, stop, phenotype_profile = line.split("\t", 5)
- next if phenotype_profile.nil? #For skipping patients without phenotypes
- phenotypes = phenotype_profile.split('|')
- # phenotypes, rejected = hpo.translate_names2codes(phenotypes)
- phenotypes, rejected = hpo.translate_names(phenotypes)
- not_found = not_found | rejected
- phenotypes.each do |hpo_code|
- get_all_hpos(patient, hpo_code, patient2phenotype, hpo, hpo_count, add_parents) if !hpo.is_removable(hpo_code)
- end
- info = [patient, start.to_i, stop.to_i]
- add_record(patients_genomic_region_by_chr, chr, info)
- end
- return patient2phenotype, hpo_count, not_found, patients_genomic_region_by_chr
-end
-
-
-def get_all_hpos(patient, hpo_code, patient2phenotype, hpo, hpo_count, add_parents)
- add_record(hpo_count, hpo_code, patient)
- add_record(patient2phenotype, patient, hpo_code)
- if add_parents == 'root'
- # hpo_parent_codes = hpo.get_parents(hpo_code)
- hpo_parent_codes = hpo.get_ancestors(hpo_code)
- hpo_parent_codes.each do |parent_code|
- add_record(hpo_count, parent_code, patient)
- add_record(patient2phenotype, patient, parent_code)
- end
- end
-end
-
-def build_tripartite_network(patients2hpo, hpo_stats, ic_threshold, patients_by_cluster)
+def build_tripartite_network(patient_data, patients_by_cluster, add_parents, ont)
tripartite_network = []
patients_by_cluster.each do |patient, node_ids|
node_ids.each do |node_id|
tripartite_network << [node_id, patient]
end
end
- patients_list = patients_by_cluster.keys
- patients2hpo.each do |patient, code|
- if patients_list.include?(patient)
- code.each do |c|
- tripartite_network << [c, patient] if hpo_stats[c].last >= ic_threshold
- end
- end
+ patient_data.each_profile do |id, profile|
+ profile = profile.map{|term| ont.get_ancestors(term)}.flatten.uniq if add_parents == 'root'
+ profile.each do |term|
+ tripartite_network << [term, id]
+ end
end
return tripartite_network
end
-def compute_hpo_stats(hpo_count, patient_number)
- hpo_stats = {}
- patient_hpo_ic = []
- hpo_count.each do |hpo_code, patient_ids|
- hpo_freq = patient_ids.length.fdiv(patient_number) #hpo frequency in patients
- hpo_ic = -Math.log10(hpo_freq)
- hpo_stats[hpo_code] = [hpo_freq, hpo_ic]
- patient_ids.each do |patient_id|
- patient_hpo_ic << [patient_id, hpo_code, hpo_ic]
- end
- end
- return hpo_stats, patient_hpo_ic.sort{|a,b| a.first <=> b.first}
-end
-
-def write_hash(hash, file_path, header = [])
- File.open(file_path, 'w') do |handler|
- handler.puts header.join("\t") if !header.empty?
- hash.each do |key, array|
- handler.puts "#{key}\t#{array.join("\t")}"
- end
- end
-end
-
-def write_array(array, file_path)
- File.open(file_path, 'w') do |handler|
- array.each do |record|
- if record.class == String
- line = record
- else
- line = record.join("\t")
- end
- handler.puts line
- end
- end
-end
-
##############################
#OPTPARSE
##############################
options = {}
OptionParser.new do |opts|
opts.banner = "Usage: #{__FILE__} [options]"
- options[:cluster_file] = 'cluster_coords.txt'
- opts.on("-c", "--cluster_file PATH", "Cluster coords output file that will be used to translate SOR nodes") do |value|
- options[:cluster_file] = File.basename(value)
- end
+ options[:chromosome_col] = nil
+ opts.on("-c", "--chromosome_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the chromosome") do |data|
+ options[:chromosome_col] = data
+ end
- options[:excluded_hpo] = nil
- opts.on("-e", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo|
- options[:excluded_hpo] = excluded_hpo
+ options[:id_col] = nil
+ opts.on("-d", "--pat_id_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the patient id") do |data|
+ options[:id_col] = data
end
- options[:patient_file] = nil
- opts.on("-i", "--input_file PATH", "Input file with patients for parsing phenotypes to HPO codes") do |value|
- options[:patient_file] = value
+ options[:end_col] = nil
+ opts.on("-e", "--end_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the end mutation coordinate") do |data|
+ options[:end_col] = data
end
+
+ options[:ont_col] = nil
+ opts.on("-p", "--hpo_term_col INTEGER/STRING", "Column name if header true or 0-based position of the column with the HPO terms") do |data|
+ options[:ont_col] = data
+ end
- options[:mutation_type] = 'A'
- opts.on("-m", "--mutation_type STRING", "Type of patient mutation, either it is a deletion (d) or duplication (D)") do |type|
- options[:mutation_type] = type
+ options[:start_col] = nil
+ opts.on("-s", "--start_col INTEGER/STRING", "Column name if header is true, otherwise 0-based position of the column with the start mutation coordinate") do |data|
+ options[:start_col] = data
end
+ options[:separator] = '|'
+ opts.on("-S", "--hpo_separator STRING", "Set which character must be used to split the HPO profile. Default '|'") do |data|
+ options[:separator] = data
+ end
+
+ options[:names] = false
+ opts.on("-n", "--hpo_names", "Define if the input HPO are human readable names. Default false") do
+ options[:names] = true
+ end
+
+ options[:header] = true
+ opts.on("-H", "--header", "File has a line header. Default true") do
+ options[:header] = false
+ end
+
+ #===================================================================
+
+ options[:input_file] = nil
+ opts.on("-i", "--input_file PATH", "Input file with patients for parsing phenotypes to HPO codes") do |value|
+ options[:input_file] = value
+ end
+
options[:output_file] = 'tripartite_network.txt'
opts.on("-o", "--output_file PATH", "Output file for the tripartite network") do |value|
options[:output_file] = value
+ end
+
+ options[:cluster_file] = 'cluster_coords.txt'
+ opts.on("-u", "--cluster_file PATH", "Cluster coords output file that will be used to translate SOR nodes") do |value|
+ options[:cluster_file] = File.basename(value)
end
+ options[:excluded_hpo] = nil
+ opts.on("-x", "--excluded_hpo PATH", "List of HPO phenotypes to exclude (low informative)") do |excluded_hpo|
+ options[:excluded_hpo] = excluded_hpo
+ end
+
+ options[:tag] = 'A'
+ opts.on("-m", "--mutation_type STRING", "Type of patient mutation, either it is a deletion (d) or duplication (D)") do |type|
+ options[:tag] = type
+ end
+
options[:hpo_file] = nil
- opts.on("-p", "--hpo_file PATH", "Input HPO file for extracting HPO codes") do |value|
+ opts.on("-O", "--hpo_file PATH", "Input HPO file for extracting HPO codes") do |value|
options[:hpo_file] = value
end
options[:add_parents] = nil
opts.on("-r", "--parents STRING", "'root' to add all parents until the ontology root. 'coh' to add parents until the most general term in the cohort(TODO).") do |value|
options[:add_parents] = value
end
- options[:hpo_stat_file] = 'hpo_stats.txt'
- opts.on("-s", "--hpo_stat_file PATH", "Output file with HPO codes, their frequency and CI") do |value|
- options[:hpo_stat_file] = File.basename(value)
- end
-
- options[:thresold] = 0
- opts.on("-t", "--info_thresold FLOAT", "IC thresold to discard non informative hpo. Default: 0.") do |thresold|
- options[:thresold] = thresold.to_f
- end
-
opts.on_tail("-h", "--help", "Show this message") do
puts opts
exit
end
@@ -174,29 +128,22 @@
###############################
output_folder = File.dirname(File.expand_path(options[:output_file]))
Dir.mkdir(output_folder) if !File.exists?(output_folder)
hpo_file = options[:hpo_file]
-hpo_file = ENV['hpo_file'] if hpo_file.nil?
-hpo_file = HPO_FILE if hpo_file.nil?
+hpo_file = !ENV['hpo_file'].nil? ? ENV['hpo_file'] : HPO_FILE if hpo_file.nil?
+Cohort.load_ontology(:hpo, hpo_file, options[:excluded_hpo])
+Cohort.act_ont = :hpo
+hpo = Cohort.get_ontology(Cohort.act_ont)
-# hpo = Ontology.new
-# hpo.load_black_list(options[:excluded_hpo]) if !options[:excluded_hpo].nil?
-# hpo.load_data(hpo_file)
-if !options[:excluded_hpo].nil?
- hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(options[:excluded_hpo]))
-else
- hpo = Ontology.new(file: hpo_file, load_file: true)
-end
-patients2hpo, hpo_count, not_found, chr_patients_genomic_region = loadPatientFile(options[:patient_file], hpo, options[:add_parents])
+patient_data, rejected_hpos_L, rejected_patients_L = Cohort_Parser.load(options)
+rejected_hpos_C, rejected_patients_C = patient_data.check
+rejected_hpos = rejected_hpos_L | rejected_hpos_C
+rejected_patients = rejected_patients_L + rejected_patients_C
+patient_data.remove_incomplete_records
+patient_data.index_vars
+patients_by_cluster, sors = patient_data.generate_cluster_regions(:reg_overlap, options[:tag], 1)
+tripartite_network = build_tripartite_network(patient_data, patients_by_cluster, options[:add_parents], hpo)
-hpo_stats, patient_hpo_ic = compute_hpo_stats(hpo_count, patients2hpo.length)
-patients_by_cluster, sors = generate_cluster_regions(chr_patients_genomic_region, options[:mutation_type])
-
-tripartite_network = build_tripartite_network(patients2hpo, hpo_stats, options[:thresold], patients_by_cluster)
-
-# write_array(not_found - hpo.excluded_codes, File.join(output_folder, 'missing_hpo_names'))
-write_array(not_found - hpo.removable_terms, File.join(output_folder, 'missing_hpo_names'))
+write_array(rejected_hpos, File.join(output_folder, 'missing_hpo_names'))
write_array(sors, File.join(output_folder, options[:cluster_file]))
-write_hash(hpo_stats.select{|hp_code, stats| stats.last > options[:thresold]}, File.join(output_folder, options[:hpo_stat_file]), %w[HPOcode Frequency IC])
-write_array(tripartite_network, options[:output_file])
-write_array(patient_hpo_ic, File.join(output_folder, 'filtered_hpo.txt'))
+write_array(tripartite_network, options[:output_file])
\ No newline at end of file