lib/pets/generalMethods.rb in pets-0.1.4 vs lib/pets/generalMethods.rb in pets-0.2.3
- old
+ new
@@ -1,9 +1,41 @@
require 'uri'
+require 'net/ftp'
+require 'net/http'
+require 'zlib'
+require 'json'
+require 'benchmark'
+
+def system_call(code_folder, script, args_string)
+ cmd = File.join(code_folder, script) + ' ' + args_string
+ puts "==> #{cmd}"
+ Benchmark.bm do |x|
+ x.report{ system(cmd) }
+ end
+end
+
+def read_excluded_hpo_file(file)
+ excluded_hpo = []
+ File.open(file).each do |line|
+ excluded_hpo << line.chomp
+ end
+ return excluded_hpo
+end
+
#Common methods for predictors
#Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5
#1. Indexing by chr (region)
+def coor_overlap?(ref_start, ref_stop, start, stop)
+ overlap = false
+ if (stop > ref_start && stop <= ref_stop) ||
+ (start >= ref_start && start < ref_stop) ||
+ (start <= ref_start && stop >= ref_stop) ||
+ (start > ref_start && stop < ref_stop)
+ overlap = true
+ end
+ return overlap
+end
def load_training_file4regions(training_file)
training_set = {}
posInfo = loadFile(training_file)
posInfo.each do |info|
@@ -55,95 +87,11 @@
end
end
return information
end
-def add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list)
- if !hpo_black_list.include?(id)
- attributes = [id, name, is_a - hpo_black_list, syn]
- hpo_storage[id] = attributes
- alt_ids.each do |altid|
- hpo_storage[altid] = attributes
- end
- end
-end
-def load_hpo_file(hpo_file, hpo_black_list=[])
- hpo_storage = {}
- id = nil
- name = nil
- alt_ids = []
- syn = []
- is_a = []
- File.open(hpo_file).each do |line|
- line.chomp!
- tag, info = line.split(': ')
- if tag == 'id' || tag == 'name' || tag == 'is_a' || tag == 'synonym' || tag == 'alt_id'
- if tag == 'id'
- add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list) if !name.nil?
- id = info
- name = nil
- alt_id = []
- syn = []
- is_a = []
- end
- if tag == 'alt_id'
- alt_ids << info
- elsif tag == 'is_a'
- is_a << info.split(' ! ')[0]
- elsif tag == 'synonym'
- syn << info.split('"')[1] #to keep only the name of the synonym
- else
- name = info
- end
- end
- end
- add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list)
- # STDERR.puts hpo_storage.inspect
- # Process.exit
- return hpo_storage
-end
-
-def load_hpo_black_list(excluded_hpo_file)
- excluded_hpos = []
- File.open(excluded_hpo_file).each do |line|
- line.chomp!
- excluded_hpos << line
- end
- return excluded_hpos
-end
-
-def create_hpo_dictionary(hpo_storage)
- hpo_dictionary = {}
- hpo_storage.each do |hpo, metadata|
- hpo_code, hpo_name, hpo_parents, hpo_synonyms = metadata
- hpo_dictionary[hpo_name] = hpo_code
- hpo_synonyms.each do |syn|
- hpo_dictionary[syn] = hpo_code
- end
- end
- return hpo_dictionary
-end
-
-def get_child_parent_relations(hpo_storage)
- # for getting hpo childs
- storage_child = {}
- hpo_storage.each do |hpo_code, hpo_data|
- id, name, is_a, syn = hpo_data
- hpo_child = [id, name]
- is_a.each do |par_hpo_code|
- query = storage_child[par_hpo_code]
- if query.nil?
- storage_child[par_hpo_code] = [hpo_child]
- else
- query << hpo_child
- end
- end
- end
- return storage_child
-end
-
def compute_IC_values(patient_data, total_patients)
patients_per_hpo = Hash.new(0)
last_patient_ID = ''
patient_data.each do |patient_ID, metadata|
patient, count = patient_ID.split('_i')
@@ -161,38 +109,16 @@
patients_per_hpo[hpo] = -Math.log10(patient_number.fdiv(total_patients))
end
return patients_per_hpo
end
-# def get_child_parent_relations(hpo_storage)
-# # for getting hpo childs
-# storage_child = {}
-# hpo_storage.each do |hpo_code, hpo_data|
-# STDERR.puts hpo_data[3].inspect
-# Process.exit
-# main_code, hpo_name, synonyms, parents = hpo_data
-# parents.each do |par_hpo_code, par_hpo_name|
-# query = storage_child[par_hpo_code]
-# hpo_child = [main_code, hpo_name]
-# if query.nil?
-# storage_child[par_hpo_code] = [par_hpo_name, [hpo_child]]
-# else
-# query.last << hpo_child
-# end
-# end
-# end
-
-# return storage_child
-# end
-
-
def load_hpo_ci_values(information_coefficient_file)
hpos_ci_values = {}
File.open(information_coefficient_file).each do |line|
line.chomp!
hpo_code, ci = line.split("\t")
- hpos_ci_values[hpo_code] = ci.to_f
+ hpos_ci_values[hpo_code.to_sym] = ci.to_f
end
return hpos_ci_values
end
def load_clustered_patients(file)
@@ -230,17 +156,18 @@
attributes = {}
fields[8].split(';').each do |pair|
key, value = pair.split('=')
attributes[key] = value
end
- geneNames = []
- geneNames << attributes['gene'] if !attributes['gene'].nil?
- geneNames.concat(attributes['gene_synonym'].split(',')) if !attributes['gene_synonym'].nil?
+ geneName = nil
+ geneName = attributes['gene'] if !attributes['gene'].nil?
+ geneSyns = []
+ geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
description = attributes['description']
description = URI.unescape(description) if !description.nil?
attributes['Dbxref'] =~ /GeneID:(\d+)/
- gene_list[$1] = [geneNames, description]
+ gene_list[$1] = [geneName, geneSyns, description]
genes << [$1, fields[3].to_i, fields[4].to_i]
end
end
gene_location[current_chr] = genes
return gene_list, gene_location
@@ -332,18 +259,15 @@
return gene2kegg
end
def merge_genes_with_kegg_data(gene_list, kegg_data)
merged_data = {}
- gene_list.each do |geneID, attributes|
- query = kegg_data[geneID]
- if query.nil?
- attributes << []
- else
- attributes << query
- end
- merged_data[geneID] = attributes
+ gene_list.each do |geneID, values|
+ geneName, geneSyn, description = values
+ kegg_entry = kegg_data[geneID]
+ kegg_entry = [] if kegg_entry.nil?
+ merged_data[geneID] = [geneName, description, kegg_entry, geneSyn]
end
return merged_data
end
def write_compressed_plain_file(data, path)
@@ -364,11 +288,11 @@
def compute_pathway_enrichment(genes_clusters, genes_with_kegg)
pathways_genes_in_predictions = {}
genes_in_predictions = []
genes_clusters.each do |cluster|
cluster.each do |geneID, data|
- geneNames, description, pathways = data
+ geneName, description, pathways, geneSyns = data
pathways.each do |pathway|
query = pathways_genes_in_predictions[pathway]
if query.nil?
pathways_genes_in_predictions[pathway] = [geneID]
else
@@ -508,35 +432,39 @@
def load_patient_cohort(options)
patient_data = {}
count = 0
fields2extract = get_fields2extract(options)
field_numbers = fields2extract.values
- original_ids = []
- File.open(options[:input_file]).each do |line|
- line.chomp!
- if options[:header] && count == 0
- line.gsub!(/#\s*/,'') # correct comment like headers
- field_names = line.split("\t")
- get_field_numbers2extract(field_names, fields2extract)
- field_numbers = fields2extract.values
- else
- fields = line.split("\t")
- pat_record = field_numbers.map{|n| fields[n]}
- if fields2extract[:pat_id_col].nil?
- pat_id = "pat_#{count}" #generate ids
- else
- original_id = pat_record.shift
- original_ids << original_id
- pat_id = original_id + "_i#{count}" # make sure that ids are uniq
- end
- patient_data[pat_id] = pat_record
- end
- count +=1
- end
- fields2extract[:pat_id_col].nil? ? patient_number = count : patient_number = original_ids.uniq.length
- options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil?
- return patient_data, patient_number
+ File.open(options[:input_file]).each do |line|
+ line.chomp!
+ if options[:header] && count == 0
+ line.gsub!(/#\s*/,'') # correct comment like headers
+ field_names = line.split("\t")
+ get_field_numbers2extract(field_names, fields2extract)
+ field_numbers = fields2extract.values
+ else
+ fields = line.split("\t")
+ pat_record = field_numbers.map{|n| fields[n]}
+ if fields2extract[:pat_id_col].nil?
+ pat_id = "pat_#{count}" #generate ids
+ else
+ original_id = pat_record.shift
+ pat_id = original_id + "_i#{count}" # make sure that ids are uniq
+ end
+ if !pat_record[0].nil?
+ pat_record[0] = pat_record[0].split(options[:hpo_separator])
+ else
+ pat_record[0] = []
+ end
+ pat_record[2] = pat_record[2].to_i if !options[:start_col].nil?
+ pat_record[3] = pat_record[3].to_i if !options[:end_col].nil?
+ patient_data[pat_id] = pat_record
+ end
+ count +=1
+ end
+ options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil?
+ return patient_data
end
def get_fields2extract(options)
fields2extract = {}
[:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field|
@@ -551,6 +479,87 @@
def get_field_numbers2extract(field_names, fields2extract)
fields2extract.each do |field, name|
fields2extract[field] = field_names.index(name)
end
+end
+
+def download(ftp_server, path, name)
+ ftp = Net::FTP.new()
+ ftp.connect(ftp_server)
+ ftp.login
+ ftp.getbinaryfile(path, name)
+ ftp.close
+end
+
+def get_and_parse_external_data(all_paths)
+ sources = [
+ ['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]],
+ ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]],
+ ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/bsid2info.gz', all_paths[:biosystems_info]]
+ ]
+ sources.each do |server, path, output|
+ download(server, path, output) if !File.exists?(output)
+ end
+
+ genes_with_kegg = {}
+ gene_location = {}
+ if !File.exists?(all_paths[:gene_data_with_pathways]) || !File.exists?(all_paths[:gene_location])
+ gene_list, gene_location = load_gene_data(all_paths[:gene_data])
+ ### kegg_data = parse_kegg_data(genes_found_attributes.keys)
+ kegg_data = parse_kegg_from_biosystems(all_paths[:biosystems_gene], all_paths[:biosystems_info])
+ genes_with_kegg = merge_genes_with_kegg_data(gene_list, kegg_data)
+ write_compressed_plain_file(genes_with_kegg, all_paths[:gene_data_with_pathways])
+ write_compressed_plain_file(gene_location, all_paths[:gene_location])
+ else
+ gene_location = read_compressed_json(all_paths[:gene_location])
+ genes_with_kegg = read_compressed_json(all_paths[:gene_data_with_pathways])
+ end
+ return gene_location, genes_with_kegg
+end
+
+def get_detailed_similarity(profile, candidates, evidences, hpo)
+ profile_length = profile.length
+ matrix = []
+ profile_length.times{ matrix << Array.new(candidates.length, 0)}
+ cand_number = 0
+ candidates.each do |candidate_id, similarity|
+ local_sim = []
+ candidate_evidence = evidences[candidate_id]
+ profile.each do |profile_term|
+ candidate_evidence.each do |candidate_term|
+ term_sim = hpo.compare([candidate_term], [profile_term], sim_type: :lin, bidirectional: false)
+ local_sim << [profile_term, candidate_term, term_sim]
+ end
+ end
+ local_sim.sort!{|s1, s2| s2.last <=> s1.last}
+
+ final_pairs = []
+ processed_profile_terms = []
+ processed_candidate_terms = []
+ local_sim.each do |pr_term, cd_term, sim|
+ if !processed_profile_terms.include?(pr_term) && !processed_candidate_terms.include?(cd_term)
+ final_pairs << [pr_term, cd_term, sim]
+ processed_profile_terms << pr_term
+ processed_candidate_terms << cd_term
+ end
+ break if profile_length == processed_profile_terms.length
+ end
+ final_pairs.each do |pr_term, cd_term, similarity|
+ matrix[profile.index(pr_term)][cand_number] = similarity
+ end
+ cand_number += 1
+ end
+ return matrix
+end
+
+def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit)
+ candidates = similarities.to_a.sort{|s1, s2| s2.last <=> s1.last}.first(candidate_limit)
+ candidates_ids = candidates.map{|c| c.first}
+ candidate_similarity_matrix = get_detailed_similarity(reference_prof, candidates, evidence_profiles, hpo)
+ candidate_similarity_matrix.each_with_index do |row, i|
+ row.unshift(hpo.translate_id(reference_prof[i]))
+ end
+ candidate_similarity_matrix.sort!{|r1,r2| r2[1..r2.length].inject(0){|sum,n| sum +n} <=> r1[1..r1.length].inject(0){|sum,n| sum +n}}
+ candidate_similarity_matrix = candidate_similarity_matrix.first(term_limit)
+ return candidate_similarity_matrix, candidates, candidates_ids
end
\ No newline at end of file