lib/pets/generalMethods.rb in pets-0.1.4 vs lib/pets/generalMethods.rb in pets-0.2.3

- old
+ new

@@ -1,9 +1,41 @@ require 'uri' +require 'net/ftp' +require 'net/http' +require 'zlib' +require 'json' +require 'benchmark' + +def system_call(code_folder, script, args_string) + cmd = File.join(code_folder, script) + ' ' + args_string + puts "==> #{cmd}" + Benchmark.bm do |x| + x.report{ system(cmd) } + end +end + +def read_excluded_hpo_file(file) + excluded_hpo = [] + File.open(file).each do |line| + excluded_hpo << line.chomp + end + return excluded_hpo +end + #Common methods for predictors #Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5 #1. Indexing by chr (region) +def coor_overlap?(ref_start, ref_stop, start, stop) + overlap = false + if (stop > ref_start && stop <= ref_stop) || + (start >= ref_start && start < ref_stop) || + (start <= ref_start && stop >= ref_stop) || + (start > ref_start && stop < ref_stop) + overlap = true + end + return overlap +end def load_training_file4regions(training_file) training_set = {} posInfo = loadFile(training_file) posInfo.each do |info| @@ -55,95 +87,11 @@ end end return information end -def add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list) - if !hpo_black_list.include?(id) - attributes = [id, name, is_a - hpo_black_list, syn] - hpo_storage[id] = attributes - alt_ids.each do |altid| - hpo_storage[altid] = attributes - end - end -end -def load_hpo_file(hpo_file, hpo_black_list=[]) - hpo_storage = {} - id = nil - name = nil - alt_ids = [] - syn = [] - is_a = [] - File.open(hpo_file).each do |line| - line.chomp! - tag, info = line.split(': ') - if tag == 'id' || tag == 'name' || tag == 'is_a' || tag == 'synonym' || tag == 'alt_id' - if tag == 'id' - add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list) if !name.nil? - id = info - name = nil - alt_id = [] - syn = [] - is_a = [] - end - if tag == 'alt_id' - alt_ids << info - elsif tag == 'is_a' - is_a << info.split(' ! ')[0] - elsif tag == 'synonym' - syn << info.split('"')[1] #to keep only the name of the synonym - else - name = info - end - end - end - add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list) - # STDERR.puts hpo_storage.inspect - # Process.exit - return hpo_storage -end - -def load_hpo_black_list(excluded_hpo_file) - excluded_hpos = [] - File.open(excluded_hpo_file).each do |line| - line.chomp! - excluded_hpos << line - end - return excluded_hpos -end - -def create_hpo_dictionary(hpo_storage) - hpo_dictionary = {} - hpo_storage.each do |hpo, metadata| - hpo_code, hpo_name, hpo_parents, hpo_synonyms = metadata - hpo_dictionary[hpo_name] = hpo_code - hpo_synonyms.each do |syn| - hpo_dictionary[syn] = hpo_code - end - end - return hpo_dictionary -end - -def get_child_parent_relations(hpo_storage) - # for getting hpo childs - storage_child = {} - hpo_storage.each do |hpo_code, hpo_data| - id, name, is_a, syn = hpo_data - hpo_child = [id, name] - is_a.each do |par_hpo_code| - query = storage_child[par_hpo_code] - if query.nil? - storage_child[par_hpo_code] = [hpo_child] - else - query << hpo_child - end - end - end - return storage_child -end - def compute_IC_values(patient_data, total_patients) patients_per_hpo = Hash.new(0) last_patient_ID = '' patient_data.each do |patient_ID, metadata| patient, count = patient_ID.split('_i') @@ -161,38 +109,16 @@ patients_per_hpo[hpo] = -Math.log10(patient_number.fdiv(total_patients)) end return patients_per_hpo end -# def get_child_parent_relations(hpo_storage) -# # for getting hpo childs -# storage_child = {} -# hpo_storage.each do |hpo_code, hpo_data| -# STDERR.puts hpo_data[3].inspect -# Process.exit -# main_code, hpo_name, synonyms, parents = hpo_data -# parents.each do |par_hpo_code, par_hpo_name| -# query = storage_child[par_hpo_code] -# hpo_child = [main_code, hpo_name] -# if query.nil? -# storage_child[par_hpo_code] = [par_hpo_name, [hpo_child]] -# else -# query.last << hpo_child -# end -# end -# end - -# return storage_child -# end - - def load_hpo_ci_values(information_coefficient_file) hpos_ci_values = {} File.open(information_coefficient_file).each do |line| line.chomp! hpo_code, ci = line.split("\t") - hpos_ci_values[hpo_code] = ci.to_f + hpos_ci_values[hpo_code.to_sym] = ci.to_f end return hpos_ci_values end def load_clustered_patients(file) @@ -230,17 +156,18 @@ attributes = {} fields[8].split(';').each do |pair| key, value = pair.split('=') attributes[key] = value end - geneNames = [] - geneNames << attributes['gene'] if !attributes['gene'].nil? - geneNames.concat(attributes['gene_synonym'].split(',')) if !attributes['gene_synonym'].nil? + geneName = nil + geneName = attributes['gene'] if !attributes['gene'].nil? + geneSyns = [] + geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil? description = attributes['description'] description = URI.unescape(description) if !description.nil? attributes['Dbxref'] =~ /GeneID:(\d+)/ - gene_list[$1] = [geneNames, description] + gene_list[$1] = [geneName, geneSyns, description] genes << [$1, fields[3].to_i, fields[4].to_i] end end gene_location[current_chr] = genes return gene_list, gene_location @@ -332,18 +259,15 @@ return gene2kegg end def merge_genes_with_kegg_data(gene_list, kegg_data) merged_data = {} - gene_list.each do |geneID, attributes| - query = kegg_data[geneID] - if query.nil? - attributes << [] - else - attributes << query - end - merged_data[geneID] = attributes + gene_list.each do |geneID, values| + geneName, geneSyn, description = values + kegg_entry = kegg_data[geneID] + kegg_entry = [] if kegg_entry.nil? + merged_data[geneID] = [geneName, description, kegg_entry, geneSyn] end return merged_data end def write_compressed_plain_file(data, path) @@ -364,11 +288,11 @@ def compute_pathway_enrichment(genes_clusters, genes_with_kegg) pathways_genes_in_predictions = {} genes_in_predictions = [] genes_clusters.each do |cluster| cluster.each do |geneID, data| - geneNames, description, pathways = data + geneName, description, pathways, geneSyns = data pathways.each do |pathway| query = pathways_genes_in_predictions[pathway] if query.nil? pathways_genes_in_predictions[pathway] = [geneID] else @@ -508,35 +432,39 @@ def load_patient_cohort(options) patient_data = {} count = 0 fields2extract = get_fields2extract(options) field_numbers = fields2extract.values - original_ids = [] - File.open(options[:input_file]).each do |line| - line.chomp! - if options[:header] && count == 0 - line.gsub!(/#\s*/,'') # correct comment like headers - field_names = line.split("\t") - get_field_numbers2extract(field_names, fields2extract) - field_numbers = fields2extract.values - else - fields = line.split("\t") - pat_record = field_numbers.map{|n| fields[n]} - if fields2extract[:pat_id_col].nil? - pat_id = "pat_#{count}" #generate ids - else - original_id = pat_record.shift - original_ids << original_id - pat_id = original_id + "_i#{count}" # make sure that ids are uniq - end - patient_data[pat_id] = pat_record - end - count +=1 - end - fields2extract[:pat_id_col].nil? ? patient_number = count : patient_number = original_ids.uniq.length - options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil? - return patient_data, patient_number + File.open(options[:input_file]).each do |line| + line.chomp! + if options[:header] && count == 0 + line.gsub!(/#\s*/,'') # correct comment like headers + field_names = line.split("\t") + get_field_numbers2extract(field_names, fields2extract) + field_numbers = fields2extract.values + else + fields = line.split("\t") + pat_record = field_numbers.map{|n| fields[n]} + if fields2extract[:pat_id_col].nil? + pat_id = "pat_#{count}" #generate ids + else + original_id = pat_record.shift + pat_id = original_id + "_i#{count}" # make sure that ids are uniq + end + if !pat_record[0].nil? + pat_record[0] = pat_record[0].split(options[:hpo_separator]) + else + pat_record[0] = [] + end + pat_record[2] = pat_record[2].to_i if !options[:start_col].nil? + pat_record[3] = pat_record[3].to_i if !options[:end_col].nil? + patient_data[pat_id] = pat_record + end + count +=1 + end + options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil? + return patient_data end def get_fields2extract(options) fields2extract = {} [:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field| @@ -551,6 +479,87 @@ def get_field_numbers2extract(field_names, fields2extract) fields2extract.each do |field, name| fields2extract[field] = field_names.index(name) end +end + +def download(ftp_server, path, name) + ftp = Net::FTP.new() + ftp.connect(ftp_server) + ftp.login + ftp.getbinaryfile(path, name) + ftp.close +end + +def get_and_parse_external_data(all_paths) + sources = [ + ['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]], + ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]], + ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/bsid2info.gz', all_paths[:biosystems_info]] + ] + sources.each do |server, path, output| + download(server, path, output) if !File.exists?(output) + end + + genes_with_kegg = {} + gene_location = {} + if !File.exists?(all_paths[:gene_data_with_pathways]) || !File.exists?(all_paths[:gene_location]) + gene_list, gene_location = load_gene_data(all_paths[:gene_data]) + ### kegg_data = parse_kegg_data(genes_found_attributes.keys) + kegg_data = parse_kegg_from_biosystems(all_paths[:biosystems_gene], all_paths[:biosystems_info]) + genes_with_kegg = merge_genes_with_kegg_data(gene_list, kegg_data) + write_compressed_plain_file(genes_with_kegg, all_paths[:gene_data_with_pathways]) + write_compressed_plain_file(gene_location, all_paths[:gene_location]) + else + gene_location = read_compressed_json(all_paths[:gene_location]) + genes_with_kegg = read_compressed_json(all_paths[:gene_data_with_pathways]) + end + return gene_location, genes_with_kegg +end + +def get_detailed_similarity(profile, candidates, evidences, hpo) + profile_length = profile.length + matrix = [] + profile_length.times{ matrix << Array.new(candidates.length, 0)} + cand_number = 0 + candidates.each do |candidate_id, similarity| + local_sim = [] + candidate_evidence = evidences[candidate_id] + profile.each do |profile_term| + candidate_evidence.each do |candidate_term| + term_sim = hpo.compare([candidate_term], [profile_term], sim_type: :lin, bidirectional: false) + local_sim << [profile_term, candidate_term, term_sim] + end + end + local_sim.sort!{|s1, s2| s2.last <=> s1.last} + + final_pairs = [] + processed_profile_terms = [] + processed_candidate_terms = [] + local_sim.each do |pr_term, cd_term, sim| + if !processed_profile_terms.include?(pr_term) && !processed_candidate_terms.include?(cd_term) + final_pairs << [pr_term, cd_term, sim] + processed_profile_terms << pr_term + processed_candidate_terms << cd_term + end + break if profile_length == processed_profile_terms.length + end + final_pairs.each do |pr_term, cd_term, similarity| + matrix[profile.index(pr_term)][cand_number] = similarity + end + cand_number += 1 + end + return matrix +end + +def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit) + candidates = similarities.to_a.sort{|s1, s2| s2.last <=> s1.last}.first(candidate_limit) + candidates_ids = candidates.map{|c| c.first} + candidate_similarity_matrix = get_detailed_similarity(reference_prof, candidates, evidence_profiles, hpo) + candidate_similarity_matrix.each_with_index do |row, i| + row.unshift(hpo.translate_id(reference_prof[i])) + end + candidate_similarity_matrix.sort!{|r1,r2| r2[1..r2.length].inject(0){|sum,n| sum +n} <=> r1[1..r1.length].inject(0){|sum,n| sum +n}} + candidate_similarity_matrix = candidate_similarity_matrix.first(term_limit) + return candidate_similarity_matrix, candidates, candidates_ids end \ No newline at end of file