require 'uri' require 'net/ftp' require 'net/http' require 'zlib' require 'json' require 'benchmark' def system_call(code_folder, script, args_string) cmd = File.join(code_folder, script) + ' ' + args_string puts "==> #{cmd}" Benchmark.bm do |x| x.report{ system(cmd) } end end def read_excluded_hpo_file(file) excluded_hpo = [] File.open(file).each do |line| excluded_hpo << line.chomp end return excluded_hpo end #Common methods for predictors #Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5 #1. Indexing by chr (region) def coor_overlap?(ref_start, ref_stop, start, stop) overlap = false if (stop > ref_start && stop <= ref_stop) || (start >= ref_start && start < ref_stop) || (start <= ref_start && stop >= ref_stop) || (start > ref_start && stop < ref_stop) overlap = true end return overlap end def load_training_file4regions(training_file) training_set = {} posInfo = loadFile(training_file) posInfo.each do |info| chr = info.shift query = training_set[chr] if query.nil? training_set[chr] = [info] else query << info end end return training_set end #2. Indexing by hpo (code) #prepare training file for analysis using phenotype2region prediction def load_training_file4HPO(training_file, thresold=0) training_set = {} information = loadFile(training_file, thresold) information.each do |info| hpoCode = info.delete_at(4) query = training_set[hpoCode] if query.nil? training_set[hpoCode] = [info] else query << info end end # STDERR.puts training_set.keys.inspect return training_set end #3. Load training info file: #Chr;Start;Stop;HPO;Association;node def loadFile(file, thresold=0) information = [] File.open(file).each do |line| line.chomp! allInfo = line.split("\t") associationValue = allInfo[4].to_f if associationValue >= thresold chr = allInfo[0] startPos = allInfo[1].to_i stopPos = allInfo[2].to_i hpoCode = allInfo[3] nodeID = allInfo[5] information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue] end end return information end def compute_IC_values(patient_data, total_patients) patients_per_hpo = Hash.new(0) last_patient_ID = '' patient_data.each do |patient_ID, metadata| patient, count = patient_ID.split('_i') if patient != last_patient_ID hpos, chr, start, stop = metadata hpos.each do |h| patients_per_hpo[h] += 1 end end last_patient_ID = patient end # STDERR.puts patients_per_hpo.inspect # Process.exit patients_per_hpo.each do |hpo, patient_number| patients_per_hpo[hpo] = -Math.log10(patient_number.fdiv(total_patients)) end return patients_per_hpo end def load_hpo_ci_values(information_coefficient_file) hpos_ci_values = {} File.open(information_coefficient_file).each do |line| line.chomp! hpo_code, ci = line.split("\t") hpos_ci_values[hpo_code.to_sym] = ci.to_f end return hpos_ci_values end def load_clustered_patients(file) clusters = {} File.open(file).each do |line| line.chomp! pat_id, cluster_id = line.split("\t") query = clusters[cluster_id] if query.nil? clusters[cluster_id] = [pat_id] else query << pat_id end end return clusters end def load_gene_data(gene_data_path) gene_list = {} #geneID => attr gene_location = {} # chr => gene infile = open(gene_data_path) gz = Zlib::GzipReader.new(infile) current_chr = nil genes = [] gz.each_line do |line| line.chomp! next if line =~ /^#/ fields = line.split("\t") if fields[8].include?('genome=chromosome') chr = fields[8].split(';')[1].split('=').last gene_location[current_chr] = genes genes = [] current_chr = chr elsif fields[2] == 'gene' attributes = {} fields[8].split(';').each do |pair| key, value = pair.split('=') attributes[key] = value end geneName = nil geneName = attributes['gene'] if !attributes['gene'].nil? geneSyns = [] geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil? description = attributes['description'] description = URI.unescape(description) if !description.nil? attributes['Dbxref'] =~ /GeneID:(\d+)/ gene_list[$1] = [geneName, geneSyns, description] genes << [$1, fields[3].to_i, fields[4].to_i] end end gene_location[current_chr] = genes return gene_list, gene_location end def parse_kegg_data(query_genes) kegg_data = {} #gene => attb while !query_genes.empty? gene_set = query_genes.shift(10) url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}" uri = URI(url) response = Net::HTTP.get(uri) geneID = nil gene_names = [] definition = nil pathways = [] parsing_pathway_field = false response.squeeze(' ').each_line do |line| line.chomp! if line =~ /^ENTRY/ geneID = line.split(' ')[1] elsif line =~ /^NAME/ gene_names = line.split(' ', 2).last.split(', ') elsif line =~ /^DEFINITION/ definition = line.split(' ', 2)[1] elsif line =~ /^PATHWAY/ pathways << line.split(' ', 3)[1..2] parsing_pathway_field = true elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/ parsing_pathway_field = false elsif parsing_pathway_field pathways << line.strip.split(' ', 2) elsif line == '///' parsing_pathway_field = false kegg_data[geneID] = [gene_names, definition, pathways] pathways = [] gene_names = [] end end end return kegg_data end def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path) kegg_data = {} gene2biosystems = load_biosystem2gene_dictionary(biosystems_gene_path) keggAttributes = loadBiosistemsInfo(biosystems_info_path, 'KEGG') keggAttributes.select!{|kegg_id, data| data.first =~ /^hsa/} gene2biosystems.each do |geneID, pathways| kegg_pathways = [] pathways.each do |biosystem| kAttrib = keggAttributes[biosystem] kegg_pathways << kAttrib if !kAttrib.nil? end kegg_data[geneID] = kegg_pathways end return kegg_data end def loadBiosistemsInfo(biosystems_info_path, filterDB) bsid2attributes = {} infile = open(biosystems_info_path) gz = Zlib::GzipReader.new(infile) gz.each_line do |line| line.chomp! #STDERR.puts line.inspect fields = line.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '').split("\t") bsid = fields.shift bsid2attributes[bsid] = [fields[1], fields[2]] if filterDB == fields[0] end return bsid2attributes end def load_biosystem2gene_dictionary(biosystems_gene_path) gene2kegg = {} infile = open(biosystems_gene_path) gz = Zlib::GzipReader.new(infile) gz.each_line do |line| line.chomp! biosystem, gene, score = line.split("\t") query = gene2kegg[gene] if query.nil? gene2kegg[gene] = [biosystem] else query << biosystem end end return gene2kegg end def merge_genes_with_kegg_data(gene_list, kegg_data) merged_data = {} gene_list.each do |geneID, values| geneName, geneSyn, description = values kegg_entry = kegg_data[geneID] kegg_entry = [] if kegg_entry.nil? merged_data[geneID] = [geneName, description, kegg_entry, geneSyn] end return merged_data end def write_compressed_plain_file(data, path) File.open(path, 'w') do |f| gz = Zlib::GzipWriter.new(f) gz.write data.to_json gz.close end end def read_compressed_json(path) infile = open(path) gz = Zlib::GzipReader.new(infile) object = JSON.parse(gz.read) return object end def compute_pathway_enrichment(genes_clusters, genes_with_kegg) pathways_genes_in_predictions = {} genes_in_predictions = [] genes_clusters.each do |cluster| cluster.each do |geneID, data| geneName, description, pathways, geneSyns = data pathways.each do |pathway| query = pathways_genes_in_predictions[pathway] if query.nil? pathways_genes_in_predictions[pathway] = [geneID] else query << geneID if !query.include?(geneID) end end genes_in_predictions << geneID if !genes_in_predictions.include?(geneID) end end genes_out_of_predictions = genes_with_kegg.keys - genes_in_predictions gene_number = genes_with_kegg.length stats = [] pathways_genes_in_predictions.each do |pathway, pathway_predicted_genes| pathway_id, pathway_name = pathway no_pathway_predicted_genes = genes_in_predictions - pathway_predicted_genes pathway_no_predicted_genes_count = 0 no_pathway_no_predicted_genes_count = 0 genes_out_of_predictions.each do |geneID| query = genes_with_kegg[geneID] if query[2].map{|pathway_info| pathway_info.first}.include?(pathway_id) pathway_no_predicted_genes_count += 1 else no_pathway_no_predicted_genes_count += 1 end end #Fisher => http://www.biostathandbook.com/fishers.html no_pathway_predicted_genes_count = no_pathway_predicted_genes.length pathway_predicted_genes_count = pathway_predicted_genes.length accumulated_prob = 0 pathway_no_predicted_genes_count.times do |n| no_pathway_predicted_genes_count_shifted = no_pathway_predicted_genes_count - n pathway_predicted_genes_count_shifted = pathway_predicted_genes_count - n if no_pathway_predicted_genes_count_shifted >= 0 && pathway_predicted_genes_count_shifted >= 0 accumulated_prob += compute_hyper_prob( n, no_pathway_predicted_genes_count_shifted, pathway_predicted_genes_count_shifted, no_pathway_no_predicted_genes_count + n, gene_number ) else break end end contigency = [pathway_no_predicted_genes_count, no_pathway_predicted_genes_count, pathway_predicted_genes_count, no_pathway_no_predicted_genes_count] stats << [pathway, pathway_predicted_genes, contigency, accumulated_prob] end return stats end def compute_hyper_prob(a, b, c, d, n) binomA = binom(a + b, a) binomC = binom(c + d, c) divisor = binom(n, a + c) return (binomA * binomC).fdiv(divisor) end def binom(n,k) if k > 0 && k < n res = (1+n-k..n).inject(:*)/(1..k).inject(:*) else res = 1 end end def get_reference(genomic_ranges) #genomic_ranges = [patientID, mut_start, mut_stop] reference = [] reference.concat(genomic_ranges.map{|gr| gr[1]})# get start reference.concat(genomic_ranges.map{|gr| gr[2]})# get stop reference.uniq! reference.sort! #Define overlap range final_reference = [] reference.each_with_index do |coord,i| next_coord = reference[i + 1] final_reference << [coord, next_coord] if !next_coord.nil? end return final_reference end def overlap_patients(genomic_ranges, reference) overlaps = [] reference.each do |start, stop| patients = [] genomic_ranges.each do |pt_id, pt_start, pt_stop| if (start <= pt_start && stop >= pt_stop) || (start > pt_start && stop < pt_stop) || (stop > pt_start && stop <= pt_stop) || (start >= pt_start && start < pt_stop) patients << pt_id end end overlaps << patients.uniq end return overlaps end def generate_cluster_regions(patients_genomic_region_by_chr, mutation_type, pat_per_reg = 1) patients_out_of_cluster = 0 patients_by_cluster = {} sors = [] patients_genomic_region_by_chr.each do |chrm, genomic_ranges| reference = get_reference(genomic_ranges) # Get putative overlap regions overlapping_patients = overlap_patients(genomic_ranges, reference) # See what patient has match with a overlap region clust_number = 1 reference.each_with_index do |ref, i| current_patients = overlapping_patients[i] if current_patients.length > pat_per_reg ref << chrm node_identifier = "#{chrm}.#{clust_number}.#{mutation_type}.#{current_patients.length}" ref << node_identifier save_sor(current_patients, node_identifier, patients_by_cluster) sors << ref clust_number += 1 end end end return patients_by_cluster, sors end def save_sor(current_patients, node_identifier, patients_by_cluster) current_patients.each do |patient| add_record(patients_by_cluster, patient, node_identifier) end end def add_record(hash, key, record) query = hash[key] if query.nil? hash[key] = [record] elsif !query.include?(record) query << record end end def load_patient_cohort(options) patient_data = {} count = 0 fields2extract = get_fields2extract(options) field_numbers = fields2extract.values File.open(options[:input_file]).each do |line| line.chomp! if options[:header] && count == 0 line.gsub!(/#\s*/,'') # correct comment like headers field_names = line.split("\t") get_field_numbers2extract(field_names, fields2extract) field_numbers = fields2extract.values else fields = line.split("\t") pat_record = field_numbers.map{|n| fields[n]} if fields2extract[:pat_id_col].nil? pat_id = "pat_#{count}" #generate ids else original_id = pat_record.shift pat_id = original_id + "_i#{count}" # make sure that ids are uniq end if !pat_record[0].nil? pat_record[0] = pat_record[0].split(options[:hpo_separator]) else pat_record[0] = [] end pat_record[2] = pat_record[2].to_i if !options[:start_col].nil? pat_record[3] = pat_record[3].to_i if !options[:end_col].nil? patient_data[pat_id] = pat_record end count +=1 end options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil? return patient_data end def get_fields2extract(options) fields2extract = {} [:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field| col = options[field] if !col.nil? col = col.to_i if !options[:header] fields2extract[field] = col end end return fields2extract end def get_field_numbers2extract(field_names, fields2extract) fields2extract.each do |field, name| fields2extract[field] = field_names.index(name) end end def download(ftp_server, path, name) ftp = Net::FTP.new() ftp.connect(ftp_server) ftp.login ftp.getbinaryfile(path, name) ftp.close end def get_and_parse_external_data(all_paths) sources = [ ['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]], ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]], ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/bsid2info.gz', all_paths[:biosystems_info]] ] sources.each do |server, path, output| download(server, path, output) if !File.exists?(output) end genes_with_kegg = {} gene_location = {} if !File.exists?(all_paths[:gene_data_with_pathways]) || !File.exists?(all_paths[:gene_location]) gene_list, gene_location = load_gene_data(all_paths[:gene_data]) ### kegg_data = parse_kegg_data(genes_found_attributes.keys) kegg_data = parse_kegg_from_biosystems(all_paths[:biosystems_gene], all_paths[:biosystems_info]) genes_with_kegg = merge_genes_with_kegg_data(gene_list, kegg_data) write_compressed_plain_file(genes_with_kegg, all_paths[:gene_data_with_pathways]) write_compressed_plain_file(gene_location, all_paths[:gene_location]) else gene_location = read_compressed_json(all_paths[:gene_location]) genes_with_kegg = read_compressed_json(all_paths[:gene_data_with_pathways]) end return gene_location, genes_with_kegg end def get_detailed_similarity(profile, candidates, evidences, hpo) profile_length = profile.length matrix = [] profile_length.times{ matrix << Array.new(candidates.length, 0)} cand_number = 0 candidates.each do |candidate_id, similarity| local_sim = [] candidate_evidence = evidences[candidate_id] profile.each do |profile_term| candidate_evidence.each do |candidate_term| term_sim = hpo.compare([candidate_term], [profile_term], sim_type: :lin, bidirectional: false) local_sim << [profile_term, candidate_term, term_sim] end end local_sim.sort!{|s1, s2| s2.last <=> s1.last} final_pairs = [] processed_profile_terms = [] processed_candidate_terms = [] local_sim.each do |pr_term, cd_term, sim| if !processed_profile_terms.include?(pr_term) && !processed_candidate_terms.include?(cd_term) final_pairs << [pr_term, cd_term, sim] processed_profile_terms << pr_term processed_candidate_terms << cd_term end break if profile_length == processed_profile_terms.length end final_pairs.each do |pr_term, cd_term, similarity| matrix[profile.index(pr_term)][cand_number] = similarity end cand_number += 1 end return matrix end def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit) candidates = similarities.to_a.sort{|s1, s2| s2.last <=> s1.last}.first(candidate_limit) candidates_ids = candidates.map{|c| c.first} candidate_similarity_matrix = get_detailed_similarity(reference_prof, candidates, evidence_profiles, hpo) candidate_similarity_matrix.each_with_index do |row, i| row.unshift(hpo.translate_id(reference_prof[i])) end candidate_similarity_matrix.sort!{|r1,r2| r2[1..r2.length].inject(0){|sum,n| sum +n} <=> r1[1..r1.length].inject(0){|sum,n| sum +n}} candidate_similarity_matrix = candidate_similarity_matrix.first(term_limit) return candidate_similarity_matrix, candidates, candidates_ids end