require 'csv' def load_hpo_ontology(hpo_file, excluded_hpo_file) hpo = nil if !hpo_file.include?('.json') if !excluded_hpo_file.nil? hpo = Ontology.new(file: hpo_file, load_file: true, removable_terms: read_excluded_hpo_file(excluded_hpo_file)) else hpo = Ontology.new(file: hpo_file, load_file: true) end else hpo = Ontology.new hpo.read(hpo_file) if !excluded_hpo_file.nil? hpo.add_removable_terms(read_excluded_hpo_file(excluded_hpo_file)) hpo.remove_removable() hpo.build_index() end end return hpo end def read_excluded_hpo_file(file) excluded_hpo = [] File.open(file).each do |line| excluded_hpo << line.chomp end return excluded_hpo end def write_hash(hash, file_path, header = []) File.open(file_path, 'w') do |handler| handler.puts header.join("\t") if !header.empty? hash.each do |key, array| handler.puts "#{key}\t#{array.join("\t")}" end end end def write_array(array, file_path) File.open(file_path, 'w') do |handler| array.each do |record| if record.class == String line = record else line = record.join("\t") end handler.puts line end end end def write_matrix_for_R(matrix, x_names, y_names, file) File.open(file, 'w') do |f| f.puts x_names.join("\t") matrix.each_with_index do |row, i| f.puts [y_names[i]].concat(row).join("\t") end end end def write_cluster_ic_data(all_ics, profile_lengths, cluster_ic_data_file, limit) File.open(cluster_ic_data_file, 'w') do |f| f.puts %w[cluster_id ic Plen].join("\t") all_ics.each_with_index do |cluster_ics, i| break if i == limit cluster_length = cluster_ics.length cluster_ics.each_with_index do |clust_ic, j| f.puts "#{cluster_length}_#{i}\t#{clust_ic}\t#{profile_lengths[i][j]}" end end end end def write_cluster_chromosome_data(cluster_data, cluster_chromosome_data_file, limit) File.open(cluster_chromosome_data_file, 'w') do |f| f.puts %w[cluster_id chr count].join("\t") index = 0 last_id = cluster_data.first.first unless cluster_data.empty? cluster_data.each do |cluster_id, patient_number, chr, count| index += 1 if cluster_id != last_id break if index == limit f.puts ["#{patient_number}_#{index}", chr, count].join("\t") last_id = cluster_id end end end def write_coverage_data(coverage_to_plot, coverage_to_plot_file) File.open(coverage_to_plot_file, 'w') do |f| coverage_to_plot.each do |chr, position, freq| f.puts "#{chr}\t#{position}\t#{freq}" end end end def write_detailed_hpo_profile_evaluation(suggested_childs, detailed_profile_evaluation_file, summary_stats) CSV.open(detailed_profile_evaluation_file, "wb") do |csv| suggested_childs.each do |pat_id, suggestions| warning = nil warning = 'WARNING: Very few phenotypes' if suggestions.length < 4 csv << ["PATIENT #{pat_id}", "#{warning}"] csv << ["CURRENT PHENOTYPES", "PUTATIVE MORE SPECIFIC PHENOTYPES"] suggestions.each do |parent, childs| parent_code, parent_name = parent if childs.empty? csv << ["#{parent_name} (#{parent_code})", '-'] else parent_writed = false childs.each do |child_code, child_name| if !parent_writed parent_field = "#{parent_name} (#{parent_code})" parent_writed = true else parent_field = "" end csv << [parent_field, "#{child_name} (#{child_code})"] end end end csv << ["", ""] end end end def write_arrays4scatterplot(x_axis_value, y_axis_value, filename, x_axis_name, y_axis_name) File.open(filename, 'w') do |f| f.puts "#{x_axis_name}\t#{y_axis_name}" x_axis_value.each_with_index do |value,i| y_value = y_axis_value[i] raise("The #{i} position is not presented in y_axis_value") if y_value.nil? f.puts [value, y_value].join("\t") end end end def write_similarity_matrix(similarity_matrix, similarity_matrix_file) File.open(similarity_matrix_file, 'w') do |f| similarity_matrix.each do |row| f.puts row.join("\t") end end end def write_profile_pairs(similarity_pairs, filename) File.open(filename, 'w') do |f| similarity_pairs.each do |pairsA, pairsB_and_values| pairsB_and_values.each do |pairsB, values| f.puts "#{pairsA}\t#{pairsB}\t#{values}" end end end end def write_patient_hpo_stat(average_hp_per_pat_distribution, output_file) File.open(output_file, 'w') do |f| f.puts "#{'PatientsNumber'}\t#{'HPOAverage'}" average_hp_per_pat_distribution.each do |patient_num, ave| f.puts "#{patient_num}\t#{ave}" end end end def parse_clusters_file(clusters_file, patient_data) clusters_info = {} clusters_table = [] File.open(clusters_file).each do |line| line.chomp! patientID, clusterID = line.split("\t") patientHPOProfile = patient_data.get_profile(patientID) query = clusters_info[clusterID] if query.nil? clusters_info[clusterID] = {patientID => patientHPOProfile} else query[patientID] = patientHPOProfile end end clusters_info.each do |clusterID, patients_info| patients_per_cluster = patients_info.keys.length clusters_table << [clusterID, patients_per_cluster, patients_info.keys, patients_info.values] end return clusters_table, clusters_info end def load_profiles(file_path, hpo) profiles = {} #count = 0 File.open(file_path).each do |line| id, profile = line.chomp.split("\t") hpos = profile.split(',').map{|a| a.to_sym} hpos, rejected_hpos = hpo.check_ids(hpos) if !hpos.empty? hpos = hpo.clean_profile(hpos) profiles[id] = hpos if !hpos.empty? end end return profiles end def load_variants(variant_folder) variants = {} Dir.glob(File.join(variant_folder, '*.tab')).each do |path| profile_id = File.basename(path, '.tab') vars = [] File.open(path).each do |line| fields = line.chomp.split("\t") chr = fields[0] start = fields[1].to_i vars << [chr, start, start] end variants[profile_id] = Genomic_Feature.new(vars) end return variants end def load_evidences(evidences_path, hpo) genomic_coordinates = {} coord_files = Dir.glob(File.join(evidences_path, '*.coords')) coord_files.each do |cd_f| entity = File.basename(cd_f, '.coords') coordinates = load_coordinates(cd_f) genomic_coordinates[entity] = coordinates end evidences = {} evidence_files = Dir.glob(File.join(evidences_path, '*_HP.txt')) evidence_files.each do |e_f| pair = File.basename(e_f, '.txt') profiles, id2label = load_evidence_profiles(e_f, hpo) evidences[pair] = {prof: profiles, id2lab: id2label} end return evidences, genomic_coordinates end def load_coordinates(file_path) coordinates = {} header = true File.open(file_path).each do |line| fields = line.chomp.split("\t") if header header = false else entity, chr, strand, start, stop = fields coordinates[entity] = [chr, start.to_i, stop.to_i, strand] end end return coordinates end def load_evidence_profiles(file_path, hpo) profiles = {} id2label = {} #count = 0 File.open(file_path).each do |line| id, label, profile = line.chomp.split("\t") hpos = profile.split(',').map{|a| a.to_sym} hpos, rejected_hpos = hpo.check_ids(hpos) if !hpos.empty? hpos = hpo.clean_profile(hpos) profiles[id] = hpos if !hpos.empty? id2label[id] = label end end return profiles, id2label end #Common methods for predictors #Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5 #1. Indexing by chr (region) def coor_overlap?(ref_start, ref_stop, start, stop) overlap = false if (stop > ref_start && stop <= ref_stop) || (start >= ref_start && start < ref_stop) || (start <= ref_start && stop >= ref_stop) || (start > ref_start && stop < ref_stop) overlap = true end return overlap end def load_training_file4regions(training_file) training_set = {} posInfo = loadFile(training_file) posInfo.each do |info| chr = info.shift query = training_set[chr] if query.nil? training_set[chr] = [info] else query << info end end return training_set end #2. Indexing by hpo (code) #prepare training file for analysis using phenotype2region prediction def load_training_file4HPO(training_file, thresold=0) training_set = {} information = loadFile(training_file, thresold) information.each do |info| hpoCode = info.delete_at(4) query = training_set[hpoCode] if query.nil? training_set[hpoCode] = [info] else query << info end end # STDERR.puts training_set.keys.inspect return training_set end #3. Load training info file: #Chr;Start;Stop;HPO;Association;node def loadFile(file, thresold=0) information = [] File.open(file).each do |line| line.chomp! allInfo = line.split("\t") associationValue = allInfo[4].to_f if associationValue >= thresold chr = allInfo[0] startPos = allInfo[1].to_i stopPos = allInfo[2].to_i hpoCode = allInfo[3] nodeID = allInfo[5] information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue] end end return information end def load_hpo_ci_values(information_coefficient_file) hpos_ci_values = {} File.open(information_coefficient_file).each do |line| line.chomp! hpo_code, ci = line.split("\t") hpos_ci_values[hpo_code.to_sym] = ci.to_f end return hpos_ci_values end def load_clustered_patients(file) clusters = {} File.open(file).each do |line| line.chomp! pat_id, cluster_id = line.split("\t") query = clusters[cluster_id] if query.nil? clusters[cluster_id] = [pat_id] else query << pat_id end end return clusters end def load_gene_data(gene_data_path) gene_list = {} #geneID => attr gene_location = {} # chr => gene infile = open(gene_data_path) gz = Zlib::GzipReader.new(infile) current_chr = nil genes = [] gz.each_line do |line| line.chomp! next if line =~ /^#/ fields = line.split("\t") if fields[8].include?('genome=chromosome') chr = fields[8].split(';')[1].split('=').last gene_location[current_chr] = genes genes = [] current_chr = chr elsif fields[2] == 'gene' attributes = {} fields[8].split(';').each do |pair| key, value = pair.split('=') attributes[key] = value end geneName = nil geneName = attributes['gene'] if !attributes['gene'].nil? geneSyns = [] geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil? description = attributes['description'] description = URI.unescape(description) if !description.nil? attributes['Dbxref'] =~ /GeneID:(\d+)/ gene_list[$1] = [geneName, geneSyns, description] genes << [$1, fields[3].to_i, fields[4].to_i] end end gene_location[current_chr] = genes return gene_list, gene_location end def parse_kegg_data(query_genes) kegg_data = {} #gene => attb while !query_genes.empty? gene_set = query_genes.shift(10) url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}" uri = URI(url) response = Net::HTTP.get(uri) geneID = nil gene_names = [] definition = nil pathways = [] parsing_pathway_field = false response.squeeze(' ').each_line do |line| line.chomp! if line =~ /^ENTRY/ geneID = line.split(' ')[1] elsif line =~ /^NAME/ gene_names = line.split(' ', 2).last.split(', ') elsif line =~ /^DEFINITION/ definition = line.split(' ', 2)[1] elsif line =~ /^PATHWAY/ pathways << line.split(' ', 3)[1..2] parsing_pathway_field = true elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/ parsing_pathway_field = false elsif parsing_pathway_field pathways << line.strip.split(' ', 2) elsif line == '///' parsing_pathway_field = false kegg_data[geneID] = [gene_names, definition, pathways] pathways = [] gene_names = [] end end end return kegg_data end def write_compressed_plain_file(data, path) File.open(path, 'w') do |f| gz = Zlib::GzipWriter.new(f) gz.write data.to_json gz.close end end def read_compressed_json(path) infile = open(path) gz = Zlib::GzipReader.new(infile) object = JSON.parse(gz.read) return object end def download(ftp_server, path, name) ftp = Net::FTP.new() ftp.connect(ftp_server) ftp.login ftp.getbinaryfile(path, name) ftp.close end