lib/pets/generalMethods.rb in pets-0.2.3 vs lib/pets/generalMethods.rb in pets-0.2.4

- old
+ new

@@ -11,87 +11,22 @@ Benchmark.bm do |x| x.report{ system(cmd) } end end -def read_excluded_hpo_file(file) - excluded_hpo = [] - File.open(file).each do |line| - excluded_hpo << line.chomp +def add_record(hash, key, record, uniq=false) + query = hash[key] + if query.nil? + hash[key] = [record] + elsif !uniq # We not take care by repeated entries + query << record + elsif !query.include?(record) # We want uniq entries + query << record end - return excluded_hpo end -#Common methods for predictors -#Training file example = 9 131371492 131375954 HP:0010974 2.41161970596 9.3.A.5 -#1. Indexing by chr (region) -def coor_overlap?(ref_start, ref_stop, start, stop) - overlap = false - if (stop > ref_start && stop <= ref_stop) || - (start >= ref_start && start < ref_stop) || - (start <= ref_start && stop >= ref_stop) || - (start > ref_start && stop < ref_stop) - overlap = true - end - return overlap -end -def load_training_file4regions(training_file) - training_set = {} - posInfo = loadFile(training_file) - posInfo.each do |info| - chr = info.shift - query = training_set[chr] - if query.nil? - training_set[chr] = [info] - else - query << info - end - end - return training_set -end - -#2. Indexing by hpo (code) -#prepare training file for analysis using phenotype2region prediction -def load_training_file4HPO(training_file, thresold=0) - training_set = {} - information = loadFile(training_file, thresold) - information.each do |info| - hpoCode = info.delete_at(4) - query = training_set[hpoCode] - if query.nil? - training_set[hpoCode] = [info] - else - query << info - end - end - # STDERR.puts training_set.keys.inspect - return training_set -end - - -#3. Load training info file: -#Chr;Start;Stop;HPO;Association;node -def loadFile(file, thresold=0) - information = [] - File.open(file).each do |line| - line.chomp! - allInfo = line.split("\t") - associationValue = allInfo[4].to_f - if associationValue >= thresold - chr = allInfo[0] - startPos = allInfo[1].to_i - stopPos = allInfo[2].to_i - hpoCode = allInfo[3] - nodeID = allInfo[5] - information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue] - end - end - return information -end - - def compute_IC_values(patient_data, total_patients) patients_per_hpo = Hash.new(0) last_patient_ID = '' patient_data.each do |patient_ID, metadata| patient, count = patient_ID.split('_i') @@ -109,110 +44,11 @@ patients_per_hpo[hpo] = -Math.log10(patient_number.fdiv(total_patients)) end return patients_per_hpo end -def load_hpo_ci_values(information_coefficient_file) - hpos_ci_values = {} - File.open(information_coefficient_file).each do |line| - line.chomp! - hpo_code, ci = line.split("\t") - hpos_ci_values[hpo_code.to_sym] = ci.to_f - end - return hpos_ci_values -end -def load_clustered_patients(file) - clusters = {} - File.open(file).each do |line| - line.chomp! - pat_id, cluster_id = line.split("\t") - query = clusters[cluster_id] - if query.nil? - clusters[cluster_id] = [pat_id] - else - query << pat_id - end - end - return clusters -end - -def load_gene_data(gene_data_path) - gene_list = {} #geneID => attr - gene_location = {} # chr => gene - infile = open(gene_data_path) - gz = Zlib::GzipReader.new(infile) - current_chr = nil - genes = [] - gz.each_line do |line| - line.chomp! - next if line =~ /^#/ - fields = line.split("\t") - if fields[8].include?('genome=chromosome') - chr = fields[8].split(';')[1].split('=').last - gene_location[current_chr] = genes - genes = [] - current_chr = chr - elsif fields[2] == 'gene' - attributes = {} - fields[8].split(';').each do |pair| - key, value = pair.split('=') - attributes[key] = value - end - geneName = nil - geneName = attributes['gene'] if !attributes['gene'].nil? - geneSyns = [] - geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil? - description = attributes['description'] - description = URI.unescape(description) if !description.nil? - attributes['Dbxref'] =~ /GeneID:(\d+)/ - gene_list[$1] = [geneName, geneSyns, description] - genes << [$1, fields[3].to_i, fields[4].to_i] - end - end - gene_location[current_chr] = genes - return gene_list, gene_location -end - -def parse_kegg_data(query_genes) - kegg_data = {} #gene => attb - while !query_genes.empty? - gene_set = query_genes.shift(10) - url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}" - uri = URI(url) - response = Net::HTTP.get(uri) - geneID = nil - gene_names = [] - definition = nil - pathways = [] - parsing_pathway_field = false - response.squeeze(' ').each_line do |line| - line.chomp! - if line =~ /^ENTRY/ - geneID = line.split(' ')[1] - elsif line =~ /^NAME/ - gene_names = line.split(' ', 2).last.split(', ') - elsif line =~ /^DEFINITION/ - definition = line.split(' ', 2)[1] - elsif line =~ /^PATHWAY/ - pathways << line.split(' ', 3)[1..2] - parsing_pathway_field = true - elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/ - parsing_pathway_field = false - elsif parsing_pathway_field - pathways << line.strip.split(' ', 2) - elsif line == '///' - parsing_pathway_field = false - kegg_data[geneID] = [gene_names, definition, pathways] - pathways = [] - gene_names = [] - end - end - end - return kegg_data -end - def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path) kegg_data = {} gene2biosystems = load_biosystem2gene_dictionary(biosystems_gene_path) keggAttributes = loadBiosistemsInfo(biosystems_info_path, 'KEGG') keggAttributes.select!{|kegg_id, data| data.first =~ /^hsa/} @@ -268,25 +104,10 @@ merged_data[geneID] = [geneName, description, kegg_entry, geneSyn] end return merged_data end -def write_compressed_plain_file(data, path) - File.open(path, 'w') do |f| - gz = Zlib::GzipWriter.new(f) - gz.write data.to_json - gz.close - end -end - -def read_compressed_json(path) - infile = open(path) - gz = Zlib::GzipReader.new(infile) - object = JSON.parse(gz.read) - return object -end - def compute_pathway_enrichment(genes_clusters, genes_with_kegg) pathways_genes_in_predictions = {} genes_in_predictions = [] genes_clusters.each do |cluster| cluster.each do |geneID, data| @@ -356,141 +177,11 @@ else res = 1 end end -def get_reference(genomic_ranges) - #genomic_ranges = [patientID, mut_start, mut_stop] - reference = [] - reference.concat(genomic_ranges.map{|gr| gr[1]})# get start - reference.concat(genomic_ranges.map{|gr| gr[2]})# get stop - reference.uniq! - reference.sort! - #Define overlap range - final_reference = [] - reference.each_with_index do |coord,i| - next_coord = reference[i + 1] - final_reference << [coord, next_coord] if !next_coord.nil? - end - return final_reference -end -def overlap_patients(genomic_ranges, reference) - overlaps = [] - reference.each do |start, stop| - patients = [] - genomic_ranges.each do |pt_id, pt_start, pt_stop| - if (start <= pt_start && stop >= pt_stop) || - (start > pt_start && stop < pt_stop) || - (stop > pt_start && stop <= pt_stop) || - (start >= pt_start && start < pt_stop) - patients << pt_id - end - end - overlaps << patients.uniq - end - return overlaps -end -def generate_cluster_regions(patients_genomic_region_by_chr, mutation_type, pat_per_reg = 1) - patients_out_of_cluster = 0 - patients_by_cluster = {} - sors = [] - patients_genomic_region_by_chr.each do |chrm, genomic_ranges| - reference = get_reference(genomic_ranges) # Get putative overlap regions - overlapping_patients = overlap_patients(genomic_ranges, reference) # See what patient has match with a overlap region - clust_number = 1 - reference.each_with_index do |ref, i| - current_patients = overlapping_patients[i] - if current_patients.length > pat_per_reg - ref << chrm - node_identifier = "#{chrm}.#{clust_number}.#{mutation_type}.#{current_patients.length}" - ref << node_identifier - save_sor(current_patients, node_identifier, patients_by_cluster) - sors << ref - clust_number += 1 - end - end - end - return patients_by_cluster, sors -end - -def save_sor(current_patients, node_identifier, patients_by_cluster) - current_patients.each do |patient| - add_record(patients_by_cluster, patient, node_identifier) - end -end - -def add_record(hash, key, record) - query = hash[key] - if query.nil? - hash[key] = [record] - elsif !query.include?(record) - query << record - end -end - -def load_patient_cohort(options) - patient_data = {} - count = 0 - fields2extract = get_fields2extract(options) - field_numbers = fields2extract.values - File.open(options[:input_file]).each do |line| - line.chomp! - if options[:header] && count == 0 - line.gsub!(/#\s*/,'') # correct comment like headers - field_names = line.split("\t") - get_field_numbers2extract(field_names, fields2extract) - field_numbers = fields2extract.values - else - fields = line.split("\t") - pat_record = field_numbers.map{|n| fields[n]} - if fields2extract[:pat_id_col].nil? - pat_id = "pat_#{count}" #generate ids - else - original_id = pat_record.shift - pat_id = original_id + "_i#{count}" # make sure that ids are uniq - end - if !pat_record[0].nil? - pat_record[0] = pat_record[0].split(options[:hpo_separator]) - else - pat_record[0] = [] - end - pat_record[2] = pat_record[2].to_i if !options[:start_col].nil? - pat_record[3] = pat_record[3].to_i if !options[:end_col].nil? - patient_data[pat_id] = pat_record - end - count +=1 - end - options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil? - return patient_data -end - -def get_fields2extract(options) - fields2extract = {} - [:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field| - col = options[field] - if !col.nil? - col = col.to_i if !options[:header] - fields2extract[field] = col - end - end - return fields2extract -end - -def get_field_numbers2extract(field_names, fields2extract) - fields2extract.each do |field, name| - fields2extract[field] = field_names.index(name) - end -end - -def download(ftp_server, path, name) - ftp = Net::FTP.new() - ftp.connect(ftp_server) - ftp.login - ftp.getbinaryfile(path, name) - ftp.close -end def get_and_parse_external_data(all_paths) sources = [ ['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]], ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]], \ No newline at end of file