generalMethods.rb in pets-0.2.4

- old
+ new

@@ -11,87 +11,22 @@
 	Benchmark.bm do |x|
 		x.report{	system(cmd) }
 	end
 end
 
-def read_excluded_hpo_file(file)
-	excluded_hpo = []
-	File.open(file).each do |line|
-		excluded_hpo << line.chomp
+def add_record(hash, key, record, uniq=false)
+	query = hash[key]
+	if query.nil?
+		hash[key] = [record]
+	elsif !uniq # We not take care by repeated entries
+		query << record
+	elsif !query.include?(record) # We want uniq entries
+		query << record
 	end
-	return excluded_hpo
 end
 
-#Common methods for predictors
-#Training file example = 9  131371492   131375954   HP:0010974  2.41161970596	9.3.A.5
-#1. Indexing by chr (region)
-def coor_overlap?(ref_start, ref_stop, start, stop)
-  overlap = false
-  if (stop > ref_start && stop <= ref_stop) ||
-    (start >= ref_start && start < ref_stop) ||
-    (start <= ref_start && stop >= ref_stop) ||
-    (start > ref_start && stop < ref_stop)
-    overlap = true
-  end
-  return overlap
-end
 
-def load_training_file4regions(training_file)
-	training_set = {}
-	posInfo = loadFile(training_file)
-	posInfo.each do |info|
-		chr = info.shift
-		query = training_set[chr]
-		if query.nil?
-			training_set[chr] = [info]
-		else
-			query << info
-		end
-	end
-	return training_set
-end
-
-#2. Indexing by hpo (code)
-#prepare training file for analysis using phenotype2region prediction
-def load_training_file4HPO(training_file, thresold=0)
-	training_set = {}
-	information = loadFile(training_file, thresold)
-	information.each do |info|
-		hpoCode = info.delete_at(4)
-		query = training_set[hpoCode]
-		if query.nil?
-			training_set[hpoCode] = [info]  
-		else
-			query << info
-		end
-	end
-	# STDERR.puts training_set.keys.inspect
-	return training_set
-end
-
-
-#3. Load training info file:
-#Chr;Start;Stop;HPO;Association;node
-def loadFile(file, thresold=0)
-	information = []
-	File.open(file).each do |line|
-		line.chomp!
-		allInfo = line.split("\t")
-		associationValue = allInfo[4].to_f
-		if associationValue >= thresold
-			chr = allInfo[0]
-			startPos = allInfo[1].to_i
-			stopPos = allInfo[2].to_i
-			hpoCode = allInfo[3]
-			nodeID = allInfo[5]
-			information << [chr, startPos, stopPos, nodeID, hpoCode, associationValue]
-		end
-	end
-	return information
-end
-
-
 def compute_IC_values(patient_data, total_patients)
 	patients_per_hpo = Hash.new(0)
 	last_patient_ID = ''
 	patient_data.each do |patient_ID, metadata|
 		patient, count = patient_ID.split('_i')
@@ -109,110 +44,11 @@
 		patients_per_hpo[hpo] = -Math.log10(patient_number.fdiv(total_patients))
 	end
 	return patients_per_hpo
 end
 
-def load_hpo_ci_values(information_coefficient_file)
-	hpos_ci_values = {}
-	File.open(information_coefficient_file).each do |line|
-		line.chomp!
-		hpo_code, ci = line.split("\t")
-		hpos_ci_values[hpo_code.to_sym] = ci.to_f
-	end
-	return hpos_ci_values
-end
 
-def load_clustered_patients(file)
-	clusters = {}
-	File.open(file).each do |line|
-		line.chomp!
-		pat_id, cluster_id = line.split("\t")
-		query = clusters[cluster_id]
-		if query.nil?
-			clusters[cluster_id] = [pat_id]
-		else
-			query << pat_id
-		end
-	end
-	return clusters
-end
-
-def load_gene_data(gene_data_path)
-	gene_list = {} #geneID => attr
-	gene_location = {} # chr => gene
-	infile = open(gene_data_path)
-	gz = Zlib::GzipReader.new(infile)
-	current_chr = nil
-	genes = []
-	gz.each_line do |line|
-		line.chomp!
-		next if line =~ /^#/ 
-		fields = line.split("\t")
-		if fields[8].include?('genome=chromosome')
-			chr = fields[8].split(';')[1].split('=').last
-			gene_location[current_chr] = genes
-			genes = []
-			current_chr = chr
-		elsif fields[2] == 'gene'
-			attributes = {}
-			fields[8].split(';').each do |pair| 
-				key, value = pair.split('=')
-				attributes[key] = value
-			end
-			geneName = nil
-			geneName = attributes['gene'] if !attributes['gene'].nil?
-			geneSyns = []
-			geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
-			description = attributes['description']
-			description = URI.unescape(description) if !description.nil?
-			attributes['Dbxref'] =~ /GeneID:(\d+)/
-			gene_list[$1] = [geneName, geneSyns, description]
-			genes << [$1, fields[3].to_i, fields[4].to_i]
-		end
-	end
-	gene_location[current_chr] = genes
-	return gene_list, gene_location
-end
-
-def parse_kegg_data(query_genes)
-	kegg_data = {} #gene => attb
-    while !query_genes.empty?
-    	gene_set = query_genes.shift(10)
-	    url = "http://rest.kegg.jp/get/#{gene_set.map{|qg| "hsa:#{qg}"}.join('+')}"
-	    uri = URI(url)
-	    response = Net::HTTP.get(uri) 
-		geneID = nil
-		gene_names = []
-		definition = nil
-		pathways = []
-		parsing_pathway_field = false
-		response.squeeze(' ').each_line do |line|
-			line.chomp!
-			if line =~ /^ENTRY/
-				geneID = line.split(' ')[1]
-			elsif line =~ /^NAME/
-				gene_names = line.split(' ', 2).last.split(', ')
-			elsif line =~ /^DEFINITION/
-				definition = line.split(' ', 2)[1]
-			elsif line =~ /^PATHWAY/
-				pathways << line.split(' ', 3)[1..2]
-				parsing_pathway_field = true
-			elsif line =~ /^BRITE/ || line =~ /^POSITION/ || line =~ /^DISEASE/ || line =~ /^MODULE/ || line =~ /^DRUG_TARGET/ || line =~ /^NETWORK/
-				parsing_pathway_field = false
-			elsif parsing_pathway_field
-				pathways << line.strip.split(' ', 2)
-			elsif line == '///'
-				parsing_pathway_field = false
-				kegg_data[geneID] = [gene_names, definition, pathways]
-				pathways = []
-				gene_names = []
-			end
-		end
-	end
-	return kegg_data
-end
-
 def parse_kegg_from_biosystems(biosystems_gene_path, biosystems_info_path)
 	kegg_data = {}
 	gene2biosystems = load_biosystem2gene_dictionary(biosystems_gene_path)
 	keggAttributes = loadBiosistemsInfo(biosystems_info_path, 'KEGG')
 	keggAttributes.select!{|kegg_id, data| data.first =~ /^hsa/}
@@ -268,25 +104,10 @@
 		merged_data[geneID] = [geneName, description, kegg_entry, geneSyn]
 	end
 	return merged_data
 end
 
-def write_compressed_plain_file(data, path)
-	File.open(path, 'w') do |f|
-		gz = Zlib::GzipWriter.new(f)
-		gz.write data.to_json
-		gz.close
-	end
-end
-
-def read_compressed_json(path)
-	infile = open(path)
-	gz = Zlib::GzipReader.new(infile)
-	object = JSON.parse(gz.read)
-	return object
-end
-
 def compute_pathway_enrichment(genes_clusters, genes_with_kegg)
 	pathways_genes_in_predictions = {}
 	genes_in_predictions = []
 	genes_clusters.each do |cluster|
 		cluster.each do |geneID, data|
@@ -356,141 +177,11 @@
 	else
 		res = 1
 	end
 end
 
-def get_reference(genomic_ranges)
-	#genomic_ranges = [patientID, mut_start, mut_stop]
-	reference = []
-	reference.concat(genomic_ranges.map{|gr| gr[1]})# get start
-	reference.concat(genomic_ranges.map{|gr| gr[2]})# get stop
-	reference.uniq!
-	reference.sort!
-	#Define overlap range
-	final_reference = []
-	reference.each_with_index do |coord,i|
-		next_coord = reference[i + 1]
-		final_reference << [coord, next_coord] if !next_coord.nil? 
-	end
-	return final_reference
-end
 
-def overlap_patients(genomic_ranges, reference)
-	overlaps = []
-	reference.each do |start, stop|
-		patients = []
-		genomic_ranges.each do |pt_id, pt_start, pt_stop|
-			if (start <= pt_start && stop >= pt_stop) ||
-				(start > pt_start && stop < pt_stop) ||
-				(stop > pt_start && stop <= pt_stop) ||
-				(start >= pt_start && start < pt_stop)
-				patients << pt_id
-			end
-		end
-		overlaps << patients.uniq
-	end
-	return overlaps
-end
 
-def generate_cluster_regions(patients_genomic_region_by_chr, mutation_type, pat_per_reg = 1)
-	patients_out_of_cluster = 0
-	patients_by_cluster = {}
-	sors = []
-	patients_genomic_region_by_chr.each do |chrm, genomic_ranges|
-		reference = get_reference(genomic_ranges) # Get putative overlap regions
-		overlapping_patients = overlap_patients(genomic_ranges, reference) # See what patient has match with a overlap region
-		clust_number = 1
-		reference.each_with_index do |ref, i|
-			current_patients = overlapping_patients[i]
-			if current_patients.length > pat_per_reg
-				ref << chrm
-				node_identifier = "#{chrm}.#{clust_number}.#{mutation_type}.#{current_patients.length}"
-				ref << node_identifier
-				save_sor(current_patients, node_identifier, patients_by_cluster)
-				sors << ref
-				clust_number += 1
-			end
-		end
-	end
-	return patients_by_cluster, sors
-end
-
-def save_sor(current_patients, node_identifier, patients_by_cluster)
-	current_patients.each do |patient|
-		add_record(patients_by_cluster, patient, node_identifier)
-	end
-end
-
-def add_record(hash, key, record)
-	query = hash[key]
-	if query.nil?
-		hash[key] = [record]
-	elsif !query.include?(record)
-		query << record
-	end
-end
-
-def load_patient_cohort(options)
-	patient_data = {}
-	count = 0
-	fields2extract = get_fields2extract(options)
-	field_numbers = fields2extract.values
-	  File.open(options[:input_file]).each do |line|
-	    line.chomp!
-	    if options[:header] && count == 0
-	      line.gsub!(/#\s*/,'') # correct comment like  headers
-	      field_names = line.split("\t")
-	      get_field_numbers2extract(field_names, fields2extract)
-	      field_numbers = fields2extract.values
-	    else
-	      fields = line.split("\t")
-	      pat_record = field_numbers.map{|n| fields[n]}
-	      if fields2extract[:pat_id_col].nil?
-	        pat_id = "pat_#{count}" #generate ids
-	      else
-	        original_id = pat_record.shift
-	        pat_id = original_id + "_i#{count}" # make sure that ids are uniq
-	      end
-	      if !pat_record[0].nil? 
-	      	pat_record[0] = pat_record[0].split(options[:hpo_separator])
-	      else
-	      	pat_record[0] = []
-	      end
-	      pat_record[2] = pat_record[2].to_i if !options[:start_col].nil?
-	      pat_record[3] = pat_record[3].to_i if !options[:end_col].nil?
-	      patient_data[pat_id] = pat_record
-	    end
-	    count +=1
-	  end
-	  options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil?
-	  return patient_data
-end 
-
-def get_fields2extract(options)
-	fields2extract = {}
-	[:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field|
-		col = options[field]
-		if !col.nil?
-			col = col.to_i if !options[:header]
-			fields2extract[field] = col
-		end
-	end
-	return fields2extract
-end
-
-def get_field_numbers2extract(field_names, fields2extract)
-  fields2extract.each do |field, name|
-    fields2extract[field] = field_names.index(name)
-  end
-end
-
-def download(ftp_server, path, name)
-  ftp = Net::FTP.new()
-  ftp.connect(ftp_server)
-  ftp.login
-  ftp.getbinaryfile(path, name)
-  ftp.close
-end
 
 def get_and_parse_external_data(all_paths)
 	sources = [
 	  ['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]],
 	  ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]],
\ No newline at end of file