generalMethods.rb in pets-0.2.3

- old
+ new

@@ -1,9 +1,41 @@
 require 'uri'
+require 'net/ftp'
+require 'net/http'
+require 'zlib'
+require 'json'
+require 'benchmark'
+
+def system_call(code_folder, script, args_string)
+	cmd = File.join(code_folder, script) + ' ' + args_string
+	puts "==> #{cmd}"
+	Benchmark.bm do |x|
+		x.report{	system(cmd) }
+	end
+end
+
+def read_excluded_hpo_file(file)
+	excluded_hpo = []
+	File.open(file).each do |line|
+		excluded_hpo << line.chomp
+	end
+	return excluded_hpo
+end
+
 #Common methods for predictors
 #Training file example = 9  131371492   131375954   HP:0010974  2.41161970596	9.3.A.5
 #1. Indexing by chr (region)
+def coor_overlap?(ref_start, ref_stop, start, stop)
+  overlap = false
+  if (stop > ref_start && stop <= ref_stop) ||
+    (start >= ref_start && start < ref_stop) ||
+    (start <= ref_start && stop >= ref_stop) ||
+    (start > ref_start && stop < ref_stop)
+    overlap = true
+  end
+  return overlap
+end
 
 def load_training_file4regions(training_file)
 	training_set = {}
 	posInfo = loadFile(training_file)
 	posInfo.each do |info|
@@ -55,95 +87,11 @@
 		end
 	end
 	return information
 end
 
-def add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list)
-	if !hpo_black_list.include?(id)
-		attributes = [id, name, is_a - hpo_black_list, syn]
-		hpo_storage[id] = attributes
-		alt_ids.each do |altid|
-			hpo_storage[altid] = attributes
-		end
-	end 
-end
 
-def load_hpo_file(hpo_file, hpo_black_list=[])
-	hpo_storage = {}
-	id = nil
-	name = nil
-	alt_ids = []
-	syn = []
-	is_a = []
-	File.open(hpo_file).each do |line|
-		line.chomp!
-		tag, info = line.split(': ')
-		if tag == 'id' || tag == 'name' || tag == 'is_a' || tag == 'synonym' || tag == 'alt_id'
-			if tag == 'id'
-				add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list) if !name.nil?
-				id = info
-				name = nil
-				alt_id = []
-				syn = []
-				is_a = []
-			end
-			if tag == 'alt_id'
-				alt_ids << info
-			elsif tag == 'is_a'
-				is_a << info.split(' ! ')[0]
-			elsif tag == 'synonym'
-				syn << info.split('"')[1] #to keep only the name of the synonym
-			else
-				name = info
-			end
-		end
-	end
-	add_record2storage(hpo_storage, id, name, is_a, syn, alt_ids, hpo_black_list)
-	# STDERR.puts hpo_storage.inspect
-	# Process.exit
-	return hpo_storage
-end
-
-def load_hpo_black_list(excluded_hpo_file)
-	excluded_hpos = []
-	File.open(excluded_hpo_file).each do |line|
-		line.chomp!
-		excluded_hpos << line
-	end
-	return excluded_hpos
-end
-
-def create_hpo_dictionary(hpo_storage)
-	hpo_dictionary = {}
-	hpo_storage.each do |hpo, metadata|
-		hpo_code, hpo_name, hpo_parents, hpo_synonyms = metadata
-		hpo_dictionary[hpo_name] = hpo_code
-		hpo_synonyms.each do |syn|
-			hpo_dictionary[syn] = hpo_code
-		end		
-	end
-	return hpo_dictionary
-end
-
-def get_child_parent_relations(hpo_storage)
-	# for getting hpo childs
-	storage_child = {}
-	hpo_storage.each do |hpo_code, hpo_data|
-		id, name, is_a, syn = hpo_data
-		hpo_child = [id, name]
-		is_a.each do |par_hpo_code|
-			query = storage_child[par_hpo_code]
-			if query.nil?
-				storage_child[par_hpo_code] = [hpo_child]
-			else
-				query << hpo_child
-			end
-		end
-	end
-	return storage_child
-end
-
 def compute_IC_values(patient_data, total_patients)
 	patients_per_hpo = Hash.new(0)
 	last_patient_ID = ''
 	patient_data.each do |patient_ID, metadata|
 		patient, count = patient_ID.split('_i')
@@ -161,38 +109,16 @@
 		patients_per_hpo[hpo] = -Math.log10(patient_number.fdiv(total_patients))
 	end
 	return patients_per_hpo
 end
 
-# def get_child_parent_relations(hpo_storage)
-# 	# for getting hpo childs
-# 	storage_child = {}
-# 	hpo_storage.each do |hpo_code, hpo_data|
-# 		STDERR.puts hpo_data[3].inspect
-# 		Process.exit
-# 		main_code, hpo_name, synonyms, parents = hpo_data
-# 		parents.each do |par_hpo_code, par_hpo_name|
-# 			query = storage_child[par_hpo_code]
-# 			hpo_child = [main_code, hpo_name]
-# 			if query.nil?
-# 				storage_child[par_hpo_code] = [par_hpo_name, [hpo_child]]
-# 			else
-# 				query.last << hpo_child
-# 			end
-# 		end
-# 	end
-	
-# 	return storage_child
-# end
-
-
 def load_hpo_ci_values(information_coefficient_file)
 	hpos_ci_values = {}
 	File.open(information_coefficient_file).each do |line|
 		line.chomp!
 		hpo_code, ci = line.split("\t")
-		hpos_ci_values[hpo_code] = ci.to_f
+		hpos_ci_values[hpo_code.to_sym] = ci.to_f
 	end
 	return hpos_ci_values
 end
 
 def load_clustered_patients(file)
@@ -230,17 +156,18 @@
 			attributes = {}
 			fields[8].split(';').each do |pair| 
 				key, value = pair.split('=')
 				attributes[key] = value
 			end
-			geneNames = []
-			geneNames << attributes['gene'] if !attributes['gene'].nil?
-			geneNames.concat(attributes['gene_synonym'].split(',')) if !attributes['gene_synonym'].nil?
+			geneName = nil
+			geneName = attributes['gene'] if !attributes['gene'].nil?
+			geneSyns = []
+			geneSyns = attributes['gene_synonym'].split(',') if !attributes['gene_synonym'].nil?
 			description = attributes['description']
 			description = URI.unescape(description) if !description.nil?
 			attributes['Dbxref'] =~ /GeneID:(\d+)/
-			gene_list[$1] = [geneNames, description]
+			gene_list[$1] = [geneName, geneSyns, description]
 			genes << [$1, fields[3].to_i, fields[4].to_i]
 		end
 	end
 	gene_location[current_chr] = genes
 	return gene_list, gene_location
@@ -332,18 +259,15 @@
 	return gene2kegg
 end
 
 def merge_genes_with_kegg_data(gene_list, kegg_data)
 	merged_data = {}
-	gene_list.each do |geneID, attributes|
-		query = kegg_data[geneID]
-		if query.nil?
-			attributes << []
-		else
-			attributes << query
-		end
-		merged_data[geneID] = attributes
+	gene_list.each do |geneID, values|
+		geneName, geneSyn, description = values
+		kegg_entry = kegg_data[geneID]	
+		kegg_entry = [] if kegg_entry.nil?
+		merged_data[geneID] = [geneName, description, kegg_entry, geneSyn]
 	end
 	return merged_data
 end
 
 def write_compressed_plain_file(data, path)
@@ -364,11 +288,11 @@
 def compute_pathway_enrichment(genes_clusters, genes_with_kegg)
 	pathways_genes_in_predictions = {}
 	genes_in_predictions = []
 	genes_clusters.each do |cluster|
 		cluster.each do |geneID, data|
-			geneNames, description, pathways = data
+			geneName, description, pathways, geneSyns = data
 			pathways.each do |pathway|
 				query = pathways_genes_in_predictions[pathway]
 				if query.nil?
 					pathways_genes_in_predictions[pathway] = [geneID]
 				else
@@ -508,35 +432,39 @@
 def load_patient_cohort(options)
 	patient_data = {}
 	count = 0
 	fields2extract = get_fields2extract(options)
 	field_numbers = fields2extract.values
-  original_ids = []
-  File.open(options[:input_file]).each do |line|
-    line.chomp!
-    if options[:header] && count == 0
-      line.gsub!(/#\s*/,'') # correct comment like  headers
-      field_names = line.split("\t")
-      get_field_numbers2extract(field_names, fields2extract)
-      field_numbers = fields2extract.values
-    else
-      fields = line.split("\t")
-      pat_record = field_numbers.map{|n| fields[n]}
-      if fields2extract[:pat_id_col].nil?
-        pat_id = "pat_#{count}" #generate ids
-      else
-        original_id = pat_record.shift
-        original_ids << original_id
-        pat_id = original_id + "_i#{count}" # make sure that ids are uniq
-      end
-      patient_data[pat_id] = pat_record
-    end
-    count +=1
-  end
-  fields2extract[:pat_id_col].nil? ? patient_number = count : patient_number = original_ids.uniq.length
-  options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil?
-  return patient_data, patient_number
+	  File.open(options[:input_file]).each do |line|
+	    line.chomp!
+	    if options[:header] && count == 0
+	      line.gsub!(/#\s*/,'') # correct comment like  headers
+	      field_names = line.split("\t")
+	      get_field_numbers2extract(field_names, fields2extract)
+	      field_numbers = fields2extract.values
+	    else
+	      fields = line.split("\t")
+	      pat_record = field_numbers.map{|n| fields[n]}
+	      if fields2extract[:pat_id_col].nil?
+	        pat_id = "pat_#{count}" #generate ids
+	      else
+	        original_id = pat_record.shift
+	        pat_id = original_id + "_i#{count}" # make sure that ids are uniq
+	      end
+	      if !pat_record[0].nil? 
+	      	pat_record[0] = pat_record[0].split(options[:hpo_separator])
+	      else
+	      	pat_record[0] = []
+	      end
+	      pat_record[2] = pat_record[2].to_i if !options[:start_col].nil?
+	      pat_record[3] = pat_record[3].to_i if !options[:end_col].nil?
+	      patient_data[pat_id] = pat_record
+	    end
+	    count +=1
+	  end
+	  options[:pat_id_col] = 'generated' if fields2extract[:pat_id_col].nil?
+	  return patient_data
 end 
 
 def get_fields2extract(options)
 	fields2extract = {}
 	[:pat_id_col, :hpo_col, :chromosome_col, :start_col, :end_col].each do |field|
@@ -551,6 +479,87 @@
 
 def get_field_numbers2extract(field_names, fields2extract)
   fields2extract.each do |field, name|
     fields2extract[field] = field_names.index(name)
   end
+end
+
+def download(ftp_server, path, name)
+  ftp = Net::FTP.new()
+  ftp.connect(ftp_server)
+  ftp.login
+  ftp.getbinaryfile(path, name)
+  ftp.close
+end
+
+def get_and_parse_external_data(all_paths)
+	sources = [
+	  ['ftp.ncbi.nlm.nih.gov', 'genomes/H_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz', all_paths[:gene_data]],
+	  ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/biosystems_gene.gz', all_paths[:biosystems_gene]],
+	  ['ftp.ncbi.nlm.nih.gov', 'pub/biosystems/CURRENT/bsid2info.gz', all_paths[:biosystems_info]]
+	]
+	sources.each do |server, path, output|
+	  download(server, path, output) if !File.exists?(output)
+	end
+
+	genes_with_kegg = {}
+	gene_location = {}
+	if !File.exists?(all_paths[:gene_data_with_pathways]) || !File.exists?(all_paths[:gene_location])
+	  gene_list, gene_location = load_gene_data(all_paths[:gene_data])
+	  ### kegg_data = parse_kegg_data(genes_found_attributes.keys)
+	  kegg_data = parse_kegg_from_biosystems(all_paths[:biosystems_gene], all_paths[:biosystems_info])
+	  genes_with_kegg = merge_genes_with_kegg_data(gene_list, kegg_data)
+	  write_compressed_plain_file(genes_with_kegg, all_paths[:gene_data_with_pathways])
+	  write_compressed_plain_file(gene_location, all_paths[:gene_location])
+	else
+	  gene_location = read_compressed_json(all_paths[:gene_location])
+	  genes_with_kegg = read_compressed_json(all_paths[:gene_data_with_pathways])
+	end
+	return gene_location, genes_with_kegg
+end
+
+def get_detailed_similarity(profile, candidates, evidences, hpo)
+	profile_length = profile.length
+	matrix = []
+	profile_length.times{ matrix << Array.new(candidates.length, 0)}
+	cand_number = 0
+	candidates.each do |candidate_id, similarity|
+		local_sim = []
+		candidate_evidence = evidences[candidate_id]
+		profile.each do |profile_term|
+			candidate_evidence.each do |candidate_term|
+				term_sim = hpo.compare([candidate_term], [profile_term], sim_type: :lin, bidirectional: false)
+				local_sim << [profile_term, candidate_term, term_sim]
+			end
+		end
+		local_sim.sort!{|s1, s2| s2.last <=> s1.last}
+
+		final_pairs = []
+		processed_profile_terms = []
+		processed_candidate_terms = []
+		local_sim.each do |pr_term, cd_term, sim|
+			if !processed_profile_terms.include?(pr_term) && !processed_candidate_terms.include?(cd_term)
+				final_pairs << [pr_term, cd_term, sim]
+				processed_profile_terms << pr_term
+				processed_candidate_terms << cd_term
+			end
+			break if profile_length == processed_profile_terms.length
+		end
+		final_pairs.each do |pr_term, cd_term, similarity|
+			matrix[profile.index(pr_term)][cand_number] = similarity
+		end
+		cand_number += 1
+	end
+	return matrix
+end
+
+def get_similarity_matrix(reference_prof, similarities, evidence_profiles, hpo, term_limit, candidate_limit)
+		candidates = similarities.to_a.sort{|s1, s2| s2.last <=> s1.last}.first(candidate_limit)
+		candidates_ids = candidates.map{|c| c.first}
+		candidate_similarity_matrix = get_detailed_similarity(reference_prof, candidates, evidence_profiles, hpo)
+		candidate_similarity_matrix.each_with_index do |row, i|
+			row.unshift(hpo.translate_id(reference_prof[i]))
+		end
+		candidate_similarity_matrix.sort!{|r1,r2| r2[1..r2.length].inject(0){|sum,n| sum +n} <=> r1[1..r1.length].inject(0){|sum,n| sum +n}}
+		candidate_similarity_matrix = candidate_similarity_matrix.first(term_limit)
+		return candidate_similarity_matrix, candidates, candidates_ids
 end
\ No newline at end of file