snp-search.rb in snp-search-2.3.0

- old
+ new
@@ -1,301 +1,39 @@
 require 'rubygems'
-gem "bio", "~> 1.4.2"
 require 'bio'
 require  'snp_db_models'
 require 'activerecord-import'
 require 'diff/lcs'
+require 'create_methods'
+require 'filter_ignore_snps_methods'
+require 'output_information_methods'
 
-#This method guesses the reference sequence file format
-def guess_sequence_format(reference_genome)
-  file_extension = File.extname(reference_genome).downcase
-  file_format = nil
-  case file_extension
-  when ".gbk", ".genbank", ".gb"
-    file_format = :genbank
-  when ".embl", ".emb"
-    file_format = :embl
-  end
-  return file_format
-end
+def find_unqiue_snps(strain_names, out, cuttoff_genotype, cuttoff_snp)
 
-# A method to populate the database with the features (genes etc) and the annotations from the gbk/embl file.  
-# We include all features that are not 'source' or 'gene' as they are repetitive info.  'CDS' is the gene.
-# The annotation table includes also the start and end coordinates of the CDS.  The strand is also included.  the 'locations' method is defined in bioruby under genbank.  It must be required at the top (bio).
-#Also, the qualifier and value are extracted from the gbk/embl file and added to the database.
-def populate_features_and_annotations(sequence_file)
-	puts "Adding features and their annotations...."
-	 ActiveRecord::Base.transaction do
-	 	counter = 0
-		sequence_file.features.each do |feature|
-			counter += 1
-			puts "Total number of features and annotations added: #{counter}" if counter % 100 == 0
-			unless feature.feature == "source" || feature.feature == "gene"
-				db_feature = Feature.new
-				db_feature.start = feature.locations.first.from
-				db_feature.end = feature.locations.first.to
-				db_feature.strand = feature.locations.first.strand
-				db_feature.name = feature.feature
-				db_feature.save
-				# Populate the Annotation table with qualifier information from the genbank file
-			feature.qualifiers.each do |qualifier|
-				a = Annotation.new
-				a.qualifier = qualifier.qualifier
-				a.value = qualifier.value
-				a.save
-				db_feature.annotations << a
-			end
-			end
-		end
-	end
-end
+  *strain_names = strain_names
 
-#This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
-def populate_snps_alleles_genotypes(vcf_file, cuttoff_snp, cuttoff_genotype)
+  where_statement = strain_names.collect{|strain_name| "strains.name = '#{strain_name}' OR "}.join("").sub(/ OR $/, "")
 
-puts "Adding SNPs........"
-# open vcf file and parse each line
-	File.open(vcf_file) do |f|
-	  # header names
-		while line = f.gets
-	  		if  line =~ /CHROM/
-	  			line.chomp!
-				column_headings = line.split("\t")
-				strain_names = column_headings[9..-1]
-				strain_names.map!{|name| name.sub(/\..*/, '')}
-					
-				strain_names.each do |str|
-					ss = Strain.new
-					ss.name = str
-					ss.save
-				end
-				
-				strains = Array.new
-			 	strain_names.each do |strain_name|
-			   		strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
-			   		strains << strain
-				 end
+  outfile = File.open(out, "w")
 
-		 		good_snps = 0
-		  		# start parsing snps
-				while line = f.gets
-					#puts line
-				    details = line.split("\t")
-				    ref = details[0]
-				    ref_pos = details[1]
-				    ref_base = details[3]
-				    snp_base = details[4]
-				    snp_qual = details [5]
-				    format = details[8].split(":")
-				    gt = format.index("GT")
-				    gq = format.index("GQ")
-				    # dp = format.index("DP")
-				    samples = details[9..-1]
+   snps = Snp.find_by_sql("SELECT snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND genotypes.geno_qual >= #{cuttoff_genotype} AND snps.qual >= #{cuttoff_snp} AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}")
 
-			     	next if ref_base.size != 1 || snp_base.size != 1 # exclude indels (e.g. G,A in REF)
-				    genotypes = samples.map do |s| 
-				      format_values = s.chomp.split(":") # output (e.g.): 0/0 \n 0,255,209 \n 99
-				      format_values[gt] # e.g. 0/0  
-				    end
+   puts "The number of unique snps are #{snps.size}"
 
-				    genotypes_qualities = samples.map do |s| 
-				      format_values = s.chomp.split(":")
-				      format_values[gq] # e.g. 99
-				    end
+   output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, false)
+end
 
-				    geno_quality_array = Array.new
 
-				    high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30. Reminder: 0/0 is no SNP, 1/1 is SNP.
-				    variant_genotypes = Array.new
-				    genotypes.each_with_index do |gt, index| # indexes each 'genotypes'.
-				    	if gt == "1/1"
-					        variant_genotypes << index # variant_genotypes is the position of genome positions that have a correct SNP with 1/1.  if you want the total number of strains thats have 1/1 for that row (genome position) then puts variant_genotypes.size
-					        if genotypes_qualities[index].to_i >= cuttoff_genotype.to_i
-					           high_quality_variant_genotypes << index        
-					    	end
-						end
-					end
+def information(out, cuttoff_genotype, cuttoff_snp)
 
-					genotypes_qualities.each do |gq|
-						if gq.to_i >= cuttoff_genotype.to_i
-							geno_quality_array << gq
-						end
-					end
+  puts "outputting SNP info....."
+  
+  strains = Strain.all
 
-					# 	# high_quality_variant_genotypes is the position of 1/1 and genotype quality above cuttoff_genotype. high_quality_variant_genotypes.size will give you the number of 1/1 in a row (genome position) that is above the genotype quality cuttoff.
-					# puts "yay" if geno_quality_array.keep_if {|z| z <= cuttoff_genotype.to_i}
+  snps = Snp.find_by_sql("SELECT distinct snps.* from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id where alleles.id <> snps.reference_allele_id")
 
-					# next if geno_quality_array.each {|z| z.to_i < cuttoff_genotype.to_i}
-					next if samples.include?("./.")
-					 next if geno_quality_array.size != strains.size
-						if snp_qual.to_i >= cuttoff_snp.to_i && genotypes.include?("1/1") &&  ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size
-					   # first condition checks the overall quality of the SNP 	is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant 	type with quality >= 30 and informative SNP
-						
-				    	 if  genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s
-				    	 
-					        good_snps +=1
-					        # puts good_snps
-					        #create snp
-					        s = Snp.new
-					        s.ref_pos = ref_pos
-					        s.qual = snp_qual
-					        s.save
-					        					 
-					   #  create ref allele
-					        ref_allele = Allele.new
-					        ref_allele.base = ref_base
-					        ref_allele.snp = s
-					        ref_allele.save
+  outfile = File.open(out, "w")
 
-					        s.reference_allele = ref_allele
-					        s.save
+  output_information_methods(snps, outfile, cuttoff_genotype, cuttoff_snp, true)
 
-					        # create snp allele
-					        snp_allele = Allele.new
-					        snp_allele.base = snp_base
-					        snp_allele.snp = s
-					        snp_allele.save
-					        
-					     	ActiveRecord::Base.transaction do
-						    	genos = []
-							    genotypes.each_with_index do |gt, index|
-							         genotype = Genotype.new
-							         genotype.strain = strains[index]
-							         genotype.geno_qual = genotypes_qualities[index].to_i
-							    	 puts index if strains[index].nil?
-							          if gt == "0/0" # wild type
-							             genotype.allele = ref_allele
-							          elsif gt == "1/1" # snp type
-							             genotype.allele = snp_allele
-							           else
-							             puts "Strange SNP #{gt}"
-							          end
-							          genos << genotype
-								end
-								# Using activerecord-import to speed up importing
-						        Genotype.import genos, :validate => false 
-								 puts "Total SNPs added so far: #{good_snps}" if good_snps % 100 == 0
-							end
-				      	end
-				     
-			    	end
-			    end
-			end
-		end
-	end
-	#Here we link the features to snps.
-	Snp.all.each do |snp|
-		x = Feature.where("features.start <= ? AND features.end >= ?", snp.ref_pos, snp.ref_pos).first
-		snp.feature = x
-		snp.save
-	end
 end
 
-def find_shared_snps(strain_names)
-    *strain_names = strain_names
-     
-   where_statement = strain_names.collect{|strain_name| "strains.name = '#{strain_name}' OR "}.join("").sub(/ OR $/, "")
-
-   Snp.find_by_sql("SELECT * from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}")
-end
-
-def synonymous(sequence_file)
-
-	#Reference Sequence
-	genome_sequence = Bio::FlatFile.open(Bio::GenBank, sequence_file).next_entry
-
-	#Extract all nucleotide sequence from ORIGIN
-	all_seqs_original = genome_sequence.seq
-	ref_bases =[]
-
-	 strains = Strain.all
-
-	  strains_hash = Hash.new
-      # create a sequence hash
-      # hash key is strain_id, loop through strain_id
-      # create an empty array
-      strains.each do |strain|
-        strains_hash[strain.id] = Array.new
-      end
-
-	variants = Feature.find_by_sql("select distinct features.* from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id inner join genotypes on alleles.id = genotypes.allele_id inner join strains on strains.id = genotypes.strain_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'")
-
-	puts "start_cds_in_ref\tend_cds_in_ref\tpos_of_SNP_in_ref\tref_base\tSNP_base\tsynonymous or non-synonymous\tamino_acid_original\tamino_acid_change\tpossible_pseudogene?\tchange_in_hydrophobicity_of_AA?\tchange_in_polarisation_of_AA?\tchange_in_size_of_AA?"
-
-
-	variants.each do |variant|
-		variant.snps.each do |snp|
-			snp.alleles.each do |allele|
-				if allele.id != snp.reference_allele_id
-					all_seqs_mutated = genome_sequence.seq
-					mutated_seq_translated = []
-					original_seq_translated = []
-					all_seqs_mutated[snp.ref_pos.to_i-1] = allele.base
-
-					mutated_seq = Bio::Sequence.auto(all_seqs_mutated[variant.start-1..variant.end-1])
-					original_seq =  Bio::Sequence.auto(all_seqs_original[variant.start-1..variant.end-1])
-
-					if variant.strand == -1
-						mutated_seq_translated << mutated_seq.reverse_complement.translate
-						original_seq_translated << original_seq.reverse_complement.translate
-					    
-					else
-						mutated_seq_translated << mutated_seq.translate
-						original_seq_translated << original_seq.translate
-						
-				 	end
-
-					 	mutated_seq_translated.zip(original_seq_translated).each do |mut, org|
-					 		mutated_seq_translated_clean = mut.gsub(/\*$/,"")
-					 		original_seq_translated_clean = org.gsub(/\*$/,"")
-
-							hydrophobic = ["I", "L", "V", "C", "A", "G", "M", "F", "Y", "W", "H", "T"]
-							non_hydrophobic = ["K", "E", "Q", "D", "N", "S", "P", "B"]
-
-							polar = ["Y", "W", "H", "K", "R", "E", "Q", "D", "N", "S", "P", "B"]
-							non_polar = ["I", "L", "V", "C", "A", "G", "M", "F", "T"]
-
-							small = ["V","C","A","G","D","N","S","T","P"]
-							non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"]
-
-							if original_seq_translated_clean == mutated_seq_translated_clean
-							# if original_seq_translated == mutated_seq_translated
-								if mutated_seq_translated_clean =~ /\*/
-									puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tsynonymous\t\t\tYes"
-								else
-									puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tsynonymous"
-								end
-							else
-								
-								diffs = Diff::LCS.diff(original_seq_translated_clean, mutated_seq_translated_clean)
-								
-								if mutated_seq_translated_clean =~ /\*/
-									puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tnon-synonymous\t#{diffs[0][0].element}\t#{diffs[0][1].element}\tYes\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}"
-								else
-									puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tnon-synonymous\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}"
-								end
-							end
-						end
-					
-				end
-			end
-		end
-	end
-
-	#Take all SNP positions in ref genome
-	# snp_positions = Feature.find_by_sql("select snps.ref_pos from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|snp| snp.ref_pos}
-
-	# # Take all SNP nucleotide
-	# snps = Feature.find_by_sql("select alleles.base from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|allele| allele.base}
-
-	# # Mutate (substitute) the original sequence with the SNPs
-	
-	# # Here all_seqs_original are all the nucelotide sequences but with the snps subsituted in them
-
-	# #Get start position of CDS with SNP
-	# coordinates_start = Feature.find_by_sql("select start from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where features.name = 'CDS' and alleles.id <> snps.reference_allele_id").map{|feature| feature.start}
-
-	# #Get end position of CDS with SNP
-	# coordinates_end = Feature.find_by_sql("select end from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where features.name = 'CDS' and alleles.id <> snps.reference_allele_id").map{|feature| feature.end}	
-
-	
-end
\ No newline at end of file