snp-search.rb in snp-search-2.0.0

- old
+ new

@@ -1,10 +1,11 @@
 require 'rubygems'
 gem "bio", "~> 1.4.2"
 require 'bio'
 require  'snp_db_models'
 require 'activerecord-import'
+require 'diff/lcs'
 
 #This method guesses the reference sequence file format
 def guess_sequence_format(reference_genome)
   file_extension = File.extname(reference_genome).downcase
   file_format = nil
@@ -48,10 +49,11 @@
 	end
 end
 
 #This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
 def populate_snps_alleles_genotypes(vcf_file, cuttoff_snp, cuttoff_genotype)
+
 puts "Adding SNPs........"
 # open vcf file and parse each line
 	File.open(vcf_file) do |f|
 	  # header names
 		while line = f.gets
@@ -84,41 +86,60 @@
 				    snp_base = details[4]
 				    snp_qual = details [5]
 				    format = details[8].split(":")
 				    gt = format.index("GT")
 				    gq = format.index("GQ")
+				    # dp = format.index("DP")
 				    samples = details[9..-1]
-			     	
-			     	next if ref_base.size != 1 || snp_base.size != 1 # exclude indels
+
+			     	next if ref_base.size != 1 || snp_base.size != 1 # exclude indels (e.g. G,A in REF)
 				    genotypes = samples.map do |s| 
-				      format_values = s.chomp.split(":")
-				      format_values[gt]
+				      format_values = s.chomp.split(":") # output (e.g.): 0/0 \n 0,255,209 \n 99
+				      format_values[gt] # e.g. 0/0  
 				    end
 
 				    genotypes_qualities = samples.map do |s| 
 				      format_values = s.chomp.split(":")
-				      format_values[gq]
+				      format_values[gq] # e.g. 99
 				    end
 
-				    high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30
+				    geno_quality_array = Array.new
+
+				    high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30. Reminder: 0/0 is no SNP, 1/1 is SNP.
 				    variant_genotypes = Array.new
-				    genotypes.each_with_index do |gt, index|
+				    genotypes.each_with_index do |gt, index| # indexes each 'genotypes'.
 				    	if gt == "1/1"
-					        variant_genotypes << index
-					        if genotypes_qualities[index].to_i >= cuttoff_genotype
-					          high_quality_variant_genotypes << index
-					        end
-				    	end
+					        variant_genotypes << index # variant_genotypes is the position of genome positions that have a correct SNP with 1/1.  if you want the total number of strains thats have 1/1 for that row (genome position) then puts variant_genotypes.size
+					        if genotypes_qualities[index].to_i >= cuttoff_genotype.to_i
+					           high_quality_variant_genotypes << index        
+					    	end
+						end
 					end
-			    
-					if snp_qual.to_i >= cuttoff_snp && genotypes.include?("1/1") &&  ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size # first condition checks the overall quality of the SNP is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant type with quality >= 30 and informative SNP
-				    	if  genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s
+
+					genotypes_qualities.each do |gq|
+						if gq.to_i >= cuttoff_genotype.to_i
+							geno_quality_array << gq
+						end
+					end
+
+					# 	# high_quality_variant_genotypes is the position of 1/1 and genotype quality above cuttoff_genotype. high_quality_variant_genotypes.size will give you the number of 1/1 in a row (genome position) that is above the genotype quality cuttoff.
+					# puts "yay" if geno_quality_array.keep_if {|z| z <= cuttoff_genotype.to_i}
+
+					# next if geno_quality_array.each {|z| z.to_i < cuttoff_genotype.to_i}
+					next if samples.include?("./.")
+					 next if geno_quality_array.size != strains.size
+						if snp_qual.to_i >= cuttoff_snp.to_i && genotypes.include?("1/1") &&  ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size
+					   # first condition checks the overall quality of the SNP 	is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant 	type with quality >= 30 and informative SNP
+						
+				    	 if  genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s
+				    	 
 					        good_snps +=1
 					        # puts good_snps
 					        #create snp
 					        s = Snp.new
 					        s.ref_pos = ref_pos
+					        s.qual = snp_qual
 					        s.save
 					        					 
 					   #  create ref allele
 					        ref_allele = Allele.new
 					        ref_allele.base = ref_base
@@ -137,25 +158,27 @@
 					     	ActiveRecord::Base.transaction do
 						    	genos = []
 							    genotypes.each_with_index do |gt, index|
 							         genotype = Genotype.new
 							         genotype.strain = strains[index]
+							         genotype.geno_qual = genotypes_qualities[index].to_i
 							    	 puts index if strains[index].nil?
 							          if gt == "0/0" # wild type
 							             genotype.allele = ref_allele
 							          elsif gt == "1/1" # snp type
 							             genotype.allele = snp_allele
-							          else
-							            puts "Strange SNP #{gt}"
+							           else
+							             puts "Strange SNP #{gt}"
 							          end
 							          genos << genotype
 								end
 								# Using activerecord-import to speed up importing
 						        Genotype.import genos, :validate => false 
 								 puts "Total SNPs added so far: #{good_snps}" if good_snps % 100 == 0
 							end
 				      	end
+				     
 			    	end
 			    end
 			end
 		end
 	end
@@ -170,10 +193,109 @@
 def find_shared_snps(strain_names)
     *strain_names = strain_names
      
    where_statement = strain_names.collect{|strain_name| "strains.name = '#{strain_name}' OR "}.join("").sub(/ OR $/, "")
 
-   puts "Snp.find_by_sql(\"SELECT * from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}\")"
+   Snp.find_by_sql("SELECT * from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}")
 end
 
+def synonymous(sequence_file)
 
+	#Reference Sequence
+	genome_sequence = Bio::FlatFile.open(Bio::GenBank, sequence_file).next_entry
 
+	#Extract all nucleotide sequence from ORIGIN
+	all_seqs_original = genome_sequence.seq
+	ref_bases =[]
+
+	 strains = Strain.all
+
+	  strains_hash = Hash.new
+      # create a sequence hash
+      # hash key is strain_id, loop through strain_id
+      # create an empty array
+      strains.each do |strain|
+        strains_hash[strain.id] = Array.new
+      end
+
+	variants = Feature.find_by_sql("select distinct features.* from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id inner join genotypes on alleles.id = genotypes.allele_id inner join strains on strains.id = genotypes.strain_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'")
+
+	puts "start_cds_in_ref\tend_cds_in_ref\tpos_of_SNP_in_ref\tref_base\tSNP_base\tsynonymous or non-synonymous\tamino_acid_original\tamino_acid_change\tpossible_pseudogene?\tchange_in_hydrophobicity_of_AA?\tchange_in_polarisation_of_AA?\tchange_in_size_of_AA?"
+
+
+	variants.each do |variant|
+		variant.snps.each do |snp|
+			snp.alleles.each do |allele|
+				if allele.id != snp.reference_allele_id
+					all_seqs_mutated = genome_sequence.seq
+					mutated_seq_translated = []
+					original_seq_translated = []
+					all_seqs_mutated[snp.ref_pos.to_i-1] = allele.base
+
+					mutated_seq = Bio::Sequence.auto(all_seqs_mutated[variant.start-1..variant.end-1])
+					original_seq =  Bio::Sequence.auto(all_seqs_original[variant.start-1..variant.end-1])
+
+					if variant.strand == -1
+						mutated_seq_translated << mutated_seq.reverse_complement.translate
+						original_seq_translated << original_seq.reverse_complement.translate
+					    
+					else
+						mutated_seq_translated << mutated_seq.translate
+						original_seq_translated << original_seq.translate
+						
+				 	end
+
+					 	mutated_seq_translated.zip(original_seq_translated).each do |mut, org|
+					 		mutated_seq_translated_clean = mut.gsub(/\*$/,"")
+					 		original_seq_translated_clean = org.gsub(/\*$/,"")
+
+							hydrophobic = ["I", "L", "V", "C", "A", "G", "M", "F", "Y", "W", "H", "T"]
+							non_hydrophobic = ["K", "E", "Q", "D", "N", "S", "P", "B"]
+
+							polar = ["Y", "W", "H", "K", "R", "E", "Q", "D", "N", "S", "P", "B"]
+							non_polar = ["I", "L", "V", "C", "A", "G", "M", "F", "T"]
+
+							small = ["V","C","A","G","D","N","S","T","P"]
+							non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"]
+
+							if original_seq_translated_clean == mutated_seq_translated_clean
+							# if original_seq_translated == mutated_seq_translated
+								if mutated_seq_translated_clean =~ /\*/
+									puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tsynonymous\t\t\tYes"
+								else
+									puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tsynonymous"
+								end
+							else
+								
+								diffs = Diff::LCS.diff(original_seq_translated_clean, mutated_seq_translated_clean)
+								
+								if mutated_seq_translated_clean =~ /\*/
+									puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tnon-synonymous\t#{diffs[0][0].element}\t#{diffs[0][1].element}\tYes\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}"
+								else
+									puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tnon-synonymous\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}"
+								end
+							end
+						end
+					
+				end
+			end
+		end
+	end
+
+	#Take all SNP positions in ref genome
+	# snp_positions = Feature.find_by_sql("select snps.ref_pos from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|snp| snp.ref_pos}
+
+	# # Take all SNP nucleotide
+	# snps = Feature.find_by_sql("select alleles.base from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|allele| allele.base}
+
+	# # Mutate (substitute) the original sequence with the SNPs
+	
+	# # Here all_seqs_original are all the nucelotide sequences but with the snps subsituted in them
+
+	# #Get start position of CDS with SNP
+	# coordinates_start = Feature.find_by_sql("select start from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where features.name = 'CDS' and alleles.id <> snps.reference_allele_id").map{|feature| feature.start}
+
+	# #Get end position of CDS with SNP
+	# coordinates_end = Feature.find_by_sql("select end from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where features.name = 'CDS' and alleles.id <> snps.reference_allele_id").map{|feature| feature.end}	
+
+	
+end
\ No newline at end of file