lib/snp-search.rb in snp-search-1.0.0 vs lib/snp-search.rb in snp-search-2.0.0
- old
+ new
@@ -1,10 +1,11 @@
require 'rubygems'
gem "bio", "~> 1.4.2"
require 'bio'
require 'snp_db_models'
require 'activerecord-import'
+require 'diff/lcs'
#This method guesses the reference sequence file format
def guess_sequence_format(reference_genome)
file_extension = File.extname(reference_genome).downcase
file_format = nil
@@ -48,10 +49,11 @@
end
end
#This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
def populate_snps_alleles_genotypes(vcf_file, cuttoff_snp, cuttoff_genotype)
+
puts "Adding SNPs........"
# open vcf file and parse each line
File.open(vcf_file) do |f|
# header names
while line = f.gets
@@ -84,41 +86,60 @@
snp_base = details[4]
snp_qual = details [5]
format = details[8].split(":")
gt = format.index("GT")
gq = format.index("GQ")
+ # dp = format.index("DP")
samples = details[9..-1]
-
- next if ref_base.size != 1 || snp_base.size != 1 # exclude indels
+
+ next if ref_base.size != 1 || snp_base.size != 1 # exclude indels (e.g. G,A in REF)
genotypes = samples.map do |s|
- format_values = s.chomp.split(":")
- format_values[gt]
+ format_values = s.chomp.split(":") # output (e.g.): 0/0 \n 0,255,209 \n 99
+ format_values[gt] # e.g. 0/0
end
genotypes_qualities = samples.map do |s|
format_values = s.chomp.split(":")
- format_values[gq]
+ format_values[gq] # e.g. 99
end
- high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30
+ geno_quality_array = Array.new
+
+ high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30. Reminder: 0/0 is no SNP, 1/1 is SNP.
variant_genotypes = Array.new
- genotypes.each_with_index do |gt, index|
+ genotypes.each_with_index do |gt, index| # indexes each 'genotypes'.
if gt == "1/1"
- variant_genotypes << index
- if genotypes_qualities[index].to_i >= cuttoff_genotype
- high_quality_variant_genotypes << index
- end
- end
+ variant_genotypes << index # variant_genotypes is the position of genome positions that have a correct SNP with 1/1. if you want the total number of strains thats have 1/1 for that row (genome position) then puts variant_genotypes.size
+ if genotypes_qualities[index].to_i >= cuttoff_genotype.to_i
+ high_quality_variant_genotypes << index
+ end
+ end
end
-
- if snp_qual.to_i >= cuttoff_snp && genotypes.include?("1/1") && ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size # first condition checks the overall quality of the SNP is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant type with quality >= 30 and informative SNP
- if genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s
+
+ genotypes_qualities.each do |gq|
+ if gq.to_i >= cuttoff_genotype.to_i
+ geno_quality_array << gq
+ end
+ end
+
+ # # high_quality_variant_genotypes is the position of 1/1 and genotype quality above cuttoff_genotype. high_quality_variant_genotypes.size will give you the number of 1/1 in a row (genome position) that is above the genotype quality cuttoff.
+ # puts "yay" if geno_quality_array.keep_if {|z| z <= cuttoff_genotype.to_i}
+
+ # next if geno_quality_array.each {|z| z.to_i < cuttoff_genotype.to_i}
+ next if samples.include?("./.")
+ next if geno_quality_array.size != strains.size
+ if snp_qual.to_i >= cuttoff_snp.to_i && genotypes.include?("1/1") && ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size
+ # first condition checks the overall quality of the SNP is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant type with quality >= 30 and informative SNP
+
+ if genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s
+
good_snps +=1
# puts good_snps
#create snp
s = Snp.new
s.ref_pos = ref_pos
+ s.qual = snp_qual
s.save
# create ref allele
ref_allele = Allele.new
ref_allele.base = ref_base
@@ -137,25 +158,27 @@
ActiveRecord::Base.transaction do
genos = []
genotypes.each_with_index do |gt, index|
genotype = Genotype.new
genotype.strain = strains[index]
+ genotype.geno_qual = genotypes_qualities[index].to_i
puts index if strains[index].nil?
if gt == "0/0" # wild type
genotype.allele = ref_allele
elsif gt == "1/1" # snp type
genotype.allele = snp_allele
- else
- puts "Strange SNP #{gt}"
+ else
+ puts "Strange SNP #{gt}"
end
genos << genotype
end
# Using activerecord-import to speed up importing
Genotype.import genos, :validate => false
puts "Total SNPs added so far: #{good_snps}" if good_snps % 100 == 0
end
end
+
end
end
end
end
end
@@ -170,10 +193,109 @@
def find_shared_snps(strain_names)
*strain_names = strain_names
where_statement = strain_names.collect{|strain_name| "strains.name = '#{strain_name}' OR "}.join("").sub(/ OR $/, "")
- puts "Snp.find_by_sql(\"SELECT * from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}\")"
+ Snp.find_by_sql("SELECT * from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}")
end
+def synonymous(sequence_file)
+ #Reference Sequence
+ genome_sequence = Bio::FlatFile.open(Bio::GenBank, sequence_file).next_entry
+ #Extract all nucleotide sequence from ORIGIN
+ all_seqs_original = genome_sequence.seq
+ ref_bases =[]
+
+ strains = Strain.all
+
+ strains_hash = Hash.new
+ # create a sequence hash
+ # hash key is strain_id, loop through strain_id
+ # create an empty array
+ strains.each do |strain|
+ strains_hash[strain.id] = Array.new
+ end
+
+ variants = Feature.find_by_sql("select distinct features.* from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id inner join genotypes on alleles.id = genotypes.allele_id inner join strains on strains.id = genotypes.strain_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'")
+
+ puts "start_cds_in_ref\tend_cds_in_ref\tpos_of_SNP_in_ref\tref_base\tSNP_base\tsynonymous or non-synonymous\tamino_acid_original\tamino_acid_change\tpossible_pseudogene?\tchange_in_hydrophobicity_of_AA?\tchange_in_polarisation_of_AA?\tchange_in_size_of_AA?"
+
+
+ variants.each do |variant|
+ variant.snps.each do |snp|
+ snp.alleles.each do |allele|
+ if allele.id != snp.reference_allele_id
+ all_seqs_mutated = genome_sequence.seq
+ mutated_seq_translated = []
+ original_seq_translated = []
+ all_seqs_mutated[snp.ref_pos.to_i-1] = allele.base
+
+ mutated_seq = Bio::Sequence.auto(all_seqs_mutated[variant.start-1..variant.end-1])
+ original_seq = Bio::Sequence.auto(all_seqs_original[variant.start-1..variant.end-1])
+
+ if variant.strand == -1
+ mutated_seq_translated << mutated_seq.reverse_complement.translate
+ original_seq_translated << original_seq.reverse_complement.translate
+
+ else
+ mutated_seq_translated << mutated_seq.translate
+ original_seq_translated << original_seq.translate
+
+ end
+
+ mutated_seq_translated.zip(original_seq_translated).each do |mut, org|
+ mutated_seq_translated_clean = mut.gsub(/\*$/,"")
+ original_seq_translated_clean = org.gsub(/\*$/,"")
+
+ hydrophobic = ["I", "L", "V", "C", "A", "G", "M", "F", "Y", "W", "H", "T"]
+ non_hydrophobic = ["K", "E", "Q", "D", "N", "S", "P", "B"]
+
+ polar = ["Y", "W", "H", "K", "R", "E", "Q", "D", "N", "S", "P", "B"]
+ non_polar = ["I", "L", "V", "C", "A", "G", "M", "F", "T"]
+
+ small = ["V","C","A","G","D","N","S","T","P"]
+ non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"]
+
+ if original_seq_translated_clean == mutated_seq_translated_clean
+ # if original_seq_translated == mutated_seq_translated
+ if mutated_seq_translated_clean =~ /\*/
+ puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tsynonymous\t\t\tYes"
+ else
+ puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tsynonymous"
+ end
+ else
+
+ diffs = Diff::LCS.diff(original_seq_translated_clean, mutated_seq_translated_clean)
+
+ if mutated_seq_translated_clean =~ /\*/
+ puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tnon-synonymous\t#{diffs[0][0].element}\t#{diffs[0][1].element}\tYes\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}"
+ else
+ puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tnon-synonymous\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}"
+ end
+ end
+ end
+
+ end
+ end
+ end
+ end
+
+ #Take all SNP positions in ref genome
+ # snp_positions = Feature.find_by_sql("select snps.ref_pos from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|snp| snp.ref_pos}
+
+ # # Take all SNP nucleotide
+ # snps = Feature.find_by_sql("select alleles.base from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|allele| allele.base}
+
+ # # Mutate (substitute) the original sequence with the SNPs
+
+ # # Here all_seqs_original are all the nucelotide sequences but with the snps subsituted in them
+
+ # #Get start position of CDS with SNP
+ # coordinates_start = Feature.find_by_sql("select start from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where features.name = 'CDS' and alleles.id <> snps.reference_allele_id").map{|feature| feature.start}
+
+ # #Get end position of CDS with SNP
+ # coordinates_end = Feature.find_by_sql("select end from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where features.name = 'CDS' and alleles.id <> snps.reference_allele_id").map{|feature| feature.end}
+
+
+end
\ No newline at end of file