lib/snp-search.rb in snp-search-1.0.0 vs lib/snp-search.rb in snp-search-2.0.0

- old
+ new

@@ -1,10 +1,11 @@ require 'rubygems' gem "bio", "~> 1.4.2" require 'bio' require 'snp_db_models' require 'activerecord-import' +require 'diff/lcs' #This method guesses the reference sequence file format def guess_sequence_format(reference_genome) file_extension = File.extname(reference_genome).downcase file_format = nil @@ -48,10 +49,11 @@ end end #This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes. def populate_snps_alleles_genotypes(vcf_file, cuttoff_snp, cuttoff_genotype) + puts "Adding SNPs........" # open vcf file and parse each line File.open(vcf_file) do |f| # header names while line = f.gets @@ -84,41 +86,60 @@ snp_base = details[4] snp_qual = details [5] format = details[8].split(":") gt = format.index("GT") gq = format.index("GQ") + # dp = format.index("DP") samples = details[9..-1] - - next if ref_base.size != 1 || snp_base.size != 1 # exclude indels + + next if ref_base.size != 1 || snp_base.size != 1 # exclude indels (e.g. G,A in REF) genotypes = samples.map do |s| - format_values = s.chomp.split(":") - format_values[gt] + format_values = s.chomp.split(":") # output (e.g.): 0/0 \n 0,255,209 \n 99 + format_values[gt] # e.g. 0/0 end genotypes_qualities = samples.map do |s| format_values = s.chomp.split(":") - format_values[gq] + format_values[gq] # e.g. 99 end - high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30 + geno_quality_array = Array.new + + high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30. Reminder: 0/0 is no SNP, 1/1 is SNP. variant_genotypes = Array.new - genotypes.each_with_index do |gt, index| + genotypes.each_with_index do |gt, index| # indexes each 'genotypes'. if gt == "1/1" - variant_genotypes << index - if genotypes_qualities[index].to_i >= cuttoff_genotype - high_quality_variant_genotypes << index - end - end + variant_genotypes << index # variant_genotypes is the position of genome positions that have a correct SNP with 1/1. if you want the total number of strains thats have 1/1 for that row (genome position) then puts variant_genotypes.size + if genotypes_qualities[index].to_i >= cuttoff_genotype.to_i + high_quality_variant_genotypes << index + end + end end - - if snp_qual.to_i >= cuttoff_snp && genotypes.include?("1/1") && ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size # first condition checks the overall quality of the SNP is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant type with quality >= 30 and informative SNP - if genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s + + genotypes_qualities.each do |gq| + if gq.to_i >= cuttoff_genotype.to_i + geno_quality_array << gq + end + end + + # # high_quality_variant_genotypes is the position of 1/1 and genotype quality above cuttoff_genotype. high_quality_variant_genotypes.size will give you the number of 1/1 in a row (genome position) that is above the genotype quality cuttoff. + # puts "yay" if geno_quality_array.keep_if {|z| z <= cuttoff_genotype.to_i} + + # next if geno_quality_array.each {|z| z.to_i < cuttoff_genotype.to_i} + next if samples.include?("./.") + next if geno_quality_array.size != strains.size + if snp_qual.to_i >= cuttoff_snp.to_i && genotypes.include?("1/1") && ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size + # first condition checks the overall quality of the SNP is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant type with quality >= 30 and informative SNP + + if genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s + good_snps +=1 # puts good_snps #create snp s = Snp.new s.ref_pos = ref_pos + s.qual = snp_qual s.save # create ref allele ref_allele = Allele.new ref_allele.base = ref_base @@ -137,25 +158,27 @@ ActiveRecord::Base.transaction do genos = [] genotypes.each_with_index do |gt, index| genotype = Genotype.new genotype.strain = strains[index] + genotype.geno_qual = genotypes_qualities[index].to_i puts index if strains[index].nil? if gt == "0/0" # wild type genotype.allele = ref_allele elsif gt == "1/1" # snp type genotype.allele = snp_allele - else - puts "Strange SNP #{gt}" + else + puts "Strange SNP #{gt}" end genos << genotype end # Using activerecord-import to speed up importing Genotype.import genos, :validate => false puts "Total SNPs added so far: #{good_snps}" if good_snps % 100 == 0 end end + end end end end end @@ -170,10 +193,109 @@ def find_shared_snps(strain_names) *strain_names = strain_names where_statement = strain_names.collect{|strain_name| "strains.name = '#{strain_name}' OR "}.join("").sub(/ OR $/, "") - puts "Snp.find_by_sql(\"SELECT * from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}\")" + Snp.find_by_sql("SELECT * from snps INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id INNER JOIN strains ON strains.id = genotypes.strain_id WHERE (#{where_statement}) AND alleles.id <> snps.reference_allele_id AND (SELECT COUNT(*) from snps AS s INNER JOIN alleles ON alleles.snp_id = snps.id INNER JOIN genotypes ON alleles.id = genotypes.allele_id WHERE alleles.id <> snps.reference_allele_id and s.id = snps.id) = #{strain_names.size} GROUP BY snps.id HAVING COUNT(*) = #{strain_names.size}") end +def synonymous(sequence_file) + #Reference Sequence + genome_sequence = Bio::FlatFile.open(Bio::GenBank, sequence_file).next_entry + #Extract all nucleotide sequence from ORIGIN + all_seqs_original = genome_sequence.seq + ref_bases =[] + + strains = Strain.all + + strains_hash = Hash.new + # create a sequence hash + # hash key is strain_id, loop through strain_id + # create an empty array + strains.each do |strain| + strains_hash[strain.id] = Array.new + end + + variants = Feature.find_by_sql("select distinct features.* from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id inner join genotypes on alleles.id = genotypes.allele_id inner join strains on strains.id = genotypes.strain_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'") + + puts "start_cds_in_ref\tend_cds_in_ref\tpos_of_SNP_in_ref\tref_base\tSNP_base\tsynonymous or non-synonymous\tamino_acid_original\tamino_acid_change\tpossible_pseudogene?\tchange_in_hydrophobicity_of_AA?\tchange_in_polarisation_of_AA?\tchange_in_size_of_AA?" + + + variants.each do |variant| + variant.snps.each do |snp| + snp.alleles.each do |allele| + if allele.id != snp.reference_allele_id + all_seqs_mutated = genome_sequence.seq + mutated_seq_translated = [] + original_seq_translated = [] + all_seqs_mutated[snp.ref_pos.to_i-1] = allele.base + + mutated_seq = Bio::Sequence.auto(all_seqs_mutated[variant.start-1..variant.end-1]) + original_seq = Bio::Sequence.auto(all_seqs_original[variant.start-1..variant.end-1]) + + if variant.strand == -1 + mutated_seq_translated << mutated_seq.reverse_complement.translate + original_seq_translated << original_seq.reverse_complement.translate + + else + mutated_seq_translated << mutated_seq.translate + original_seq_translated << original_seq.translate + + end + + mutated_seq_translated.zip(original_seq_translated).each do |mut, org| + mutated_seq_translated_clean = mut.gsub(/\*$/,"") + original_seq_translated_clean = org.gsub(/\*$/,"") + + hydrophobic = ["I", "L", "V", "C", "A", "G", "M", "F", "Y", "W", "H", "T"] + non_hydrophobic = ["K", "E", "Q", "D", "N", "S", "P", "B"] + + polar = ["Y", "W", "H", "K", "R", "E", "Q", "D", "N", "S", "P", "B"] + non_polar = ["I", "L", "V", "C", "A", "G", "M", "F", "T"] + + small = ["V","C","A","G","D","N","S","T","P"] + non_small = ["I","L","M","F","Y","W","H","K","R","E","Q"] + + if original_seq_translated_clean == mutated_seq_translated_clean + # if original_seq_translated == mutated_seq_translated + if mutated_seq_translated_clean =~ /\*/ + puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tsynonymous\t\t\tYes" + else + puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tsynonymous" + end + else + + diffs = Diff::LCS.diff(original_seq_translated_clean, mutated_seq_translated_clean) + + if mutated_seq_translated_clean =~ /\*/ + puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tnon-synonymous\t#{diffs[0][0].element}\t#{diffs[0][1].element}\tYes\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}" + else + puts "#{variant.start}\t#{variant.end}\t#{snp.ref_pos}\t#{all_seqs_original[snp.ref_pos.to_i-1].upcase}\t#{(allele.base).upcase}\tnon-synonymous\t#{diffs[0][0].element}\t#{diffs[0][1].element}\t\t#{'Yes' if (hydrophobic.include? diffs[0][0].element) == (non_hydrophobic.include? diffs[0][1].element)}\t#{'Yes' if (polar.include? diffs[0][0].element) == (non_polar.include? diffs[0][1].element)}\t#{'Yes' if (small.include? diffs[0][0].element) == (non_small.include? diffs[0][1].element)}" + end + end + end + + end + end + end + end + + #Take all SNP positions in ref genome + # snp_positions = Feature.find_by_sql("select snps.ref_pos from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|snp| snp.ref_pos} + + # # Take all SNP nucleotide + # snps = Feature.find_by_sql("select alleles.base from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where alleles.id <> snps.reference_allele_id and features.name = 'CDS'").map{|allele| allele.base} + + # # Mutate (substitute) the original sequence with the SNPs + + # # Here all_seqs_original are all the nucelotide sequences but with the snps subsituted in them + + # #Get start position of CDS with SNP + # coordinates_start = Feature.find_by_sql("select start from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where features.name = 'CDS' and alleles.id <> snps.reference_allele_id").map{|feature| feature.start} + + # #Get end position of CDS with SNP + # coordinates_end = Feature.find_by_sql("select end from features inner join snps on features.id = snps.feature_id inner join alleles on snps.id = alleles.snp_id where features.name = 'CDS' and alleles.id <> snps.reference_allele_id").map{|feature| feature.end} + + +end \ No newline at end of file