require 'rubygems' gem "bio", "~> 1.4.2" require 'bio' require 'snp_db_models' require 'activerecord-import' #This method guesses the reference sequence file format def guess_sequence_format(reference_genome) file_extension = File.extname(reference_genome).downcase file_format = nil case file_extension when ".gbk", ".genbank", ".gb" file_format = :genbank when ".embl", ".emb" file_format = :embl end return file_format end # A method to populate the database with the features (genes etc) and the annotations from the gbk/embl file. # We include all features that are not 'source' or 'gene' as they are repetitive info. 'CDS' is the gene. # The annotation table includes also the start and end coordinates of the CDS. The strand is also included. the 'locations' method is defined in bioruby under genbank. It must be required at the top (bio). #Also, the qualifier and value are extracted from the gbk/embl file and added to the database. def populate_features_and_annotations(sequence_file) puts "Adding features and their annotations...." ActiveRecord::Base.transaction do counter = 0 sequence_file.features.each do |feature| counter += 1 puts "Total number of features and annotations added: #{counter}" if counter % 100 == 0 unless feature.feature == "source" || feature.feature == "gene" db_feature = Feature.new db_feature.start = feature.locations.first.from db_feature.end = feature.locations.first.to db_feature.strand = feature.locations.first.strand db_feature.name = feature.feature db_feature.save # Populate the Annotation table with qualifier information from the genbank file feature.qualifiers.each do |qualifier| a = Annotation.new a.qualifier = qualifier.qualifier a.value = qualifier.value a.save db_feature.annotations << a end end end end end #This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes. def populate_snps_alleles_genotypes(vcf_file, cuttoff_snp, cuttoff_genotype) puts "Adding SNPs........" # open vcf file and parse each line File.open(vcf_file) do |f| # header names while line = f.gets if line =~ /CHROM/ line.chomp! column_headings = line.split("\t") strain_names = column_headings[9..-1] strain_names.map!{|name| name.sub(/\..*/, '')} strain_names.each do |str| ss = Strain.new ss.name = str ss.save end strains = Array.new strain_names.each do |strain_name| strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first strains << strain end good_snps = 0 # start parsing snps while line = f.gets #puts line details = line.split("\t") ref = details[0] ref_pos = details[1] ref_base = details[3] snp_base = details[4] snp_qual = details [5] samples = details[9..-1] next if ref_base.size != 1 || snp_base.size != 1 # exclude indels genotypes = samples.map do |s| pl, gt, gq = s.chomp.split(":") gt end genotypes_qualities = samples.map do |s| pl, gt, gq = s.chomp.split(":") gq end high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30 variant_genotypes = Array.new genotypes.each_with_index do |gt, index| if gt == "1/1" variant_genotypes << index if genotypes_qualities[index].to_i >= cuttoff_genotype high_quality_variant_genotypes << index end end end if snp_qual.to_i >= cuttoff_snp && genotypes.include?("1/1") && ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size # first condition checks the overall quality of the SNP is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant type with quality >= 30 and informative SNP if genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s good_snps +=1 # puts good_snps #create snp s = Snp.new s.ref_pos = ref_pos s.save # create ref allele ref_allele = Allele.new ref_allele.base = ref_base ref_allele.snp = s ref_allele.save s.reference_allele = ref_allele s.save # create snp allele snp_allele = Allele.new snp_allele.base = snp_base snp_allele.snp = s snp_allele.save ActiveRecord::Base.transaction do genos = [] genotypes.each_with_index do |gt, index| genotype = Genotype.new genotype.strain = strains[index] puts index if strains[index].nil? if gt == "0/0" # wild type genotype.allele = ref_allele elsif gt == "1/1" # snp type genotype.allele = snp_allele else puts "Strange SNP #{gt}" end genos << genotype end # Using activerecord-import to speed up importing Genotype.import genos, :validate => false puts "Total SNPs added so far: #{good_snps}" if good_snps % 100 == 0 end end end end end end end #Here we link the features to snps. Snp.all.each do |snp| x = Feature.where("features.start <= ? AND features.end >= ?", snp.ref_pos, snp.ref_pos).first snp.feature = x snp.save end end