require 'rubygems'
gem "bio", "~> 1.4.2"
require 'bio'
require  'snp_db_models'
require 'activerecord-import'

#This method guesses the reference sequence file format
def guess_sequence_format(reference_genome)
  file_extension = File.extname(reference_genome).downcase
  file_format = nil
  case file_extension
  when ".gbk", ".genbank", ".gb"
    file_format = :genbank
  when ".embl", ".emb"
    file_format = :embl
  end
  return file_format
end

# A method to populate the database with the features (genes etc) and the annotations from the gbk/embl file.  
# We include all features that are not 'source' or 'gene' as they are repetitive info.  'CDS' is the gene.
# The annotation table includes also the start and end coordinates of the CDS.  The strand is also included.  the 'locations' method is defined in bioruby under genbank.  It must be required at the top (bio).
#Also, the qualifier and value are extracted from the gbk/embl file and added to the database.
def populate_features_and_annotations(sequence_file)
	puts "Adding features and their annotations...."
	 ActiveRecord::Base.transaction do
	 	counter = 0
		sequence_file.features.each do |feature|
			counter += 1
			puts "Total number of features and annotations added: #{counter}" if counter % 100 == 0
			unless feature.feature == "source" || feature.feature == "gene"
				db_feature = Feature.new
				db_feature.start = feature.locations.first.from
				db_feature.end = feature.locations.first.to
				db_feature.strand = feature.locations.first.strand
				db_feature.name = feature.feature
				db_feature.save
				# Populate the Annotation table with qualifier information from the genbank file
			feature.qualifiers.each do |qualifier|
				a = Annotation.new
				a.qualifier = qualifier.qualifier
				a.value = qualifier.value
				a.save
				db_feature.annotations << a
			end
			end
		end
	end
end

#This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
def populate_snps_alleles_genotypes(vcf_file, cuttoff_snp, cuttoff_genotype)
puts "Adding SNPs........"
# open vcf file and parse each line
	File.open(vcf_file) do |f|
	  # header names
		while line = f.gets.chomp!
	  		if  line =~ /CHROM/
				column_headings = line.split("\t")
				strain_names = column_headings[9..-1]
				strain_names.map!{|name| name.sub(/\..*/, '')}
					
				strain_names.each do |str|
					ss = Strain.new
					ss.name = str
					ss.save
				end
				
				strains = Array.new
			 	strain_names.each do |strain_name|
			   		strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
			   		strains << strain
				 end

		 		good_snps = 0
		  		# start parsing snps
				while line = f.gets
					#puts line
				    details = line.split("\t")
				    ref = details[0]
				    ref_pos = details[1]
				    ref_base = details[3]
				    snp_base = details[4]
				    snp_qual = details [5]
				    samples = details[9..-1]
			     	
			     	next if ref_base.size != 1 || snp_base.size != 1 # exclude indels
				    genotypes = samples.map do |s| 
				      pl, gt, gq = s.chomp.split(":")
				      gt
				    end

				    genotypes_qualities = samples.map do |s| 
				      pl, gt, gq = s.chomp.split(":")
				      gq
				    end

				    high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30
				    variant_genotypes = Array.new
				    genotypes.each_with_index do |gt, index|
				    	if gt == "1/1"
					        variant_genotypes << index
					        if genotypes_qualities[index].to_i >= cuttoff_genotype
					          high_quality_variant_genotypes << index
					        end
				    	end
					end
			    
					if snp_qual.to_i >= cuttoff_snp && genotypes.include?("1/1") &&  ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size # first condition checks the overall quality of the SNP is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant type with quality >= 30 and informative SNP
				    	if  genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s
					        good_snps +=1
					        # puts good_snps
					        #create snp
					        s = Snp.new
					        s.ref_pos = ref_pos
					        s.save
					        					 
					   #  create ref allele
					        ref_allele = Allele.new
					        ref_allele.base = ref_base
					        ref_allele.snp = s
					        ref_allele.save

					        s.reference_allele = ref_allele
					        s.save

					        # create snp allele
					        snp_allele = Allele.new
					        snp_allele.base = snp_base
					        snp_allele.snp = s
					        snp_allele.save

					     	ActiveRecord::Base.transaction do
							    genotypes.each_with_index do |gt, index|
							        genotype = Genotype.new
							        genotype.strain = strains[index]
							        puts index if strains[index].nil?
							          if gt == "0/0" # wild type
							            genotype.allele = ref_allele
							          elsif gt == "1/1" # snp type
							            genotype.allele = snp_allele
							          else
							            puts "Strange SNP #{gt}"
							          end
							        genotype.save
								end
								puts "Total SNPs added so far: #{good_snps}" if good_snps % 100 == 0
							end
				      	end
			    	end
			    end
			end
		end
	end
	#Here we link the features to snps.
	Snp.all.each do |snp|
		x = Feature.where("features.start <= ? AND features.end >= ?", snp.ref_pos, snp.ref_pos).first
		snp.feature = x
		snp.save
	end
end