require 'rubygems'
require 'bio'
require  'snp_db_models'
#establish_connection

def guess_sequence_format(reference_genome)
  file_extension = File.extname(reference_genome).downcase
  file_format = nil
  case file_extension
  when ".gbk", ".genbank", ".gb"
    file_format = :genbank
  when ".embl", ".emb"
    file_format = :embl
  end
  return file_format
end

# A method to populate the strain names in the Strain table.  strain_names is an array of strain names.
def populate_strains(strain_names)
	strain_names.each do |strain|
	s = Strain.new
	s.name = strain
	s.save
	end
end

# A method to populate the database with the features (genes etc) and the annotations from the embl file.  
# We include all features that are not 'source' or 'gene' as they are repetitive info.  'CDS' is the gene.
# The annotation table includes also the start and end coordinates of the CDS.  The strand is also included.  the 'locations' method is defined in bioruby under genbank.  It must be required at the top (bio).
# Also, the qualifier and value are extracted from the embl file and added to the database.
def populate_features_and_annotations(sequence_file)
	sequence_file.features.each do |feature|
		unless feature.feature == "source" || feature.feature == "gene"
			db_feature = Feature.new
			db_feature.start = feature.locations.first.from
			db_feature.end = feature.locations.first.to
			db_feature.strand = feature.locations.first.strand
			db_feature.name = feature.feature
			db_feature.save
			puts "populated #{db_feature.name}, start: #{db_feature.start}, end: #{db_feature.end}, strand: #{db_feature.strand} for feature: #{db_feature.id}"
			# Populate the Annotation table with qualifier information from the genbank file
			feature.qualifiers.each do |qualifier|
				a = Annotation.new
				a.qualifier = qualifier.qualifier
				a.value = qualifier.value
				a.save
				db_feature.annotations << a
				puts "populated #{a.qualifier} for feature: #{db_feature.id}"
			end
		end
	end
end


#This method populates the rest of the information, i.e. SNP information, Alleles and Genotypes.
# It requires the strain_names as array and the output (vcf file) from mpileup-snp identification algorithm.

def populate_snps_alleles_genotypes(strain_names, vcf_file, cuttoff_snp, cuttoff_genotype)
	strains = Array.new
 	strain_names.each do |strain_name|
  		strain = Strain.find_by_name(strain_name) # equivalent to Strain.find.where("strains.name=?", strain_name).first
  		strains << strain
	end

# open vcf file and parse each line
	File.open(vcf_file) do |f|
	  # header names
	  header = f.gets
	  header2 = f.gets.chomp
	  column_headings = header2.split("\t")
	  sample_names = column_headings[9..-1]
	 
	  good_snps = 0
	  # start parsing snps
		while line = f.gets
		    details = line.split("\t")
		    ref = details[0]
		    ref_pos = details[1]
		    ref_base = details[3]
		    snp_base = details[4]
		    snp_qual = details [5]
		    samples = details[9..-1]
		     
		    genotypes = samples.map do |s| 
		      pl, gt, gq = s.chomp.split(":")
		      gt
		    end

		    genotypes_qualities = samples.map do |s| 
		      pl, gt, gq = s.chomp.split(":")
		      gq
		    end

		    high_quality_variant_genotypes = Array.new # this will be filled with the indicies of genotypes that are "1/1" and have a quality >= 30
		    variant_genotypes = Array.new
		    genotypes.each_with_index do |gt, index|
		    	if gt == "1/1"
			        variant_genotypes << index
			        if genotypes_qualities[index].to_i >= cuttoff_genotype
			          high_quality_variant_genotypes << index
			        end
		    	end
			end
		    
			if snp_qual.to_i >= cuttoff_snp && genotypes.include?("1/1") &&  ! high_quality_variant_genotypes.empty? && high_quality_variant_genotypes.size == variant_genotypes.size # first condition checks the overall quality of the SNP is >=90, second checks that at least one genome has the 'homozygous' 1/1 variant type with quality >= 30 and informative SNP
		    	if  genotypes.include?("0/0") && !genotypes.include?("0/1") # exclude SNPs which are all 1/1 i.e something strange about ref and those which have confusing heterozygote 0/1s
			        good_snps +=1
			        # puts good_snps
			        #create snp
			        s = Snp.new
			        s.ref_pos = ref_pos
			        s.save
			        puts "Adding Reference SNP position: #{ref_pos}"

			        # create ref allele
			        ref_allele = Allele.new
			        ref_allele.base = ref_base
			        ref_allele.snp = s
			        ref_allele.save

			        puts "Adding Reference SNP base: #{ref_base}"

			        s.reference_allele = ref_allele
			        s.save

			        # create snp allele
			        snp_allele = Allele.new
			        snp_allele.base = snp_base
			        snp_allele.snp = s
			        snp_allele.save

			        puts "Adding SNP base: #{snp_base}"

			        

			        genotypes.each_with_index do |gt, index|
			          genotype = Genotype.new
			          genotype.strain = strains[index]
			          puts index if strains[index].nil?
			          # print "#{gt}(#{genotypes_qualities[index]}) "
			          if gt == "0/0" # wild type
			            genotype.allele = ref_allele
			          elsif gt == "1/1" # snp type
			            genotype.allele = snp_allele
			          else
			            puts "Strange SNP #{gt}"
			          end
			          genotype.save
			        end
		      	end
		    end

		end
	end
	#Here we link the features to snps.
	Snp.all.each do |snp|
		x = Feature.where("features.start <= ? AND features.end >= ?", snp.ref_pos, snp.ref_pos).first
		snp.feature = x
		snp.save
	end
end