#!/usr/bin/env ruby
require 'bio'
require 'rubygems'
require 'pathname'
require 'bio-samtools'
require 'optparse'
require 'set'
$: << File.expand_path(File.dirname(__FILE__) + '/../lib')
$: << File.expand_path('.')
path= File.expand_path(File.dirname(__FILE__) + '/../lib/bioruby-polyploid-tools.rb')
require path

def log(msg)
  time=Time.now.strftime("%Y-%m-%d %H:%M:%S.%L")
  puts "#{time}: #{msg}"
end


class Bio::PolyploidTools::ExonContainer
   def add_alignments(opts=Hash.new) 
      opts = { :min_identity=>90 }.merge!(opts)
      exonerate_filename = opts[:exonerate_file]
      arm_selection = opts[:arm_selection]

      unless arm_selection
        arm_selection = lambda do | contig_name |
          ret = contig_name[0,3]       
          return ret
        end
      end

      File.open(exonerate_filename) do |f|
        f.each_line do | line |
          record = Bio::DB::Exonerate::Alignment.parse_custom(line)
          if  record and record.identity >= opts[:min_identity]
            snp_array = @snp_map[record.query_id]
            if snp_array != nil
              snp_array.each do |snp|                            
                if snp != nil and snp.position.between?( (record.query_start + 1) , record.query_end)
                  begin
                    exon = record.exon_on_gene_position(snp.position)
                    snp.add_exon(exon, arm_selection.call(record.target_id))
                  rescue Bio::DB::Exonerate::ExonerateException
                    $stderr.puts "Failed for the range #{record.query_start}-#{record.query_end} for position #{snp.position}"
                  end
                end
              end
            end
          end
        end
      end
    end
end

class Bio::DB::Primer3::SNP

  def to_s
     "#{gene}:#{snp_from.chromosome}"
  end

end
class Bio::DB::Primer3::Primer3Record


  def best_pair
    return @best_pair if @best_pair
    @best_pair = nil
    @total_caps = 100
    @primerPairs.each do | primer |
      capital_count = "#{primer.left.sequence}#{primer.right.sequence}".scan(/[A-Z]/).length
      if @best_pair.nil?
        @best_pair = primer 
        @total_caps = capital_count
        next
      end
      if capital_count < @total_caps
        @best_pair = primer 
        @total_caps = capital_count
      end
      if primer.size < @best_pair.size 
        @best_pair = primer 
        @total_caps = capital_count
      end
    end
    #@best_pair = @primerPairs.min
    @best_pair
  end

#CL3339Contig1:T509C AvocetS chromosome_specific exon 4D forward 
  def parse_header
    @snp, @line, @type, @in, @polymorphism, @chromosome, @orientation   = self.sequence_id.split(" ")  
    @type = @type.to_sym
    if @in
      @in = @in.to_sym == :exon 
    else
      @exon = false
    end

    if @polymorphism.to_sym == :homoeologous
      @homoeologous = true
    else
      @homoeologous = false
    end
    @parsed = true
    @orientation = @orientation.to_sym
  end

  def score
    best_pair
#    puts "score"
 #   puts self.inspect
    ret = 0
    ret += @scores[type]
    ret += @scores[:exon] if exon?
    ret -= @total_caps * 10  
    ret -= product_length
    ret
  end

  def to_s
      "#{gene}:#{snp_from.chromosome}"
  end

   def left_primer_snp(snp)
      tmp_primer = String.new(left_primer)
      #if self.orientation == :forward
      #  base_original = snp.original 
      #  base_snp = snp.snp
      #elsif self.orientation == :reverse
      #  base_original = reverse_complement_string(snp.original )
      #  base_snp = reverse_complement_string(snp.snp)
      #else
      #  raise Primer3Exception.new "#{self.orientation} is not a valid orientation"
      #end

      # puts "#{snp.to_s} #{self.orientation} #{tmp_primer[-1] } #{base_original} #{base_snp}"
      #if tmp_primer[-1] == base_original
      #  tmp_primer[-1] = base_snp
      #elsif tmp_primer[-1] == base_snp
      #  tmp_primer[-1] = base_original  
      #else
      #  raise Primer3Exception.new "#{tmp_primer} doesnt end in a base in the SNP #{snp.to_s}"
      #end
      #puts "tmp_primer: #{tmp_primer}"
      return tmp_primer
    end

end

arm_selection_functions = Hash.new;


arm_selection_functions[:arm_selection_first_two] = lambda do | contig_name |
  ret = contig_name[0,2]       
  return ret
end

#Function to parse stuff like: "IWGSC_CSS_1AL_scaff_110"
#Or the first two characters in the contig name, to deal with 
#pseudomolecules that start with headers like: "1A"
#And with the cases when 3B is named with the prefix: v443
arm_selection_functions[:arm_selection_embl] = lambda do | contig_name|
  
  arr = contig_name.split('_')
  ret = "U"
  ret = arr[2][0,2] if arr.size >= 3
  ret = "3B" if arr.size == 2 and arr[0] == "v443"
  ret = arr[0][0,2] if arr.size == 1   
  return ret
end

arm_selection_functions[:arm_selection_morex] = lambda do | contig_name |
  ret = contig_name.split(':')[0].split("_")[1];       
  return ret
end

arm_selection_functions[:scaffold] = lambda do | contig_name |
  ret = contig_name;       
  return ret
end

markers = nil

options = {}
options[:model] = "est2genome"
options[:min_identity] = 90
options[:extract_found_contigs] = false
options[:arm_selection] = arm_selection_functions[:arm_selection_embl] ;
options[:genomes_count] = 3


options[:primer_3_preferences] = {
      :primer_product_size_range => "50-150" ,
      :primer_max_size => 25 , 
      :primer_lib_ambiguity_codes_consensus => 1,
      :primer_liberal_base => 1, 
      :primer_num_return=>5,
      :primer_explain_flag => 1,
      :primer_thermodynamic_parameters_path=>File.expand_path(File.dirname(__FILE__) + '../../conf/primer3_config/') + '/'
  }


OptionParser.new do |opts|
  
  opts.banner = "Usage: find_homoeologue_variations.rb [options]"

  opts.on("-c", "--sequences FASTA", "Sequence of the region to searc") do |o|
    options[:sequences] = o
  end
  opts.on("-r", "--reference FASTA", "reference with the contigs") do |o|
    options[:reference] = o
  end
  opts.on("-o", "--output DIR", "Directory to write the output") do |o|
    options[:output] = o
  end

  opts.on("-g", "--genomes_count INT", "Number of genomes (default 3, for hexaploid)") do |o|
    options[:genomes_count] = o.to_i
  end

  opts.on("-x", "--extract_found_contigs", "If present, save in a separate file the contigs with matches. Useful to debug.") do |o|
    options[:extract_found_contigs] = true
  end
  
end.parse!
#reference="/Users/ramirezr/Documents/TGAC/references/Triticum_aestivum.IWGSP1.21.dna_rm.genome.fa"
reference = options[:reference] if options[:reference]
throw raise Exception.new(), "Reference has to be provided" unless reference
sequences = options[:sequences] if options[:sequences]
throw raise Exception.new(), "Fasta file with sequences has to be provided" unless sequences
output_folder = options[:output] if options[:output]
throw raise Exception.new(), "An output directory has to be provided" unless output_folder
model=options[:model] 
Dir.mkdir(output_folder)
min_identity= options[:min_identity]

exonerate_file="#{output_folder}/exonerate_tmp.tab"
temp_contigs="#{output_folder}/contigs_tmp.fa"
primer_3_input="#{output_folder}/primer_3_input_temp"
primer_3_output="#{output_folder}/primer_3_output_temp"
exons_filename="#{output_folder}/exons_genes_and_contigs.fa"
output_primers="#{output_folder}/primers.csv"
output_to_order="#{output_folder}/primers_to_order.csv"

fasta_file = Bio::DB::Fasta::FastaFile.new({:fasta=>reference})
fasta_file.load_fai_entries

original_name="A"
snp_in="B"

 arm_selection = options[:arm_selection]

unless arm_selection
   arm_selection = lambda do | contig_name |
      ret = contig_name[0,3]       
      return ret
    end
end
begin
log "Reading exons"
exons = Array.new
Bio::FlatFile.auto(sequences) do |ff|
  ff.each do |entry|
    fields = Array.new
    fields << entry.definition
    fields << arm_selection.call(entry.definition)
    fields << entry.seq
    
    line = fields.join(",")
    snp =  Bio::PolyploidTools::NoSNPSequence.parse(line)
    snp.genomes_count = options[:genomes_count]
    exons << snp
     
  end
end



log "Searching markers in genome"
found_contigs = Set.new
exo_f = File.open(exonerate_file, "w")
contigs_f = File.open(temp_contigs, "w") if options[:extract_found_contigs]
Bio::DB::Exonerate.align({:query=>sequences, :target=>reference, :model=>model}) do |aln|
	if aln.identity > min_identity
    exo_f.puts aln.line
    unless found_contigs.include?(aln.target_id) #We only add once each contig. Should reduce the size of the output file. 
      found_contigs.add(aln.target_id)
      entry = fasta_file.index.region_for_entry(aln.target_id)
      raise ExonerateException.new,  "Entry not found! #{aln.target_id}. Make sure that the #{target_id}.fai was generated properly." if entry == nil
      region = entry.get_full_region
      seq = fasta_file.fetch_sequence(region)
      contigs_f.puts(">#{aln.target_id}\n#{seq}") if options[:extract_found_contigs]
    end
  end  
end
exo_f.close() 
contigs_f.close() if options[:extract_found_contigs]



log "Reading best alignment on each chromosome"

container= Bio::PolyploidTools::ExonContainer.new
container.flanking_size=options[:flanking_size] 
container.gene_models(sequences)
container.chromosomes(temp_contigs)
container.add_parental({:name=>"A"})
container.add_parental({:name=>"B"})
exons.each do |exon|
  exon.container = container
  exon.flanking_size = 50
  exon.variation_free_region = options[:variation_free_region]
#  puts exon.inspect
  container.add_snp(exon)

end
container.add_alignments({:exonerate_file=>exonerate_file, :arm_selection=>options[:arm_selection] , :min_identity=>min_identity})

#4.1 generating primer3 file
log "Running primer3"
file = File.open(exons_filename, "w")
container.print_fasta_snp_exones(file)
file.close

file = File.open(primer_3_input, "w")

Bio::DB::Primer3.prepare_input_file(file, options[:primer_3_preferences])
added_exons = container.print_primer_3_exons(file, nil, snp_in)
file.close

Bio::DB::Primer3.run({:in=>primer_3_input, :out=>primer_3_output}) if added_exons > 0

#5. Pick the best primer and make the primer3 output
log "Selecting best primers"
kasp_container=Bio::DB::Primer3::KASPContainer.new
kasp_container.line_1= original_name
kasp_container.line_2= snp_in

if options[:scoring] == :het_dels
  kasp_container.scores = Hash.new
  kasp_container.scores[:chromosome_specific] = 0
  kasp_container.scores[:chromosome_semispecific] = 1000
  kasp_container.scores[:chromosome_nonspecific] = 100    
end

exons.each do |snp|
  snpk = kasp_container.add_snp(snp) 
end

kasp_container.add_primers_file(primer_3_output) if added_exons > 0
header = "Marker,SNP,RegionSize,chromosome,total_contigs,contig_regions,SNP_type,#{original_name},#{snp_in},common,primer_type,orientation,#{original_name}_TM,#{snp_in}_TM,common_TM,selected_from,product_size,errors"
File.open(output_primers, 'w') { |f| f.write("#{header}\n#{kasp_container.print_primers}") }

kasp_container.snp_hash.each_pair do |name, kaspSNP|  
  #puts kaspSNP.snp_from.surrounding_exon_sequences.inspect
  #puts kaspSNP.first_product
  #puts kaspSNP.realigned_primers

  out_fasta_products = "#{output_folder}/#{name}.fa"
  File.open(out_fasta_products, 'w') { |f| f.write(kaspSNP.realigned_primers_fasta) }


end

File.open(output_to_order, "w") { |io|  io.write(kasp_container.print_primers_with_tails()) }

log "DONE"
rescue StandardError => e
  log "ERROR\t#{e.message}"
  $stderr.puts e.backtrace
  raise e 
rescue Exception => e
  log "ERROR\t#{e.message}"
  $stderr.puts e.backtrace
  raise e  
end
#puts container.inspect 

#container.snp_map.each do | gene, snp_array|
#  snp_array.each do |e|
 #   puts e.inspect
#    puts e.aligned_sequences_fasta
#  end
#end