#!/usr/bin/env ruby
require 'optparse'
require 'bio'
require 'csv'
require 'bio-blastxmlparser'
require 'fileutils'
require 'tmpdir'


options = {}
options[:identity] = 50
options[:min_bases] = 200
options[:split_token] = "-"
options[:tmp_folder]  = Dir.mktmpdir
options[:program]  = "blastn"
options[:random_sample] = 0
options[:cut_promoter_length] = 0
options[:reverse] = true

OptionParser.new do |opts|
  
  opts.banner = "Usage: filter_blat.rb [options]"

  opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
    options[:identity] = o.to_f
  end
  opts.on("-c", "--min_bases int", "Minimum alignment length (default 200)") do |o|
    options[:min_bases] = o.to_i
  end

  opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
    options[:triads] = o
  end

  opts.on("-f", "--sequences FILE" , "FASTA file containing all the possible sequences. ") do |o|
    options[:fasta] = o
  end

  opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
    options[:split_token] = o
  end

  opts.on("-p", "--program blastn|blastp", "The program to use in the alignments. Currntly only supported blastn and blastp") do |o|
    options[:program] = o
  end

  opts.on("-r", "--random_sample INT", "Number of blast to run and keep. If set, only the number of subsets will be run") do |o|
    options[:random_sample] = o.to_i
  end

  opts.on("-l", "--cut_promoter_length INT", "Bases to consider") do |o|
    options[:cut_promoter_length] = o.to_i
  end

  opts.on("-v", "--reverse T|F", "Reverse the input bases") do |o|
    if o == 'T'
      options[:reverse] = true
    elsif o == 'F'
      options[:reverse] = false
    else
      $stderr.puts "Invalid option for reverse (should be T or F)"
      exit -1
    end
  end
end.parse!


def blast_pair_fast(path_a, path_b, out_path, program: "blastn")
  cmd = "#{program} -query #{path_a} -subject #{path_b} -task #{program} -out #{out_path} -outfmt '5' "
  #puts cmd
  executed = system cmd
  result = []
  blast_version = nil
  n = Bio::BlastXMLParser::XmlIterator.new(out_path).to_enum
  longest = nil
  max_length = 0
  max_pident = 0.0
  n.each do | iter |
    iter.each do | hit |
      hit.each do | hsp |
        if hsp.align_len > max_length
          max_length = hsp.align_len
          max_pident = 100 * hsp.identity.to_f / hsp.align_len.to_f
        end
      end
    end
  end
  [max_length, max_pident]
end

valid_pairs_A_B = Hash.new
valid_pairs_A_D = Hash.new
valid_pairs_B_D = Hash.new

split_token = options[:split_token]

sequences = Hash.new
sequence_count=0
Bio::FlatFile.open(Bio::FastaFormat, options[:fasta]) do |fasta_file|
  fasta_file.each do |entry|
    gene_name = entry.entry_id.split(split_token)[0]  
    seq = entry.naseq
    seq.reverse_complement! if options[:reverse] 
    seq = seq[0,options[:cut_promoter_length]] if options[:cut_promoter_length] > 0
    entry.data = seq
    sequences[gene_name] = entry unless sequences[gene_name]
    sequences[gene_name] = entry if entry.length > sequences[gene_name].length
    sequence_count += 1
  end
end

$stderr.puts "#Loaded #{sequences.length} genes from #{sequence_count} sequences"
#FileUtils.mkdir_p(options[:tmp_folder])
$stderr.puts "TMP dir: #{options[:tmp_folder]}"

a_tmp   = options[:tmp_folder] + "/A.fa"
b_tmp   = options[:tmp_folder] + "/B.fa"
d_tmp   = options[:tmp_folder] + "/D.fa"
out_tmp = options[:tmp_folder] + "/out.blast"


puts [
  "group_id" , "query"      , "subject" , 
  "chr_query", "chr_subject", "aln_type",
  "length"   , "pident" , "Ns_query", "Ns_subject", "Ns_total"   ].join("\t")

count_lines = File.foreach(options[:triads]).inject(0) {|c, line| c+1}

probability =  options[:random_sample] / count_lines.to_f
probability = 1 if options[:random_sample] == 0
prng = Random.new
#puts probability
prom_len = options[:cut_promoter_length]
CSV.foreach(options[:triads], headers:true ) do |row|
   a = row['A']
   b = row['B']
   d = row['D']
   triad = row['group_id'].to_i
   triad_folder = triad/100

   save = probability > prng.rand && probability < 1
   run  = probability == 1 || save 
   next unless run

   seq_a = sequences[a]
   seq_b = sequences[b]
   seq_d = sequences[d]
   File.open(a_tmp, 'w') {|f| f.write(seq_a) } if seq_a
   File.open(b_tmp, 'w') {|f| f.write(seq_b) } if seq_b
   File.open(d_tmp, 'w') {|f| f.write(seq_d) } if seq_d

   ns_a = seq_a.seq.count('Nn') if seq_a
   ns_b = seq_b.seq.count('Nn') if seq_b
   ns_d = seq_d.seq.count('Nn') if seq_d

   save_folder = "blast_alignments_#{prom_len}/#{triad_folder}/#{triad}"
   
   #if save
    FileUtils.mkdir_p save_folder
    FileUtils.cp(a_tmp, save_folder) if seq_a
    FileUtils.cp(b_tmp, save_folder) if seq_b
    FileUtils.cp(d_tmp, save_folder) if seq_d
   #end

   if seq_a and seq_b
      to_print = [triad, a, b , "A","B","A->B"]
      to_print << blast_pair_fast(a_tmp, b_tmp, out_tmp, program:options[:program])
      to_print << ns_a 
      to_print << ns_b
      to_print << ns_a + ns_b
      FileUtils.cp(out_tmp, "#{save_folder}/A_B.xml") #if save
      puts to_print.join("\t")
   end
  if seq_a and seq_d
      to_print = [triad, a, b , "A","D","A->D"]
      to_print << blast_pair_fast(a_tmp, d_tmp, out_tmp, program:options[:program]) 
      to_print << ns_a 
      to_print << ns_d
      to_print << ns_a + ns_d
      FileUtils.cp(out_tmp, "#{save_folder}/A_D.xml") #if save
      puts to_print.join("\t")
  end
  if seq_b and seq_d
      to_print = [triad, a, b , "B","D","B->D"]
      to_print << blast_pair_fast(b_tmp, d_tmp, out_tmp, program:options[:program])
      to_print << ns_b
      to_print << ns_d
      to_print << ns_b + ns_d
      FileUtils.cp(out_tmp, "#{save_folder}/B_D.xml") #if save
      puts to_print.join("\t")
  end
end