#!/usr/bin/env ruby
require 'optparse'
require 'bio'
require 'csv'
require 'bio-blastxmlparser'
require 'fileutils'
require 'tmpdir'


options = {}
options[:identity] = 50
options[:min_bases] = 200
options[:split_token] = "-"
options[:tmp_folder]  = Dir.mktmpdir
options[:program]  = "blastn"
options[:random_sample] = 0

OptionParser.new do |opts|
  
  opts.banner = "Usage: mafft_triads.rb [options]"

  opts.on("-i", "--identity FLOAT", "Minimum percentage identity") do |o|
    options[:identity] = o.to_f
  end

  opts.on("-t", "--triads FILE", "CSV file with the gene triad names in the named columns 'A','B' and 'D' ") do |o|
    options[:triads] = o
  end

  opts.on("-f", "--pep FILE" , "FASTA file containing all the possible peptide sequences. ") do |o|
    options[:pep] = o
  end

  opts.on("-s", "--cds FILE" , "FASTA file containing all the possible CDS sequences. ") do |o|
    options[:cds] = o
  end

  opts.on("-s", "--split_token CHAR", "Character used to split the sequence name. The name will be evarything before this token on the name of the sequences") do |o|
    options[:split_token] = o
  end

end.parse!


def peptide_alignment(sequences_to_align) 
  options = ['--maxiterate', '1000', '--localpair', '--quiet']
  mafft = Bio::MAFFT.new( "mafft" , options)
  report = mafft.query_align(sequences_to_align)
  report.alignment
end


split_token = options[:split_token]

pep_seq = Hash.new
pep_seq_count=0
Bio::FlatFile.open(Bio::FastaFormat, options[:pep]) do |fasta_file|
  fasta_file.each do |entry|
    gene_name = entry.entry_id.split(split_token)[0]  
    pep_seq[gene_name] = entry unless pep_seq[gene_name]
    pep_seq[gene_name] = entry if entry.length > pep_seq[gene_name].length
    pep_seq_count += 1
  end
end
$stderr.puts "#Loaded #{pep_seq.length} genes from #{pep_seq_count} pep_seq"

cds_seq = Hash.new
cds_seq_count=0
Bio::FlatFile.open(Bio::FastaFormat, options[:cds]) do |fasta_file|
  fasta_file.each do |entry|
    gene_name = entry.entry_id.split(split_token)[0]  
    cds_seq[gene_name] = entry unless cds_seq[gene_name]
    cds_seq[gene_name] = entry if entry.length > cds_seq[gene_name].length
    cds_seq_count += 1
  end
end
$stderr.puts "#Loaded #{cds_seq.length} genes from #{cds_seq_count} cds_seq"


$stderr.puts "TMP dir: #{options[:tmp_folder]}"

def write_fasta_from_hash(sequences, filename)
  out = File.new(filename, "w")
  #puts sequences.inspect
  sequences.each_pair do | chromosome, exon_seq | 
    out.puts ">#{chromosome}\n#{exon_seq}\n"
  end
  out.close
end


CSV.foreach(options[:triads], headers:true ) do |row|
   a = row['A']
   b = row['B']
   d = row['D']
   triad = row['group_id']

   to_align = Bio::Alignment::SequenceHash.new 
   to_align[a] = pep_seq[a]
   to_align[b] = pep_seq[b]
   to_align[d] = pep_seq[d]

   cds_seqs = Bio::Alignment::SequenceHash.new 
   cds_seqs[a] = cds_seq[a].to_biosequence
   cds_seqs[b] = cds_seq[b].to_biosequence
   cds_seqs[d] = cds_seq[d].to_biosequence

   cent_triad = triad.to_i / 100
   folder = "alignments/#{cent_triad}/"
   FileUtils.mkdir_p folder
   
   pep_aln = peptide_alignment(to_align)
   
   save_pep = "#{folder}/#{triad}.pep.fa"
   write_fasta_from_hash(pep_aln, save_pep)

   save_cds = "#{folder}/#{triad}.cds.fa"
   write_fasta_from_hash(cds_seqs, save_cds)
  #break
end