#!/usr/bin/env ruby

# 12-2-2011 Noe Fernandez Pozo.
# Full-LengtherNEXT predicts if your sequences are complete, showing you the nucleotide sequences and the translated protein
ROOT_PATH=File.dirname(__FILE__)
$: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next"))

require 'optparse'
require 'fileutils'
require 'socket'
require 'report_html'
require 'scbi_fasta'

###############################################################################################
## METHODS
###############################################################################################
def read_fasta(input_fasta)
	fasta = []
	fasta_file = FastaQualFile.new(input_fasta,'').each do |name, seq, qual|
		if seq.downcase.count('n') == seq.length
			STDERR.puts "Sequence #{name} is full of Ns. Skipped"
		else
			fasta << [name, seq]
		end
	end
	return fasta
end

def split_fasta(input_fasta, output_folder, n_files, chunk_size)
	fasta = read_fasta(input_fasta)
	n_seqs = fasta.length
	fasta.sort!{|s1, s2| s1.last.length <=> s2.last.length }
	file_names = []
	files = []
	n_files.times do |i| 
		file_name = "ref#{i}.fasta"
		file_names << file_name
		(chunk_size -1 ).times do 
			file_names << :data_void # This fills file_names with void elements to sync a file name for worker when it does the chunk size loop
		end
		files << File.open(File.join(output_folder, file_name), 'w') 
	end
	count = 0
	while !fasta.empty?
		files[count].puts '>' + fasta.shift.join("\n")
		files[count].puts '>' + fasta.pop.join("\n") if !fasta.empty?
		count += 1
		count = 0 if count == n_files
	end
	files.map{|file| file.close }
	return file_names, n_seqs
end

###############################################################################################
# PARSE OPTIONS
###############################################################################################
options = {}

if !File.exists?('logs')
	Dir.mkdir('logs')
end

optparse = OptionParser.new do |opts|

	options[:acess_db] = 'stnp'
	opts.on( '-a', '--acess_db STRING', 'Select that databases is going to be used. s for Swissprot, t for trEMBL and n for ncrna, p for use Transdecoder and c for use testcode algothrim. By default is set to stnp' ) do |acess_db|
		options[:acess_db] = acess_db
	end

    options[:blast] = ''
    opts.on( '-b', '--blast STRING', 'Aditional options to blast execution' ) do |blast|
            options[:blast] = blast
    end

	options[:chunk_size] = 200
	opts.on( '-c', '--chunk_size SIZE', "Number of sequences processed in each block when parallelization is used. Default=200" ) do |s|
		options[:chunk_size] = s.to_i
	end

	options[:est_db] = nil
	opts.on( '-d', '--est_db FILE', "EST database for representative transcriptome\n" ) do |est_db|
		options[:est_db] = est_db
		if !File.exists?(options[:est_db])
			puts "No valid path to EST database"
			Process.exit(-1)
		end
	end

	options[:exonerate] = TRUE
	opts.on( '-e', '--exonerate', 'Disables exonerate analysis' ) do |exonerate|
		options[:exonerate] = FALSE
	end

	options[:fasta] = nil
	opts.on( '-f', '--fasta FILE', 'Fasta input file' ) do |file|
		options[:fasta] = file
	end

	options[:tax_group] = nil
	opts.on( '-g', '--taxon_group GROUP', "Taxon group, required to use the best databases:\n#{"\t"*5}fungi\n#{"\t"*5}human\n#{"\t"*5}invertebrates\n#{"\t"*5}mammals\n#{"\t"*5}plants\n#{"\t"*5}rodents\n#{"\t"*5}vertebrates\n\n" ) do |tax_name|
		options[:tax_group] = tax_name
	end

	options[:ident] = 45.00
	opts.on( '-i', '--identity_percent IDENTITY', 'identity percent threshold to consider as reliable the sequence similarity. Default=45.00' ) do |ident|
		options[:ident] = ident.to_f 
	end

	options[:high_clustering] = FALSE
	opts.on( '-k', '--high_clustering', 'Only for representative transcriptome. Add a clustering step using pfam ids. Default false' ) do
		options[:high_clustering] = TRUE
	end

	options[:subject_coverage] = 0.25
	opts.on( '-j', '--subject_coverage_percent FLOAT', "Subject coverage percentage threshold" ) do |j|
		options[:subject_coverage] = j.to_f/100
	end

	options[:min_nucleotides] = 100
	opts.on( '-n', '--min_nucleotides minLONG', "min nucleotides to consider a part of chimera like putative unigene. Default=100\n\n" ) do |min_nucleotides|
		options[:min_nucleotides] = min_nucleotides.to_i
	end

	options[:distance] = 15
	opts.on( '-m', '--max_distance maxDIST', "maximal distance between query and subject gene boundaries to be qualified as putative, the less distance the more strict. Default=15\n\n" ) do |distance|
		options[:distance] = distance.to_i
	end

	options[:port] = 0 #50000
	opts.on( '-p', '--port PORT', "Server port\n\n" ) do |port|
		options[:port] = port.to_i
	end
  
	options[:chimera] = 'rc' 
	opts.on( '-q', '--chimera_detection STRING', "d for deactivate chimera detection mode, s for search chimeras only, r for revise it and c for cut it. Default = rcs \n\n" ) do |chimera|
		chimera.downcase!
		options[:chimera] = chimera
	end

	options[:reptrans] = nil
	opts.on( '-r', '--representative_transcriptome', "Generates a fasta file with the minime transcriptome\n" ) do |reptrans|
		options[:reptrans] = reptrans
	end

	options[:server_ip] = '0.0.0.0'
	opts.on( '-s', '--server IP', 'Server ip. Can use a partial ip to select the apropriate interface' ) do |server_ip|

		# get list of available ips
		ip_list = Socket.ip_address_list.select{|e| e.ipv4?}.map{|e| e.ip_address}   
		ip=ip_list.select{|ip| ip.index(server_ip)==0}.first
    
		if !ip
			ip='0.0.0.0'
			# $LOG.info("No available ip matching #{server_ip}")
		end
		# $ .info("Using ip #{ip}")
		options[:server_ip] = ip
	end

	options[:ident_thresold] = 55.0
	opts.on( '-t', '--identity_thresold FLOAT', "For chimeras only. Min identity to consider that two proteins are the same. Default=55.0\n\n" ) do |ident_thresold|
		options[:ident_thresold] = ident_thresold.to_i
	end

	options[:user_db] = nil
	opts.on( '-u', '--user_db UserDB', 'User blast+ database' ) do |db|
		options[:user_db] = db	
		if !File.exists?(File.expand_path(db+'.psq'))
			puts "user database: #{options[:user_db]} was not found"
			exit
		end 	
	end

	options[:verbose] = 0
	opts.on( '-v', '--verbose INTEGER', 'Show extra info' ) do |verbose|
		options[:verbose] = verbose.to_i
	end

	options[:workers] = 2
	opts.on( '-w', '--workers INTEGER/FILE', 'Number of CPUs, or a file containing machine names to launch workers with ssh' ) do |workers|

             if File.exists?(workers)
               # use workers file
               options[:workers] = File.read(workers).split("\n").map{|w| w.chomp}
               options[:workers].shift
             elsif (workers.to_i > 0)
               options[:workers] = workers.to_i
             else
               options[:workers] = 2
             end

    end

	options[:training_ident] = 45.00
	opts.on( '-x', '--training_identity_percent IDENTITY', 'identity percent threshold to use a complete sure sequence for Transdecoder training. Default=45.00' ) do |training_ident|
		options[:training_ident] = ident.to_f 
	end

	options[:hdd] = FALSE
	opts.on( '-z', '--hdd', 'Write/use blast report on HDD' ) do |hdd|
		options[:hdd] = TRUE
	end


	options[:files2map] = []
	opts.on('-M', '--files2map STRING', 'Fastq files to map against analysed transcriptome This must be a comma separated string with the full paths to the files' ) do |files2map|
		options[:files2map] = files2map.split(';').map{|map_files| map_files.split(',')}
	end

	options[:remove_unmapped] = TRUE
	opts.on('-R', '--remove_unmapped', 'When fastq files are provided, all sequences without at least a read pair are removed. When this option is enabled this filtering is disabled' ) do 
		options[:remove_unmapped] = FALSE
	end
	
	# Set a banner, displayed at the top of the help screen.
	opts.banner = "\nUsage: full_lengther_next -f input.fasta -g [fungi|human|invertebrates|mammals|plants|rodents|vertebrates] [options]\n\n"

	# This displays the help screen
	opts.on( '-h', '--help', 'Display this screen' ) do
		puts opts
		exit
	end
   
end

# parse options and remove from ARGV
optparse.parse!

if (options[:fasta].nil?) || (options[:tax_group].nil?)
	puts "incorrect number of arguments, you need a fasta file and a taxonomical group:\n\n\t"
	puts optparse.help
	exit
end

###################################################################################################
# PREPARE ENVIROMENT
###################################################################################################
if ENV['FULL_LENGTHER_NEXT_INIT'] && File.exists?(ENV['FULL_LENGTHER_NEXT_INIT'])
  FULL_LENGTHER_NEXT_INIT=File.expand_path(ENV['FULL_LENGTHER_NEXT_INIT'])
else
  FULL_LENGTHER_NEXT_INIT=File.join(ROOT_PATH,'init_env')
end

if ENV['FLN_TEMP']
	options[:temp] = File.join(ENV['FLN_TEMP'], 'temp') 
else
	options[:temp] = File.join(Dir.pwd, 'temp') 
end

if !File.exists?(options[:temp])
	Dir.mkdir(options[:temp])
end

if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
  formatted_db_path = ENV['BLASTDB']
else # otherwise use ROOTPATH + DB
  formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
end

ENV['BLASTDB']=formatted_db_path
puts "Using databases at: #{ENV['BLASTDB']}"

ncrna_path = File.join(ENV['BLASTDB'],'nc_rna_db','ncrna.nhr')
if !File.exists?(ncrna_path) && options[:acess_db].include?('c')
  	puts "DB File #{ncrna_path} doesn't exists"
	puts optparse.help
	exit
end

if options[:acess_db].include?('s') || options[:acess_db].include?('t')
	sp_path=File.join(ENV['BLASTDB'],"sp_#{options[:tax_group]}","sp_#{options[:tax_group]}.psq")
	if !File.exists?(sp_path)
		puts "DB File #{sp_path} doesn't exists, or"
		puts "incorrect taxon group name: #{options[:tax_group]} choose:"
		puts optparse.help
		exit
	end
end

##################################################################################################
# MAIN  
###################################################################################################

require 'scbi_mapreduce'
require 'my_worker_manager_fln' #First server
require 'reptrans'

require 'go_methods'
require "benchmark"

#Benchmark.bm do |x|
#	x.report('main'){
options[:ref_files] = []
if !options[:files2map].empty? # Mapping
	#$VERBOSE = true
	temp = File.join(options[:temp], 'map')
	options[:temp_map_folder] = temp
	FileUtils.mkdir(temp) if !Dir.exists?(temp)
	map_workers = options[:workers]
	map_workers = options[:workers].length if options[:workers].class == Array
	options[:ref_files], options[:n_refs] = split_fasta(options[:fasta], temp, map_workers-1, options[:chunk_size])

end

$LOG = Logger.new(STDOUT)
$LOG.datetime_format = "%Y-%m-%d %H:%M:%S"

main_path = File.dirname(ROOT_PATH)
custom_worker_file = File.join(main_path, 'lib','full_lengther_next','my_worker.rb')

$LOG.info 'Starting server'
	# initialize work manager (open files, etc)
	MyWorkerManagerFln.init_work_manager(options)
	
	# Create server
	server = ScbiMapreduce::Manager.new(options[:server_ip], options[:port], options[:workers], MyWorkerManagerFln, custom_worker_file, STDOUT, FULL_LENGTHER_NEXT_INIT)
	server.chunk_size = options[:chunk_size]
	
	# launch server
	server.start_server
$LOG.info 'Closing server'
seqs_annotation_prot, seqs_some_coding,seqs_unknown = MyWorkerManagerFln.get_annotations()


if !options[:reptrans].nil?
	reptrans(seqs_annotation_prot, seqs_some_coding ,seqs_unknown, options)
end
puts "\nGracias por utilizar Full-LengtherNEXT"


#} #Bench
#end #Bench