#!/usr/bin/env ruby
 
# 15-2-2011 Noe Fernandez-Pozo
# Script to download Full-LengtherNext databases.
# Once in UniProtKB/Swiss-Prot, a protein entry is removed from UniProtKB/TrEMBL.

ROOT_PATH=File.dirname(__FILE__)
$: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/"))

require 'bio'
require 'net/ftp'
require 'open-uri'
require 'scbi_zcat'
require 'optparse'
require 'cdhit'
require 'handle_db'

############################################################################################## 
## METHODS
#############################################################################################
def download_ncrna(formatted_db_path, no_download)

	ncrna_zip = File.join(formatted_db_path, 'ncrna.gz')
	db_path = File.join(formatted_db_path, 'nc_rna_db')
	db_files = File.join(db_path, 'ncrna')
	fasta = File.join(db_path, 'filtered.fasta')
	if !no_download 
		puts "Downloading ncRNA database"
		open(ncrna_zip, 'wb') do |my_file|
		 	my_file.print open('ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/sequences/rnacentral_active.fasta.gz').read
		 	#my_file.print open('http://www.ncrna.org/frnadb/files/ncrna.zip').read
		end
		puts "\nncRNA database downloaded"
	end
	
	if  File.exists?(ncrna_zip)
		puts "\nFiltering ncRNA database"
		Dir.mkdir(db_path) if !File.exists?(db_path)
		black_list = [' 16S ', 'rRNA', 'ribosomal', 'tRNA', 'rrn'] #rrn = ribosonal rna
		filtered_fasta = filtering_seqs(ncrna_zip, 40, black_list)
		#do_makeblastdb(filtered_fasta, db_files, 'nucl')
		output_file = File.open(fasta, 'w')
		output_file.puts filtered_fasta
		output_file.close
		puts "\nncRNA database filtered"
		puts "\nncRNA database removing redundance with cdhit and creating BlastDb"
		cmd = "cd-hit-est -i #{fasta} -o /dev/stderr -c 0.95 -n 11 -M 0 2>&1 >/dev/null | makeblastdb -in - -out #{db_files} -title #{File.basename(db_files)} -dbtype 'nucl' -parse_seqids"
		system(cmd)
		puts "\nncRNA database completed"
	end
end

def filtering_seqs(fasta_file, max_length, black_list)
	fasta = ScbiZcatFile.new(fasta_file)
    filtered_fasta = ''
	seq_name = nil
	seq = ''
	while !fasta.eof
	        line = fasta.readline.chomp
	        if line[0] == '>'
	                if !seq_name.nil?
	                    filtered_fasta << "#{seq_name}\n#{seq}\n" if seq.length >= max_length && !compare_list(seq_name, black_list)
	                end
	                seq_name = line
	                seq = ''
	        else
	                seq << line
	        end
	end
    filtered_fasta << "#{seq_name}\n#{seq}\n" if seq.length >= max_length && !compare_list(seq_name, black_list)

    return filtered_fasta
end

def compare_list(string, list)
    res = FALSE
    list.each do |word|
        if string.include?(word)
            res = TRUE
            break
        end
    end
    return res
end

def conecta_uniprot(my_array, formatted_db_path)
	
	Dir.mkdir(formatted_db_path) if !File.exists?(formatted_db_path)
	varsplic_out=File.join(formatted_db_path,'uniprot_sprot_varsplic.fasta.gz')

	$ftp = Net::FTP.new()
	$ftp.connect('ftp.ebi.ac.uk')
	$ftp.login
	
	puts "connected to UniProt"
	my_array.each do |db_group|
		puts "Downloading #{db_group}"
		download_uniprot(db_group, formatted_db_path)
	end

	#archivo de variantes de splicing. POR QUE?
	$ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/complete")
	$ftp.getbinaryfile("uniprot_sprot_varsplic.fasta.gz", varsplic_out)
	$ftp.close
	
	puts "isoform files downloaded"
end

def download_uniprot(uniprot_group, formatted_db_path)
	
	sp_out=File.join(formatted_db_path,"uniprot_sprot_#{uniprot_group}.dat.gz")
	tr_out=File.join(formatted_db_path,"uniprot_trembl_#{uniprot_group}.dat.gz")
	$ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions")
	$ftp.getbinaryfile("uniprot_sprot_#{uniprot_group}.dat.gz", sp_out)
	$ftp.getbinaryfile("uniprot_trembl_#{uniprot_group}.dat.gz", tr_out)

	puts "#{uniprot_group} files downloaded"
	
end


def filter_and_makeDB(formatted_db_path, dbtype, db_group, isoform_hash, prefix, options)
	file_name = prefix +'_' + db_group
	puts 'Building ' + file_name
	fasta = File.join(formatted_db_path,"#{file_name}","#{file_name}.fasta")
	blastdb_input = fasta.gsub('.fasta', '') 
	current_db_source = File.join(formatted_db_path, "uniprot_#{dbtype}_#{db_group}.dat")
	if File.exists?(current_db_source)
		seqs = filter_incomplete_seqs(current_db_source, isoform_hash, formatted_db_path, file_name, options)
		if !options[:only_index]
			if options[:cdhit] > 0
				output_file = File.open(fasta, 'w')
				output_file.puts seqs
				output_file.close
				system("cd-hit -i #{fasta} -o /dev/stderr -c #{options[:cdhit]} -M 0 -s 0.95 2>&1 >/dev/null| makeblastdb -in - -out #{blastdb_input} -title #{File.basename(blastdb_input)} -dbtype 'prot' -parse_seqids")
			else
				do_makeblastdb(seqs, blastdb_input, 'prot')
			end
		end
	end
end

def complete?(uniprot_record)
	complete = TRUE
	if uniprot_record.description.include?('Flags: Fragment') || #Discard non full length records
		uniprot_record.seq[0] != 'M' ||
		uniprot_record.seq.include?('XX') ||
		uniprot_record.ft.keys.include?('NON_TER') ||# The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key
		uniprot_record.ft.keys.include?('NON_CONS') # Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them
		complete = FALSE
	end
	return complete	
end

def fln_record(uniprot_record, seqs, index, isoform_hash)
	index_record = []
	# Primary data
	accession_number = uniprot_record.accession
	description_data = uniprot_record.description.split(';')
	description = description_data.first
	description.sub!(/RecName: Full=/,'sp=')
	description.sub!(/SubName: Full=/,'tr=')
	description.sub!(/{\S*}/,'')

	organism = uniprot_record.os.first.values.reverse.join(' ')
	organelle = uniprot_record.og.join(' ')
	sequence = uniprot_record.seq.gsub('U','X')

	# Secondary data
	index_record << accession_number
	index_record << description
	taxonomy = uniprot_record.oc.join(';')
	index_record << taxonomy
	index_record << organism
	index_record << sequence
	if !organelle.empty?
		index_record << organelle
	else
		index_record << '-'
	end
	go_data = uniprot_record.dr['GO']
	if !go_data.nil?
		index_record << go_data.map{|go| go[0]}.join(';') # GO ID
		index_record << go_data.map{|go| go[1]}.join(';') # GO Description
	else
		index_record << '-'
		index_record << '-'
	end
	kegg_data = uniprot_record.dr['KEGG']
	if !kegg_data.nil?
		index_record << kegg_data.map{|kegg| kegg[0]}.join(';')
	else
		index_record << '-'
	end
	interpro_data = uniprot_record.dr['InterPro']
	if !interpro_data.nil?
		index_record << interpro_data.map{|ip| ip[0]}.join(';') # interpro ID
		index_record << interpro_data.map{|ip| ip[1]}.join(';') # ip Description
	else
		index_record << '-'
		index_record << '-'
	end

	if !description_data[1].nil? && description_data[1].include?('EC=')
		index_record << description_data[1].split(' ').first.gsub('=',':')
	else
		index_record << '-'
	end

	pfam_data = uniprot_record.dr['Pfam']
	if !pfam_data.nil?
		index_record << pfam_data.map!{|pf| pf[0]}.join(';') # pfam ID
		index_record << pfam_data.map!{|pf| pf[1]}.join(';') # pfam description
	else
		index_record << '-'
		index_record << '-'
	end
	
	unipathway_data = uniprot_record.dr['UniPathway']
	if !unipathway_data.nil?
		index_record << unipathway_data.map!{|pf| pf[0]}.join(';') # unipathway ID
	else
		index_record << '-'
	end

	seqs << ">#{[accession_number, description, organism, organelle].join(' ')}\n#{sequence}\n"			
	index.puts index_record.join("\t")
	seqs << isoform_hash[accession_number]+"\n" if !isoform_hash.nil? && !isoform_hash[accession_number].nil?
end

def	ncbi_record(uniprot_record, seqs)
	accession_number = uniprot_record.accession
	id = uniprot_record.entry_id
	organism = uniprot_record.os.first.values.reverse.join(' ')
	sequence = uniprot_record.seq
	description = uniprot_record.description.split(';').first
	gene_name = nil
	gn_field = uniprot_record.gn.first
	gene_name = gn_field[:name] if !gn_field.nil?
	prediction_field = uniprot_record.get('PE')
	prediction_field =~ /PE\s+(\d+):/
	prediction_status = $1
	sequence_version_field = uniprot_record.dt['sequence']
	sequence_version_field =~ /sequence version (\d+)./
	sequence_version = $1
	db = nil
	if description.include?('RecName: Full=')
		db = 'sp'
		description.sub!(/RecName: Full=/,'')
	elsif description.include?('SubName: Full=')
		db = 'tr'
		description.sub!(/SubName: Full=/,'')
	end
	taxonomy = uniprot_record.oc.join(';')

	seqs << ">#{db}|#{accession_number}|#{id} #{description} OS=#{organism} GN=#{gene_name} PE=#{prediction_status} SV=#{sequence_version}\n#{sequence}\n"			
end

def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path, db_name, options)
	
	puts "filtering sequences from #{file_name}"
	
	db_folder = File.join(formatted_db_path, db_name)	
	Dir.mkdir(db_folder) if !File.exists?(db_folder)

	main_name = File.join(db_folder, db_name)
	index = File.open(main_name + '.index', 'w') if !options[:all]
	seqs = ''
	Bio::FlatFile.auto(file_name).each_entry {|uniprot_record|
		if !options[:all] && !complete?(uniprot_record)
			next
		else #Get attributes of full length records
			if options[:all]
				ncbi_record(uniprot_record, seqs)
			else
				fln_record(uniprot_record, seqs, index, isoform_hash)
			end
		end
	}

	index.close if !options[:all]
	return seqs
end


##########################################################################################
## OPTIONS
##########################################################################################

options = {}

divs = %w{fungi invertebrates mammals plants rodents vertebrates}
all_divs = %w{human fungi invertebrates mammals plants rodents vertebrates}

optparse = OptionParser.new do |opts|
  options[:uniprot_div] = divs
  opts.on( '-u', '--file String', 'Uniprot DBs to be downloaded. String structure: \'div_name1,div_name2..\'. Posible options: human, fungi, invertebrates, mammals, plants, rodents, vertebrates. Default: download all') do |uniprot_div|
		temp_divs = uniprot_div.split(',')
		check_valid_ids = temp_divs - all_divs
		if !check_valid_ids.empty?
			puts 'This uniprot division not exists', check_valid_ids
			process.exit
		else
			options[:uniprot_div] = temp_divs
		end         
  end

  options[:no_download] = FALSE
  opts.on( '-d', '--no_download', 'Only parse downloaded files without download them again') do 
		options[:no_download] = TRUE
  end

  options[:no_ncrna] = FALSE
  opts.on( '-n', '--no_ncrna', 'No use ncrna sequences') do 
		options[:no_ncrna] = TRUE
  end

  options[:only_index] = FALSE
  opts.on( '-i', '--only_index', 'Build annotation index only without do blast DB') do 
		options[:only_index] = TRUE
  end

  options[:no_trembl] = FALSE
  opts.on( '-t', '--no_trembl', 'No use trembl sequences') do 
		options[:no_trembl] = TRUE
  end

  options[:all] = FALSE
  opts.on( '-a', '--all_sequences', 'Generate databases with all sequences') do 
		options[:all] = TRUE
  end

  options[:cdhit] = 0
  opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit|
		options[:cdhit] = cdhit.to_f
  end

  options[:no_uniprot] = FALSE
  opts.on( '-p', '--no_uniprot', 'No use uniprot sequences') do 
		options[:no_uniprot] = TRUE
  end


  # Set a banner, displayed at the top of the help screen.
  opts.banner = "Usage: #{File.basename(__FILE__)} [options]  \n\n"

  # This displays the help screen
  opts.on( '-h', '--help', 'Display this screen' ) do
    puts opts
    exit
  end

end # End opts

# parse options and remove from ARGV
optparse.parse!


############################################################################################## 
## MAIN
##############################################################################################


if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB'])
  formatted_db_path = ENV['BLASTDB']
else # otherwise use ROOTPATH + DB
  formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs"))
  Dir.mkdir(formatted_db_path)
end


ENV['BLASTDB'] = formatted_db_path
puts "Databases will be downloaded at: #{ENV['BLASTDB']}"
puts "\nTo set the path for storing databases, execute next line in your terminal or add it to your .bash_profile:\n\n\texport BLASTDB=/my_path/\n\n"

download_ncrna(formatted_db_path, options[:no_download]) if !options[:no_ncrna]

if !options[:no_download]
	if !options[:no_uniprot]
		conecta_uniprot(options[:uniprot_div], formatted_db_path) 
		system('gunzip '+formatted_db_path+'*.gz')
	end
end

if !options[:no_uniprot]
	isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta"))  #archivo de variantes de splicing. POR QUE?
	options[:uniprot_div].each do |db_group|
		filter_and_makeDB(formatted_db_path, 'sprot', db_group, isoform_hash, 'sp', options)
		filter_and_makeDB(formatted_db_path, 'trembl', db_group, nil, 'tr', options) if !options[:no_trembl]
	end
end
puts "download_fln_dbs.rb has finished"