#!/usr/bin/env ruby # 15-2-2011 Noe Fernandez-Pozo # Script to download Full-LengtherNext databases. # Once in UniProtKB/Swiss-Prot, a protein entry is removed from UniProtKB/TrEMBL. ROOT_PATH=File.dirname(__FILE__) $: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next/classes/")) require 'bio' require 'net/ftp' require 'open-uri' require 'scbi_zcat' require 'optparse' require 'cdhit' require 'handle_db' ############################################################################################## ## METHODS ############################################################################################# def download_ncrna(formatted_db_path, no_download) ncrna_zip = File.join(formatted_db_path, 'ncrna.gz') db_path = File.join(formatted_db_path, 'nc_rna_db') db_files = File.join(db_path, 'ncrna') fasta = File.join(db_path, 'filtered.fasta') if !no_download puts "Downloading ncRNA database" open(ncrna_zip, 'wb') do |my_file| my_file.print open('ftp://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/sequences/rnacentral_active.fasta.gz').read #my_file.print open('http://www.ncrna.org/frnadb/files/ncrna.zip').read end puts "\nncRNA database downloaded" end if File.exists?(ncrna_zip) puts "\nFiltering ncRNA database" Dir.mkdir(db_path) if !File.exists?(db_path) black_list = [' 16S ', 'rRNA', 'ribosomal', 'tRNA', 'rrn'] #rrn = ribosonal rna filtered_fasta = filtering_seqs(ncrna_zip, 40, black_list) #do_makeblastdb(filtered_fasta, db_files, 'nucl') output_file = File.open(fasta, 'w') output_file.puts filtered_fasta output_file.close puts "\nncRNA database filtered" puts "\nncRNA database removing redundance with cdhit and creating BlastDb" cmd = "cd-hit-est -i #{fasta} -o /dev/stderr -c 0.95 -n 11 -M 0 2>&1 >/dev/null | makeblastdb -in - -out #{db_files} -title #{File.basename(db_files)} -dbtype 'nucl' -parse_seqids" system(cmd) puts "\nncRNA database completed" end end def filtering_seqs(fasta_file, max_length, black_list) fasta = ScbiZcatFile.new(fasta_file) filtered_fasta = '' seq_name = nil seq = '' while !fasta.eof line = fasta.readline.chomp if line[0] == '>' if !seq_name.nil? filtered_fasta << "#{seq_name}\n#{seq}\n" if seq.length >= max_length && !compare_list(seq_name, black_list) end seq_name = line seq = '' else seq << line end end filtered_fasta << "#{seq_name}\n#{seq}\n" if seq.length >= max_length && !compare_list(seq_name, black_list) return filtered_fasta end def compare_list(string, list) res = FALSE list.each do |word| if string.include?(word) res = TRUE break end end return res end def conecta_uniprot(my_array, formatted_db_path) Dir.mkdir(formatted_db_path) if !File.exists?(formatted_db_path) varsplic_out=File.join(formatted_db_path,'uniprot_sprot_varsplic.fasta.gz') $ftp = Net::FTP.new() $ftp.connect('ftp.ebi.ac.uk') $ftp.login puts "connected to UniProt" my_array.each do |db_group| puts "Downloading #{db_group}" download_uniprot(db_group, formatted_db_path) end #archivo de variantes de splicing. POR QUE? $ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/complete") $ftp.getbinaryfile("uniprot_sprot_varsplic.fasta.gz", varsplic_out) $ftp.close puts "isoform files downloaded" end def download_uniprot(uniprot_group, formatted_db_path) sp_out=File.join(formatted_db_path,"uniprot_sprot_#{uniprot_group}.dat.gz") tr_out=File.join(formatted_db_path,"uniprot_trembl_#{uniprot_group}.dat.gz") $ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions") $ftp.getbinaryfile("uniprot_sprot_#{uniprot_group}.dat.gz", sp_out) $ftp.getbinaryfile("uniprot_trembl_#{uniprot_group}.dat.gz", tr_out) puts "#{uniprot_group} files downloaded" end def filter_and_makeDB(formatted_db_path, dbtype, db_group, isoform_hash, prefix, options) file_name = prefix +'_' + db_group puts 'Building ' + file_name fasta = File.join(formatted_db_path,"#{file_name}","#{file_name}.fasta") blastdb_input = fasta.gsub('.fasta', '') current_db_source = File.join(formatted_db_path, "uniprot_#{dbtype}_#{db_group}.dat") if File.exists?(current_db_source) seqs = filter_incomplete_seqs(current_db_source, isoform_hash, formatted_db_path, file_name, options) if !options[:only_index] if options[:cdhit] > 0 output_file = File.open(fasta, 'w') output_file.puts seqs output_file.close system("cd-hit -i #{fasta} -o /dev/stderr -c #{options[:cdhit]} -M 0 -s 0.95 2>&1 >/dev/null| makeblastdb -in - -out #{blastdb_input} -title #{File.basename(blastdb_input)} -dbtype 'prot' -parse_seqids") else do_makeblastdb(seqs, blastdb_input, 'prot') end end end end def complete?(uniprot_record) complete = TRUE if uniprot_record.description.include?('Flags: Fragment') || #Discard non full length records uniprot_record.seq[0] != 'M' || uniprot_record.seq.include?('XX') || uniprot_record.ft.keys.include?('NON_TER') ||# The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key uniprot_record.ft.keys.include?('NON_CONS') # Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them complete = FALSE end return complete end def fln_record(uniprot_record, seqs, index, isoform_hash) index_record = [] # Primary data accession_number = uniprot_record.accession description_data = uniprot_record.description.split(';') description = description_data.first description.sub!(/RecName: Full=/,'sp=') description.sub!(/SubName: Full=/,'tr=') description.sub!(/{\S*}/,'') organism = uniprot_record.os.first.values.reverse.join(' ') organelle = uniprot_record.og.join(' ') sequence = uniprot_record.seq.gsub('U','X') # Secondary data index_record << accession_number index_record << description taxonomy = uniprot_record.oc.join(';') index_record << taxonomy index_record << organism index_record << sequence if !organelle.empty? index_record << organelle else index_record << '-' end go_data = uniprot_record.dr['GO'] if !go_data.nil? index_record << go_data.map{|go| go[0]}.join(';') # GO ID index_record << go_data.map{|go| go[1]}.join(';') # GO Description else index_record << '-' index_record << '-' end kegg_data = uniprot_record.dr['KEGG'] if !kegg_data.nil? index_record << kegg_data.map{|kegg| kegg[0]}.join(';') else index_record << '-' end interpro_data = uniprot_record.dr['InterPro'] if !interpro_data.nil? index_record << interpro_data.map{|ip| ip[0]}.join(';') # interpro ID index_record << interpro_data.map{|ip| ip[1]}.join(';') # ip Description else index_record << '-' index_record << '-' end if !description_data[1].nil? && description_data[1].include?('EC=') index_record << description_data[1].split(' ').first.gsub('=',':') else index_record << '-' end pfam_data = uniprot_record.dr['Pfam'] if !pfam_data.nil? index_record << pfam_data.map!{|pf| pf[0]}.join(';') # pfam ID index_record << pfam_data.map!{|pf| pf[1]}.join(';') # pfam description else index_record << '-' index_record << '-' end unipathway_data = uniprot_record.dr['UniPathway'] if !unipathway_data.nil? index_record << unipathway_data.map!{|pf| pf[0]}.join(';') # unipathway ID else index_record << '-' end seqs << ">#{[accession_number, description, organism, organelle].join(' ')}\n#{sequence}\n" index.puts index_record.join("\t") seqs << isoform_hash[accession_number]+"\n" if !isoform_hash.nil? && !isoform_hash[accession_number].nil? end def ncbi_record(uniprot_record, seqs) accession_number = uniprot_record.accession id = uniprot_record.entry_id organism = uniprot_record.os.first.values.reverse.join(' ') sequence = uniprot_record.seq description = uniprot_record.description.split(';').first gene_name = nil gn_field = uniprot_record.gn.first gene_name = gn_field[:name] if !gn_field.nil? prediction_field = uniprot_record.get('PE') prediction_field =~ /PE\s+(\d+):/ prediction_status = $1 sequence_version_field = uniprot_record.dt['sequence'] sequence_version_field =~ /sequence version (\d+)./ sequence_version = $1 db = nil if description.include?('RecName: Full=') db = 'sp' description.sub!(/RecName: Full=/,'') elsif description.include?('SubName: Full=') db = 'tr' description.sub!(/SubName: Full=/,'') end taxonomy = uniprot_record.oc.join(';') seqs << ">#{db}|#{accession_number}|#{id} #{description} OS=#{organism} GN=#{gene_name} PE=#{prediction_status} SV=#{sequence_version}\n#{sequence}\n" end def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path, db_name, options) puts "filtering sequences from #{file_name}" db_folder = File.join(formatted_db_path, db_name) Dir.mkdir(db_folder) if !File.exists?(db_folder) main_name = File.join(db_folder, db_name) index = File.open(main_name + '.index', 'w') if !options[:all] seqs = '' Bio::FlatFile.auto(file_name).each_entry {|uniprot_record| if !options[:all] && !complete?(uniprot_record) next else #Get attributes of full length records if options[:all] ncbi_record(uniprot_record, seqs) else fln_record(uniprot_record, seqs, index, isoform_hash) end end } index.close if !options[:all] return seqs end ########################################################################################## ## OPTIONS ########################################################################################## options = {} divs = %w{fungi invertebrates mammals plants rodents vertebrates} all_divs = %w{human fungi invertebrates mammals plants rodents vertebrates} optparse = OptionParser.new do |opts| options[:uniprot_div] = divs opts.on( '-u', '--file String', 'Uniprot DBs to be downloaded. String structure: \'div_name1,div_name2..\'. Posible options: human, fungi, invertebrates, mammals, plants, rodents, vertebrates. Default: download all') do |uniprot_div| temp_divs = uniprot_div.split(',') check_valid_ids = temp_divs - all_divs if !check_valid_ids.empty? puts 'This uniprot division not exists', check_valid_ids process.exit else options[:uniprot_div] = temp_divs end end options[:no_download] = FALSE opts.on( '-d', '--no_download', 'Only parse downloaded files without download them again') do options[:no_download] = TRUE end options[:no_ncrna] = FALSE opts.on( '-n', '--no_ncrna', 'No use ncrna sequences') do options[:no_ncrna] = TRUE end options[:only_index] = FALSE opts.on( '-i', '--only_index', 'Build annotation index only without do blast DB') do options[:only_index] = TRUE end options[:no_trembl] = FALSE opts.on( '-t', '--no_trembl', 'No use trembl sequences') do options[:no_trembl] = TRUE end options[:all] = FALSE opts.on( '-a', '--all_sequences', 'Generate databases with all sequences') do options[:all] = TRUE end options[:cdhit] = 0 opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit| options[:cdhit] = cdhit.to_f end options[:no_uniprot] = FALSE opts.on( '-p', '--no_uniprot', 'No use uniprot sequences') do options[:no_uniprot] = TRUE end # Set a banner, displayed at the top of the help screen. opts.banner = "Usage: #{File.basename(__FILE__)} [options] \n\n" # This displays the help screen opts.on( '-h', '--help', 'Display this screen' ) do puts opts exit end end # End opts # parse options and remove from ARGV optparse.parse! ############################################################################################## ## MAIN ############################################################################################## if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB']) formatted_db_path = ENV['BLASTDB'] else # otherwise use ROOTPATH + DB formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs")) Dir.mkdir(formatted_db_path) end ENV['BLASTDB'] = formatted_db_path puts "Databases will be downloaded at: #{ENV['BLASTDB']}" puts "\nTo set the path for storing databases, execute next line in your terminal or add it to your .bash_profile:\n\n\texport BLASTDB=/my_path/\n\n" download_ncrna(formatted_db_path, options[:no_download]) if !options[:no_ncrna] if !options[:no_download] if !options[:no_uniprot] conecta_uniprot(options[:uniprot_div], formatted_db_path) system('gunzip '+formatted_db_path+'*.gz') end end if !options[:no_uniprot] isoform_hash = load_isoform_hash(File.join(formatted_db_path, "uniprot_sprot_varsplic.fasta")) #archivo de variantes de splicing. POR QUE? options[:uniprot_div].each do |db_group| filter_and_makeDB(formatted_db_path, 'sprot', db_group, isoform_hash, 'sp', options) filter_and_makeDB(formatted_db_path, 'trembl', db_group, nil, 'tr', options) if !options[:no_trembl] end end puts "download_fln_dbs.rb has finished"