#!/usr/bin/env ruby # 15-2-2011 Noe Fernandez-Pozo # Script to download Full-LengtherNext databases. # Once in UniProtKB/Swiss-Prot, a protein entry is removed from UniProtKB/TrEMBL. require 'net/ftp' ################################################### Functions def conecta_uniprot(my_array, formatted_db_path) $ftp = Net::FTP.new() if !File.exists?('blast_dbs') Dir.mkdir('blast_dbs') end $ftp.connect('ftp.uniprot.org') $ftp.login puts "connected to UniProt" my_array.each do |db_group| puts "Downloading #{db_group}" download_uniprot(db_group, formatted_db_path) end $ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/complete") $ftp.getbinaryfile("uniprot_sprot_varsplic.fasta.gz", "#{formatted_db_path}/uniprot_sprot_varsplic.fasta.gz") puts "isoform files downloaded" $ftp.close end def download_uniprot(uniprot_group, formatted_db_path) $ftp.chdir("/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions") $ftp.getbinaryfile("uniprot_sprot_#{uniprot_group}.dat.gz", "#{formatted_db_path}/uniprot_sprot_#{uniprot_group}.dat.gz") $ftp.getbinaryfile("uniprot_trembl_#{uniprot_group}.dat.gz", "#{formatted_db_path}/uniprot_trembl_#{uniprot_group}.dat.gz") puts "#{uniprot_group} files downloaded" end def filter_incomplete_seqs(file_name, isoform_hash, formatted_db_path) puts "filtering sequences from #{file_name}" # UniProtKB fragments with FT NON_CONS and FT NON_TER features. # # * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines: # FT NON_TER 1 1 # FT NON_TER 29 29 # * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line: # FT NON_CONS 1683 1684 # # NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'. newseq=false print_seq=true id='' description = '' organism_name = '' seq = '' organelle = '' file_name =~ /uniprot_([a-z]+)_([a-z]+).dat/ db_name = $1 output_name = $2 db_name.sub!('sprot','sp') db_name.sub!('trembl','tr') if !File.exists?("#{formatted_db_path}/#{db_name}_#{output_name}") Dir.mkdir("#{formatted_db_path}/#{db_name}_#{output_name}") end output_file = File.new("#{formatted_db_path}/#{db_name}_#{output_name}/#{db_name}_#{output_name}.fasta", "w") File.open(file_name).each_line do |line| if (newseq == false) if (line =~ /^AC\s+(\w+);/) id=$1 newseq = true description = '' organism_name = '' seq = '' print_seq = true organelle = '' end else if (line =~ /^DE\s+(.+)\;*/) if (description == '') description = $1 description.sub!(/RecName: Full=/,'sp=') description.sub!(/SubName: Full=/,'tr=') end if (line =~ /Flags: Fragment/) # puts "#{id} #{line}" print_seq=false end elsif (line =~ /^OS\s+(.+)/) organism_name = $1 elsif (line =~ /^OG\s+(.+)/) organelle = $1 elsif (line =~ /^FT\s+NON_TER\s+/) print_seq=false # puts "#{id} NON_TER" elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/) print_seq=false # puts "#{id} NON_CONS" elsif (line =~ /^\s+([\w\s]+)/) seq += $1 elsif (line =~ /^\/\//) seq.gsub!(/\s*/,'') if (seq !~ /^M/i) print_seq=false end newseq = false if (print_seq) output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}" if (!isoform_hash[id].nil?) output_file.puts isoform_hash[id] end end end end end output_file.close end def load_isoform_hash(file) isoform_hash = {} my_fasta = '' acc = '' File.open(file).each do |line| line.chomp! if (line =~ /(^>\w+\|(\w+)\-\d\|.+)/) if (isoform_hash[acc].nil?) isoform_hash[acc]= "#{my_fasta}\n" else isoform_hash[acc]+= "#{my_fasta}\n" end my_fasta = "#{$1}\n" acc = $2 else my_fasta += line end end # if (isoform_hash[acc].nil?) # isoform_hash[acc]= "#{my_fasta}\n" # else # isoform_hash[acc]+= "#{my_fasta}\n" # end return isoform_hash end ################################################### MAIN ROOT_PATH=File.dirname(__FILE__) if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB']) formatted_db_path = ENV['BLASTDB'] else # otherwise use ROOTPATH + DB formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs")) end ENV['BLASTDB']=formatted_db_path puts "Databases will be downloaded at: #{ENV['BLASTDB']}" my_array = ["human","fungi","invertebrates","mammals","plants","rodents","vertebrates"] # my_array = ["plants","invertebrates"] # used for a shoter test conecta_uniprot(my_array, formatted_db_path) `gunzip #{formatted_db_path}/*gz` isoform_hash = {} isoform_hash = load_isoform_hash("#{formatted_db_path}/uniprot_sprot_varsplic.fasta") my_array.each do |db_group| filter_incomplete_seqs("#{formatted_db_path}/uniprot_sprot_#{db_group}.dat", isoform_hash, formatted_db_path) filter_incomplete_seqs("#{formatted_db_path}/uniprot_trembl_#{db_group}.dat", isoform_hash, formatted_db_path) `makeblastdb -in #{formatted_db_path}/sp_#{db_group}/sp_#{db_group}.fasta -dbtype 'prot' -parse_seqids` `makeblastdb -in #{formatted_db_path}/tr_#{db_group}/tr_#{db_group}.fasta -dbtype 'prot' -parse_seqids` end puts "download_fln_dbs.rb has finished"