#!/usr/bin/env ruby # 15-2-2011 Noe Fernandez-Pozo # Script to create your own Full-LengtherNext User database. require 'net/ftp' #receive one argument or fail if (ARGV.size != 2) puts "incorrect number of arguments, you need a taxonomic group like 'Coniferopsida', you can search it in 'http://www.ncbi.nlm.nih.gov/Taxonomy/' and a UniProt taxonomic group from this list: fungi human invertebrates mammals plants rodents vertebrates mode of use: ruby make_user_db.rb coniferopsida plants\n\n" Process.exit(-1); end (my_group,uniprot_group)=ARGV ################################################### Functions def filter_incomplete_seqs(output_file,file_name, my_group) puts " filtering sequences" # UniProtKB fragments with FT NON_CONS and FT NON_TER features. # # * FT NON_TER: The residue at an extremity of the sequence is not the terminal residue. If applied to position 1, this signifies that the first position is not the N-terminus of the complete molecule. If applied to the last position, it means that this position is not the C-terminus of the complete molecule. There is no description field for this key. Examples of NON_TER key feature lines: # FT NON_TER 1 1 # FT NON_TER 29 29 # * FT NON_CONS: Non-consecutive residues. Indicates that two residues in a sequence are not consecutive and that there are a number of unreported or missing residues between them. Example of a NON_CONS key feature line: # FT NON_CONS 1683 1684 # # NON_CONS fragments are not indicated as non-consecutive in InterPro and being non-consecutive the match to methods may be incorrect if the method spans the 'break'. newseq=false print_seq=false incomplete=false id='' description = '' organism_name = '' seq = '' organelle = '' File.open(file_name).each_line do |line| if (newseq == false) if (line =~ /^AC\s+(\w+);/) id=$1 newseq = true description = '' organism_name = '' seq = '' print_seq = false incomplete = false organelle = '' end else if (line =~ /^DE\s+(.+)\;*/) if (description == '') description = $1 description.sub!(/RecName: Full=/,'sp=') description.sub!(/SubName: Full=/,'tr=') end if (line =~ /Flags: Fragment/) # puts "#{id} #{line}" incomplete = true end elsif (line =~ /^OS\s+(.+)/) organism_name = $1 elsif (line =~ /^OG\s+(.+)/) organelle = $1 elsif (line =~ /^OC\s+[\w\s\;]*#{my_group}/i) && (!incomplete) print_seq=true # puts "#{id} #{organism_name} print_seq?: #{print_seq}" elsif (line =~ /^FT\s+NON_TER\s+/) print_seq=false # puts "#{id} NON_TER" elsif (line =~ /^FT\s+NON_CONS\s+(\d+)\s+/) print_seq=false # puts "#{id} NON_CONS" elsif (line =~ /^\s+([\w\s]+)/) seq += $1 elsif (line =~ /^\/\//) seq.gsub!(/\s*/,'') if (seq !~ /^M/i) print_seq=false end newseq = false if (print_seq) output_file.puts ">#{id} #{description} #{organism_name} #{organelle}\n#{seq}" end end end end end ######################################################## ## MAIN ######################################################## ROOT_PATH=File.dirname(__FILE__) # $: << File.expand_path(File.join(ROOT_PATH, "classes")) # load gem path, only to test locally # $: << File.expand_path('~/progs/ruby/gems/full_lengther_next/lib') require 'full_lengther_next' if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB']) formatted_db_path = ENV['BLASTDB'] else # otherwise use ROOTPATH + DB formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs")) end ENV['BLASTDB']=formatted_db_path if !File.exists?(File.join(ENV['BLASTDB'], my_group)) Dir.mkdir("blast_dbs/#{my_group}") end output_file_path=File.join(ENV['BLASTDB'],my_group,my_group+".fasta") output_file = File.new(output_file_path, "w") filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_sprot_#{uniprot_group}.dat"), my_group) filter_incomplete_seqs(output_file, File.join(ENV['BLASTDB'], "uniprot_trembl_#{uniprot_group}.dat"), my_group) output_file.close `makeblastdb -in #{output_file_path} -dbtype 'prot' -parse_seqids` puts "make_user_db.rb has finished"