#!/usr/bin/env ruby # 15-2-2011 Noe Fernandez-Pozo # Script to create your own Full-LengtherNext User database. ROOT_PATH=File.dirname(__FILE__) $: << File.expand_path(File.join(ROOT_PATH, "../lib/full_lengther_next")) require 'cdhit' require 'handle_db' require 'optparse' ############################################################################################## ## METHODS ############################################################################################# def get_seqs(index, taxon, isoform_hash) seqs = '' File.open(index).each do |line| line.chomp! fields = line.split("\t") if fields[2].split(';').include?(taxon) if fields[5] == '-' organelle = '' else organelle = fields[5].gsub('-','') end seqs << ">#{[fields[0], fields[1], fields[3], organelle].join(' ')}\n#{fields[4]}\n" if !isoform_hash.nil? accid = fields[1].split(' ').first.split('-').first var_splice = isoform_hash[accid] seqs << var_splice + "\n" if !var_splice.nil? end end end return seqs end ########################################################################################## ## OPTIONS ########################################################################################## options = {} divs = %w{human fungi invertebrates mammals plants rodents vertebrates} optparse = OptionParser.new do |opts| options[:uniprot_div] = nil opts.on( '-u', '--file String', 'Uniprot DBs to taxon search. Posible options: human, fungi, invertebrates, mammals, plants, rodents, vertebrates.') do |uniprot_div| if !divs.include?(uniprot_div) puts 'This uniprot division not exists:', uniprot_div Process.exit end options[:uniprot_div] = uniprot_div end options[:taxon] = nil opts.on( '-t', '--taxon STRING', 'Specific taxon to search in uniprot division. Write taxo between \'\'') do |taxon| options[:taxon] = taxon end options[:name] = nil opts.on( '-n', '--name STRING', 'Database name in case the creation of a local DB') do |name| options[:name] = name end options[:local] = FALSE opts.on( '-l', '--local', 'Only parse downloaded files without download them again') do options[:local] = TRUE end options[:user_fasta] = nil opts.on( '-f', '--user_fasta FILE', 'Use a custom fasta file to build the user database') do |file| options[:user_fasta] = file end options[:cdhit] = 0 opts.on( '-c', '--cdhit FLOAT', 'Compact databases with cdhit. 0 for deactivate, >0 - 1 to set percentage of identity. Default: 0') do |cdhit| options[:cdhit] = cdhit.to_f end # Set a banner, displayed at the top of the help screen. opts.banner = "Usage: #{File.basename(__FILE__)} [options] \n\n" # This displays the help screen opts.on( '-h', '--help', 'Display this screen' ) do puts opts exit end end # End opts # parse options and remove from ARGV optparse.parse! ######################################################## ## MAIN ######################################################## if options[:user_fasta].nil? if options[:taxon].nil? || options[:uniprot_div].nil? puts 'Taxon or uniprot division was not specified' Process.exit(-1) end elsif !File.exists?(options[:user_fasta]) && options[:taxon].nil? puts 'User fasta file not exists or taxon was not specified' Process.exit(-1) end if ENV['BLASTDB'] && File.exists?(ENV['BLASTDB']) formatted_db_path = ENV['BLASTDB'] else # otherwise use ROOTPATH + DB formatted_db_path = File.expand_path(File.join(ROOT_PATH, "blast_dbs")) end name_db = nil if !options[:local] user_db_folder = File.join(formatted_db_path, options[:taxon]) name_db = options[:taxon] else user_db_folder = File.join(Dir.pwd, options[:name]) name_db = options[:name] end user_db_folder.gsub!(' ', '_') Dir.mkdir(user_db_folder) if !File.exists?(user_db_folder) output_file_path = File.join(user_db_folder, name_db) output_file_path.gsub!(' ', '_') seqs = '' if options[:user_fasta].nil? isoform_hash = load_isoform_hash(File.join(formatted_db_path, 'uniprot_sprot_varsplic.fasta.gz')) seqs = get_seqs(File.join(formatted_db_path, 'sp_' + options[:uniprot_div],"sp_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash) isoform_hash = nil seqs << get_seqs(File.join(formatted_db_path, 'tr_' + options[:uniprot_div],"tr_#{options[:uniprot_div]}.index"), options[:taxon], isoform_hash) else seqs = File.open(options[:user_fasta]).read end if options[:cdhit] > 0 output_file = File.open(output_file_path, 'w') output_file.puts seqs output_file.close system("cd-hit -i #{output_file_path} -o #{output_file_path}_cln -c 1 -s 0.95 -M 0") #-d length of description in .clstr file, default 20 if set to 0, it takes the fasta defline and stops at first space (BUGGED OPTION) -M 0 cd-hit uses all memory that it needs cdhit = Cdhit.new(output_file_path, output_file_path+'_cln.clstr') cdhit.master_to_sp_seq seqs = cdhit.get_all_master seqs.map!{|s| s.to_s} seqs = seqs.join("\n") end do_makeblastdb(seqs, output_file_path, 'prot') puts "make_user_db.rb has finished"