lib/genevalidator/blast.rb in genevalidator-1.6.12 vs lib/genevalidator/blast.rb in genevalidator-2.1.3

- old
+ new

@@ -10,142 +10,66 @@ module GeneValidator # Contains methods that run BLAST and methods that analyse sequences class BlastUtils class << self extend Forwardable - def_delegators GeneValidator, :opt, :config + def_delegators GeneValidator, :opt, :config, :dirs EVALUE = 1e-5 - ## - # Calls blast from standard input with specific parameters - # Params: - # +blast_type+: blast command in String format (e.g 'blast(x/p)') - # +query+: String containing the the query in fasta format - # +db+: database - # +num_threads+: The number of threads to run BLAST with. - # Output: - # String with the blast xml output - def run_blast(query, db = opt[:db], seq_type = config[:type], - num_threads = opt[:num_threads]) - - blast_type = (seq_type == :protein) ? 'blastp' : 'blastx' - # -num_threads is not supported on remote databases - threads = (db !~ /remote/) ? "-num_threads #{num_threads}" : '' - - blastcmd = "#{blast_type} -db '#{db}' -evalue #{EVALUE} -outfmt 5" \ - " #{threads}" - - cmd = "echo \"#{query}\" | #{blastcmd}" - `#{cmd} >/dev/null 2>&1` - end - - ## # Runs BLAST on an input file # Params: # +blast_type+: blast command in String format (e.g 'blastx' or 'blastp') # +opt+: Hash made of :input_fasta_file :blast_xml_file, :db, :num_threads # +gapopen+: gapopen blast parameter # +gapextend+: gapextend blast parameter # +nr_hits+: max number of hits # Output: # XML file - def run_blast_on_input_file(input_file = opt[:input_fasta_file], - db = opt[:db], seq_type = config[:type], - num_threads = opt[:num_threads]) - return if opt[:blast_xml_file] || opt[:blast_tabular_file] + def run_blast_on_input_file + remote = opt[:db].match?(/remote/) ? true : false + print_blast_info_text(remote) - $stderr.puts 'Running BLAST. This may take a while.' - opt[:blast_xml_file] = input_file + '.blast_xml' + log_file = File.join(dirs[:tmp_dir], 'blast_cmd_output.txt') + `#{blast_cmd(opt, config, remote)} > #{log_file} 2>&1` - blast_type = (seq_type == :protein) ? 'blastp' : 'blastx' - # -num_threads is not supported on remote databases - threads = (opt[:db] !~ /remote/) ? "-num_threads #{num_threads}" : '' - - blastcmd = "#{blast_type} -query '#{input_file}'" \ - " -out '#{opt[:blast_xml_file]}' -db #{db} " \ - " -evalue #{EVALUE} -outfmt 5 #{threads}" - - `#{blastcmd} >/dev/null 2>&1` return unless File.zero?(opt[:blast_xml_file]) - $stderr.puts 'Blast failed to run on the input file.' - if opt[:db] !~ /remote/ - $stderr.puts 'Please ensure that the BLAST database exists and try' - $stderr.puts 'again.' + warn 'Blast failed to run on the input file.' + if remote + warn 'You are using BLAST with a remote database. Please' + warn 'ensure that you have internet access and try again.' else - $stderr.puts 'You are using BLAST with a remote database. Please' - $stderr.puts 'ensure that you have internet access and try again.' + warn 'Please ensure that the BLAST database exists and try again.' end + exit 1 end ## # Parses the next query from the blast xml output query # Params: # +iterator+: blast xml iterator for hits # +type+: the type of the sequence: :nucleotide or :protein # Outputs: # Array of +Sequence+ objects corresponding to the list of hits - def parse_next(iterator, type = config[:type]) - hits = [] + def parse_next(iterator) iter = iterator.next # parse blast the xml output and get the hits # hits obtained are proteins! (we use only blastp and blastx) + hits = [] iter.each do |hit| - seq = Query.new - + seq = Query.new seq.length_protein = hit.len.to_i seq.type = :protein seq.identifier = hit.hit_id seq.definition = hit.hit_def - seq.accession_no = hit.accession + seq.accession_no = hit.accession + seq.hsp_list = hit.hsps.map { |hsp| Hsp.new(xml_input: hsp) } - # get all high-scoring segment pairs (hsp) - hsps = [] - - hit.hsps.each do |hsp| - current_hsp = Hsp.new - current_hsp.hsp_evalue = format('%.0e', hsp.evalue) - - current_hsp.hit_from = hsp.hit_from.to_i - current_hsp.hit_to = hsp.hit_to.to_i - current_hsp.match_query_from = hsp.query_from.to_i - current_hsp.match_query_to = hsp.query_to.to_i - - if type == :nucleotide - current_hsp.match_query_from /= 3 - current_hsp.match_query_to /= 3 - current_hsp.match_query_from += 1 - current_hsp.match_query_to += 1 - end - - current_hsp.query_reading_frame = hsp.query_frame.to_i - - current_hsp.hit_alignment = hsp.hseq.to_s - seq_type = guess_sequence_type(current_hsp.hit_alignment) - fail SequenceTypeError unless seq_type == :protein || seq_type.nil? - - current_hsp.query_alignment = hsp.qseq.to_s - seq_type = guess_sequence_type(current_hsp.query_alignment) - fail SequenceTypeError unless seq_type == :protein || seq_type.nil? - - current_hsp.align_len = hsp.align_len.to_i - current_hsp.identity = hsp.identity.to_i - current_hsp.pidentity = (100 * hsp.identity / hsp.align_len.to_f) - .round(2) - - hsps.push(current_hsp) - end - - seq.hsp_list = hsps - hits.push(seq) + hits << seq end - hits - rescue SequenceTypeError => e - $stderr.puts e - exit 1 rescue StopIteration nil end ## @@ -162,11 +86,11 @@ def type_of_sequences(fasta_format_string) # the first sequence does not need to have a fasta definition line sequences = fasta_format_string.split(/^>.*$/).delete_if(&:empty?) # get all sequence types sequence_types = sequences.collect { |seq| guess_sequence_type(seq) } - .uniq.compact + .uniq.compact return nil if sequence_types.empty? sequence_types.first if sequence_types.length == 1 end @@ -182,21 +106,42 @@ # removing non-letter and ambiguous characters cleaned_sequence = sequence_string.gsub(/[^A-Z]|[NX]/i, '') return nil if cleaned_sequence.length < 10 # conservative type = Bio::Sequence.new(cleaned_sequence).guess(0.9) - (type == Bio::Sequence::NA) ? :nucleotide : :protein + type == Bio::Sequence::NA ? :nucleotide : :protein end ## # def guess_sequence_type_from_input_file(file = opt[:input_fasta_file]) lines = File.foreach(file).first(10) seqs = '' - lines.each do |l| - seqs += l.chomp unless l[0] == '>' - end + lines.each { |l| seqs += l.chomp unless l[0] == '>' } guess_sequence_type(seqs) + end + + private + + def blast_cmd(opt, config, remote) + blast_type = config[:type] == :protein ? 'blastp' : 'blastx' + # -num_threads is not supported on remote databases + threads = remote ? '' : "-num_threads #{opt[:num_threads]}" + + "#{blast_type} -query '#{opt[:input_fasta_file]}'" \ + " -db #{opt[:db]} -outfmt 5 -evalue #{EVALUE} #{threads}" \ + " -out '#{opt[:blast_xml_file]}' #{opt[:blast_options]}" + end + + def print_blast_info_text(remote) + warn '' # a blank line + if remote + warn '==> BLAST search and subsequent analysis will be done on a remote' + warn ' database. Please use a local database for larger analysis.' + else + warn '==> Running BLAST. This may take a while.' + end + warn '' # a blank line end end end end