lib/genevalidator/blast.rb in genevalidator-1.6.12 vs lib/genevalidator/blast.rb in genevalidator-2.1.3
- old
+ new
@@ -10,142 +10,66 @@
module GeneValidator
# Contains methods that run BLAST and methods that analyse sequences
class BlastUtils
class << self
extend Forwardable
- def_delegators GeneValidator, :opt, :config
+ def_delegators GeneValidator, :opt, :config, :dirs
EVALUE = 1e-5
- ##
- # Calls blast from standard input with specific parameters
- # Params:
- # +blast_type+: blast command in String format (e.g 'blast(x/p)')
- # +query+: String containing the the query in fasta format
- # +db+: database
- # +num_threads+: The number of threads to run BLAST with.
- # Output:
- # String with the blast xml output
- def run_blast(query, db = opt[:db], seq_type = config[:type],
- num_threads = opt[:num_threads])
-
- blast_type = (seq_type == :protein) ? 'blastp' : 'blastx'
- # -num_threads is not supported on remote databases
- threads = (db !~ /remote/) ? "-num_threads #{num_threads}" : ''
-
- blastcmd = "#{blast_type} -db '#{db}' -evalue #{EVALUE} -outfmt 5" \
- " #{threads}"
-
- cmd = "echo \"#{query}\" | #{blastcmd}"
- `#{cmd} >/dev/null 2>&1`
- end
-
- ##
# Runs BLAST on an input file
# Params:
# +blast_type+: blast command in String format (e.g 'blastx' or 'blastp')
# +opt+: Hash made of :input_fasta_file :blast_xml_file, :db, :num_threads
# +gapopen+: gapopen blast parameter
# +gapextend+: gapextend blast parameter
# +nr_hits+: max number of hits
# Output:
# XML file
- def run_blast_on_input_file(input_file = opt[:input_fasta_file],
- db = opt[:db], seq_type = config[:type],
- num_threads = opt[:num_threads])
- return if opt[:blast_xml_file] || opt[:blast_tabular_file]
+ def run_blast_on_input_file
+ remote = opt[:db].match?(/remote/) ? true : false
+ print_blast_info_text(remote)
- $stderr.puts 'Running BLAST. This may take a while.'
- opt[:blast_xml_file] = input_file + '.blast_xml'
+ log_file = File.join(dirs[:tmp_dir], 'blast_cmd_output.txt')
+ `#{blast_cmd(opt, config, remote)} > #{log_file} 2>&1`
- blast_type = (seq_type == :protein) ? 'blastp' : 'blastx'
- # -num_threads is not supported on remote databases
- threads = (opt[:db] !~ /remote/) ? "-num_threads #{num_threads}" : ''
-
- blastcmd = "#{blast_type} -query '#{input_file}'" \
- " -out '#{opt[:blast_xml_file]}' -db #{db} " \
- " -evalue #{EVALUE} -outfmt 5 #{threads}"
-
- `#{blastcmd} >/dev/null 2>&1`
return unless File.zero?(opt[:blast_xml_file])
- $stderr.puts 'Blast failed to run on the input file.'
- if opt[:db] !~ /remote/
- $stderr.puts 'Please ensure that the BLAST database exists and try'
- $stderr.puts 'again.'
+ warn 'Blast failed to run on the input file.'
+ if remote
+ warn 'You are using BLAST with a remote database. Please'
+ warn 'ensure that you have internet access and try again.'
else
- $stderr.puts 'You are using BLAST with a remote database. Please'
- $stderr.puts 'ensure that you have internet access and try again.'
+ warn 'Please ensure that the BLAST database exists and try again.'
end
+ exit 1
end
##
# Parses the next query from the blast xml output query
# Params:
# +iterator+: blast xml iterator for hits
# +type+: the type of the sequence: :nucleotide or :protein
# Outputs:
# Array of +Sequence+ objects corresponding to the list of hits
- def parse_next(iterator, type = config[:type])
- hits = []
+ def parse_next(iterator)
iter = iterator.next
# parse blast the xml output and get the hits
# hits obtained are proteins! (we use only blastp and blastx)
+ hits = []
iter.each do |hit|
- seq = Query.new
-
+ seq = Query.new
seq.length_protein = hit.len.to_i
seq.type = :protein
seq.identifier = hit.hit_id
seq.definition = hit.hit_def
- seq.accession_no = hit.accession
+ seq.accession_no = hit.accession
+ seq.hsp_list = hit.hsps.map { |hsp| Hsp.new(xml_input: hsp) }
- # get all high-scoring segment pairs (hsp)
- hsps = []
-
- hit.hsps.each do |hsp|
- current_hsp = Hsp.new
- current_hsp.hsp_evalue = format('%.0e', hsp.evalue)
-
- current_hsp.hit_from = hsp.hit_from.to_i
- current_hsp.hit_to = hsp.hit_to.to_i
- current_hsp.match_query_from = hsp.query_from.to_i
- current_hsp.match_query_to = hsp.query_to.to_i
-
- if type == :nucleotide
- current_hsp.match_query_from /= 3
- current_hsp.match_query_to /= 3
- current_hsp.match_query_from += 1
- current_hsp.match_query_to += 1
- end
-
- current_hsp.query_reading_frame = hsp.query_frame.to_i
-
- current_hsp.hit_alignment = hsp.hseq.to_s
- seq_type = guess_sequence_type(current_hsp.hit_alignment)
- fail SequenceTypeError unless seq_type == :protein || seq_type.nil?
-
- current_hsp.query_alignment = hsp.qseq.to_s
- seq_type = guess_sequence_type(current_hsp.query_alignment)
- fail SequenceTypeError unless seq_type == :protein || seq_type.nil?
-
- current_hsp.align_len = hsp.align_len.to_i
- current_hsp.identity = hsp.identity.to_i
- current_hsp.pidentity = (100 * hsp.identity / hsp.align_len.to_f)
- .round(2)
-
- hsps.push(current_hsp)
- end
-
- seq.hsp_list = hsps
- hits.push(seq)
+ hits << seq
end
-
hits
- rescue SequenceTypeError => e
- $stderr.puts e
- exit 1
rescue StopIteration
nil
end
##
@@ -162,11 +86,11 @@
def type_of_sequences(fasta_format_string)
# the first sequence does not need to have a fasta definition line
sequences = fasta_format_string.split(/^>.*$/).delete_if(&:empty?)
# get all sequence types
sequence_types = sequences.collect { |seq| guess_sequence_type(seq) }
- .uniq.compact
+ .uniq.compact
return nil if sequence_types.empty?
sequence_types.first if sequence_types.length == 1
end
@@ -182,21 +106,42 @@
# removing non-letter and ambiguous characters
cleaned_sequence = sequence_string.gsub(/[^A-Z]|[NX]/i, '')
return nil if cleaned_sequence.length < 10 # conservative
type = Bio::Sequence.new(cleaned_sequence).guess(0.9)
- (type == Bio::Sequence::NA) ? :nucleotide : :protein
+ type == Bio::Sequence::NA ? :nucleotide : :protein
end
##
#
def guess_sequence_type_from_input_file(file = opt[:input_fasta_file])
lines = File.foreach(file).first(10)
seqs = ''
- lines.each do |l|
- seqs += l.chomp unless l[0] == '>'
- end
+ lines.each { |l| seqs += l.chomp unless l[0] == '>' }
guess_sequence_type(seqs)
+ end
+
+ private
+
+ def blast_cmd(opt, config, remote)
+ blast_type = config[:type] == :protein ? 'blastp' : 'blastx'
+ # -num_threads is not supported on remote databases
+ threads = remote ? '' : "-num_threads #{opt[:num_threads]}"
+
+ "#{blast_type} -query '#{opt[:input_fasta_file]}'" \
+ " -db #{opt[:db]} -outfmt 5 -evalue #{EVALUE} #{threads}" \
+ " -out '#{opt[:blast_xml_file]}' #{opt[:blast_options]}"
+ end
+
+ def print_blast_info_text(remote)
+ warn '' # a blank line
+ if remote
+ warn '==> BLAST search and subsequent analysis will be done on a remote'
+ warn ' database. Please use a local database for larger analysis.'
+ else
+ warn '==> Running BLAST. This may take a while.'
+ end
+ warn '' # a blank line
end
end
end
end