lib/genevalidator/blast.rb in genevalidator-1.6.1 vs lib/genevalidator/blast.rb in genevalidator-1.6.2

- old
+ new

@@ -1,21 +1,21 @@ -require 'genevalidator/sequences' +require 'bio' +require 'bio-blastxmlparser' +require 'forwardable' + +require 'genevalidator/exceptions' require 'genevalidator/hsp' +require 'genevalidator/sequences' require 'genevalidator/output' -require 'genevalidator/exceptions' -require 'bio-blastxmlparser' -require 'net/http' -require 'open-uri' -require 'uri' -require 'io/console' -require 'yaml' -require 'bio' module GeneValidator # Contains methods that run BLAST and methods that analyse sequences class BlastUtils class << self + extend Forwardable + def_delegators GeneValidator, :opt, :config + EVALUE = 1e-5 ## # Calls blast from standard input with specific parameters # Params: @@ -23,11 +23,14 @@ # +query+: String containing the the query in fasta format # +db+: database # +num_threads+: The number of threads to run BLAST with. # Output: # String with the blast xml output - def run_blast(blast_type, query, db, num_threads) + def run_blast(query, db = opt[:db], seq_type = config[:type], + num_threads = opt[:num_threads]) + + blast_type = (seq_type == :protein) ? 'blastp' : 'blastx' # -num_threads is not supported on remote databases threads = (db !~ /remote/) ? "-num_threads #{num_threads}" : '' blastcmd = "#{blast_type} -db '#{db}' -evalue #{EVALUE} -outfmt 5" \ " #{threads}" @@ -44,37 +47,41 @@ # +gapopen+: gapopen blast parameter # +gapextend+: gapextend blast parameter # +nr_hits+: max number of hits # Output: # XML file - def run_blast_on_file(opt) - seq_type = guess_sequence_type_from_file(opt[:input_fasta_file]) + def run_blast_on_input_file(input_file = opt[:input_fasta_file], + db = opt[:db], seq_type = config[:type], + num_threads = opt[:num_threads]) + return if opt[:blast_xml_file] || opt[:blast_tabular_file] + + $stderr.puts 'Running BLAST' + opt[:blast_xml_file] = input_file + '.blast_xml' + blast_type = (seq_type == :protein) ? 'blastp' : 'blastx' # -num_threads is not supported on remote databases - threads = (opt[:db] !~ /remote/) ? "-num_threads #{opt[:num_threads]}" : '' + threads = (opt[:db] !~ /remote/) ? "-num_threads #{num_threads}" : '' - blastcmd = "#{blast_type} -query '#{opt[:input_fasta_file]}'" \ - " -out '#{opt[:blast_xml_file]}' -db #{opt[:db]} " \ + blastcmd = "#{blast_type} -query '#{input_file}'" \ + " -out '#{opt[:blast_xml_file]}' -db #{db} " \ " -evalue #{EVALUE} -outfmt 5 #{threads}" `#{blastcmd}` return unless File.zero?(opt[:blast_xml_file]) - puts 'Blast failed to run on the input file. Please ensure that the' - puts 'BLAST database exists and try again' + $stderr.puts 'Blast failed to run on the input file. Please ensure that the' + $stderr.puts 'BLAST database exists and try again' exit 1 end ## # Parses the next query from the blast xml output query # Params: # +iterator+: blast xml iterator for hits # +type+: the type of the sequence: :nucleotide or :protein # Outputs: # Array of +Sequence+ objects corresponding to the list of hits - def parse_next(iterator, type) - fail TypeError unless iterator.is_a? Enumerator - + def parse_next(iterator, type = config[:type]) hits = [] iter = iterator.next # parse blast the xml output and get the hits # hits obtained are proteins! (we use only blastp and blastx) @@ -83,41 +90,40 @@ seq.length_protein = hit.len.to_i seq.type = :protein seq.identifier = hit.hit_id seq.definition = hit.hit_def - # puts seq.identifier seq.accession_no = hit.accession # get all high-scoring segment pairs (hsp) hsps = [] hit.hsps.each do |hsp| current_hsp = Hsp.new - current_hsp.hsp_evalue = '%.0e' % hsp.evalue + current_hsp.hsp_evalue = format('%.0e', hsp.evalue) current_hsp.hit_from = hsp.hit_from.to_i current_hsp.hit_to = hsp.hit_to.to_i current_hsp.match_query_from = hsp.query_from.to_i current_hsp.match_query_to = hsp.query_to.to_i if type == :nucleotide current_hsp.match_query_from /= 3 - current_hsp.match_query_to /= 3 + current_hsp.match_query_to /= 3 current_hsp.match_query_from += 1 - current_hsp.match_query_to += 1 + current_hsp.match_query_to += 1 end current_hsp.query_reading_frame = hsp.query_frame.to_i current_hsp.hit_alignment = hsp.hseq.to_s - if BlastUtils.guess_sequence_type(current_hsp.hit_alignment) != :protein + if guess_sequence_type(current_hsp.hit_alignment) != :protein fail SequenceTypeError end current_hsp.query_alignment = hsp.qseq.to_s - if BlastUtils.guess_sequence_type(current_hsp.query_alignment) != :protein + if guess_sequence_type(current_hsp.query_alignment) != :protein fail SequenceTypeError end current_hsp.align_len = hsp.align_len.to_i current_hsp.identity = hsp.identity.to_i current_hsp.pidentity = (100 * hsp.identity / (hsp.align_len + 0.0)).round(2) @@ -127,28 +133,40 @@ seq.hsp_list = hsps hits.push(seq) end - return hits - - rescue TypeError => error - line = error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0] - $stderr.print "Type error at #{line}. Possible cause: you didn't call" \ - " parse method first!\n" + hits + rescue SequenceTypeError => e + $stderr.puts e exit 1 - rescue SequenceTypeError => error - line = error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0] - $stderr.print "Sequence Type error at #{line}. Possible cause: the" \ - 'blast output was not obtained against a protein' \ - " database.\n" - exit 1 rescue StopIteration nil end ## + # Method copied from sequenceserver/sequencehelpers.rb + # Splits input at putative fasta definition lines (like ">adsfadsf"); + # then guesses sequence type for each sequence. + # If not enough sequence to determine, returns nil. + # If 2 kinds of sequence mixed together, raises ArgumentError + # Otherwise, returns :nucleotide or :protein + # Params: + # +sequence_string+: String to validate + # Output: + # nil, :nucleotide or :protein + def type_of_sequences(fasta_format_string) + # the first sequence does not need to have a fasta definition line + sequences = fasta_format_string.split(/^>.*$/).delete_if(&:empty?) + # get all sequence types + sequence_types = sequences.collect { |seq| guess_sequence_type(seq) }.uniq.compact + + return nil if sequence_types.empty? + return sequence_types.first if sequence_types.length == 1 + end + + ## # Strips all non-letter characters. guestimates sequence based on that. # If less than 10 useable characters... returns nil # If more than 90% ACGTU returns :nucleotide. else returns :protein # Params: # +sequence_string+: String to validate @@ -163,37 +181,16 @@ (type == Bio::Sequence::NA) ? :nucleotide : :protein end ## # - def guess_sequence_type_from_file(file) + def guess_sequence_type_from_input_file(file = opt[:input_fasta_file]) lines = File.foreach(file).first(10) seqs = '' lines.each do |l| seqs += l.chomp unless l[0] == '>' end guess_sequence_type(seqs) - end - - ## - # Method copied from sequenceserver/sequencehelpers.rb - # Splits input at putative fasta definition lines (like ">adsfadsf"); - # then guesses sequence type for each sequence. - # If not enough sequence to determine, returns nil. - # If 2 kinds of sequence mixed together, raises ArgumentError - # Otherwise, returns :nucleotide or :protein - # Params: - # +sequence_string+: String to validate - # Output: - # nil, :nucleotide or :protein - def type_of_sequences(fasta_format_string) - # the first sequence does not need to have a fasta definition line - sequences = fasta_format_string.split(/^>.*$/).delete_if(&:empty?) - # get all sequence types - sequence_types = sequences.collect { |seq| BlastUtils.guess_sequence_type(seq) }.uniq.compact - - return nil if sequence_types.empty? - return sequence_types.first if sequence_types.length == 1 end end end end