require 'genevalidator/arg_validation' require 'genevalidator/get_raw_sequences' require 'genevalidator/tabular_parser' require 'genevalidator/blast' require 'genevalidator/output' require 'genevalidator/exceptions' require 'genevalidator/validation_length_cluster' require 'genevalidator/validation_length_rank' require 'genevalidator/validation_blast_reading_frame' require 'genevalidator/validation_gene_merge' require 'genevalidator/validation_duplication' require 'genevalidator/validation_open_reading_frame' require 'genevalidator/validation_alignment' require 'genevalidator/pool' require 'bio-blastxmlparser' require 'open-uri' require 'uri' require 'io/console' require 'yaml' require 'thread' # Top level module / namespace. module GeneValidator Pair1 = Struct.new(:x, :y) # Main Class that initalises and then runs validations. class Validation attr_reader :opt attr_reader :type attr_reader :input_fasta_file attr_reader :html_path attr_reader :yaml_path attr_reader :filename attr_reader :raw_seq_file_index attr_reader :raw_seq_file_load attr_accessor :idx # current number of the querry processed attr_reader :start_idx # array of indexes for the start offsets of each query in the fasta file attr_reader :query_offset_lst attr_reader :overall_evaluation # global variables attr_reader :no_queries attr_reader :scores attr_reader :good_predictions attr_reader :bad_predictions attr_reader :nee attr_reader :no_mafft attr_reader :no_internet attr_reader :map_errors attr_reader :map_running_times attr_reader :threads attr_reader :mutex attr_reader :mutex_yaml attr_reader :mutex_html attr_reader :mutex_array ## # Initilizes the object # Params: # +opt+: A hash - Default Values: {validations: ['all'], # blast_tabular_file: nil, blast_tabular_options: nil, blast_xml_file: nil, # db: 'remote', raw_sequences: nil, num_threads: 1 fast: false} # +start_idx+: number of the sequence from the file to start with # +overall_evaluation+: boolean variable for printing overall evaluation def initialize(opt, start_idx = 1, overall_evaluation = true) # Validate opts @opt = GVArgValidation.validate_args(opt) puts "\nDepending on your input and your computational resources, this"\ ' may take a while. Please wait...' @idx = 0 @start_idx = start_idx @overall_evaluation = overall_evaluation # start a worker thread @threads = [] # used for parallelizing the validations. @mutex = Mutex.new @mutex_yaml = Mutex.new @mutex_html = Mutex.new @mutex_array = Mutex.new # global variables @no_queries = 0 @scores = [] @good_predictions = 0 @bad_predictions = 0 @nee = 0 @no_mafft = 0 @no_internet = 0 @map_errors = Hash.new(0) @map_running_times = Hash.new(Pair1.new(0, 0)) @type = determine_sequence_type @query_offset_lst = index_the_input # build the path of html folder output dir = File.dirname(@opt[:input_fasta_file]) @filename = File.basename(@opt[:input_fasta_file]) @yaml_path = dir @html_path = "#{opt[:input_fasta_file]}.html" @plot_dir = "#{@html_path}/files/json" # create 'html' directory Dir.mkdir(@html_path) # copy auxiliar folders to the html folder aux = File.join(File.dirname(File.expand_path(__FILE__)), '../aux/files') FileUtils.cp_r(aux, @html_path) end ## # Parse the blast output and run validations def run # Run BLAST on all sequences run_blast_on_the_input_file if @opt[:fast] unless @opt[:blast_xml_file] || @opt[:blast_tabular_file] # run BLAST on each sequence individually & then run validations run_blast_on_each_sequence else # Extract raw sequences of hits extract_raw_sequences_of_blast_hits unless @opt[:raw_sequences] create_an_index_file_of_raw_seq_file(@opt[:raw_sequences]) # Run Validations iterator = parse_blast_output_file run_validations(iterator) end return unless @overall_evaluation Output.print_footer(@no_queries, @scores, @good_predictions, @bad_predictions, @nee, @no_mafft, @no_internet, @map_errors, @map_running_times, @html_path, @filename) end def determine_sequence_type BlastUtils.guess_sequence_type_from_file(@opt[:input_fasta_file]) end ## # Runs BLAST on the input file - only run when the opt[:fast] is true def run_blast_on_the_input_file return if @opt[:blast_xml_file] || @opt[:blast_tabular_file] puts 'Running BLAST' @opt[:blast_xml_file] = @opt[:input_fasta_file] + '.blast_xml' BlastUtils.run_blast_on_file(@opt) end ## # Extracts raw sequences of all blast hits def extract_raw_sequences_of_blast_hits puts 'Extracting sequences within the BLAST output file from the BLAST' \ ' database' @opt[:raw_sequences] = GetRawSequences.run(@opt) end ## # create a list of index of the queries in the FASTA # These offset can then be used to quickly read the input file using the # start and end positions of each query. def index_the_input fasta_content = IO.binread(@opt[:input_fasta_file]) offset_array = fasta_content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) } offset_array.push(fasta_content.length) fasta_content = nil offset_array end ## # Index the raw sequences file... def create_an_index_file_of_raw_seq_file(raw_sequence_file) # leave only the identifiers in the fasta description content = File.open(raw_sequence_file, 'rb').read.gsub(/ .*/, '') File.open(raw_sequence_file, 'w+') { |f| f.write(content) } # index the fasta file keys = content.scan(/>(.*)\n/).flatten values = content.enum_for(:scan, /(>[^>]+)/).map { Regexp.last_match.begin(0) } # make an index hash index_hash = {} keys.each_with_index do |k, i| start = values[i] endf = (i == values.length - 1) ? content.length - 1 : values[i + 1] index_hash[k] = [start, endf] end # create FASTA index @raw_seq_file_index = "#{raw_sequence_file}.idx" @raw_seq_file_load = index_hash File.open(@raw_seq_file_index, 'w') do |f| YAML.dump(index_hash, f) end content = nil end ## # Params: # +output+: filename or stream, according to the type # +type+: file or stream # Returns an iterator.. def parse_blast_output_file if @opt[:blast_xml_file] Bio::BlastXMLParser::XmlIterator.new(@opt[:blast_xml_file]).to_enum else TabularParser.new(@opt[:blast_tabular_file], @opt[:blast_tabular_options], @type) end ## TODO: Add a Rescue statement - e.g. if unable to create the Object... end ## # def run_blast_on_each_sequence # file seek for each query @query_offset_lst[0..@query_offset_lst.length - 2].each_with_index do |_pos, i| if (i + 1) >= @start_idx start_offset = @query_offset_lst[i + 1] - @query_offset_lst[i] end_offset = @query_offset_lst[i] query = IO.binread(@opt[:input_fasta_file], start_offset, end_offset) # call blast with the default parameters blast_type = (type == :protein) ? 'blastp' : 'blastx' blast_xml_output = BlastUtils.run_blast(blast_type, query, @opt[:db], @opt[:num_threads]) iterator = Bio::BlastXMLParser::NokogiriBlastXml.new(blast_xml_output).to_enum run_validations(iterator) else @idx += 1 end end end ## # def run_validations(iterator) p = Pool.new(@opt[:num_threads]) if @opt[:num_threads] > 1 while @idx + 1 < @query_offset_lst.length prediction = get_info_on_each_query_sequence @idx += 1 hits = parse_next_iteration(iterator, prediction) if hits.nil? @idx -= 1 break end current_idx = @idx # the first validation should be treated separately if current_idx == @start_idx || @opt[:num_threads] == 1 validate(prediction, hits, current_idx) else p.schedule(prediction, hits, current_idx) do |prediction, hits, current_idx| validate(prediction, hits, current_idx) end end end ensure p.shutdown if @opt[:num_threads] > 1 end def parse_next_iteration(iterator, prediction) iterator.next if @idx < @start_idx if @opt[:blast_xml_file] BlastUtils.parse_next(iterator, @type) elsif @opt[:blast_tabular_file] iterator.parse_next(prediction.identifier) end end ## # get info about the query def get_info_on_each_query_sequence prediction = Sequence.new start_offset = @query_offset_lst[idx + 1] - @query_offset_lst[idx] end_offset = @query_offset_lst[idx] query = IO.binread(@opt[:input_fasta_file], start_offset, end_offset) parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0] prediction.definition = parse_query[0].gsub("\n", '') prediction.identifier = prediction.definition.gsub(/ .*/, '') prediction.type = @type prediction.raw_sequence = parse_query[1].gsub("\n", '') prediction.length_protein = prediction.raw_sequence.length prediction.length_protein /= 3 if @type == :nucleotide prediction end ## # Validate one query and create validation report # Params: # +prediction+: Sequence object # +hits+: Array of +Sequence+ objects # +idx+: the index number of the query def validate(prediction, hits, current_idx) query_output = do_validations(prediction, hits, current_idx) query_output.generate_html query_output.print_output_file_yaml query_output.print_output_console validations = query_output.validations no_mafft = 0 no_internet = 0 errors = [] validations.each do |v| unless v.errors.nil? no_mafft += v.errors.select { |e| e == NoMafftInstallationError }.length no_internet += v.errors.select { |e| e == NoInternetError }.length end errors.push(v.short_header) if v.validation == :error end no_evidence = validations.count { |v| v.result == :unapplicable || v.result == :warning } == validations.length nee = (no_evidence) ? 1 : 0 good_predictions = (query_output.overall_score >= 75) ? 1 : 0 bad_predictions = (query_output.overall_score >= 75) ? 0 : 1 @mutex_array.synchronize do @no_queries += 1 @scores.push(query_output.overall_score) @good_predictions += good_predictions @bad_predictions += bad_predictions @nee += nee @no_mafft += no_mafft @no_internet += no_internet errors.each { |err| @map_errors[err] += 1 } validations.each do |v| next if v.running_time == 0 || v.running_time.nil? next if v.validation == :unapplicable || v.validation == :error p = Pair1.new(@map_running_times[v.short_header].x + v.running_time, @map_running_times[v.short_header].y + 1) @map_running_times[v.short_header] = p end end query_output end ## # Removes identical hits # Params: # +prediction+: Sequence object # +hits+: Array of +Sequence+ objects # Output: # new array of hit +Sequence+ objects def remove_identical_hits(prediction, hits) # remove the identical hits # identical hit means 100%coverage and >99% identity identical_hits = [] hits.each do |hit| # check if all hsps have identity more than 99% low_identity = hit.hsp_list.select { |hsp| hsp.pidentity.nil? || hsp.pidentity < 99 } # check the coverage coverage = Array.new(prediction.length_protein, 0) hit.hsp_list.each do |hsp| len = hsp.match_query_to - hsp.match_query_from + 1 coverage[hsp.match_query_from - 1..hsp.match_query_to - 1] = Array.new(len, 1) end if low_identity.length == 0 && coverage.uniq.length == 1 identical_hits.push(hit) end end identical_hits.each { |hit| hits.delete(hit) } hits end ## # Runs all the validations and prints the outputs given the current # prediction query and the corresponding hits # Params: # +prediction+: Sequence object # +hits+: Array of +Sequence+ objects # +idx+: the index number of the query # Output: # +Output+ object def do_validations(prediction, hits, current_idx) begin hits = remove_identical_hits(prediction, hits) rescue Exception => error # NoPIdentError end query_output = Output.new(@mutex, @mutex_yaml, @mutex_html, @filename, @html_path, @yaml_path, current_idx, @start_idx) query_output.prediction_len = prediction.length_protein query_output.prediction_def = prediction.definition query_output.nr_hits = hits.length plot_path = File.join(@plot_dir, "#{@filename}_#{current_idx}") validations = [] validations.push LengthClusterValidation.new(@type, prediction, hits, plot_path) validations.push LengthRankValidation.new(@type, prediction, hits) validations.push GeneMergeValidation.new(@type, prediction, hits, plot_path) validations.push DuplicationValidation.new(@type, prediction, hits, @opt[:raw_sequences], @raw_seq_file_index, @raw_seq_file_load, @opt[:db], @opt[:num_threads]) validations.push BlastReadingFrameValidation.new(@type, prediction, hits) validations.push OpenReadingFrameValidation.new(@type, prediction, hits, plot_path) validations.push AlignmentValidation.new(@type, prediction, hits, plot_path, @opt[:raw_sequences], @raw_seq_file_index, @raw_seq_file_load, @opt[:db], @opt[:num_threads]) validations = validations.select { |v| @opt[:validations].include? v.cli_name.downcase } # check the class type of the elements in the list validations.each do |v| fail ValidationClassError unless v.is_a? ValidationTest end # check alias duplication aliases = validations.map(&:cli_name) fail AliasDuplicationError unless aliases.length == aliases.uniq.length validations.each do |v| v.run fail ReportClassError unless v.validation_report.is_a? ValidationReport end query_output.validations = validations.map(&:validation_report) fail NoValidationError if query_output.validations.length == 0 # compute validation score compute_scores(query_output) query_output rescue ValidationClassError => error error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0] $stderr.print "Class Type error at #{error_line}." \ ' Possible cause: type of one of the validations is not' \ " ValidationTest\n" exit 1 rescue NoValidationError => error error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0] $stderr.print "Validation error at #{error_line}." \ " Possible cause: your -v arguments are not valid aliases\n" exit 1 rescue ReportClassError => error error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0] $stderr.print "Class Type error at #{error_line}."\ ' Possible cause: type of one of the validation reports' \ " returned by the 'run' method is not ValidationReport\n" exit 1 rescue AliasDuplicationError => error error_line = error.backtrace[0].scan(%r{/([^/]+:\d+):.*})[0][0] $stderr.print "Alias Duplication error at #{error_line}."\ ' Possible cause: At least two validations have the same' \ " CLI alias\n" exit 1 end def compute_scores(query_output) validations = query_output.validations successes = validations.map { |v| v.result == v.expected }.count(true) fails = validations.map { |v| v.validation != :unapplicable && v.validation != :error && v.result != v.expected }.count(true) lcv = validations.select { |v| v.class == LengthClusterValidationOutput } lrv = validations.select { |v| v.class == LengthRankValidationOutput } if lcv.length == 1 && lrv.length == 1 score_lcv = (lcv[0].result == lcv[0].expected) score_lrv = (lrv[0].result == lrv[0].expected) # if both are true this should be counted as a single success if score_lcv == true && score_lrv == true successes -= 1 elsif score_lcv == false && score_lrv == false # if both are false this will be a fail fails -= 1 else successes -= 0.5 fails -= 0.5 end end query_output.successes = successes query_output.fails = fails total_query = successes.to_i + fails query_output.overall_score = (successes * 100 / (total_query)).round(0) end end end