lib/genevalidator/validation_duplication.rb in genevalidator-1.6.12 vs lib/genevalidator/validation_duplication.rb in genevalidator-2.1.3

- old
+ new

@@ -17,11 +17,13 @@ attr_reader :threshold attr_reader :result def initialize(short_header, header, description, pvalue, averages, threshold = 0.05, expected = :yes) - @short_header, @header, @description = short_header, header, description + @short_header = short_header + @header = header + @description = description @pvalue = pvalue @threshold = threshold @result = validation @expected = expected @average = averages.mean @@ -34,11 +36,11 @@ @conclusion = conclude end def explain "The Wilcoxon test produced a p-value of #{prettify_evalue(@pvalue)}" \ - "#{(@result == :no) ? " (average = #{@average.round(2)})." : '.'}" + "#{@result == :no ? " (average = #{@average.round(2)})." : '.'}" end def conclude if @result == :yes 'This suggests that the query sequence contains no erroneous' \ @@ -48,19 +50,19 @@ ' sequence is likely repeated more than once.' end end def print - "#{@pvalue.round(2)}" + @pvalue.round(2).to_s end def validation - (@pvalue > @threshold) ? :yes : :no + @pvalue > @threshold ? :yes : :no end def color - (validation == :yes) ? 'success' : 'danger' + validation == :yes ? 'success' : 'danger' end private # Copied from SequenceServer @@ -95,46 +97,38 @@ @cli_name = 'dup' @raw_seq_file = opt[:raw_sequences] @index_file_name = config[:raw_seq_file_index] @raw_seq_file_load = config[:raw_seq_file_load] @db = opt[:db] - @num_threads = opt[:num_threads] + @num_threads = opt[:mafft_threads] @type = config[:type] end - def in_range?(ranges, idx) - ranges.each do |range| - return (range.member?(idx)) ? true : false - end - false - end - ## # Check duplication in the first n hits # Output: # +DuplicationValidationOutput+ object def run(n = 10) - fail NotEnoughHitsError unless hits.length >= 5 - fail unless prediction.is_a?(Query) && !prediction.raw_sequence.nil? && - hits[0].is_a?(Query) + raise NotEnoughHitsError if hits.length < opt[:min_blast_hits] + raise unless prediction.is_a?(Query) && !prediction.raw_sequence.nil? && + hits[0].is_a?(Query) start = Time.new # get the first n hits - less_hits = @hits[0..[n - 1, @hits.length].min] - useless_hits = [] + n_hits = [n - 1, @hits.length].min + less_hits = @hits[0..n_hits] # get raw sequences for less_hits - less_hits.map do |hit| - next unless hit.raw_sequence.nil? - hit.raw_sequence = FetchRawSequences.run(hit.identifier, - hit.accession_no) - useless_hits.push(hit) if hit.raw_sequence.nil? + less_hits.delete_if do |hit| + if hit.raw_sequence.nil? + hit.raw_sequence = FetchRawSequences.run(hit.identifier, + hit.accession_no) + end + hit.raw_sequence.nil? ? true : false end - useless_hits.each { |hit| less_hits.delete(hit) } + raise NoInternetError if less_hits.length.zero? - fail NoInternetError if less_hits.length == 0 - averages = [] less_hits.each do |hit| coverage = Array.new(hit.length_protein, 0) # each residue of the protein should be evluated once only @@ -144,66 +138,21 @@ # align subsequences from the hit and prediction that match if !hsp.hit_alignment.nil? && !hsp.query_alignment.nil? hit_alignment = hsp.hit_alignment query_alignment = hsp.query_alignment else - # indexing in blast starts from 1 - hit_local = hit.raw_sequence[hsp.hit_from - 1..hsp.hit_to - 1] - query_local = prediction.raw_sequence[hsp.match_query_from - - 1..hsp.match_query_to - 1] - - # in case of nucleotide prediction sequence translate into protein - # use translate with reading frame 1 because - # to/from coordinates of the hsp already correspond to the - # reading frame in which the prediction was read to match this hsp - if @type == :nucleotide - s = Bio::Sequence::NA.new(query_local) - query_local = s.translate - end - - # local alignment for hit and query - seqs = [hit_local, query_local] - - begin - options = ['--maxiterate', '1000', '--localpair', '--anysymbol', - '--quiet', '--thread', "#{@num_threads}"] - mafft = Bio::MAFFT.new('mafft', options) - - report = mafft.query_align(seqs) - raw_align = report.alignment - align = [] - - raw_align.each { |seq| align.push(seq.to_s) } - hit_alignment = align[0] - query_alignment = align[1] - rescue - raise NoMafftInstallationError - end + align = find_local_alignment(hit, prediction, hsp) + hit_alignment = align[0] + query_alignment = align[1] end - # check multiple coverage + coverage = check_multiple_coverage(hit_alignment, query_alignment, + hsp, coverage, ranges_prediction) - # for each hsp of the curent hit - # iterate through the alignment and count the matching residues - [*(0..hit_alignment.length - 1)].each do |i| - residue_hit = hit_alignment[i] - residue_query = query_alignment[i] - next if residue_hit == ' ' || residue_hit == '+' || - residue_hit == '-' || residue_hit != residue_query - # indexing in blast starts from 1 - idx_hit = i + (hsp.hit_from - 1) - - hit_alignment[0..i].scan(/-/).length - idx_query = i + (hsp.match_query_from - 1) - - query_alignment[0..i].scan(/-/).length - unless in_range?(ranges_prediction, idx_query) - coverage[idx_hit] += 1 - end - end - - ranges_prediction.push((hsp.match_query_from..hsp.match_query_to)) + ranges_prediction << (hsp.match_query_from..hsp.match_query_to) end - overlap = coverage.reject { |x| x == 0 } + overlap = coverage.reject(&:zero?) if overlap != [] averages.push((overlap.inject(:+) / (overlap.length + 0.0)).round(2)) end end @@ -223,11 +172,10 @@ @header, @description, pval, averages) @run_time = Time.now - start @validation_report - rescue NotEnoughHitsError @validation_report = ValidationReport.new('Not enough evidence', :warning, @short_header, @header, @description) rescue NoMafftInstallationError @@ -238,24 +186,77 @@ rescue NoInternetError @validation_report = ValidationReport.new('Internet error', :error, @short_header, @header, @description) @validation_report.errors.push NoInternetError - rescue + rescue StandardError @validation_report = ValidationReport.new('Unexpected error', :error, @short_header, @header, @description) @validation_report.errors.push 'Unexpected Error' end + # Only run if the BLAST output does not contain hit alignmment + def find_local_alignment(hit, prediction, hsp) + # indexing in blast starts from 1 + hit_local = hit.raw_sequence[hsp.hit_from - 1..hsp.hit_to - 1] + query_local = prediction.raw_sequence[hsp.match_query_from - + 1..hsp.match_query_to - 1] + + # in case of nucleotide prediction sequence translate into protein + # use translate with reading frame 1 because + # to/from coordinates of the hsp already correspond to the + # reading frame in which the prediction was read to match this hsp + if @type == :nucleotide + s = Bio::Sequence::NA.new(query_local) + query_local = s.translate + end + + opt = ['--maxiterate', '1000', '--localpair', '--anysymbol', '--quiet', + '--thread', @num_threads.to_s] + mafft = Bio::MAFFT.new('mafft', opt) + + # local alignment for hit and query + seqs = [hit_local, query_local] + report = mafft.query_align(seqs) + report.alignment.map(&:to_s) + rescue StandardError + raise NoMafftInstallationError + end + + def check_multiple_coverage(hit_alignment, query_alignment, hsp, coverage, + ranges_prediction) + # for each hsp of the curent hit + # iterate through the alignment and count the matching residues + [*(0..hit_alignment.length - 1)].each do |i| + residue_hit = hit_alignment[i] + residue_query = query_alignment[i] + next if [' ', '+', '-'].include?(residue_hit) + next if residue_hit != residue_query + # indexing in blast starts from 1 + idx_hit = i + (hsp.hit_from - 1) - + hit_alignment[0..i].scan(/-/).length + idx_query = i + (hsp.match_query_from - 1) - + query_alignment[0..i].scan(/-/).length + coverage[idx_hit] += 1 unless in_range?(ranges_prediction, idx_query) + end + coverage + end + + def in_range?(ranges, idx) + ranges.each { |range| return true if range.member?(idx) } + false + end + ## # wilcox test implementation from statsample ruby gem # many thanks to Claudio for helping us with the implementation! def wilcox_test(averages) - wilcox = Statsample::Test.wilcoxon_signed_rank(Daru::Vector.new(averages), - Daru::Vector.new(Array.new(averages.length, - 1))) + wilcox = Statsample::Test.wilcoxon_signed_rank( + Daru::Vector.new(averages), + Daru::Vector.new(Array.new(averages.length, 1)) + ) - (averages.length < 15) ? wilcox.probability_exact : wilcox.probability_z + averages.length < 15 ? wilcox.probability_exact : wilcox.probability_z end end end