validation_duplication.rb in genevalidator-2.1.3

- old
+ new

@@ -17,11 +17,13 @@
     attr_reader :threshold
     attr_reader :result
 
     def initialize(short_header, header, description, pvalue, averages,
                    threshold = 0.05, expected = :yes)
-      @short_header, @header, @description = short_header, header, description
+      @short_header = short_header
+      @header = header
+      @description = description
       @pvalue      = pvalue
       @threshold   = threshold
       @result      = validation
       @expected    = expected
       @average     = averages.mean
@@ -34,11 +36,11 @@
       @conclusion  = conclude
     end
 
     def explain
       "The Wilcoxon test produced a p-value of #{prettify_evalue(@pvalue)}" \
-      "#{(@result == :no) ? " (average = #{@average.round(2)})." : '.'}"
+      "#{@result == :no ? " (average = #{@average.round(2)})." : '.'}"
     end
 
     def conclude
       if @result == :yes
         'This suggests that the query sequence contains no erroneous' \
@@ -48,19 +50,19 @@
         ' sequence is likely repeated more than once.'
       end
     end
 
     def print
-      "#{@pvalue.round(2)}"
+      @pvalue.round(2).to_s
     end
 
     def validation
-      (@pvalue > @threshold) ? :yes : :no
+      @pvalue > @threshold ? :yes : :no
     end
 
     def color
-      (validation == :yes) ? 'success' : 'danger'
+      validation == :yes ? 'success' : 'danger'
     end
 
     private
 
     # Copied from SequenceServer
@@ -95,46 +97,38 @@
       @cli_name          = 'dup'
       @raw_seq_file      = opt[:raw_sequences]
       @index_file_name   = config[:raw_seq_file_index]
       @raw_seq_file_load = config[:raw_seq_file_load]
       @db                = opt[:db]
-      @num_threads       = opt[:num_threads]
+      @num_threads       = opt[:mafft_threads]
       @type              = config[:type]
     end
 
-    def in_range?(ranges, idx)
-      ranges.each do |range|
-        return (range.member?(idx)) ? true : false
-      end
-      false
-    end
-
     ##
     # Check duplication in the first n hits
     # Output:
     # +DuplicationValidationOutput+ object
     def run(n = 10)
-      fail NotEnoughHitsError unless hits.length >= 5
-      fail unless prediction.is_a?(Query) && !prediction.raw_sequence.nil? &&
-                  hits[0].is_a?(Query)
+      raise NotEnoughHitsError if hits.length < opt[:min_blast_hits]
+      raise unless prediction.is_a?(Query) && !prediction.raw_sequence.nil? &&
+                   hits[0].is_a?(Query)
 
       start = Time.new
       # get the first n hits
-      less_hits = @hits[0..[n - 1, @hits.length].min]
-      useless_hits = []
+      n_hits = [n - 1, @hits.length].min
+      less_hits = @hits[0..n_hits]
       # get raw sequences for less_hits
-      less_hits.map do |hit|
-        next unless hit.raw_sequence.nil?
-        hit.raw_sequence = FetchRawSequences.run(hit.identifier,
-                                                 hit.accession_no)
-        useless_hits.push(hit) if hit.raw_sequence.nil?
+      less_hits.delete_if do |hit|
+        if hit.raw_sequence.nil?
+          hit.raw_sequence = FetchRawSequences.run(hit.identifier,
+                                                   hit.accession_no)
+        end
+        hit.raw_sequence.nil? ? true : false
       end
 
-      useless_hits.each { |hit| less_hits.delete(hit) }
+      raise NoInternetError if less_hits.length.zero?
 
-      fail NoInternetError if less_hits.length == 0
-
       averages = []
 
       less_hits.each do |hit|
         coverage = Array.new(hit.length_protein, 0)
         # each residue of the protein should be evluated once only
@@ -144,66 +138,21 @@
           # align subsequences from the hit and prediction that match
           if !hsp.hit_alignment.nil? && !hsp.query_alignment.nil?
             hit_alignment   = hsp.hit_alignment
             query_alignment = hsp.query_alignment
           else
-            # indexing in blast starts from 1
-            hit_local   = hit.raw_sequence[hsp.hit_from - 1..hsp.hit_to - 1]
-            query_local = prediction.raw_sequence[hsp.match_query_from -
-                                                  1..hsp.match_query_to - 1]
-
-            # in case of nucleotide prediction sequence translate into protein
-            # use translate with reading frame 1 because
-            # to/from coordinates of the hsp already correspond to the
-            # reading frame in which the prediction was read to match this hsp
-            if @type == :nucleotide
-              s = Bio::Sequence::NA.new(query_local)
-              query_local = s.translate
-            end
-
-            # local alignment for hit and query
-            seqs = [hit_local, query_local]
-
-            begin
-              options   = ['--maxiterate', '1000', '--localpair', '--anysymbol',
-                           '--quiet', '--thread', "#{@num_threads}"]
-              mafft     = Bio::MAFFT.new('mafft', options)
-
-              report    = mafft.query_align(seqs)
-              raw_align = report.alignment
-              align     = []
-
-              raw_align.each { |seq| align.push(seq.to_s) }
-              hit_alignment   = align[0]
-              query_alignment = align[1]
-            rescue
-              raise NoMafftInstallationError
-            end
+            align = find_local_alignment(hit, prediction, hsp)
+            hit_alignment   = align[0]
+            query_alignment = align[1]
           end
 
-          # check multiple coverage
+          coverage = check_multiple_coverage(hit_alignment, query_alignment,
+                                             hsp, coverage, ranges_prediction)
 
-          # for each hsp of the curent hit
-          # iterate through the alignment and count the matching residues
-          [*(0..hit_alignment.length - 1)].each do |i|
-            residue_hit   = hit_alignment[i]
-            residue_query = query_alignment[i]
-            next if residue_hit == ' ' || residue_hit == '+' ||
-                    residue_hit == '-' || residue_hit != residue_query
-            # indexing in blast starts from 1
-            idx_hit   = i + (hsp.hit_from - 1) -
-                        hit_alignment[0..i].scan(/-/).length
-            idx_query = i + (hsp.match_query_from - 1) -
-                        query_alignment[0..i].scan(/-/).length
-            unless in_range?(ranges_prediction, idx_query)
-              coverage[idx_hit] += 1
-            end
-          end
-
-          ranges_prediction.push((hsp.match_query_from..hsp.match_query_to))
+          ranges_prediction << (hsp.match_query_from..hsp.match_query_to)
         end
-        overlap = coverage.reject { |x| x == 0 }
+        overlap = coverage.reject(&:zero?)
         if overlap != []
           averages.push((overlap.inject(:+) / (overlap.length + 0.0)).round(2))
         end
       end
 
@@ -223,11 +172,10 @@
                                                            @header,
                                                            @description, pval,
                                                            averages)
       @run_time = Time.now - start
       @validation_report
-
     rescue NotEnoughHitsError
       @validation_report = ValidationReport.new('Not enough evidence', :warning,
                                                 @short_header, @header,
                                                 @description)
     rescue NoMafftInstallationError
@@ -238,24 +186,77 @@
     rescue NoInternetError
       @validation_report = ValidationReport.new('Internet error', :error,
                                                 @short_header, @header,
                                                 @description)
       @validation_report.errors.push NoInternetError
-    rescue
+    rescue StandardError
       @validation_report = ValidationReport.new('Unexpected error', :error,
                                                 @short_header, @header,
                                                 @description)
       @validation_report.errors.push 'Unexpected Error'
     end
 
+    # Only run if the BLAST output does not contain hit alignmment
+    def find_local_alignment(hit, prediction, hsp)
+      # indexing in blast starts from 1
+      hit_local   = hit.raw_sequence[hsp.hit_from - 1..hsp.hit_to - 1]
+      query_local = prediction.raw_sequence[hsp.match_query_from -
+                                            1..hsp.match_query_to - 1]
+
+      # in case of nucleotide prediction sequence translate into protein
+      # use translate with reading frame 1 because
+      # to/from coordinates of the hsp already correspond to the
+      # reading frame in which the prediction was read to match this hsp
+      if @type == :nucleotide
+        s = Bio::Sequence::NA.new(query_local)
+        query_local = s.translate
+      end
+
+      opt = ['--maxiterate', '1000', '--localpair', '--anysymbol', '--quiet',
+             '--thread', @num_threads.to_s]
+      mafft = Bio::MAFFT.new('mafft', opt)
+
+      # local alignment for hit and query
+      seqs = [hit_local, query_local]
+      report = mafft.query_align(seqs)
+      report.alignment.map(&:to_s)
+    rescue StandardError
+      raise NoMafftInstallationError
+    end
+
+    def check_multiple_coverage(hit_alignment, query_alignment, hsp, coverage,
+                                ranges_prediction)
+      # for each hsp of the curent hit
+      # iterate through the alignment and count the matching residues
+      [*(0..hit_alignment.length - 1)].each do |i|
+        residue_hit   = hit_alignment[i]
+        residue_query = query_alignment[i]
+        next if [' ', '+', '-'].include?(residue_hit)
+        next if residue_hit != residue_query
+        # indexing in blast starts from 1
+        idx_hit   = i + (hsp.hit_from - 1) -
+                    hit_alignment[0..i].scan(/-/).length
+        idx_query = i + (hsp.match_query_from - 1) -
+                    query_alignment[0..i].scan(/-/).length
+        coverage[idx_hit] += 1 unless in_range?(ranges_prediction, idx_query)
+      end
+      coverage
+    end
+
+    def in_range?(ranges, idx)
+      ranges.each { |range| return true if range.member?(idx) }
+      false
+    end
+
     ##
     # wilcox test implementation from statsample ruby gem
     # many thanks to Claudio for helping us with the implementation!
     def wilcox_test(averages)
-      wilcox = Statsample::Test.wilcoxon_signed_rank(Daru::Vector.new(averages),
-                                                     Daru::Vector.new(Array.new(averages.length,
-                                                               1)))
+      wilcox = Statsample::Test.wilcoxon_signed_rank(
+        Daru::Vector.new(averages),
+        Daru::Vector.new(Array.new(averages.length, 1))
+      )
 
-      (averages.length < 15) ? wilcox.probability_exact : wilcox.probability_z
+      averages.length < 15 ? wilcox.probability_exact : wilcox.probability_z
     end
   end
 end