lib/genevalidator/validation_duplication.rb in genevalidator-1.6.12 vs lib/genevalidator/validation_duplication.rb in genevalidator-2.1.3
- old
+ new
@@ -17,11 +17,13 @@
attr_reader :threshold
attr_reader :result
def initialize(short_header, header, description, pvalue, averages,
threshold = 0.05, expected = :yes)
- @short_header, @header, @description = short_header, header, description
+ @short_header = short_header
+ @header = header
+ @description = description
@pvalue = pvalue
@threshold = threshold
@result = validation
@expected = expected
@average = averages.mean
@@ -34,11 +36,11 @@
@conclusion = conclude
end
def explain
"The Wilcoxon test produced a p-value of #{prettify_evalue(@pvalue)}" \
- "#{(@result == :no) ? " (average = #{@average.round(2)})." : '.'}"
+ "#{@result == :no ? " (average = #{@average.round(2)})." : '.'}"
end
def conclude
if @result == :yes
'This suggests that the query sequence contains no erroneous' \
@@ -48,19 +50,19 @@
' sequence is likely repeated more than once.'
end
end
def print
- "#{@pvalue.round(2)}"
+ @pvalue.round(2).to_s
end
def validation
- (@pvalue > @threshold) ? :yes : :no
+ @pvalue > @threshold ? :yes : :no
end
def color
- (validation == :yes) ? 'success' : 'danger'
+ validation == :yes ? 'success' : 'danger'
end
private
# Copied from SequenceServer
@@ -95,46 +97,38 @@
@cli_name = 'dup'
@raw_seq_file = opt[:raw_sequences]
@index_file_name = config[:raw_seq_file_index]
@raw_seq_file_load = config[:raw_seq_file_load]
@db = opt[:db]
- @num_threads = opt[:num_threads]
+ @num_threads = opt[:mafft_threads]
@type = config[:type]
end
- def in_range?(ranges, idx)
- ranges.each do |range|
- return (range.member?(idx)) ? true : false
- end
- false
- end
-
##
# Check duplication in the first n hits
# Output:
# +DuplicationValidationOutput+ object
def run(n = 10)
- fail NotEnoughHitsError unless hits.length >= 5
- fail unless prediction.is_a?(Query) && !prediction.raw_sequence.nil? &&
- hits[0].is_a?(Query)
+ raise NotEnoughHitsError if hits.length < opt[:min_blast_hits]
+ raise unless prediction.is_a?(Query) && !prediction.raw_sequence.nil? &&
+ hits[0].is_a?(Query)
start = Time.new
# get the first n hits
- less_hits = @hits[0..[n - 1, @hits.length].min]
- useless_hits = []
+ n_hits = [n - 1, @hits.length].min
+ less_hits = @hits[0..n_hits]
# get raw sequences for less_hits
- less_hits.map do |hit|
- next unless hit.raw_sequence.nil?
- hit.raw_sequence = FetchRawSequences.run(hit.identifier,
- hit.accession_no)
- useless_hits.push(hit) if hit.raw_sequence.nil?
+ less_hits.delete_if do |hit|
+ if hit.raw_sequence.nil?
+ hit.raw_sequence = FetchRawSequences.run(hit.identifier,
+ hit.accession_no)
+ end
+ hit.raw_sequence.nil? ? true : false
end
- useless_hits.each { |hit| less_hits.delete(hit) }
+ raise NoInternetError if less_hits.length.zero?
- fail NoInternetError if less_hits.length == 0
-
averages = []
less_hits.each do |hit|
coverage = Array.new(hit.length_protein, 0)
# each residue of the protein should be evluated once only
@@ -144,66 +138,21 @@
# align subsequences from the hit and prediction that match
if !hsp.hit_alignment.nil? && !hsp.query_alignment.nil?
hit_alignment = hsp.hit_alignment
query_alignment = hsp.query_alignment
else
- # indexing in blast starts from 1
- hit_local = hit.raw_sequence[hsp.hit_from - 1..hsp.hit_to - 1]
- query_local = prediction.raw_sequence[hsp.match_query_from -
- 1..hsp.match_query_to - 1]
-
- # in case of nucleotide prediction sequence translate into protein
- # use translate with reading frame 1 because
- # to/from coordinates of the hsp already correspond to the
- # reading frame in which the prediction was read to match this hsp
- if @type == :nucleotide
- s = Bio::Sequence::NA.new(query_local)
- query_local = s.translate
- end
-
- # local alignment for hit and query
- seqs = [hit_local, query_local]
-
- begin
- options = ['--maxiterate', '1000', '--localpair', '--anysymbol',
- '--quiet', '--thread', "#{@num_threads}"]
- mafft = Bio::MAFFT.new('mafft', options)
-
- report = mafft.query_align(seqs)
- raw_align = report.alignment
- align = []
-
- raw_align.each { |seq| align.push(seq.to_s) }
- hit_alignment = align[0]
- query_alignment = align[1]
- rescue
- raise NoMafftInstallationError
- end
+ align = find_local_alignment(hit, prediction, hsp)
+ hit_alignment = align[0]
+ query_alignment = align[1]
end
- # check multiple coverage
+ coverage = check_multiple_coverage(hit_alignment, query_alignment,
+ hsp, coverage, ranges_prediction)
- # for each hsp of the curent hit
- # iterate through the alignment and count the matching residues
- [*(0..hit_alignment.length - 1)].each do |i|
- residue_hit = hit_alignment[i]
- residue_query = query_alignment[i]
- next if residue_hit == ' ' || residue_hit == '+' ||
- residue_hit == '-' || residue_hit != residue_query
- # indexing in blast starts from 1
- idx_hit = i + (hsp.hit_from - 1) -
- hit_alignment[0..i].scan(/-/).length
- idx_query = i + (hsp.match_query_from - 1) -
- query_alignment[0..i].scan(/-/).length
- unless in_range?(ranges_prediction, idx_query)
- coverage[idx_hit] += 1
- end
- end
-
- ranges_prediction.push((hsp.match_query_from..hsp.match_query_to))
+ ranges_prediction << (hsp.match_query_from..hsp.match_query_to)
end
- overlap = coverage.reject { |x| x == 0 }
+ overlap = coverage.reject(&:zero?)
if overlap != []
averages.push((overlap.inject(:+) / (overlap.length + 0.0)).round(2))
end
end
@@ -223,11 +172,10 @@
@header,
@description, pval,
averages)
@run_time = Time.now - start
@validation_report
-
rescue NotEnoughHitsError
@validation_report = ValidationReport.new('Not enough evidence', :warning,
@short_header, @header,
@description)
rescue NoMafftInstallationError
@@ -238,24 +186,77 @@
rescue NoInternetError
@validation_report = ValidationReport.new('Internet error', :error,
@short_header, @header,
@description)
@validation_report.errors.push NoInternetError
- rescue
+ rescue StandardError
@validation_report = ValidationReport.new('Unexpected error', :error,
@short_header, @header,
@description)
@validation_report.errors.push 'Unexpected Error'
end
+ # Only run if the BLAST output does not contain hit alignmment
+ def find_local_alignment(hit, prediction, hsp)
+ # indexing in blast starts from 1
+ hit_local = hit.raw_sequence[hsp.hit_from - 1..hsp.hit_to - 1]
+ query_local = prediction.raw_sequence[hsp.match_query_from -
+ 1..hsp.match_query_to - 1]
+
+ # in case of nucleotide prediction sequence translate into protein
+ # use translate with reading frame 1 because
+ # to/from coordinates of the hsp already correspond to the
+ # reading frame in which the prediction was read to match this hsp
+ if @type == :nucleotide
+ s = Bio::Sequence::NA.new(query_local)
+ query_local = s.translate
+ end
+
+ opt = ['--maxiterate', '1000', '--localpair', '--anysymbol', '--quiet',
+ '--thread', @num_threads.to_s]
+ mafft = Bio::MAFFT.new('mafft', opt)
+
+ # local alignment for hit and query
+ seqs = [hit_local, query_local]
+ report = mafft.query_align(seqs)
+ report.alignment.map(&:to_s)
+ rescue StandardError
+ raise NoMafftInstallationError
+ end
+
+ def check_multiple_coverage(hit_alignment, query_alignment, hsp, coverage,
+ ranges_prediction)
+ # for each hsp of the curent hit
+ # iterate through the alignment and count the matching residues
+ [*(0..hit_alignment.length - 1)].each do |i|
+ residue_hit = hit_alignment[i]
+ residue_query = query_alignment[i]
+ next if [' ', '+', '-'].include?(residue_hit)
+ next if residue_hit != residue_query
+ # indexing in blast starts from 1
+ idx_hit = i + (hsp.hit_from - 1) -
+ hit_alignment[0..i].scan(/-/).length
+ idx_query = i + (hsp.match_query_from - 1) -
+ query_alignment[0..i].scan(/-/).length
+ coverage[idx_hit] += 1 unless in_range?(ranges_prediction, idx_query)
+ end
+ coverage
+ end
+
+ def in_range?(ranges, idx)
+ ranges.each { |range| return true if range.member?(idx) }
+ false
+ end
+
##
# wilcox test implementation from statsample ruby gem
# many thanks to Claudio for helping us with the implementation!
def wilcox_test(averages)
- wilcox = Statsample::Test.wilcoxon_signed_rank(Daru::Vector.new(averages),
- Daru::Vector.new(Array.new(averages.length,
- 1)))
+ wilcox = Statsample::Test.wilcoxon_signed_rank(
+ Daru::Vector.new(averages),
+ Daru::Vector.new(Array.new(averages.length, 1))
+ )
- (averages.length < 15) ? wilcox.probability_exact : wilcox.probability_z
+ averages.length < 15 ? wilcox.probability_exact : wilcox.probability_z
end
end
end