Class: DuplicationValidation

Inherits:
ValidationTest show all
Defined in:
lib/genevalidator/validation_duplication.rb

Overview

This class contains the methods necessary for finding duplicated subsequences in the predicted gene

Instance Attribute Summary (collapse)

Attributes inherited from ValidationTest

#cli_name, #description, #header, #hits, #prediction, #running_time, #short_header, #type, #validation_report

Instance Method Summary (collapse)

Constructor Details

- (DuplicationValidation) initialize(type, prediction, hits, mafft_path, index_file_name)

Returns a new instance of DuplicationValidation



47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/genevalidator/validation_duplication.rb', line 47

def initialize(type, prediction, hits, mafft_path, index_file_name)
  super
  @mafft_path      = mafft_path
  @index_file_name = index_file_name
  @short_header    = "Duplication"
  @header          = "Duplication"
  @description = "Check whether there is a duplicated subsequence in the"<<
  " predicted gene by counting the hsp residue coverag of the prediction,"<<
  " for each hit. Meaning of the output displayed: P-value of the Wilcoxon"<<
  " test which test the distribution of hit average coverage against 1."<<
  " P-values higher than 5% pass the validation test."
  @cli_name        = "dup"

end

Instance Attribute Details

- (Object) index_file_name (readonly)

Returns the value of attribute index_file_name



45
46
47
# File 'lib/genevalidator/validation_duplication.rb', line 45

def index_file_name
  @index_file_name
end

- (Object) mafft_path (readonly)

Returns the value of attribute mafft_path



44
45
46
# File 'lib/genevalidator/validation_duplication.rb', line 44

def mafft_path
  @mafft_path
end

Instance Method Details

- (Object) run(n = 10)

Check duplication in the first n hits Output: DuplicationValidationOutput object



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# File 'lib/genevalidator/validation_duplication.rb', line 66

def run(n=10)    
  begin
    raise NotEnoughHitsError unless hits.length >= 5
    raise Exception unless prediction.is_a? Sequence and
                           prediction.raw_sequence != nil and 
                           hits[0].is_a? Sequence 

    start = Time.new
    # get the first n hits
    less_hits = @hits[0..[n-1,@hits.length].min]
    useless_hits = []
    begin
      # get raw sequences for less_hits
      less_hits.map do |hit|
        #get gene by accession number
        if hit.raw_sequence == nil
          #hit.get_sequence_from_index_file(@index_file_name, hit.identifier)
          if hit.type == :protein
            hit.get_sequence_by_accession_no(hit.accession_no, "protein")
          else
            hit.get_sequence_by_accession_no(hit.accession_no, "nucleotide")
          end
          if hit.raw_sequence == ""
            useless_hits.push(hit)
          end
        end
      end
      rescue Exception => error
        raise NoInternetError
    end
    useless_hits.each{|hit| less_hits.delete(hit)}

    if less_hits.length == 0 
      raise NoInternetError
    end

    averages = []

    less_hits.each do |hit|
      coverage = Array.new(hit.length_protein,0)
      hit.hsp_list.each do |hsp|
      # align subsequences from the hit and prediction that match (if it's the case)
        if hsp.hit_alignment != nil and hsp.query_alignment != nil
          hit_alignment   = hsp.hit_alignment
          query_alignment = hsp.query_alignment
        else
          # indexing in blast starts from 1
          hit_local = hit.raw_sequence[hsp.hit_from-1..hsp.hit_to-1]
          query_local = prediction.raw_sequence[hsp.match_query_from-1..hsp.match_query_to-1]

          # in case of nucleotide prediction sequence translate into protein
          # use translate with reading frame 1 because 
          # to/from coordinates of the hsp already correspond to the 
          # reading frame in which the prediction was read to match this hsp 
          if @type == :nucleotide
            s = Bio::Sequence::NA.new(query_local)
            query_local = s.translate
          end

          # local alignment for hit and query
          seqs = [hit_local, query_local]

          begin
            options   = ['--maxiterate', '1000', '--localpair', '--quiet']
            mafft     = Bio::MAFFT.new(@mafft_path, options)
            report    = mafft.query_align(seqs)
            raw_align = report.alignment
            align     = []

            raw_align.each { |s| align.push(s.to_s) }
            hit_alignment   = align[0]
            query_alignment = align[1]
          rescue Exception => error                
            raise NoMafftInstallationError
          end
        end 

        # check multiple coverage

        # for each hsp of the curent hit
        # iterate through the alignment and count the matching residues
        [*(0 .. hit_alignment.length-1)].each do |i|
          residue_hit = hit_alignment[i]
          residue_query = query_alignment[i]
          if residue_hit != ' ' and residue_hit != '+' and residue_hit != '-'
            if residue_hit == residue_query             
              # indexing in blast starts from 1
              idx = i + (hsp.hit_from-1) - hit_alignment[0..i].scan(/-/).length 
                coverage[idx] += 1
              #end
            end
          end
        end
      end
      overlap = coverage.reject{|x| x==0}
      if overlap != []
        averages.push(overlap.inject(:+)/(overlap.length + 0.0)).map{|x| x.round(2)}
      end
    end
  
    # if all hsps match only one time
    if averages.reject{|x| x==1} == []
      @validation_report = DuplicationValidationOutput.new(1)
      @running_time = Time.now - start
      return @validation_report
    end
    
    pval = wilcox_test(averages)

    @validation_report = DuplicationValidationOutput.new(pval)        
    @running_time = Time.now - start
    return @validation_report

  rescue  NotEnoughHitsError => error
    @validation_report = ValidationReport.new("Not enough evidence", :warning)
    return @validation_report
  rescue NoMafftInstallationError
    @validation_report = ValidationReport.new("Mafft error", :error)
    @validation_report.errors.push NoMafftInstallationError                          
    return @validation_report
  rescue NoInternetError
    @validation_report = ValidationReport.new("Internet error", :error)
    @validation_report.errors.push NoInternetError
    return @validation_report
  rescue Exception => error
    puts error.backtrace
    @validation_report.errors.push OtherError
    @validation_report = ValidationReport.new("Unexpected error", :error)
    return @validation_report
  end
end

- (Object) wilcox_test(averages)

wilcox test implementation from statsample ruby gem many thanks to Claudio for helping us with the implementation!



201
202
203
204
205
206
207
208
209
# File 'lib/genevalidator/validation_duplication.rb', line 201

def wilcox_test (averages)
   require 'statsample'
   wilcox = Statsample::Test.wilcoxon_signed_rank(averages.to_scale, Array.new(averages.length,1).to_scale)
   if averages.length < 15
     return wilcox.probability_exact
   else
     return wilcox.probability_z
   end
end

- (Object) wilcox_test_R(averages)

Calls R to calculate the p value for the wilcoxon-test Input vector Array of values with nonparametric distribution



216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# File 'lib/genevalidator/validation_duplication.rb', line 216

def wilcox_test_R (averages)
  require 'rinruby'
  begin
    original_stdout = $stdout
    original_stderr = $stderr

    $stdout = File.new('/dev/null', 'w')
    $stderr = File.new('/dev/null', 'w')

    R.echo "enable = nil, stderr = nil, warn = nil"
    #make the wilcox-test and get the p-value
    R.eval("coverageDistrib = c#{averages.to_s.gsub('[','(').gsub(']',')')}")
    R.eval("coverageDistrib = c#{averages.to_s.gsub('[','(').gsub(']',')')}")
    R. eval("pval = wilcox.test(coverageDistrib - 1)$p.value")

    pval = R.pull "pval"
    $stdout = original_stdout
    $stderr = original_stderr

    return pval
  rescue Exception => error
    #return nil
  end
end