Class: Validation

Inherits:
Object
  • Object
show all
Defined in:
lib/genevalidator/validation.rb

Instance Attribute Summary (collapse)

Instance Method Summary (collapse)

Constructor Details

- (Validation) initialize(fasta_filepath, vlist = ["all"], tabular_format = nil, xml_file = nil, raw_seq_file = nil, mafft_path = nil, start_idx = 1)

Initilizes the object Params: fasta_filepath: fasta file with query sequences vlist: list of validations tabular_format: list of column names for parsing the tablar blast output xml_file: name of the precalculated blast xml output (used in 'skip blast' case) mafft_path: path of the MAFFT program installation start_idx: number of the sequence from the file to start with



50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'lib/genevalidator/validation.rb', line 50

def initialize( fasta_filepath, 
                vlist = ["all"], 
                tabular_format = nil, 
                xml_file = nil, 
                raw_seq_file = nil,
                mafft_path = nil, 
                start_idx = 1)
  begin

    @fasta_filepath = fasta_filepath
    @xml_file = xml_file
    @vlist = vlist.map{|v| v.gsub(/^\s/,"").gsub(/\s\Z/,"").split(/\s/)}.flatten
    @idx = 0

    if start_idx == nil
      @start_idx = 1
    else
      @start_idx = start_idx
    end

    raise FileNotFoundException.new unless File.exists?(@fasta_filepath)
    fasta_content = IO.binread(@fasta_filepath);

    # the expected type for the sequences is the
    # type of the first query
     
    # autodetect the type of the sequence in the FASTA
    # also check if the fasta file contains a single type of queries
    @type = BlastUtils.type_of_sequences(fasta_content)

    # create a list of index of the queries in the FASTA
    @query_offset_lst = fasta_content.enum_for(:scan, /(>[^>]+)/).map{ Regexp.last_match.begin(0)}
    @query_offset_lst.push(fasta_content.length)
    fasta_content   = nil # free memory for variable fasta_content
    @tabular_format = tabular_format

    if mafft_path == nil
      @mafft_path = which("mafft")
    else
      @mafft_path = mafft_path
    end

    begin
      # index raw_sequence file
      if raw_seq_file != nil
        raise FileNotFoundException.new unless File.exists?(raw_seq_file)
        # leave only the identifiers in the fasta description
        content = File.open(raw_seq_file, "rb").read
        File.open(raw_seq_file, 'w+') { |file| file.write(content.gsub(/ .*/, ""))}

        #index the fasta file
        keys = content.scan(/>(.*)\n/).flatten
        values = content.enum_for(:scan, /(>[^>]+)/).map{ Regexp.last_match.begin(0)}

        # make am index hash
        index_hash = {}
        index_hash = keys.each_with_index{|k, i| index_hash[keys[i]] = values[i]}
        
        puts index_hash

        # create FASTA index
        @raw_seq_file_index = "#{raw_seq_file}.idx"

      end
      rescue Exception => error
        $stderr.print "Error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}. "<<
          "Possible cause: your file with raw sequences is not FASTA. Please use get_raw_sequences executable to create a correct one.\n"       
    end
 
    # build the path of html folder output
    path = File.dirname(@fasta_filepath)
    @html_path = "#{fasta_filepath}.html"
    @yaml_path = path

    @filename = File.basename(@fasta_filepath)#.scan(/\/([^\/]+)$/)[0][0]
    @all_query_outputs = []

    # create 'html' directory
    FileUtils.rm_rf(@html_path)
    Dir.mkdir(@html_path)

    # copy auxiliar folders to the html folder
    FileUtils.cp_r("aux/css", @html_path)
    FileUtils.cp_r("aux/js", @html_path)
    FileUtils.cp_r("aux/img", @html_path)
    FileUtils.cp_r("aux/font", @html_path)
    FileUtils.cp_r("aux/doc", @html_path)

  rescue SequenceTypeError => error
    $stderr.print "Sequence Type error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}. "<<
     "Possible cause: input file containes mixed sequence types.\n"      
    exit 
  rescue FileNotFoundException => error
    $stderr.print "File not found error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}."<<
     "Possible cause: input file does not exist.\n"
    exit 
  end
end

Instance Attribute Details

- (Object) all_query_outputs (readonly)

list with all validation reports



36
37
38
# File 'lib/genevalidator/validation.rb', line 36

def all_query_outputs
  @all_query_outputs
end

- (Object) fasta_filepath (readonly)

Returns the value of attribute fasta_filepath



24
25
26
# File 'lib/genevalidator/validation.rb', line 24

def fasta_filepath
  @fasta_filepath
end

- (Object) filename (readonly)

Returns the value of attribute filename



28
29
30
# File 'lib/genevalidator/validation.rb', line 28

def filename
  @filename
end

- (Object) html_path (readonly)

Returns the value of attribute html_path



25
26
27
# File 'lib/genevalidator/validation.rb', line 25

def html_path
  @html_path
end

- (Object) idx

current number of the querry processed



31
32
33
# File 'lib/genevalidator/validation.rb', line 31

def idx
  @idx
end

- (Object) mafft_path (readonly)

Returns the value of attribute mafft_path



27
28
29
# File 'lib/genevalidator/validation.rb', line 27

def mafft_path
  @mafft_path
end

- (Object) query_offset_lst (readonly)

array of indexes for the start offsets of each query in the fasta file



34
35
36
# File 'lib/genevalidator/validation.rb', line 34

def query_offset_lst
  @query_offset_lst
end

- (Object) raw_seq_file_index (readonly)

Returns the value of attribute raw_seq_file_index



29
30
31
# File 'lib/genevalidator/validation.rb', line 29

def raw_seq_file_index
  @raw_seq_file_index
end

- (Object) start_idx (readonly)

Returns the value of attribute start_idx



32
33
34
# File 'lib/genevalidator/validation.rb', line 32

def start_idx
  @start_idx
end

- (Object) tabular_format (readonly)

Returns the value of attribute tabular_format



39
40
41
# File 'lib/genevalidator/validation.rb', line 39

def tabular_format
  @tabular_format
end

- (Object) type (readonly)

Returns the value of attribute type



23
24
25
# File 'lib/genevalidator/validation.rb', line 23

def type
  @type
end

- (Object) vlist (readonly)

Returns the value of attribute vlist



38
39
40
# File 'lib/genevalidator/validation.rb', line 38

def vlist
  @vlist
end

- (Object) yaml_path (readonly)

Returns the value of attribute yaml_path



26
27
28
# File 'lib/genevalidator/validation.rb', line 26

def yaml_path
  @yaml_path
end

Instance Method Details

- (Object) do_validations(prediction, hits)

Runs all the validations and prints the outputs given the current prediction query and the corresponding hits Params: hits: Array of Sequence objects Output: Array Output object



320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
# File 'lib/genevalidator/validation.rb', line 320

def do_validations(prediction, hits)
  begin
  begin
    hits = remove_identical_hits(prediction, hits)
    rescue Exception => error #NoPIdentError
  end
  
  # do validations
  begin

    query_output                = Output.new(@filename, @html_path, @yaml_path, @idx, @start_idx)
    query_output.prediction_len = prediction.length_protein
    query_output.prediction_def = prediction.definition
    query_output.nr_hits        = hits.length

    plot_path = "#{html_path}/#{filename}_#{@idx}"

    validations = []
    validations.push LengthClusterValidation.new(@type, prediction, hits, plot_path)
    validations.push LengthRankValidation.new(@type, prediction, hits)
    validations.push BlastReadingFrameValidation.new(@type, prediction, hits)
    validations.push GeneMergeValidation.new(@type, prediction, hits, plot_path)
    validations.push DuplicationValidation.new(@type, prediction, hits, @mafft_path, @raw_seq_file_index)
    validations.push OpenReadingFrameValidation.new(@type, prediction, hits, plot_path, ["ATG"])
    validations.push AlignmentValidation.new(@type, prediction, hits, plot_path, @mafft_path, @raw_seq_file_index)

    # check the class type of the elements in the list
    validations.map do |v|
      raise ValidationClassError unless v.is_a? ValidationTest
    end

    # check alias duplication
    unless validations.map{|v| v.cli_name}.length == validations.map{|v| v.cli_name}.uniq.length
      raise AliasDuplicationError 
    end

    if vlist.map{|v| v.strip.downcase}.include? "all"
      validations.map{|v| v.run}
      # check the class type of the validation reports
      validations.each do |v|
        raise ReportClassError unless v.validation_report.is_a? ValidationReport
      end
      query_output.validations = validations
    else
      desired_validations = validations.select {|v| vlist.map{|vv| vv.strip.downcase}.include? v.cli_name.downcase }
      desired_validations.each do |v|
          v.run
          raise ReportClassError unless v.validation_report.is_a? ValidationReport
      end
      query_output.validations = desired_validations
      if query_output.validations.length == 0
        raise NoValidationError
      end
    end

  rescue ValidationClassError => error
    $stderr.print "Class Type error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}. "<<
      "Possible cause: type of one of the validations is not ValidationTest\n"
    exit!
  rescue NoValidationError => error
    $stderr.print "Validation error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}. "<<
      "Possible cause: your -v arguments are not valid aliases\n"
    exit!
  rescue ReportClassError => error
    $stderr.print "Class Type error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}. "<<
      "Possible cause: type of one of the validation reports returned by the 'run' method is not ValidationReport\n"
    exit!
  rescue AliasDuplicationError => error
    $stderr.print "Alias Duplication error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}. "<<
      "Possible cause: At least two validations have the same CLI alias\n"
    exit!
  rescue Exception => error
    $stderr.print "Error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}.\n"
    exit!
  end
 
  query_output.generate_html
  query_output.print_output_console
  query_output.print_output_file_yaml
  @all_query_outputs.push(query_output)
rescue Exception => error
  $stderr.print "Error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}.\n"
  exit
end

end

- (Object) parse_output(output)

Parses the blast output: autodetect the format: xml or tabular Param: output: String with the blast output



205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# File 'lib/genevalidator/validation.rb', line 205

def parse_output(output)
  begin
    iterator_xml = Bio::BlastXMLParser::NokogiriBlastXml.new(output).to_enum
    iterator_tab = TabularParser.new(output, tabular_format, @type)
    input_file_type = :xml
    begin
      # get info about the query
      # get the @idx-th sequence  from the fasta file

      prediction = Sequence.new    
      if @idx+1 == @query_offset_lst.length
        break
      end
      query       = IO.binread(@fasta_filepath, @query_offset_lst[@idx+1] - @query_offset_lst[@idx], @query_offset_lst[@idx])
      parse_query = query.scan(/>([^\n]*)\n([A-Za-z\n]*)/)[0]

      prediction.definition     = parse_query[0].gsub("\n","")
      prediction.identifier     = prediction.definition.gsub(/ .*/,"")
      prediction.type           = @type
      prediction.raw_sequence   = parse_query[1].gsub("\n","")
      prediction.length_protein = prediction.raw_sequence.length

      if @type == :nucleotide
        prediction.length_protein /= 3    
      end

      @idx = @idx + 1

      begin
        if input_file_type == :xml
          # check xml format
          if @idx < @start_idx
            iter = iterator_xml.next
          else
            hits = BlastUtils.parse_next_query_xml(iterator_xml, @type)
            if hits == nil
              @idx = @idx -1
              break
            end
            do_validations(prediction, hits)
          end
        else 
          raise Exception
        end
      rescue SequenceTypeError => error
        $stderr.print "Sequence Type error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}. "<<
          "Possible cause: the blast output was not obtained against a protein database.\n"
        exit!
      rescue Exception => error
        begin 
          input_file_type = :tabular
          if @tabular_format == nil and @xml_file!= nil
            puts "Note: Please specify the --tabular argument if you used tabular format input with nonstandard columns.\n"
          end
          #check tabular format
          if @idx < @start_idx
            iterator_tab.jump_next          
          else
            hits = iterator_tab.next(prediction.identifier)
            if hits == nil
              @idx = @idx -1
              break
            end
            do_validations(prediction, hits)
          end
        rescue SequenceTypeError => error
          $stderr.print "Sequence Type error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}. "<<
            "Possible cause: the blast output was not obtained against a protein database.\n"
          exit!
        rescue Exception => error
          $stderr.print "Blast file error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}. "<<
             "Possible cause: blast output file format is neihter xml nor tabular.\n"
          exit!
        end
      end
    end while 1
  end
end

- (Object) remove_identical_hits(prediction, hits)

Removes identical hits Params: prediction: Sequence object hits: Array of Sequence objects Output: new array of hit Sequence objects



291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
# File 'lib/genevalidator/validation.rb', line 291

def remove_identical_hits(prediction, hits)
  # remove the identical hits
  # identical hit means 100%coverage and >99% identity
  identical_hits = []
  hits.each do |hit|
    # check if all hsps have identity more than 99%
    low_identity = hit.hsp_list.select{|hsp| hsp.pidentity == nil or hsp.pidentity < 99}
    # check the coverage
    coverage = Array.new(prediction.length_protein,0)
    hit.hsp_list.each do |hsp| 
       len = hsp.match_query_to - hsp.match_query_from + 1
       coverage[hsp.match_query_from-1..hsp.match_query_to-1] = Array.new(len, 1)
    end
    if low_identity.length == 0 and coverage.uniq.length == 1
      identical_hits.push(hit) 
    end
  end

  identical_hits.each {|hit| hits.delete(hit)}
  return hits
end

- (Object) validation

Parse the blast output and run validations



151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
# File 'lib/genevalidator/validation.rb', line 151

def validation
  puts "\nDepending on your input and your computational "<<
       "resources, this may take a while. Please wait..."
  begin
    if @xml_file == nil
 
      #file seek for each query
      @query_offset_lst[0..@query_offset_lst.length-2].each_with_index do |pos, i|
    
        if (i+1) >= @start_idx
          query = IO.binread(@fasta_filepath, @query_offset_lst[i+1] - @query_offset_lst[i], @query_offset_lst[i]);

          #call blast with the default parameters
          if type == :protein
            output = BlastUtils.call_blast_from_stdin("blastp", query, 11, 1)
          else
            output = BlastUtils.call_blast_from_stdin("blastx", query, 11, 1)
          end

          #save output in a file
          #xml_file = "blast/#{@fasta_filepath}_#{i+1}.xml"
          #File.open(xml_file , "w") do |f| f.write(output) end

          #parse output
          parse_output(output)   
        else
          @idx = @idx + 1
        end
      end
    else
      file = File.open(@xml_file, "rb").read
      #check the format of the input file
      parse_output(file)      
    end
    Output.print_footer(@all_query_outputs, @html_path)

  rescue SystemCallError => error
    $stderr.print "Load error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}. "<<
      "Possible cause: input file is not valid\n"      
    exit
  rescue SequenceTypeError => error
    $stderr.print "Sequence Type error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}. "<<
      "Possible cause: the blast output was not obtained against a protein database.\n"
    exit!
  rescue Exception => error
     $stderr.print "Error at #{error.backtrace[0].scan(/\/([^\/]+:\d+):.*/)[0][0]}.\n"
    exit!
  end
end

- (Object) which(cmd)

The ruby equivalent for 'which' command in unix



409
410
411
412
413
414
415
416
417
418
# File 'lib/genevalidator/validation.rb', line 409

def which(cmd)
  exts = ENV['PATHEXT'] ? ENV['PATHEXT'].split(';') : ['']
  ENV['PATH'].split(File::PATH_SEPARATOR).each do |path|
    exts.each { |ext|
      exe = File.join(path, "#{cmd}#{ext}")
      return exe if File.executable? exe
    }
  end
  return nil
end