# # = bio/db/sanger_chromatogram/scf.rb - Scf class # # Copyright:: Copyright (C) 2009 Anthony Underwood , # License:: The Ruby License # require 'bio/db/sanger_chromatogram/chromatogram' module Bio # == Description # # This class inherits from the SangerChromatogram superclass. It captures the information contained # within an scf format chromatogram file generated by DNA sequencing. See the SangerChromatogram class # for usage class Scf < SangerChromatogram # sequence attributes # The quality of each base at each position along the length of the sequence is captured # by the nqual attributes where n is one of a, c, g or t. Generally the quality will be # high for the base that is called at a particular position and low for all the other bases. # However at positions of poor sequence quality, more than one base may have similar top scores. # By analysing the nqual attributes it may be possible to determine if the base calling was # correct or not. # The quality of the A base at each sequence position attr_accessor :aqual # The quality of the C base at each sequence position attr_accessor :cqual # The quality of the G base at each sequence position attr_accessor :gqual # The quality of the T base at each sequence position attr_accessor :tqual # A hash of extra information extracted from the chromatogram file attr_accessor :comments # see SangerChromatogram class for how to create an Scf object and its usage def initialize(string) header = string.slice(0,128) # read in header info @chromatogram_type, @samples, @sample_offset, @bases, @bases_left_clip, @bases_right_clip, @bases_offset, @comment_size, @comments_offset, @version, @sample_size, @code_set, @header_spare = header.unpack("a4 NNNNNNNN a4 NN N20") get_traces(string) get_bases_peakIndices_and_qualities(string) get_comments(string) if @comments["DYEP"] @dye_mobility = @comments["DYEP"] else @dye_mobility = "Unnown" end end private def get_traces(string) if @version == "3.00" # read in trace info offset = @sample_offset length = @samples * @sample_size # determine whether the data is stored in 1 byte as an unsigned byte or 2 bytes as an unsigned short @sample_size == 2 ? byte = "n" : byte = "c" for base in ["a" , "c" , "g" , "t"] trace_read = string.slice(offset,length).unpack("#{byte}#{@samples}") # convert offsets for sample_num in (0..trace_read.size-1) if trace_read[sample_num] > 30000 trace_read[sample_num] = trace_read[sample_num] - 65536 end end # For 8-bit data we need to emulate a signed/unsigned # cast that is implicit in the C implementations..... if @sample_size == 1 for sample_num in (0..trace_read.size-1) trace_read[sample_num] += 256 if trace_read[sample_num] < 0 end end trace_read = convert_deltas_to_values(trace_read) self.instance_variable_set("@#{base}trace", trace_read) offset += length end elsif @version == "2.00" @atrace = [] @ctrace = [] @gtrace = [] @ttrace = [] # read in trace info offset = @sample_offset length = @samples * @sample_size * 4 # determine whether the data is stored in 1 byte as an unsigned byte or 2 bytes as an unsigned short @sample_size == 2 ? byte = "n" : byte = "c" trace_read = string.slice(offset,length).unpack("#{byte}#{@samples*4}") (0..(@samples-1)*4).step(4) do |offset2| @atrace << trace_read[offset2] @ctrace << trace_read[offset2+1] @gtrace << trace_read[offset2+2] @ttrace << trace_read[offset2+3] end end end def get_bases_peakIndices_and_qualities(string) if @version == "3.00" # now go and get the peak index information offset = @bases_offset length = @bases * 4 get_v3_peak_indices(string,offset,length) # now go and get the accuracy information offset += length; get_v3_accuracies(string,offset,length) # OK, now go and get the base information. offset += length; length = @bases; get_v3_sequence(string,offset,length) #combine accuracies to get quality scores @qualities= convert_accuracies_to_qualities elsif @version == "2.00" @peak_indices = [] @aqual = [] @cqual = [] @gqual = [] @tqual = [] @qualities = [] @sequence = "" # now go and get the base information offset = @bases_offset length = @bases * 12 all_bases_info = string.slice(offset,length) (0..length-1).step(12) do |offset2| base_info = all_bases_info.slice(offset2,12).unpack("N C C C C a C3") @peak_indices << base_info[0] @aqual << base_info[1] @cqual << base_info[2] @gqual << base_info[3] @tqual << base_info[4] @sequence += base_info[5].downcase case base_info[5].downcase when "a" @qualities << base_info[1] when "c" @qualities << base_info[2] when "g" @qualities << base_info[3] when "t" @qualities << base_info[4] else @qualities << 0 end end end end def get_v3_peak_indices(string,offset,length) @peak_indices = string.slice(offset,length).unpack("N#{length/4}") end def get_v3_accuracies(string,offset,length) qualities = string.slice(offset,length) qual_length = length/4; qual_offset = 0; for base in ["a" , "c" , "g" , "t"] self.instance_variable_set("@#{base}qual",qualities.slice(qual_offset,qual_length).unpack("C#{qual_length}")) qual_offset += qual_length end end def get_v3_sequence(string,offset,length) @sequence = string.slice(offset,length).unpack("a#{length}").join('').downcase end def convert_deltas_to_values(trace_read) p_sample = 0; for sample_num in (0..trace_read.size-1) trace_read[sample_num] = trace_read[sample_num] + p_sample p_sample = trace_read[sample_num]; end p_sample = 0; for sample_num in (0..trace_read.size-1) trace_read[sample_num] = trace_read[sample_num] + p_sample p_sample = trace_read[sample_num]; end return trace_read end def convert_accuracies_to_qualities qualities = Array.new for base_pos in (0..@sequence.length-1) case sequence.slice(base_pos,1) when "a" qualities << @aqual[base_pos] when "c" qualities << @cqual[base_pos] when "g" qualities << @gqual[base_pos] when "t" qualities << @tqual[base_pos] else qualities << 0 end end return qualities end def get_comments(string) @comments = Hash.new comment_string = string.slice(@comments_offset,@comment_size) comment_string.gsub!(/\0/, "") comment_array = comment_string.split("\n") comment_array.each do |comment| comment =~ /(\w+)=(.*)/ @comments[$1] = $2 end end end end