require 'strscan' module Mspire # A Digester splits a protein sequence into peptides at specified sites. # # trypsin = Mspire::Digester[:trypsin] # # trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG') # # => ['MIVIGR', 'SIVHPYITNEYEPFAAEK', 'QQILSIMAG'] # # With 1 missed cleavage: # # trypsin.digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1) # # => ['MIVIGR','MIVIGRSIVHPYITNEYEPFAAEK','SIVHPYITNEYEPFAAEK', # # 'SIVHPYITNEYEPFAAEKQQILSIMAG', 'QQILSIMAG'] # # Return the start and end sites of digestion: # # trypsin.site_digest('MIVIGRSIVHPYITNEYEPFAAEKQQILSIMAG', 1) # # => [[0,6],[0,24],[6,24],[6,33],[24,33]] class Digester # The name of the digester attr_reader :name # A string of residues at which cleavage occurs attr_reader :cleave_str # A c-terminal resitriction residue which prevents # cleavage at a potential cleavage site (optional). attr_reader :cterm_exception # True if cleavage occurs at the c-terminus of a # cleavage residue, false if cleavage occurs at # the n-terminus. attr_reader :cterm_cleavage MULTILINE_WHITESPACE = /\s*/m def initialize(name, cleave_str, cterm_exception=nil, cterm_cleavage=true) regexp = [] 0.upto(cleave_str.length - 1) {|i| regexp << cleave_str[i, 1] } @name = name @cleave_str = cleave_str @cleave_regexp = Regexp.new(regexp.join('|')) @cterm_exception = case when cterm_exception == nil || cterm_exception.empty? then nil when cterm_exception.length == 1 then cterm_exception[0] else raise ArgumentError, "cterm exceptions must be a single residue: #{cterm_exception}" end @cterm_cleavage = cterm_cleavage @scanner = StringScanner.new('') end # Returns digestion sites in sequence, as determined by the # cleave_regexp boundaries. The digestion sites correspond to the # positions where a peptide begins and ends, such that [n, (n+1) - n] # corresponds to the [index, length] for peptide n. # # d = Digester.new('Trypsin', 'KR', 'P') # seq = "AARGGR" # sites = d.cleavage_sites(seq) # => [0, 3, 6] # # seq[sites[0], sites[0+1] - sites[0]] # => "AAR" # seq[sites[1], sites[1+1] - sites[1]] # => "GGR" # # Trailing whitespace is included in the fragment. # # seq = "AAR \n GGR" # sites = d.cleavage_sites(seq) # => [0, 8, 11] # # seq[sites[0], sites[0+1] - sites[0]] # => "AAR \n " # seq[sites[1], sites[1+1] - sites[1]] # => "GGR" # # The digested section of sequence may be specified using offset # and length. def cleavage_sites(seq, offset=0, length=seq.length-offset) return [0, 1] if seq.size == 1 # adding exceptions is lame--algorithm should just work adjustment = cterm_cleavage ? 0 : 1 limit = offset + length positions = [offset] pos = scan(seq, offset, limit) do |pos| positions << (pos - adjustment) end # add the final position if (pos < limit) || (positions.length == 1) positions << limit end # adding exceptions is lame.. this code probably needs to be # refactored (corrected). if !cterm_cleavage && pos == limit positions << limit end positions end # Returns digestion sites of sequence as [start_index, end_index] pairs, # allowing for missed cleavages. Digestion sites are determined using # cleavage_sites; as in that method, the digested section of sequence # may be specified using offset and length. # # Each [start_index, end_index] pair is yielded to the block, if given, # and the collected results are returned. def site_digest(seq, max_misses=0, offset=0, length=seq.length-offset, &block) # :yields: start_index, end_index frag_sites = cleavage_sites(seq, offset, length) overlay(frag_sites.length, max_misses, 1) do |start_index, end_index| start_index = frag_sites[start_index] end_index = frag_sites[end_index] block ? block.call(start_index, end_index) : [start_index, end_index] end end # Returns an array of peptides produced by digesting sequence, allowing for # missed cleavage sites. Digestion sites are determined using cleavage_sites; # as in that method, the digested section of sequence may be specified using # offset and length. def digest(seq, max_misses=0, offset=0, length=seq.length-offset) site_digest(seq, max_misses, offset, length).map do |s, e| seq[s, e-s] end end protected # The cleavage regexp used to identify cleavage sites attr_reader :cleave_regexp # :nodoc: # The scanner used to digest strings. attr_reader :scanner # :nodoc: # Scans seq between offset and limit for the cleave_regexp, skipping whitespace # and being mindful of exception characters. The positions of the scanner at # each match are yielded to the block. def scan(seq, offset, limit, &block) # :nodoc: scanner.string = seq scanner.pos = offset while scanner.search_full(cleave_regexp, true, false) scanner.search_full(MULTILINE_WHITESPACE, true, false) pos = scanner.pos # skip if the next character is the exception character next if cterm_exception != nil && seq[pos] == cterm_exception # break if you scanned past the upper limit break if pos > limit block.call(pos) end scanner.pos end # Performs an overlap-collect algorithm providing the start and end # indicies of spans skipping up to max_misses boundaries. def overlay(n, max_misses, offset, &block) # :nodoc: results = [] 0.upto(n-1) do |start_index| 0.upto(max_misses) do |n_miss| end_index = start_index + offset + n_miss break if end_index == n results << block.call(start_index, end_index) end end results end # # Enzymes adapted from the default Mascot enzyme list. # class << self # takes the name of the enzyme in any case (symbol or string) # and accesses the constant (returns nil if none found) def [](enzyme_name) ENZYMES[ enzyme_name.to_s.downcase.gsub(/\W+/,'_').to_sym ] end # Utility method to parse a mascot enzyme configuration # string (tab separated) into a Digester. def mascot_parse(str) # :nodoc: name, sense, cleave_str, cterm_exception, independent, semi_specific = str.split(/ *\t */) cterm_cleavage = case sense when 'C-Term' then true when 'N-Term' then false else raise ArgumentError, "unknown sense: #{sense}" end new(name, cleave_str, cterm_exception, cterm_cleavage) end end # ARG_C = mascot_parse('Arg-C C-Term R P no no') # ENZYMES[:arg_c] = <'Arg-C' enzyme> MASCOT_ENZYME_CONFIG_STRINGS = { :arg_c => 'Arg-C C-Term R P no no', :asp_n => 'Asp-N N-Term BD no no', :asp_n_ambic => 'Asp-N_ambic N-Term DE no no', :chymotrypsin => 'Chymotrypsin C-Term FLWY P no no', :cnbr => 'CNBr C-Term M no no', :lys_c => 'Lys-C C-Term K P no no', :lys_c_p => 'Lys-C/P C-Term K no no', :pepsin_a => 'PepsinA C-Term FL no no', :tryp_cnbr => 'Tryp-CNBr C-Term KMR P no no', :tryp_chymo => 'TrypChymo C-Term FKLRWY P no no', :trypsin_p => 'Trypsin/P C-Term KR no no', :v8_de => 'V8-DE C-Term BDEZ P no no', :v8_e => 'V8-E C-Term EZ P no no', :trypsin => 'Trypsin C-Term KR P no no', :v8_e_trypsin => 'V8-E+Trypsin C-Term EKRZ P no no', :v8_de_trypsin => 'V8-DE+Trypsin C-Term BDEKRZ P no no', :arg_c => 'Arg-C C-Term R P no no', :asp_n => 'Asp-N N-Term BD no no', :asp_n_ambic => 'Asp-N_ambic N-Term DE no no', :chymotrypsin => 'Chymotrypsin C-Term FLWY P no no', :cnbr => 'CNBr C-Term M no no', :lys_c => 'Lys-C C-Term K P no no', :lys_c_p => 'Lys-C/P C-Term K no no', :pepsin_a => 'PepsinA C-Term FL no no', :tryp_cnbr => 'Tryp-CNBr C-Term KMR P no no', :tryp_chymo => 'TrypChymo C-Term FKLRWY P no no', :trypsin_p => 'Trypsin/P C-Term KR no no', :v8_de => 'V8-DE C-Term BDEZ P no no', :v8_e => 'V8-E C-Term EZ P no no', :trypsin => 'Trypsin C-Term KR P no no', :v8_e_trypsin => 'V8-E+Trypsin C-Term EKRZ P no no', :v8_de_trypsin => 'V8-DE+Trypsin C-Term BDEKRZ P no no', } ENZYMES = MASCOT_ENZYME_CONFIG_STRINGS.inject(Hash.new) do |hash,(k,v)| hash[k] = mascot_parse(v) hash end end end