# File: srl_tokenizer.rb # Tokenizer for SRL (Simple Regex Language) require 'strscan' require 'rley' # Load the gem module SRL # The tokenizer should recognize: # Keywords: as, capture, letter # Integer literals including single digit # String literals (quote delimited) # Single character literal # Delimiters: parentheses '(' and ')' # Separators: comma (optional) class Tokenizer attr_reader(:scanner) attr_reader(:lineno) attr_reader(:line_start) attr_reader(:name2symbol) @@lexeme2name = { '(' => 'LPAREN', ')' => 'RPAREN', ',' => 'COMMA' }.freeze # Here are all the SRL keywords (in uppercase) @@keywords = %w[ AND ANY ANYTHING AT BACKSLASH BETWEEN CHARACTER DIGIT EXACTLY FROM LEAST LETTER LINE LITERALLY MORE NEVER NEW NO NUMBER OF ONCE ONE OPTIONAL OR TAB TIMES TO TWICE UPPERCASE WHITESPACE ].map { |x| [x, x] } .to_h class ScanError < StandardError; end def initialize(source, aGrammar) @scanner = StringScanner.new(source) @name2symbol = aGrammar.name2symbol @lineno = 1 end def tokens() tok_sequence = [] until @scanner.eos? token = _next_token tok_sequence << token unless token.nil? end return tok_sequence end private def _next_token() skip_whitespaces curr_ch = scanner.peek(1) return nil if curr_ch.nil? || curr_ch.empty? token = nil if '(),'.include? curr_ch # Delimiters, separators => single character token token = build_token(@@lexeme2name[curr_ch], scanner.getch) elsif (lexeme = scanner.scan(/[0-9]{2,}/)) token = build_token('INTEGER', lexeme) # An integer has two or more digits elsif (lexeme = scanner.scan(/[0-9]/)) token = build_token('DIGIT_LIT', lexeme) elsif (lexeme = scanner.scan(/[a-zA-Z]{2,}/)) token = build_token(@@keywords[lexeme.upcase], lexeme) # TODO: handle case unknown identifier elsif (lexeme = scanner.scan(/[a-zA-Z]((?=\s)|$)/)) token = build_token('LETTER_LIT', lexeme) elsif (lexeme = scanner.scan(/"([^"]|\\")*"/)) # Double quotes literal? unquoted = lexeme.gsub(/(^")|("$)/, '') token = build_token('STRING_LIT', unquoted) elsif (lexeme = scanner.scan(/'([^']|\\')*'/)) # Single quotes literal? unquoted = lexeme.gsub(/(^')|('$)/, '') token = build_token('STRING_LIT', unquoted) else # Unknown token erroneous = curr_ch.nil? ? '' : curr_ch sequel = scanner.scan(/.{1,20}/) erroneous += sequel unless sequel.nil? raise ScanError.new("Unknown token #{erroneous}") end return token end def build_token(aSymbolName, aLexeme) token_type = name2symbol[aSymbolName] begin token = Rley::Lexical::Token.new(aLexeme, token_type) rescue Exception => ex puts "Failing with '#{aSymbolName}' and '#{aLexeme}'" raise ex end return token end def skip_whitespaces() scanner.scan(/[ \t\f\n\r]+/) end end # class end # module