# frozen_string_literal: true require 'strscan' require_relative '../lexical/token' module Rley module RGN # A tokenizer for the Rley notation language. # Responsibility: break input into a sequence of token objects. # The tokenizer should recognize: # Identifiers, # Number literals including single digit # String literals (quote delimited) # Delimiters: e.g. parentheses '(', ')' # Separators: e.g. comma class Tokenizer # @return [StringScanner] Low-level input scanner attr_reader(:scanner) # @return [Integer] The current line number attr_reader(:lineno) # @return [Integer] Position of last start of line in the input attr_reader(:line_start) # One or two special character tokens. @@lexeme2name = { '(' => 'LEFT_PAREN', ')' => 'RIGHT_PAREN', '{' => 'LEFT_BRACE', '}' => 'RIGHT_BRACE', ',' => 'COMMA', '+' => 'PLUS', '?' => 'QUESTION_MARK', '*' => 'STAR', '..' => 'ELLIPSIS' }.freeze # Here are all the implemented Rley notation keywords @@keywords = %w[ match_closest repeat ].map { |x| [x, x] }.to_h # Constructor. Initialize a tokenizer for Lox input. # @param source [String] Lox text to tokenize. def initialize(source = nil) @scanner = StringScanner.new('') start_with(source) if source end # Reset the tokenizer and make the given text, the current input. # @param source [String] Lox text to tokenize. def start_with(source) @scanner.string = source @lineno = 1 @line_start = 0 end # Scan the source and return an array of tokens. # @return [Array] | Returns a sequence of tokens def tokens tok_sequence = [] until @scanner.eos? token = _next_token tok_sequence << token unless token.nil? end return tok_sequence end private def _next_token pos_before = scanner.pos skip_intertoken_spaces ws_found = true if scanner.pos > pos_before curr_ch = scanner.peek(1) return nil if curr_ch.nil? || curr_ch.empty? token = nil if '(){},'.include? curr_ch # Single delimiter, separator or character token = build_token(@@lexeme2name[curr_ch], scanner.getch) elsif '?*+,'.include? curr_ch # modifier character # modifiers without prefix text are symbols symb = ws_found ? 'SYMBOL' : @@lexeme2name[curr_ch] token = build_token(symb, scanner.getch) elsif (lexeme = scanner.scan(/\.\./)) # One or two special character tokens token = build_token(@@lexeme2name[lexeme], lexeme) elsif scanner.check(/"|'/) # Start of string detected... token = build_string_token elsif (lexeme = scanner.scan(/\d+/)) token = build_token('INT_LIT', lexeme) elsif (lexeme = scanner.scan(/[a-zA-Z_][a-zA-Z_0-9]*:/)) keyw = @@keywords[lexeme.chop!] token = build_token('KEY', lexeme) if keyw # ... error case elsif (lexeme = scanner.scan(/[^?*+,:(){}\s]+/)) token = build_token('SYMBOL', lexeme) else # Unknown token col = scanner.pos - @line_start + 1 _erroneous = curr_ch.nil? ? '' : scanner.scan(/./) raise ScanError, "Error: [line #{lineno}:#{col}]: Unexpected character." end return token end def build_token(aSymbolName, aLexeme) begin lex_length = aLexeme ? aLexeme.size : 0 col = scanner.pos - lex_length - @line_start + 1 pos = Rley::Lexical::Position.new(@lineno, col) token = Rley::Lexical::Token.new(aLexeme.dup, aSymbolName, pos) rescue StandardError => e puts "Failing with '#{aSymbolName}' and '#{aLexeme}'" raise e end return token end # precondition: current position at leading quote def build_string_token delimiter = scanner.scan(/./) scan_pos = scanner.pos line = @lineno column_start = scan_pos - @line_start literal = +'' loop do substr = scanner.scan(/[^"'\\\r\n]*/) if scanner.eos? pos_start = "line #{line}:#{column_start}" raise ScanError, "Error: [#{pos_start}]: Unterminated string." else literal << substr special = scanner.scan(/["'\\\r\n]/) case special when delimiter # Terminating quote found break when "\r" next_line special << scanner.scan(/./) if scanner.match?(/\n/) literal << special when "\n" next_line literal << special end end end pos = Rley::Lexical::Position.new(line, column_start) Rley::Lexical::Token.new(literal, 'STR_LIT', pos) end # Skip non-significant whitespaces and comments. # Advance the scanner until something significant is found. def skip_intertoken_spaces loop do ws_found = scanner.skip(/[ \t\f]+/) ? true : false nl_found = scanner.skip(/(?:\r\n)|\r|\n/) if nl_found ws_found = true next_line end break unless ws_found end scanner.pos end def next_line @lineno += 1 @line_start = scanner.pos end end # class end # module end # module