# frozen_string_literal: true require 'strscan' require 'rley' require_relative '../error' require_relative '../datatype/all_datatypes' require_relative 'literal' module Loxxy module FrontEnd # A scanner (tokenizer) for the Lox language. # Reference material: # https://craftinginterpreters.com/the-lox-language.html # Section 4.2.1 Token types # Appendix A1.2 Lexical Grammar # Responsibility: break input into a sequence of token objects. # The tokenizer should recognize: # Identifiers, # Number literals including single digit # String literals (quote delimited) # Delimiters: e.g. parentheses '(', ')' # Separators: e.g. comma class Scanner # @return [StringScanner] Low-level input scanner attr_reader(:scanner) # @return [Integer] The current line number attr_reader(:lineno) # @return [Integer] Position of last start of line in the input attr_reader(:line_start) # One or two special character tokens. # These are enumerated in section 4.2.1 Token type @@lexeme2name = { '(' => 'LEFT_PAREN', ')' => 'RIGHT_PAREN', '{' => 'LEFT_BRACE', '}' => 'RIGHT_BRACE', ',' => 'COMMA', '.' => 'DOT', '-' => 'MINUS', '+' => 'PLUS', ';' => 'SEMICOLON', '/' => 'SLASH', '*' => 'STAR', '!' => 'BANG', '!=' => 'BANG_EQUAL', '=' => 'EQUAL', '==' => 'EQUAL_EQUAL', '>' => 'GREATER', '>=' => 'GREATER_EQUAL', '<' => 'LESS', '<=' => 'LESS_EQUAL' }.freeze # Here are all the implemented Lox keywords # These are enumerated in section 4.2.1 Token type @@keywords = %w[ and class else false fun for if nil or print return super this true var while ].map { |x| [x, x] }.to_h # Single character that have a special meaning when escaped # @return [{Char => String}] @@escape_chars = { ?a => "\a", ?b => "\b", ?e => "\e", ?f => "\f", ?n => "\n", ?r => "\r", ?s => "\s", ?t => "\t", ?v => "\v" }.freeze # Constructor. Initialize a tokenizer for Lox input. # @param source [String] Lox text to tokenize. def initialize(source = nil) @scanner = StringScanner.new('') start_with(source) if source end # Reset the tokenizer and make the given text, the current input. # @param source [String] Lox text to tokenize. def start_with(source) @scanner.string = source @lineno = 1 @line_start = 0 end # Scan the source and return an array of tokens. # @return [Array] | Returns a sequence of tokens def tokens tok_sequence = [] until @scanner.eos? token = _next_token tok_sequence << token unless token.nil? end tok_sequence << build_token('EOF', nil) return tok_sequence end private def _next_token skip_intertoken_spaces curr_ch = scanner.peek(1) return nil if curr_ch.nil? || curr_ch.empty? token = nil if '(){},.;+-/*'.include? curr_ch # Single delimiter or separator character token = build_token(@@lexeme2name[curr_ch], scanner.getch) elsif (lexeme = scanner.scan(/[!=><]=?/)) # One or two special character tokens token = build_token(@@lexeme2name[lexeme], lexeme) elsif scanner.scan(/"/) # Start of string detected... token = build_string_token elsif (lexeme = scanner.scan(/\d+(?:\.\d+)?/)) token = build_token('NUMBER', lexeme) elsif (lexeme = scanner.scan(/[a-zA-Z_][a-zA-Z_0-9]*/)) keyw = @@keywords[lexeme] tok_type = keyw ? keyw.upcase : 'IDENTIFIER' token = build_token(tok_type, lexeme) else # Unknown token col = scanner.pos - @line_start + 1 _erroneous = curr_ch.nil? ? '' : scanner.scan(/./) raise ScanError, "Error: [line #{lineno}:#{col}]: Unexpected character." end return token end def build_token(aSymbolName, aLexeme) begin (value, symb) = convert_to(aLexeme, aSymbolName) lex_length = aLexeme ? aLexeme.size : 0 col = scanner.pos - lex_length - @line_start + 1 pos = Rley::Lexical::Position.new(@lineno, col) if value token = Literal.new(value, aLexeme.dup, symb, pos) else token = Rley::Lexical::Token.new(aLexeme.dup, symb, pos) end rescue StandardError => e puts "Failing with '#{aSymbolName}' and '#{aLexeme}'" raise e end return token end def convert_to(aLexeme, aSymbolName) symb = aSymbolName case aSymbolName when 'FALSE' value = Datatype::False.instance when 'NIL' value = Datatype::Nil.instance when 'NUMBER' value = Datatype::Number.new(aLexeme) when 'TRUE' value = Datatype::True.instance else value = nil end return [value, symb] end # precondition: current position at leading quote def build_string_token scan_pos = scanner.pos line = @lineno column_start = scan_pos - @line_start literal = +'' loop do substr = scanner.scan(/[^"\\\r\n]*/) if scanner.eos? pos_start = "line #{line}:#{column_start}" raise ScanError, "Error: [#{pos_start}]: Unterminated string." else literal << substr special = scanner.scan(/["\\\r\n]/) case special when '"' # Terminating quote found break when "\r" next_line special << scanner.scan(/./) if scanner.match?(/\n/) literal << special when "\n" next_line literal << special when '\\' ch = scanner.scan(/./) next unless ch escaped = @@escape_chars[ch] if escaped literal << escaped else literal << ch end end end end pos = Rley::Lexical::Position.new(line, column_start) lox_string = Datatype::LXString.new(literal) lexeme = scanner.string[scan_pos - 1..scanner.pos - 1] Literal.new(lox_string, lexeme, 'STRING', pos) end # Skip non-significant whitespaces and comments. # Advance the scanner until something significant is found. def skip_intertoken_spaces loop do ws_found = scanner.skip(/[ \t\f]+/) ? true : false nl_found = scanner.skip(/(?:\r\n)|\r|\n/) if nl_found ws_found = true next_line end cmt_found = false if scanner.scan(/\/(\/|\*)/) cmt_found = true case scanner.matched when '//' scanner.skip(/[^\r\n]*(?:(?:\r\n)|\r|\n)?/) next_line when '/*' skip_block_comment next end end break unless ws_found || cmt_found end scanner.pos end def skip_block_comment nesting_level = 1 loop do comment_part = scanner.scan_until(/(?:\/\*)|(?:\*\/)|(?:(?:\r\n)|\r|\n)/) unless comment_part msg = "Unterminated '/* ... */' block comment on line #{lineno}" raise ScanError, msg end case scanner.matched when /(?:(?:\r\n)|\r|\n)/ next_line when '*/' nesting_level -= 1 break if nesting_level.zero? when '/*' nesting_level += 1 end end end def next_line @lineno += 1 @line_start = scanner.pos end end # class end # module end # module # End of file