# frozen_string_literal: true module GraphQL module Language class Lexer def initialize(graphql_str, filename: nil, max_tokens: nil) if !(graphql_str.encoding == Encoding::UTF_8 || graphql_str.ascii_only?) graphql_str = graphql_str.dup.force_encoding(Encoding::UTF_8) end @string = graphql_str @filename = filename @scanner = StringScanner.new(graphql_str) @pos = nil @max_tokens = max_tokens || Float::INFINITY @tokens_count = 0 end def eos? @scanner.eos? end attr_reader :pos def advance @scanner.skip(IGNORE_REGEXP) return false if @scanner.eos? @tokens_count += 1 if @tokens_count > @max_tokens raise_parse_error("This query is too large to execute.") end @pos = @scanner.pos next_byte = @string.getbyte(@pos) next_byte_is_for = FIRST_BYTES[next_byte] case next_byte_is_for when ByteFor::PUNCTUATION @scanner.pos += 1 PUNCTUATION_NAME_FOR_BYTE[next_byte] when ByteFor::NAME if len = @scanner.skip(KEYWORD_REGEXP) case len when 2 :ON when 12 :SUBSCRIPTION else pos = @pos # Use bytes 2 and 3 as a unique identifier for this keyword bytes = (@string.getbyte(pos + 2) << 8) | @string.getbyte(pos + 1) KEYWORD_BY_TWO_BYTES[_hash(bytes)] end else @scanner.skip(IDENTIFIER_REGEXP) :IDENTIFIER end when ByteFor::IDENTIFIER @scanner.skip(IDENTIFIER_REGEXP) :IDENTIFIER when ByteFor::NUMBER @scanner.skip(NUMERIC_REGEXP) if GraphQL.reject_numbers_followed_by_names new_pos = @scanner.pos peek_byte = @string.getbyte(new_pos) next_first_byte = FIRST_BYTES[peek_byte] if next_first_byte == ByteFor::NAME || next_first_byte == ByteFor::IDENTIFIER number_part = token_value name_part = @scanner.scan(IDENTIFIER_REGEXP) raise_parse_error("Name after number is not allowed (in `#{number_part}#{name_part}`)") end end # Check for a matched decimal: @scanner[1] ? :FLOAT : :INT when ByteFor::ELLIPSIS if @string.getbyte(@pos + 1) != 46 || @string.getbyte(@pos + 2) != 46 raise_parse_error("Expected `...`, actual: #{@string[@pos..@pos + 2].inspect}") end @scanner.pos += 3 :ELLIPSIS when ByteFor::STRING if @scanner.skip(BLOCK_STRING_REGEXP) || @scanner.skip(QUOTED_STRING_REGEXP) :STRING else raise_parse_error("Expected string or block string, but it was malformed") end else @scanner.pos += 1 :UNKNOWN_CHAR end rescue ArgumentError => err if err.message == "invalid byte sequence in UTF-8" raise_parse_error("Parse error on bad Unicode escape sequence", nil, nil) end end def token_value @string.byteslice(@scanner.pos - @scanner.matched_size, @scanner.matched_size) rescue StandardError => err raise GraphQL::Error, "(token_value failed: #{err.class}: #{err.message})" end def debug_token_value(token_name) if token_name && Lexer::Punctuation.const_defined?(token_name) Lexer::Punctuation.const_get(token_name) elsif token_name == :ELLIPSIS "..." elsif token_name == :STRING string_value elsif @scanner.matched_size.nil? @scanner.peek(1) else token_value end end ESCAPES = /\\["\\\/bfnrt]/ ESCAPES_REPLACE = { '\\"' => '"', "\\\\" => "\\", "\\/" => '/', "\\b" => "\b", "\\f" => "\f", "\\n" => "\n", "\\r" => "\r", "\\t" => "\t", } UTF_8 = /\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i VALID_STRING = /\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o ESCAPED = /(?:#{ESCAPES}|#{UTF_8})/o def string_value str = token_value is_block = str.start_with?('"""') if is_block str.gsub!(/\A"""|"""\z/, '') return Language::BlockString.trim_whitespace(str) else str.gsub!(/\A"|"\z/, '') if !str.valid_encoding? || !str.match?(VALID_STRING) raise_parse_error("Bad unicode escape in #{str.inspect}") else Lexer.replace_escaped_characters_in_place(str) if !str.valid_encoding? raise_parse_error("Bad unicode escape in #{str.inspect}") else str end end end end def line_number @scanner.string[0..@pos].count("\n") + 1 end def column_number @scanner.string[0..@pos].split("\n").last.length end def raise_parse_error(message, line = line_number, col = column_number) raise GraphQL::ParseError.new(message, line, col, @string, filename: @filename) end IGNORE_REGEXP = %r{ (?: [, \c\r\n\t]+ | \#.*$ )* }x IDENTIFIER_REGEXP = /[_A-Za-z][_0-9A-Za-z]*/ INT_REGEXP = /-?(?:[0]|[1-9][0-9]*)/ FLOAT_DECIMAL_REGEXP = /[.][0-9]+/ FLOAT_EXP_REGEXP = /[eE][+-]?[0-9]+/ # TODO: FLOAT_EXP_REGEXP should not be allowed to follow INT_REGEXP, integers are not allowed to have exponent parts. NUMERIC_REGEXP = /#{INT_REGEXP}(#{FLOAT_DECIMAL_REGEXP}#{FLOAT_EXP_REGEXP}|#{FLOAT_DECIMAL_REGEXP}|#{FLOAT_EXP_REGEXP})?/ KEYWORDS = [ "on", "fragment", "true", "false", "null", "query", "mutation", "subscription", "schema", "scalar", "type", "extend", "implements", "interface", "union", "enum", "input", "directive", "repeatable" ].freeze KEYWORD_REGEXP = /#{Regexp.union(KEYWORDS.sort)}\b/ KEYWORD_BY_TWO_BYTES = [ :INTERFACE, :MUTATION, :EXTEND, :FALSE, :ENUM, :TRUE, :NULL, nil, nil, nil, nil, nil, nil, nil, :QUERY, nil, nil, :REPEATABLE, :IMPLEMENTS, :INPUT, :TYPE, :SCHEMA, nil, nil, nil, :DIRECTIVE, :UNION, nil, nil, :SCALAR, nil, :FRAGMENT ] # This produces a unique integer for bytes 2 and 3 of each keyword string # See https://tenderlovemaking.com/2023/09/02/fast-tokenizers-with-stringscanner.html def _hash key (key * 18592990) >> 27 & 0x1f end module Punctuation LCURLY = '{' RCURLY = '}' LPAREN = '(' RPAREN = ')' LBRACKET = '[' RBRACKET = ']' COLON = ':' VAR_SIGN = '$' DIR_SIGN = '@' EQUALS = '=' BANG = '!' PIPE = '|' AMP = '&' end # A sparse array mapping the bytes for each punctuation # to a symbol name for that punctuation PUNCTUATION_NAME_FOR_BYTE = Punctuation.constants.each_with_object([]) { |name, arr| punct = Punctuation.const_get(name) arr[punct.ord] = name } QUOTE = '"' UNICODE_DIGIT = /[0-9A-Za-z]/ FOUR_DIGIT_UNICODE = /#{UNICODE_DIGIT}{4}/ N_DIGIT_UNICODE = %r{#{Punctuation::LCURLY}#{UNICODE_DIGIT}{4,}#{Punctuation::RCURLY}}x UNICODE_ESCAPE = %r{\\u(?:#{FOUR_DIGIT_UNICODE}|#{N_DIGIT_UNICODE})} STRING_ESCAPE = %r{[\\][\\/bfnrt]} BLOCK_QUOTE = '"""' ESCAPED_QUOTE = /\\"/; STRING_CHAR = /#{ESCAPED_QUOTE}|[^"\\\n\r]|#{UNICODE_ESCAPE}|#{STRING_ESCAPE}/ QUOTED_STRING_REGEXP = %r{#{QUOTE} (?:#{STRING_CHAR})* #{QUOTE}}x BLOCK_STRING_REGEXP = %r{ #{BLOCK_QUOTE} (?: [^"\\] | # Any characters that aren't a quote or slash (?= 0xD800 && codepoint_1 <= 0xDBFF) && # leading surrogate (codepoint_2 >= 0xDC00 && codepoint_2 <= 0xDFFF) # trailing surrogate # A surrogate pair combined = ((codepoint_1 - 0xD800) * 0x400) + (codepoint_2 - 0xDC00) + 0x10000 [combined].pack('U'.freeze) else # Two separate code points [codepoint_1].pack('U'.freeze) + [codepoint_2].pack('U'.freeze) end else [codepoint_1].pack('U'.freeze) end else ESCAPES_REPLACE[matched_str] end end nil end # This is not used during parsing because the parser # doesn't actually need tokens. def self.tokenize(string) lexer = GraphQL::Language::Lexer.new(string) tokens = [] while (token_name = lexer.advance) new_token = [ token_name, lexer.line_number, lexer.column_number, lexer.debug_token_value(token_name), ] tokens << new_token end tokens end end end end