%%machine lex; # % fix highlighting # # === BEFORE YOU START === # # Read the Ruby Hacking Guide chapter 11, available in English at # http://whitequark.org/blog/2013/04/01/ruby-hacking-guide-ch-11-finite-state-lexer/ # # Remember two things about Ragel scanners: # # 1) Longest match wins. # # 2) If two matches have the same length, the first # in source code wins. # # General rules of making Ragel and Bison happy: # # * `p` (position) and `@te` contain the index of the character # they're pointing to ("current"), plus one. `@ts` contains the index # of the corresponding character. The code for extracting matched token is: # # @source_buffer.slice(@ts...@te) # # * If your input is `foooooooobar` and the rule is: # # 'f' 'o'+ # # the result will be: # # foooooooobar # ^ ts=0 ^ p=te=9 # # * A Ragel lexer action should not emit more than one token, unless # you know what you are doing. # # * All Ragel commands (fnext, fgoto, ...) end with a semicolon. # # * If an action emits the token and transitions to another state, use # these Ragel commands: # # emit($whatever) # fnext $next_state; fbreak; # # If you perform `fgoto` in an action which does not emit a token nor # rewinds the stream pointer, the parser's side-effectful, # context-sensitive lookahead actions will break in a hard to detect # and debug way. # # * If an action does not emit a token: # # fgoto $next_state; # # * If an action features lookbehind, i.e. matches characters with the # intent of passing them to another action: # # p = @ts - 1 # fgoto $next_state; # # or, if the lookbehind consists of a single character: # # fhold; fgoto $next_state; # # * Ragel merges actions. So, if you have `e_lparen = '(' %act` and # `c_lparen = '('` and a lexer action `e_lparen | c_lparen`, the result # _will_ invoke the action `act`. # # e_something stands for "something with **e**mbedded action". # # * EOF is explicit and is matched by `c_eof`. If you want to introspect # the state of the lexer, add this rule to the state: # # c_eof => do_eof; # # * If you proceed past EOF, the lexer will complain: # # NoMethodError: undefined method `ord' for nil:NilClass # class Parser::Lexer %% write data nofinal; # % ESCAPES = { ?a.ord => "\a", ?b.ord => "\b", ?e.ord => "\e", ?f.ord => "\f", ?n.ord => "\n", ?r.ord => "\r", ?s.ord => "\s", ?t.ord => "\t", ?v.ord => "\v", ?\\.ord => "\\" }.freeze REGEXP_META_CHARACTERS = Regexp.union(*"\\$()*+.<>?[]^{|}".chars).freeze attr_reader :source_buffer attr_accessor :diagnostics attr_accessor :static_env attr_accessor :force_utf32 attr_accessor :cond, :cmdarg, :in_kwarg, :context, :command_start attr_accessor :tokens, :comments def initialize(version) @version = version @static_env = nil @context = nil @tokens = nil @comments = nil reset end def reset(reset_state=true) # Ragel state: if reset_state # Unit tests set state prior to resetting lexer. @cs = self.class.lex_en_line_begin @cond = StackState.new('cond') @cmdarg = StackState.new('cmdarg') @cond_stack = [] @cmdarg_stack = [] end @force_utf32 = false # Set to true by some tests @source_pts = nil # @source as a codepoint array @p = 0 # stream position (saved manually in #advance) @ts = nil # token start @te = nil # token end @act = 0 # next action @stack = [] # state stack @top = 0 # state stack top pointer # Lexer state: @token_queue = [] @literal_stack = [] @eq_begin_s = nil # location of last encountered =begin @sharp_s = nil # location of last encountered # @newline_s = nil # location of last encountered newline @num_base = nil # last numeric base @num_digits_s = nil # starting position of numeric digits @num_suffix_s = nil # starting position of numeric suffix @num_xfrm = nil # numeric suffix-induced transformation @escape_s = nil # starting position of current sequence @escape = nil # last escaped sequence, as string @herebody_s = nil # starting position of current heredoc line # Ruby 1.9 ->() lambdas emit a distinct token if do/{ is # encountered after a matching closing parenthesis. @paren_nest = 0 @lambda_stack = [] # After encountering the closing line of <<~SQUIGGLY_HEREDOC, # we store the indentation level and give it out to the parser # on request. It is not possible to infer indentation level just # from the AST because escape sequences such as `\ ` or `\t` are # expanded inside the lexer, but count as non-whitespace for # indentation purposes. @dedent_level = nil # If the lexer is in `command state' (aka expr_value) # at the entry to #advance, it will transition to expr_cmdarg # instead of expr_arg at certain points. @command_start = true # True at the end of "def foo a:" @in_kwarg = false # State before =begin / =end block comment @cs_before_block_comment = self.class.lex_en_line_begin end def source_buffer=(source_buffer) @source_buffer = source_buffer if @source_buffer source = @source_buffer.source if source.encoding == Encoding::UTF_8 @source_pts = source.unpack('U*') else @source_pts = source.unpack('C*') end if @source_pts[0] == 0xfeff # Skip byte order mark. @p = 1 end else @source_pts = nil end end def encoding @source_buffer.source.encoding end LEX_STATES = { :line_begin => lex_en_line_begin, :expr_dot => lex_en_expr_dot, :expr_fname => lex_en_expr_fname, :expr_value => lex_en_expr_value, :expr_beg => lex_en_expr_beg, :expr_mid => lex_en_expr_mid, :expr_arg => lex_en_expr_arg, :expr_cmdarg => lex_en_expr_cmdarg, :expr_end => lex_en_expr_end, :expr_endarg => lex_en_expr_endarg, :expr_endfn => lex_en_expr_endfn, :expr_labelarg => lex_en_expr_labelarg, :interp_string => lex_en_interp_string, :interp_words => lex_en_interp_words, :plain_string => lex_en_plain_string, :plain_words => lex_en_plain_string, } def state LEX_STATES.invert.fetch(@cs, @cs) end def state=(state) @cs = LEX_STATES.fetch(state) end def push_cmdarg @cmdarg_stack.push(@cmdarg) @cmdarg = StackState.new("cmdarg.#{@cmdarg_stack.count}") end def pop_cmdarg @cmdarg = @cmdarg_stack.pop end def push_cond @cond_stack.push(@cond) @cond = StackState.new("cond.#{@cond_stack.count}") end def pop_cond @cond = @cond_stack.pop end def dedent_level # We erase @dedent_level as a precaution to avoid accidentally # using a stale value. dedent_level, @dedent_level = @dedent_level, nil dedent_level end # Return next token: [type, value]. def advance if @token_queue.any? return @token_queue.shift end # Ugly, but dependent on Ragel output. Consider refactoring it somehow. klass = self.class _lex_trans_keys = klass.send :_lex_trans_keys _lex_key_spans = klass.send :_lex_key_spans _lex_index_offsets = klass.send :_lex_index_offsets _lex_indicies = klass.send :_lex_indicies _lex_trans_targs = klass.send :_lex_trans_targs _lex_trans_actions = klass.send :_lex_trans_actions _lex_to_state_actions = klass.send :_lex_to_state_actions _lex_from_state_actions = klass.send :_lex_from_state_actions _lex_eof_trans = klass.send :_lex_eof_trans pe = @source_pts.size + 2 p, eof = @p, pe cmd_state = @command_start @command_start = false %% write exec; # % # Ragel creates a local variable called `testEof` but it doesn't use # it in any assignment. This dead code is here to swallow the warning. # It has no runtime cost because Ruby doesn't produce any instructions from it. if false testEof end @p = p if @token_queue.any? @token_queue.shift elsif @cs == klass.lex_error [ false, [ '$error'.freeze, range(p - 1, p) ] ] else eof = @source_pts.size [ false, [ '$eof'.freeze, range(eof, eof) ] ] end end protected def eof_codepoint?(point) [0x04, 0x1a, 0x00].include? point end def version?(*versions) versions.include?(@version) end def stack_pop @top -= 1 @stack[@top] end def encode_escape(ord) ord.chr.force_encoding(@source_buffer.source.encoding) end def tok(s = @ts, e = @te) @source_buffer.slice(s...e) end def range(s = @ts, e = @te) Parser::Source::Range.new(@source_buffer, s, e) end def emit(type, value = tok, s = @ts, e = @te) token = [ type, [ value, range(s, e) ] ] @token_queue.push(token) @tokens.push(token) if @tokens token end def emit_table(table, s = @ts, e = @te) value = tok(s, e) emit(table[value], value, s, e) end def emit_do(do_block=false) if @cond.active? emit(:kDO_COND, 'do'.freeze) elsif @cmdarg.active? || do_block emit(:kDO_BLOCK, 'do'.freeze) else emit(:kDO, 'do'.freeze) end end def arg_or_cmdarg(cmd_state) if cmd_state self.class.lex_en_expr_cmdarg else self.class.lex_en_expr_arg end end def emit_comment(s = @ts, e = @te) if @comments @comments.push(Parser::Source::Comment.new(range(s, e))) end if @tokens @tokens.push([ :tCOMMENT, [ tok(s, e), range(s, e) ] ]) end nil end def diagnostic(type, reason, arguments=nil, location=range, highlights=[]) @diagnostics.process( Parser::Diagnostic.new(type, reason, arguments, location, highlights)) end # # === LITERAL STACK === # def push_literal(*args) new_literal = Literal.new(self, *args) @literal_stack.push(new_literal) next_state_for_literal(new_literal) end def next_state_for_literal(literal) if literal.words? && literal.backslash_delimited? if literal.interpolate? self.class.lex_en_interp_backslash_delimited_words else self.class.lex_en_plain_backslash_delimited_words end elsif literal.words? && !literal.backslash_delimited? if literal.interpolate? self.class.lex_en_interp_words else self.class.lex_en_plain_words end elsif !literal.words? && literal.backslash_delimited? if literal.interpolate? self.class.lex_en_interp_backslash_delimited else self.class.lex_en_plain_backslash_delimited end else if literal.interpolate? self.class.lex_en_interp_string else self.class.lex_en_plain_string end end end def literal @literal_stack.last end def pop_literal old_literal = @literal_stack.pop @dedent_level = old_literal.dedent_level if old_literal.type == :tREGEXP_BEG # Fetch modifiers. self.class.lex_en_regexp_modifiers else self.class.lex_en_expr_end end end # Mapping of strings to parser tokens. PUNCTUATION = { '=' => :tEQL, '&' => :tAMPER2, '|' => :tPIPE, '!' => :tBANG, '^' => :tCARET, '+' => :tPLUS, '-' => :tMINUS, '*' => :tSTAR2, '/' => :tDIVIDE, '%' => :tPERCENT, '~' => :tTILDE, ',' => :tCOMMA, ';' => :tSEMI, '.' => :tDOT, '..' => :tDOT2, '...' => :tDOT3, '[' => :tLBRACK2, ']' => :tRBRACK, '(' => :tLPAREN2, ')' => :tRPAREN, '?' => :tEH, ':' => :tCOLON, '&&' => :tANDOP, '||' => :tOROP, '-@' => :tUMINUS, '+@' => :tUPLUS, '~@' => :tTILDE, '**' => :tPOW, '->' => :tLAMBDA, '=~' => :tMATCH, '!~' => :tNMATCH, '==' => :tEQ, '!=' => :tNEQ, '>' => :tGT, '>>' => :tRSHFT, '>=' => :tGEQ, '<' => :tLT, '<<' => :tLSHFT, '<=' => :tLEQ, '=>' => :tASSOC, '::' => :tCOLON2, '===' => :tEQQ, '<=>' => :tCMP, '[]' => :tAREF, '[]=' => :tASET, '{' => :tLCURLY, '}' => :tRCURLY, '`' => :tBACK_REF2, '!@' => :tBANG, '&.' => :tANDDOT, } PUNCTUATION_BEGIN = { '&' => :tAMPER, '*' => :tSTAR, '**' => :tDSTAR, '+' => :tUPLUS, '-' => :tUMINUS, '::' => :tCOLON3, '(' => :tLPAREN, '{' => :tLBRACE, '[' => :tLBRACK, } KEYWORDS = { 'if' => :kIF_MOD, 'unless' => :kUNLESS_MOD, 'while' => :kWHILE_MOD, 'until' => :kUNTIL_MOD, 'rescue' => :kRESCUE_MOD, 'defined?' => :kDEFINED, 'BEGIN' => :klBEGIN, 'END' => :klEND, } KEYWORDS_BEGIN = { 'if' => :kIF, 'unless' => :kUNLESS, 'while' => :kWHILE, 'until' => :kUNTIL, 'rescue' => :kRESCUE, 'defined?' => :kDEFINED, 'BEGIN' => :klBEGIN, 'END' => :klEND, } %w(class module def undef begin end then elsif else ensure case when for break next redo retry in do return yield super self nil true false and or not alias __FILE__ __LINE__ __ENCODING__).each do |keyword| KEYWORDS_BEGIN[keyword] = KEYWORDS[keyword] = :"k#{keyword.upcase}" end %%{ # % access @; getkey (@source_pts[p] || 0); # === CHARACTER CLASSES === # # Pay close attention to the differences between c_any and any. # c_any does not include EOF and so will cause incorrect behavior # for machine subtraction (any-except rules) and default transitions # for scanners. action do_nl { # Record position of a newline for precise location reporting on tNL # tokens. # # This action is embedded directly into c_nl, as it is idempotent and # there are no cases when we need to skip it. @newline_s = p } c_nl = '\n' $ do_nl; c_space = [ \t\r\f\v]; c_space_nl = c_space | c_nl; c_eof = 0x04 | 0x1a | 0 | zlen; # ^D, ^Z, \0, EOF c_eol = c_nl | c_eof; c_any = any - c_eof; c_nl_zlen = c_nl | zlen; c_line = any - c_nl_zlen; c_unicode = c_any - 0x00..0x7f; c_upper = [A-Z]; c_lower = [a-z_] | c_unicode; c_alpha = c_lower | c_upper; c_alnum = c_alpha | [0-9]; action do_eof { # Sit at EOF indefinitely. #advance would return $eof each time. # This allows to feed the lexer more data if needed; this is only used # in tests. # # Note that this action is not embedded into e_eof like e_heredoc_nl and e_bs # below. This is due to the fact that scanner state at EOF is observed # by tests, and encapsulating it in a rule would break the introspection. fhold; fbreak; } # # === TOKEN DEFINITIONS === # # All operators are punctuation. There is more to punctuation # than just operators. Operators can be overridden by user; # punctuation can not. # A list of operators which are valid in the function name context, but # have different semantics in others. operator_fname = '[]' | '[]=' | '`' | '-@' | '+@' | '~@' | '!@' ; # A list of operators which can occur within an assignment shortcut (+ → +=). operator_arithmetic = '&' | '|' | '&&' | '||' | '^' | '+' | '-' | '*' | '/' | '**' | '~' | '<<' | '>>' | '%' ; # A list of all user-definable operators not covered by groups above. operator_rest = '=~' | '!~' | '==' | '!=' | '!' | '===' | '<' | '<=' | '>' | '>=' | '<=>' | '=>' ; # Note that `{` and `}` need to be referred to as e_lbrace and e_rbrace, # as they are ambiguous with interpolation `#{}` and should be counted. # These braces are not present in punctuation lists. # A list of punctuation which has different meaning when used at the # beginning of expression. punctuation_begin = '-' | '+' | '::' | '(' | '[' | '*' | '**' | '&' ; # A list of all punctuation except punctuation_begin. punctuation_end = ',' | '=' | '->' | '(' | '[' | ']' | '::' | '?' | ':' | '.' | '..' | '...' ; # A list of keywords which have different meaning at the beginning of expression. keyword_modifier = 'if' | 'unless' | 'while' | 'until' | 'rescue' ; # A list of keywords which accept an argument-like expression, i.e. have the # same post-processing as method calls or commands. Example: `yield 1`, # `yield (1)`, `yield(1)`, are interpreted as if `yield` was a function. keyword_with_arg = 'yield' | 'super' | 'not' | 'defined?' ; # A list of keywords which accept a literal function name as an argument. keyword_with_fname = 'def' | 'undef' | 'alias' ; # A list of keywords which accept an expression after them. keyword_with_value = 'else' | 'case' | 'ensure' | 'module' | 'elsif' | 'then' | 'for' | 'in' | 'do' | 'when' | 'begin' | 'class' | 'and' | 'or' ; # A list of keywords which accept a value, and treat the keywords from # `keyword_modifier` list as modifiers. keyword_with_mid = 'rescue' | 'return' | 'break' | 'next' ; # A list of keywords which do not accept an expression after them. keyword_with_end = 'end' | 'self' | 'true' | 'false' | 'retry' | 'redo' | 'nil' | 'BEGIN' | 'END' | '__FILE__' | '__LINE__' | '__ENCODING__'; # All keywords. keyword = keyword_with_value | keyword_with_mid | keyword_with_end | keyword_with_arg | keyword_with_fname | keyword_modifier ; constant = c_upper c_alnum*; bareword = c_alpha c_alnum*; call_or_var = c_lower c_alnum*; class_var = '@@' bareword; instance_var = '@' bareword; global_var = '$' ( bareword | digit+ | [`'+~*$&?!@/\\;,.=:<>"] # ` | '-' c_alnum ) ; # Ruby accepts (and fails on) variables with leading digit # in literal context, but not in unquoted symbol body. class_var_v = '@@' c_alnum+; instance_var_v = '@' c_alnum+; label = bareword [?!]? ':'; # # === NUMERIC PARSING === # int_hex = ( xdigit+ '_' )* xdigit* '_'? ; int_dec = ( digit+ '_' )* digit* '_'? ; int_bin = ( [01]+ '_' )* [01]* '_'? ; flo_int = [1-9] [0-9]* ( '_' digit+ )* | '0'; flo_frac = '.' ( digit+ '_' )* digit+; flo_pow = [eE] [+\-]? ( digit+ '_' )* digit+; int_suffix = '' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars) } } | 'r' % { @num_xfrm = lambda { |chars| emit(:tRATIONAL, Rational(chars)) } } | 'i' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, chars)) } } | 'ri' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Rational(chars))) } } | 're' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 2); p -= 2 } } | 'if' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 2); p -= 2 } } | 'rescue' % { @num_xfrm = lambda { |chars| emit(:tINTEGER, chars, @ts, @te - 6); p -= 6 } }; flo_pow_suffix = '' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars)) } } | 'i' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Float(chars))) } } | 'if' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars), @ts, @te - 2); p -= 2 } }; flo_suffix = flo_pow_suffix | 'r' % { @num_xfrm = lambda { |chars| emit(:tRATIONAL, Rational(chars)) } } | 'ri' % { @num_xfrm = lambda { |chars| emit(:tIMAGINARY, Complex(0, Rational(chars))) } } | 'rescue' % { @num_xfrm = lambda { |chars| emit(:tFLOAT, Float(chars), @ts, @te - 6); p -= 6 } }; # # === ESCAPE SEQUENCE PARSING === # # Escape parsing code is a Ragel pattern, not a scanner, and therefore # it shouldn't directly raise errors or perform other actions with side effects. # In reality this would probably just mess up error reporting in pathological # cases, through. # The amount of code required to parse \M\C stuff correctly is ridiculous. escaped_nl = "\\" c_nl; action unicode_points { @escape = "" codepoints = tok(@escape_s + 2, p - 1) codepoint_s = @escape_s + 2 if @version < 24 if codepoints.start_with?(" ") || codepoints.start_with?("\t") diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s + 2, @escape_s + 3) end if spaces_p = codepoints.index(/[ \t]{2}/) diagnostic :fatal, :invalid_unicode_escape, nil, range(codepoint_s + spaces_p + 1, codepoint_s + spaces_p + 2) end if codepoints.end_with?(" ") || codepoints.end_with?("\t") diagnostic :fatal, :invalid_unicode_escape, nil, range(p - 1, p) end end codepoints.scan(/([0-9a-fA-F]+)|([ \t]+)/).each do |(codepoint_str, spaces)| if spaces codepoint_s += spaces.length else codepoint = codepoint_str.to_i(16) if codepoint >= 0x110000 diagnostic :error, :unicode_point_too_large, nil, range(codepoint_s, codepoint_s + codepoint_str.length) break end @escape += codepoint.chr(Encoding::UTF_8) codepoint_s += codepoint_str.length end end } action unescape_char { codepoint = @source_pts[p - 1] if (@escape = ESCAPES[codepoint]).nil? @escape = encode_escape(@source_buffer.slice(p - 1)) end } action invalid_complex_escape { diagnostic :fatal, :invalid_escape } action read_post_meta_or_ctrl_char { @escape = @source_buffer.slice(p - 1).chr if @version >= 27 && ((0..8).include?(@escape.ord) || (14..31).include?(@escape.ord)) diagnostic :fatal, :invalid_escape end } action slash_c_char { @escape = encode_escape(@escape[0].ord & 0x9f) } action slash_m_char { @escape = encode_escape(@escape[0].ord | 0x80) } maybe_escaped_char = ( '\\' c_any %unescape_char | ( c_any - [\\] ) %read_post_meta_or_ctrl_char ); maybe_escaped_ctrl_char = ( # why?! '\\' c_any %unescape_char %slash_c_char | '?' % { @escape = "\x7f" } | ( c_any - [\\?] ) %read_post_meta_or_ctrl_char %slash_c_char ); escape = ( # \377 [0-7]{1,3} % { @escape = encode_escape(tok(@escape_s, p).to_i(8) % 0x100) } # \xff | 'x' xdigit{1,2} % { @escape = encode_escape(tok(@escape_s + 1, p).to_i(16)) } # %q[\x] | 'x' ( c_any - xdigit ) % { diagnostic :fatal, :invalid_hex_escape, nil, range(@escape_s - 1, p + 2) } # \u263a | 'u' xdigit{4} % { @escape = tok(@escape_s + 1, p).to_i(16).chr(Encoding::UTF_8) } # \u123 | 'u' xdigit{0,3} % { diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s - 1, p) } # u{not hex} or u{} | 'u{' ( c_any - xdigit - [ \t}] )* '}' % { diagnostic :fatal, :invalid_unicode_escape, nil, range(@escape_s - 1, p) } # \u{ \t 123 \t 456 \t\t } | 'u{' [ \t]* ( xdigit{1,6} [ \t]+ )* ( ( xdigit{1,6} [ \t]* '}' %unicode_points ) | ( xdigit* ( c_any - xdigit - [ \t}] )+ '}' | ( c_any - [ \t}] )* c_eof | xdigit{7,} ) % { diagnostic :fatal, :unterminated_unicode, nil, range(p - 1, p) } ) # \C-\a \cx | ( 'C-' | 'c' ) escaped_nl? maybe_escaped_ctrl_char # \M-a | 'M-' escaped_nl? maybe_escaped_char %slash_m_char # \C-\M-f \M-\cf \c\M-f | ( ( 'C-' | 'c' ) escaped_nl? '\\M-' | 'M-\\' escaped_nl? ( 'C-' | 'c' ) ) escaped_nl? maybe_escaped_ctrl_char %slash_m_char | 'C' c_any %invalid_complex_escape | 'M' c_any %invalid_complex_escape | ( 'M-\\C' | 'C-\\M' ) c_any %invalid_complex_escape | ( c_any - [0-7xuCMc] ) %unescape_char | c_eof % { diagnostic :fatal, :escape_eof, nil, range(p - 1, p) } ); # Use rules in form of `e_bs escape' when you need to parse a sequence. e_bs = '\\' % { @escape_s = p @escape = nil }; # # === STRING AND HEREDOC PARSING === # # Heredoc parsing is quite a complex topic. First, consider that heredocs # can be arbitrarily nested. For example: # # puts <= 2.2 if @version >= 22 && !@cond.active? lookahead = @source_buffer.slice(@te...@te+2) end current_literal = literal if !current_literal.heredoc? && (token = current_literal.nest_and_try_closing(string, @ts, @te, lookahead)) if token[0] == :tLABEL_END p += 1 pop_literal fnext expr_labelarg; else fnext *pop_literal; end fbreak; else current_literal.extend_string(string, @ts, @te) end } action extend_string_escaped { current_literal = literal # Get the first character after the backslash. escaped_char = @source_buffer.slice(@escape_s).chr if current_literal.munge_escape? escaped_char # If this particular literal uses this character as an opening # or closing delimiter, it is an escape sequence for that # particular character. Write it without the backslash. if current_literal.regexp? && REGEXP_META_CHARACTERS.match(escaped_char) # Regular expressions should include escaped delimiters in their # escaped form, except when the escaped character is # a closing delimiter but not a regexp metacharacter. # # The backslash itself cannot be used as a closing delimiter # at the same time as an escape symbol, but it is always munged, # so this branch also executes for the non-closing-delimiter case # for the backslash. current_literal.extend_string(tok, @ts, @te) else current_literal.extend_string(escaped_char, @ts, @te) end else # It does not. So this is an actual escape sequence, yay! if current_literal.squiggly_heredoc? && escaped_char == "\n".freeze # Squiggly heredocs like # <<~-HERE # 1\ # 2 # HERE # treat '\' as a line continuation, but still dedent the body, so the heredoc above becomes "12\n". # This information is emitted as is, without escaping, # later this escape sequence (\\\n) gets handled manually in the Lexer::Dedenter current_literal.extend_string(tok, @ts, @te) elsif current_literal.supports_line_continuation_via_slash? && escaped_char == "\n".freeze # Heredocs, regexp and a few other types of literals support line # continuation via \\\n sequence. The code like # "a\ # b" # must be parsed as "ab" current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te) elsif current_literal.regexp? # Regular expressions should include escape sequences in their # escaped form. On the other hand, escaped newlines are removed (in cases like "\\C-\\\n\\M-x") current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te) else current_literal.extend_string(@escape || tok, @ts, @te) end end } # Extend a string with a newline or a EOF character. # As heredoc closing line can immediately precede EOF, this action # has to handle such case specially. action extend_string_eol { current_literal = literal if @te == pe diagnostic :fatal, :string_eof, nil, range(current_literal.str_s, current_literal.str_s + 1) end if current_literal.heredoc? line = tok(@herebody_s, @ts).gsub(/\r+$/, ''.freeze) if version?(18, 19, 20) # See ruby:c48b4209c line = line.gsub(/\r.*$/, ''.freeze) end # Try ending the heredoc with the complete most recently # scanned line. @herebody_s always refers to the start of such line. if current_literal.nest_and_try_closing(line, @herebody_s, @ts) # Adjust @herebody_s to point to the next line. @herebody_s = @te # Continue regular lexing after the heredoc reference (< 2.7 # If interpolated instance/class variable starts with a digit we parse it as a plain substring # However, "#$1" is still a regular interpolation interp_digit_var = '#' ('@' | '@@') digit c_alpha*; action extend_interp_digit_var { if @version >= 27 literal.extend_string(tok, @ts, @te) else message = tok.start_with?('#@@') ? :cvar_name : :ivar_name diagnostic :error, message, { :name => tok(@ts + 1, @te) }, range(@ts + 1, @te) end } # Interpolations with code blocks must match nested curly braces, as # interpolation ending is ambiguous with a block ending. So, every # opening and closing brace should be matched with e_[lr]brace rules, # which automatically perform the counting. # # Note that interpolations can themselves be nested, so brace balance # is tied to the innermost literal. # # Also note that literals themselves should not use e_[lr]brace rules # when matching their opening and closing delimiters, as the amount of # braces inside the characters of a string literal is independent. interp_code = '#{'; e_lbrace = '{' % { @cond.push(false); @cmdarg.push(false) current_literal = literal if current_literal current_literal.start_interp_brace end }; e_rbrace = '}' % { current_literal = literal if current_literal if current_literal.end_interp_brace_and_try_closing if version?(18, 19) emit(:tRCURLY, '}'.freeze, p - 1, p) @cond.lexpop @cmdarg.lexpop else emit(:tSTRING_DEND, '}'.freeze, p - 1, p) end if current_literal.saved_herebody_s @herebody_s = current_literal.saved_herebody_s end fhold; fnext *next_state_for_literal(current_literal); fbreak; end end @paren_nest -= 1 }; action extend_interp_code { current_literal = literal current_literal.flush_string current_literal.extend_content emit(:tSTRING_DBEG, '#{'.freeze) if current_literal.heredoc? current_literal.saved_herebody_s = @herebody_s @herebody_s = nil end current_literal.start_interp_brace @command_start = true fnext expr_value; fbreak; } # Actual string parsers are simply combined from the primitives defined # above. interp_words := |* interp_code => extend_interp_code; interp_digit_var => extend_interp_digit_var; interp_var => extend_interp_var; e_bs escape => extend_string_escaped; c_space+ => extend_string_space; c_eol => extend_string_eol; c_any => extend_string; *|; interp_string := |* interp_code => extend_interp_code; interp_digit_var => extend_interp_digit_var; interp_var => extend_interp_var; e_bs escape => extend_string_escaped; c_eol => extend_string_eol; c_any => extend_string; *|; plain_words := |* e_bs c_any => extend_string_escaped; c_space+ => extend_string_space; c_eol => extend_string_eol; c_any => extend_string; *|; plain_string := |* '\\' c_nl => extend_string_eol; e_bs c_any => extend_string_escaped; c_eol => extend_string_eol; c_any => extend_string; *|; interp_backslash_delimited := |* interp_code => extend_interp_code; interp_digit_var => extend_interp_digit_var; interp_var => extend_interp_var; c_eol => extend_string_eol; c_any => extend_string; *|; plain_backslash_delimited := |* c_eol => extend_string_eol; c_any => extend_string; *|; interp_backslash_delimited_words := |* interp_code => extend_interp_code; interp_digit_var => extend_interp_digit_var; interp_var => extend_interp_var; c_space+ => extend_string_space; c_eol => extend_string_eol; c_any => extend_string; *|; plain_backslash_delimited_words := |* c_space+ => extend_string_space; c_eol => extend_string_eol; c_any => extend_string; *|; regexp_modifiers := |* [A-Za-z]+ => { unknown_options = tok.scan(/[^imxouesn]/) if unknown_options.any? diagnostic :error, :regexp_options, { :options => unknown_options.join } end emit(:tREGEXP_OPT) fnext expr_end; fbreak; }; any => { emit(:tREGEXP_OPT, tok(@ts, @te - 1), @ts, @te - 1) fhold; fgoto expr_end; }; *|; # # === WHITESPACE HANDLING === # # Various contexts in Ruby allow various kinds of whitespace # to be used. They are grouped to clarify the lexing machines # and ease collection of comments. # A line of code with inline #comment at end is always equivalent # to a line of code ending with just a newline, so an inline # comment is deemed equivalent to non-newline whitespace # (c_space character class). w_space = c_space+ | '\\' e_heredoc_nl ; w_comment = '#' %{ @sharp_s = p - 1 } # The (p == pe) condition compensates for added "\0" and # the way Ragel handles EOF. c_line* %{ emit_comment(@sharp_s, p == pe ? p - 2 : p) } ; w_space_comment = w_space | w_comment ; # A newline in non-literal context always interoperates with # here document logic and can always be escaped by a backslash, # still interoperating with here document logic in the same way, # yet being invisible to anything else. # # To demonstrate: # # foo = <' %{ tm = p - 2 } | # a=>b a => b '===' %{ tm = p - 3 } # a===b a === b ; ambiguous_symbol_suffix = # actual parsed ambiguous_ident_suffix | '==>' %{ tm = p - 2 } # :a==>b :a= => b ; # Ambiguous with 1.9 hash labels. ambiguous_const_suffix = # actual parsed '::' %{ tm = p - 2 } # A::B A :: B ; # Resolving kDO/kDO_COND/kDO_BLOCK ambiguity requires embedding # @cond/@cmdarg-related code to e_lbrack, e_lparen and e_lbrace. e_lbrack = '[' % { @cond.push(false); @cmdarg.push(false) @paren_nest += 1 }; e_rbrack = ']' % { @paren_nest -= 1 }; # Ruby 1.9 lambdas require parentheses counting in order to # emit correct opening kDO/tLBRACE. e_lparen = '(' % { @cond.push(false); @cmdarg.push(false) @paren_nest += 1 if version?(18) @command_start = true end }; e_rparen = ')' % { @paren_nest -= 1 }; # Ruby is context-sensitive wrt/ local identifiers. action local_ident { emit(:tIDENTIFIER) if !@static_env.nil? && @static_env.declared?(tok) fnext expr_endfn; fbreak; else fnext *arg_or_cmdarg(cmd_state); fbreak; end } # Variable lexing code is accessed from both expressions and # string interpolation related code. # expr_variable := |* global_var => { if tok =~ /^\$([1-9][0-9]*)$/ emit(:tNTH_REF, tok(@ts + 1).to_i) elsif tok =~ /^\$([&`'+])$/ emit(:tBACK_REF) else emit(:tGVAR) end fnext *stack_pop; fbreak; }; class_var_v => { if tok =~ /^@@[0-9]/ diagnostic :error, :cvar_name, { :name => tok } end emit(:tCVAR) fnext *stack_pop; fbreak; }; instance_var_v => { if tok =~ /^@[0-9]/ diagnostic :error, :ivar_name, { :name => tok } end emit(:tIVAR) fnext *stack_pop; fbreak; }; *|; # Literal function name in definition (e.g. `def class`). # Keywords are returned as their respective tokens; this is used # to support singleton def `def self.foo`. Global variables are # returned as `tGVAR`; this is used in global variable alias # statements `alias $a $b`. Symbols are returned verbatim; this # is used in `alias :a :"b#{foo}"` and `undef :a`. # # Transitions to `expr_endfn` afterwards. # expr_fname := |* keyword => { emit_table(KEYWORDS_BEGIN); fnext expr_endfn; fbreak; }; constant => { emit(:tCONSTANT) fnext expr_endfn; fbreak; }; bareword [?=!]? => { emit(:tIDENTIFIER) fnext expr_endfn; fbreak; }; global_var => { p = @ts - 1 fnext expr_end; fcall expr_variable; }; # If the handling was to be delegated to expr_end, # these cases would transition to something else than # expr_endfn, which is incorrect. operator_fname | operator_arithmetic | operator_rest => { emit_table(PUNCTUATION) fnext expr_endfn; fbreak; }; '::' => { fhold; fhold; fgoto expr_end; }; ':' => { fhold; fgoto expr_beg; }; '%s' c_any => { if version?(23) type, delimiter = tok[0..-2], tok[-1].chr fgoto *push_literal(type, delimiter, @ts); else p = @ts - 1 fgoto expr_end; end }; w_any; c_any => { fhold; fgoto expr_end; }; c_eof => do_eof; *|; # After literal function name in definition. Behaves like `expr_end`, # but allows a tLABEL. # # Transitions to `expr_end` afterwards. # expr_endfn := |* label ( any - ':' ) => { emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1) fhold; fnext expr_labelarg; fbreak; }; w_space_comment; c_any => { fhold; fgoto expr_end; }; c_eof => do_eof; *|; # Literal function name in method call (e.g. `a.class`). # # Transitions to `expr_arg` afterwards. # expr_dot := |* constant => { emit(:tCONSTANT) fnext *arg_or_cmdarg(cmd_state); fbreak; }; call_or_var => { emit(:tIDENTIFIER) fnext *arg_or_cmdarg(cmd_state); fbreak; }; bareword ambiguous_fid_suffix => { emit(:tFID, tok(@ts, tm), @ts, tm) fnext *arg_or_cmdarg(cmd_state); p = tm - 1; fbreak; }; # See the comment in `expr_fname`. operator_fname | operator_arithmetic | operator_rest => { emit_table(PUNCTUATION) fnext expr_arg; fbreak; }; w_any; c_any => { fhold; fgoto expr_end; }; c_eof => do_eof; *|; # The previous token emitted was a `tIDENTIFIER` or `tFID`; no space # is consumed; the current expression is a command or method call. # expr_arg := |* # # COMMAND MODE SPECIFIC TOKENS # # cmd (1 + 2) # See below the rationale about expr_endarg. w_space+ e_lparen => { if version?(18) emit(:tLPAREN2, '('.freeze, @te - 1, @te) fnext expr_value; fbreak; else emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te) fnext expr_beg; fbreak; end }; # meth(1 + 2) # Regular method call. e_lparen => { emit(:tLPAREN2, '('.freeze) fnext expr_beg; fbreak; }; # meth [...] # Array argument. Compare with indexing `meth[...]`. w_space+ e_lbrack => { emit(:tLBRACK, '['.freeze, @te - 1, @te) fnext expr_beg; fbreak; }; # cmd {} # Command: method call without parentheses. w_space* e_lbrace => { if @lambda_stack.last == @paren_nest @lambda_stack.pop emit(:tLAMBEG, '{'.freeze, @te - 1, @te) else emit(:tLCURLY, '{'.freeze, @te - 1, @te) end @command_start = true @paren_nest += 1 fnext expr_value; fbreak; }; # # AMBIGUOUS TOKENS RESOLVED VIA EXPR_BEG # # a?? # Ternary operator '?' c_space_nl => { # Unlike expr_beg as invoked in the next rule, do not warn p = @ts - 1 fgoto expr_end; }; # a ?b, a? ? # Character literal or ternary operator w_space* '?' => { fhold; fgoto expr_beg; }; # a %{1}, a %[1] (but not "a %=1=" or "a % foo") # a /foo/ (but not "a / foo" or "a /=foo") # a < { if tok(tm, tm + 1) == '/'.freeze # Ambiguous regexp literal. diagnostic :warning, :ambiguous_literal, nil, range(tm, tm + 1) end p = tm - 1 fgoto expr_beg; }; # x *1 # Ambiguous splat, kwsplat or block-pass. w_space+ %{ tm = p } ( '+' | '-' | '*' | '&' | '**' ) => { diagnostic :warning, :ambiguous_prefix, { :prefix => tok(tm, @te) }, range(tm, @te) p = tm - 1 fgoto expr_beg; }; # x ::Foo # Ambiguous toplevel constant access. w_space+ '::' => { fhold; fhold; fgoto expr_beg; }; # x:b # Symbol. w_space* ':' => { fhold; fgoto expr_beg; }; w_space+ label => { p = @ts - 1; fgoto expr_beg; }; # # AMBIGUOUS TOKENS RESOLVED VIA EXPR_END # # a ? b # Ternary operator. w_space+ %{ tm = p } '?' c_space_nl => { p = tm - 1; fgoto expr_end; }; # x + 1: Binary operator or operator-assignment. w_space* operator_arithmetic ( '=' | c_space_nl )? | # x rescue y: Modifier keyword. w_space* keyword_modifier | # a &. b: Safe navigation operator. w_space* '&.' | # Miscellanea. w_space* punctuation_end => { p = @ts - 1 fgoto expr_end; }; w_space; w_comment => { fgoto expr_end; }; w_newline => { fhold; fgoto expr_end; }; c_any => { fhold; fgoto expr_beg; }; c_eof => do_eof; *|; # The previous token was an identifier which was seen while in the # command mode (that is, the state at the beginning of #advance was # expr_value). This state is very similar to expr_arg, but disambiguates # two very rare and specific condition: # * In 1.8 mode, "foo (lambda do end)". # * In 1.9+ mode, "f x: -> do foo do end end". expr_cmdarg := |* w_space+ e_lparen => { emit(:tLPAREN_ARG, '('.freeze, @te - 1, @te) if version?(18) fnext expr_value; fbreak; else fnext expr_beg; fbreak; end }; w_space* 'do' => { if @cond.active? emit(:kDO_COND, 'do'.freeze, @te - 2, @te) else emit(:kDO, 'do'.freeze, @te - 2, @te) end fnext expr_value; fbreak; }; c_any | # Disambiguate with the `do' rule above. w_space* bareword | w_space* label => { p = @ts - 1 fgoto expr_arg; }; c_eof => do_eof; *|; # The rationale for this state is pretty complex. Normally, if an argument # is passed to a command and then there is a block (tLCURLY...tRCURLY), # the block is attached to the innermost argument (`f` in `m f {}`), or it # is a parse error (`m 1 {}`). But there is a special case for passing a single # primary expression grouped with parentheses: if you write `m (1) {}` or # (2.0 only) `m () {}`, then the block is attached to `m`. # # Thus, we recognize the opening `(` of a command (remember, a command is # a method call without parens) as a tLPAREN_ARG; then, in parser, we recognize # `tLPAREN_ARG expr rparen` as a `primary_expr` and before rparen, set the # lexer's state to `expr_endarg`, which makes it emit the possibly following # `{` as `tLBRACE_ARG`. # # The default post-`expr_endarg` state is `expr_end`, so this state also handles # `do` (as `kDO_BLOCK` in `expr_beg`). expr_endarg := |* e_lbrace => { if @lambda_stack.last == @paren_nest @lambda_stack.pop emit(:tLAMBEG, '{'.freeze) else emit(:tLBRACE_ARG, '{'.freeze) end @paren_nest += 1 @command_start = true fnext expr_value; fbreak; }; 'do' => { emit_do(true) fnext expr_value; fbreak; }; w_space_comment; c_any => { fhold; fgoto expr_end; }; c_eof => do_eof; *|; # The rationale for this state is that several keywords accept value # (i.e. should transition to `expr_beg`), do not accept it like a command # (i.e. not an `expr_arg`), and must behave like a statement, that is, # accept a modifier if/while/etc. # expr_mid := |* keyword_modifier => { emit_table(KEYWORDS) fnext expr_beg; fbreak; }; bareword => { p = @ts - 1; fgoto expr_beg; }; w_space_comment; w_newline => { fhold; fgoto expr_end; }; c_any => { fhold; fgoto expr_beg; }; c_eof => do_eof; *|; # Beginning of an expression. # # Don't fallthrough to this state from `c_any`; make sure to handle # `c_space* c_nl` and let `expr_end` handle the newline. # Otherwise code like `f\ndef x` gets glued together and the parser # explodes. # expr_beg := |* # +5, -5, - 5 [+\-] w_any* [0-9] => { emit(:tUNARY_NUM, tok(@ts, @ts + 1), @ts, @ts + 1) fhold; fnext expr_end; fbreak; }; # splat *a '*' => { emit(:tSTAR, '*'.freeze) fbreak; }; # # STRING AND REGEXP LITERALS # # /regexp/oui # /=/ (disambiguation with /=) '/' c_any => { type = delimiter = tok[0].chr fhold; fgoto *push_literal(type, delimiter, @ts); }; # % '%' ( any - [A-Za-z] ) => { type, delimiter = @source_buffer.slice(@ts).chr, tok[-1].chr fgoto *push_literal(type, delimiter, @ts); }; # %w(we are the people) '%' [A-Za-z]+ c_any => { type, delimiter = tok[0..-2], tok[-1].chr fgoto *push_literal(type, delimiter, @ts); }; '%' c_eof => { diagnostic :fatal, :string_eof, nil, range(@ts, @ts + 1) }; # Heredoc start. # < { tok(@ts, heredoc_e) =~ /^<<(-?)(~?)(["'`]?)(.*)\3$/m indent = !$1.empty? || !$2.empty? dedent_body = !$2.empty? type = $3.empty? ? '<<"'.freeze : ('<<'.freeze + $3) delimiter = $4 if @version >= 27 if delimiter.count("\n") > 0 || delimiter.count("\r") > 0 diagnostic :error, :unterminated_heredoc_id, nil, range(@ts, @ts + 1) end elsif @version >= 24 if delimiter.count("\n") > 0 if delimiter.end_with?("\n") diagnostic :warning, :heredoc_id_ends_with_nl, nil, range(@ts, @ts + 1) delimiter = delimiter.rstrip else diagnostic :fatal, :heredoc_id_has_newline, nil, range(@ts, @ts + 1) end end end if dedent_body && version?(18, 19, 20, 21, 22) emit(:tLSHFT, '<<'.freeze, @ts, @ts + 2) p = @ts + 1 fnext expr_beg; fbreak; else fnext *push_literal(type, delimiter, @ts, heredoc_e, indent, dedent_body); @herebody_s ||= new_herebody_s p = @herebody_s - 1 end }; # Escaped unterminated heredoc start # <<'END | <<"END | <<`END | # <<-'END | <<-"END | <<-`END | # <<~'END | <<~"END | <<~`END # # If the heredoc is terminated the rule above should handle it '<<' [~\-]? ('"' (any - c_nl - '"')* |"'" (any - c_nl - "'")* |"`" (any - c_nl - "`") ) => { diagnostic :error, :unterminated_heredoc_id, nil, range(@ts, @ts + 1) }; # # SYMBOL LITERALS # # :&&, :|| ':' ('&&' | '||') => { fhold; fhold; emit(:tSYMBEG, tok(@ts, @ts + 1), @ts, @ts + 1) fgoto expr_fname; }; # :"bar", :'baz' ':' ['"] # ' => { type, delimiter = tok, tok[-1].chr fgoto *push_literal(type, delimiter, @ts); }; # :!@ is :! # :~@ is :~ ':' [!~] '@' => { emit(:tSYMBOL, tok(@ts + 1, @ts + 2)) fnext expr_end; fbreak; }; ':' bareword ambiguous_symbol_suffix => { emit(:tSYMBOL, tok(@ts + 1, tm), @ts, tm) p = tm - 1 fnext expr_end; fbreak; }; ':' ( bareword | global_var | class_var | instance_var | operator_fname | operator_arithmetic | operator_rest ) => { emit(:tSYMBOL, tok(@ts + 1), @ts) fnext expr_end; fbreak; }; ':' ( '@' %{ tm = p - 1; diag_msg = :ivar_name } | '@@' %{ tm = p - 2; diag_msg = :cvar_name } ) [0-9]* => { if @version >= 27 diagnostic :error, diag_msg, { name: tok(tm, @te) }, range(tm, @te) else emit(:tCOLON, tok(@ts, @ts + 1), @ts, @ts + 1) p = @ts end fnext expr_end; fbreak; }; # # AMBIGUOUS TERNARY OPERATOR # # Character constant, like ?a, ?\n, ?\u1000, and so on # Don't accept \u escape with multiple codepoints, like \u{1 2 3} '?' ( e_bs ( escape - ( '\u{' (xdigit+ [ \t]+)+ xdigit+ '}' )) | (c_any - c_space_nl - e_bs) % { @escape = nil } ) => { value = @escape || tok(@ts + 1) if version?(18) emit(:tINTEGER, value.getbyte(0)) else emit(:tCHARACTER, value) end fnext expr_end; fbreak; }; '?' c_space_nl => { escape = { " " => '\s', "\r" => '\r', "\n" => '\n', "\t" => '\t', "\v" => '\v', "\f" => '\f' }[@source_buffer.slice(@ts + 1)] diagnostic :warning, :invalid_escape_use, { :escape => escape }, range p = @ts - 1 fgoto expr_end; }; '?' c_eof => { diagnostic :fatal, :incomplete_escape, nil, range(@ts, @ts + 1) }; # f ?aa : b: Disambiguate with a character literal. '?' [A-Za-z_] bareword => { p = @ts - 1 fgoto expr_end; }; # # AMBIGUOUS EMPTY BLOCK ARGUMENTS # # Ruby >= 2.7 emits it as two tPIPE terminals # while Ruby < 2.7 as a single tOROP (like in `a || b`) '||' => { if @version >= 27 emit(:tPIPE, tok(@ts, @ts + 1), @ts, @ts + 1) fhold; fnext expr_beg; fbreak; else p -= 2 fgoto expr_end; end }; # # KEYWORDS AND PUNCTUATION # # a({b=>c}) e_lbrace => { if @lambda_stack.last == @paren_nest @lambda_stack.pop @command_start = true emit(:tLAMBEG, '{'.freeze) else emit(:tLBRACE, '{'.freeze) end @paren_nest += 1 fbreak; }; # a([1, 2]) e_lbrack => { emit(:tLBRACK, '['.freeze) fbreak; }; # a() e_lparen => { emit(:tLPAREN, '('.freeze) fbreak; }; # a(+b) punctuation_begin => { emit_table(PUNCTUATION_BEGIN) fbreak; }; # rescue Exception => e: Block rescue. # Special because it should transition to expr_mid. 'rescue' %{ tm = p } '=>'? => { emit(:kRESCUE, 'rescue'.freeze, @ts, tm) p = tm - 1 fnext expr_mid; fbreak; }; # if a: Statement if. keyword_modifier => { emit_table(KEYWORDS_BEGIN) @command_start = true fnext expr_value; fbreak; }; # # RUBY 1.9 HASH LABELS # label ( any - ':' ) => { fhold; if version?(18) ident = tok(@ts, @te - 2) emit((@source_buffer.slice(@ts) =~ /[A-Z]/) ? :tCONSTANT : :tIDENTIFIER, ident, @ts, @te - 2) fhold; # continue as a symbol if !@static_env.nil? && @static_env.declared?(ident) fnext expr_end; else fnext *arg_or_cmdarg(cmd_state); end else emit(:tLABEL, tok(@ts, @te - 2), @ts, @te - 1) fnext expr_labelarg; end fbreak; }; # # RUBY 2.7 BEGINLESS RANGE '..' => { if @version >= 27 emit(:tBDOT2) else emit(:tDOT2) end fnext expr_beg; fbreak; }; '...' => { if @version >= 28 if @lambda_stack.any? && @lambda_stack.last + 1 == @paren_nest # To reject `->(...)` like `->...` emit(:tDOT3) else emit(:tBDOT3) end elsif @version >= 27 emit(:tBDOT3) else emit(:tDOT3) end fnext expr_beg; fbreak; }; # # CONTEXT-DEPENDENT VARIABLE LOOKUP OR COMMAND INVOCATION # # foo= bar: Disambiguate with bareword rule below. bareword ambiguous_ident_suffix | # def foo: Disambiguate with bareword rule below. keyword => { p = @ts - 1 fgoto expr_end; }; # a = 42; a [42]: Indexing. # def a; end; a [42]: Array argument. call_or_var => local_ident; (call_or_var - keyword) % { ident_tok = tok; ident_ts = @ts; ident_te = @te; } w_space+ '(' => { emit(:tIDENTIFIER, ident_tok, ident_ts, ident_te) p = ident_te - 1 if !@static_env.nil? && @static_env.declared?(ident_tok) && @version < 25 fnext expr_endfn; else fnext expr_cmdarg; end fbreak; }; # # WHITESPACE # w_any; e_heredoc_nl '=begin' ( c_space | c_nl_zlen ) => { p = @ts - 1 @cs_before_block_comment = @cs fgoto line_begin; }; # # DEFAULT TRANSITION # # The following rules match most binary and all unary operators. # Rules for binary operators provide better error reporting. operator_arithmetic '=' | operator_rest | punctuation_end | c_any => { p = @ts - 1; fgoto expr_end; }; c_eof => do_eof; *|; # Special newline handling for "def a b:" # expr_labelarg := |* w_space_comment; w_newline => { if @in_kwarg fhold; fgoto expr_end; else fgoto line_begin; end }; c_any => { fhold; fgoto expr_beg; }; c_eof => do_eof; *|; # Like expr_beg, but no 1.9 label or 2.2 quoted label possible. # expr_value := |* # a:b: a(:b), a::B, A::B label (any - ':') => { p = @ts - 1 fgoto expr_end; }; # "bar", 'baz' ['"] # ' => { fgoto *push_literal(tok, tok, @ts); }; w_space_comment; w_newline => { fgoto line_begin; }; c_any => { fhold; fgoto expr_beg; }; c_eof => do_eof; *|; expr_end := |* # # STABBY LAMBDA # '->' => { emit(:tLAMBDA, '->'.freeze, @ts, @ts + 2) @lambda_stack.push @paren_nest fnext expr_endfn; fbreak; }; e_lbrace | 'do' => { if @lambda_stack.last == @paren_nest @lambda_stack.pop if tok == '{'.freeze emit(:tLAMBEG, '{'.freeze) else # 'do' emit(:kDO_LAMBDA, 'do'.freeze) end else if tok == '{'.freeze emit(:tLCURLY, '{'.freeze) else # 'do' emit_do end end if tok == '{'.freeze @paren_nest += 1 end @command_start = true fnext expr_value; fbreak; }; # # KEYWORDS # keyword_with_fname => { emit_table(KEYWORDS) fnext expr_fname; fbreak; }; 'class' w_any* '<<' => { emit(:kCLASS, 'class'.freeze, @ts, @ts + 5) emit(:tLSHFT, '<<'.freeze, @te - 2, @te) fnext expr_value; fbreak; }; # a if b:c: Syntax error. keyword_modifier => { emit_table(KEYWORDS) fnext expr_beg; fbreak; }; # elsif b:c: elsif b(:c) keyword_with_value => { emit_table(KEYWORDS) @command_start = true fnext expr_value; fbreak; }; keyword_with_mid => { emit_table(KEYWORDS) fnext expr_mid; fbreak; }; keyword_with_arg => { emit_table(KEYWORDS) if version?(18) && tok == 'not'.freeze fnext expr_beg; fbreak; else fnext expr_arg; fbreak; end }; '__ENCODING__' => { if version?(18) emit(:tIDENTIFIER) unless !@static_env.nil? && @static_env.declared?(tok) fnext *arg_or_cmdarg(cmd_state); end else emit(:k__ENCODING__, '__ENCODING__'.freeze) end fbreak; }; keyword_with_end => { emit_table(KEYWORDS) fbreak; }; # # NUMERIC LITERALS # ( '0' [Xx] %{ @num_base = 16; @num_digits_s = p } int_hex | '0' [Dd] %{ @num_base = 10; @num_digits_s = p } int_dec | '0' [Oo] %{ @num_base = 8; @num_digits_s = p } int_dec | '0' [Bb] %{ @num_base = 2; @num_digits_s = p } int_bin | [1-9] digit* '_'? %{ @num_base = 10; @num_digits_s = @ts } int_dec | '0' digit* '_'? %{ @num_base = 8; @num_digits_s = @ts } int_dec ) %{ @num_suffix_s = p } int_suffix => { digits = tok(@num_digits_s, @num_suffix_s) if digits.end_with? '_'.freeze diagnostic :error, :trailing_in_number, { :character => '_'.freeze }, range(@te - 1, @te) elsif digits.empty? && @num_base == 8 && version?(18) # 1.8 did not raise an error on 0o. digits = '0'.freeze elsif digits.empty? diagnostic :error, :empty_numeric elsif @num_base == 8 && (invalid_idx = digits.index(/[89]/)) invalid_s = @num_digits_s + invalid_idx diagnostic :error, :invalid_octal, nil, range(invalid_s, invalid_s + 1) end if version?(18, 19, 20) emit(:tINTEGER, digits.to_i(@num_base), @ts, @num_suffix_s) p = @num_suffix_s - 1 else @num_xfrm.call(digits.to_i(@num_base)) end fbreak; }; flo_frac flo_pow? => { diagnostic :error, :no_dot_digit_literal }; flo_int [eE] => { if version?(18, 19, 20) diagnostic :error, :trailing_in_number, { :character => tok(@te - 1, @te) }, range(@te - 1, @te) else emit(:tINTEGER, tok(@ts, @te - 1).to_i, @ts, @te - 1) fhold; fbreak; end }; flo_int flo_frac [eE] => { if version?(18, 19, 20) diagnostic :error, :trailing_in_number, { :character => tok(@te - 1, @te) }, range(@te - 1, @te) else emit(:tFLOAT, tok(@ts, @te - 1).to_f, @ts, @te - 1) fhold; fbreak; end }; flo_int ( flo_frac? flo_pow %{ @num_suffix_s = p } flo_pow_suffix | flo_frac %{ @num_suffix_s = p } flo_suffix ) => { digits = tok(@ts, @num_suffix_s) if version?(18, 19, 20) emit(:tFLOAT, Float(digits), @ts, @num_suffix_s) p = @num_suffix_s - 1 else @num_xfrm.call(digits) end fbreak; }; # # STRING AND XSTRING LITERALS # # `echo foo`, "bar", 'baz' '`' | ['"] # ' => { type, delimiter = tok, tok[-1].chr fgoto *push_literal(type, delimiter, @ts, nil, false, false, true); }; # # CONSTANTS AND VARIABLES # constant => { emit(:tCONSTANT) fnext *arg_or_cmdarg(cmd_state); fbreak; }; constant ambiguous_const_suffix => { emit(:tCONSTANT, tok(@ts, tm), @ts, tm) p = tm - 1; fbreak; }; global_var | class_var_v | instance_var_v => { p = @ts - 1; fcall expr_variable; }; # # METHOD CALLS # '.' | '&.' | '::' => { emit_table(PUNCTUATION) fnext expr_dot; fbreak; }; call_or_var => local_ident; bareword ambiguous_fid_suffix => { if tm == @te # Suffix was consumed, e.g. foo! emit(:tFID) else # Suffix was not consumed, e.g. foo!= emit(:tIDENTIFIER, tok(@ts, tm), @ts, tm) p = tm - 1 end fnext expr_arg; fbreak; }; # # OPERATORS # '*' | '=>' => { emit_table(PUNCTUATION) fgoto expr_value; }; # When '|', '~', '!', '=>' are used as operators # they do not accept any symbols (or quoted labels) after. # Other binary operators accept it. ( operator_arithmetic | operator_rest ) - ( '|' | '~' | '!' | '*' ) => { emit_table(PUNCTUATION); fnext expr_value; fbreak; }; ( e_lparen | '|' | '~' | '!' ) => { emit_table(PUNCTUATION) fnext expr_beg; fbreak; }; e_rbrace | e_rparen | e_rbrack => { emit_table(PUNCTUATION) if @version < 24 @cond.lexpop @cmdarg.lexpop else @cond.pop @cmdarg.pop end if tok == '}'.freeze || tok == ']'.freeze if @version >= 25 fnext expr_end; else fnext expr_endarg; end else # ) # fnext expr_endfn; ? end fbreak; }; operator_arithmetic '=' => { emit(:tOP_ASGN, tok(@ts, @te - 1)) fnext expr_beg; fbreak; }; '?' => { emit(:tEH, '?'.freeze) fnext expr_value; fbreak; }; e_lbrack => { emit(:tLBRACK2, '['.freeze) fnext expr_beg; fbreak; }; '...' c_nl => { if @paren_nest == 0 diagnostic :warning, :triple_dot_at_eol, nil, range(@ts, @te - 1) end emit(:tDOT3, '...'.freeze, @ts, @te - 1) fhold; fnext expr_beg; fbreak; }; punctuation_end => { emit_table(PUNCTUATION) fnext expr_beg; fbreak; }; # # WHITESPACE # w_space_comment; w_newline => { fgoto leading_dot; }; ';' => { emit(:tSEMI, ';'.freeze) @command_start = true fnext expr_value; fbreak; }; '\\' c_line { diagnostic :error, :bare_backslash, nil, range(@ts, @ts + 1) fhold; }; c_any => { diagnostic :fatal, :unexpected, { :character => tok.inspect[1..-2] } }; c_eof => do_eof; *|; leading_dot := |* # Insane leading dots: # a #comment # # post-2.7 comment # .b: a.b # Here we use '\n' instead of w_newline to not modify @newline_s # and eventually properly emit tNL (c_space* w_space_comment '\n')+ => { if @version < 27 # Ruby before 2.7 doesn't support comments before leading dot. # If a line after "a" starts with a comment then "a" is a self-contained statement. # So in that case we emit a special tNL token and start reading the # next line as a separate statement. # # Note: block comments before leading dot are not supported on any version of Ruby. emit(:tNL, nil, @newline_s, @newline_s + 1) fhold; fnext line_begin; fbreak; end }; c_space* %{ tm = p } ('.' | '&.') => { p = tm - 1; fgoto expr_end; }; any => { emit(:tNL, nil, @newline_s, @newline_s + 1) fhold; fnext line_begin; fbreak; }; *|; # # === EMBEDDED DOCUMENT (aka BLOCK COMMENT) PARSING === # line_comment := |* '=end' c_line* c_nl_zlen => { emit_comment(@eq_begin_s, @te) fgoto *@cs_before_block_comment; }; c_line* c_nl; c_line* zlen => { diagnostic :fatal, :embedded_document, nil, range(@eq_begin_s, @eq_begin_s + '=begin'.length) }; *|; line_begin := |* w_any; '=begin' ( c_space | c_nl_zlen ) => { @eq_begin_s = @ts fgoto line_comment; }; '__END__' ( c_eol - zlen ) => { p = pe - 3 }; c_any => { cmd_state = true; fhold; fgoto expr_value; }; c_eof => do_eof; *|; }%% # % end