# encoding: UTF-8 class RubyLexer # :stopdoc: RUBY19 = "".respond_to? :encoding IDENT_CHAR_RE = if RUBY19 then /[\w\u0080-\u{10ffff}]/u else /[\w\x80-\xFF]/n end IDENT_RE = /^#{IDENT_CHAR_RE}+/o attr_accessor :command_start attr_accessor :cmdarg attr_accessor :cond attr_accessor :tern # TODO: rename ternary damnit... wtf attr_accessor :string_nest ESC_RE = /\\((?>[0-7]{1,3}|x[0-9a-fA-F]{1,2}|M-[^\\]|(C-|c)[^\\]|[^0-7xMCc]))/u # :startdoc: ## # What version of ruby to parse. 18 and 19 are the only valid values # currently supported. attr_accessor :version # Additional context surrounding tokens that both the lexer and # grammar use. attr_reader :lex_state attr_accessor :lex_strterm attr_accessor :parser # HACK for very end of lexer... *sigh* # Stream of data that yylex examines. attr_reader :src # Last token read via yylex. attr_accessor :token attr_accessor :string_buffer # Value of last token which had a value associated with it. attr_accessor :yacc_value # What handles warnings attr_accessor :warnings attr_accessor :space_seen attr_accessor :paren_nest attr_accessor :brace_nest attr_accessor :lpar_beg EOF = :eof_haha! # ruby constants for strings (should this be moved somewhere else?) # :stopdoc: STR_FUNC_BORING = 0x00 STR_FUNC_ESCAPE = 0x01 # TODO: remove and replace with REGEXP STR_FUNC_EXPAND = 0x02 STR_FUNC_REGEXP = 0x04 STR_FUNC_QWORDS = 0x08 STR_FUNC_SYMBOL = 0x10 STR_FUNC_INDENT = 0x20 # <<-HEREDOC STR_SQUOTE = STR_FUNC_BORING STR_DQUOTE = STR_FUNC_BORING | STR_FUNC_EXPAND STR_XQUOTE = STR_FUNC_BORING | STR_FUNC_EXPAND STR_REGEXP = STR_FUNC_REGEXP | STR_FUNC_ESCAPE | STR_FUNC_EXPAND STR_SSYM = STR_FUNC_SYMBOL STR_DSYM = STR_FUNC_SYMBOL | STR_FUNC_EXPAND TOKENS = { "!" => :tBANG, "!=" => :tNEQ, # "!@" => :tUBANG, "!~" => :tNMATCH, "," => :tCOMMA, ".." => :tDOT2, "..." => :tDOT3, "=" => :tEQL, "==" => :tEQ, "===" => :tEQQ, "=>" => :tASSOC, "=~" => :tMATCH, "->" => :tLAMBDA, } # :startdoc: # How the parser advances to the next token. # # @return true if not at end of file (EOF). def advance r = yylex self.token = r raise "yylex returned nil" unless r return RubyLexer::EOF != r end def arg_ambiguous self.warning("Ambiguous first argument. make sure.") end def comments c = @comments.join @comments.clear c end def expr_beg_push val cond.push false cmdarg.push false self.lex_state = :expr_beg self.yacc_value = val end def fix_arg_lex_state self.lex_state = if in_lex_state? :expr_fname, :expr_dot then :expr_arg else :expr_beg end end def heredoc here # 63 lines _, eos, func, last_line = here indent = (func & STR_FUNC_INDENT) != 0 ? "[ \t]*" : nil expand = (func & STR_FUNC_EXPAND) != 0 eos_re = /#{indent}#{Regexp.escape eos}(\r*\n|\z)/ err_msg = "can't match #{eos_re.inspect} anywhere in " rb_compile_error err_msg if src.eos? if src.beginning_of_line? && src.scan(eos_re) then src.unread_many last_line # TODO: figure out how to remove this self.yacc_value = eos return :tSTRING_END end self.string_buffer = [] if expand then case when src.scan(/#[$@]/) then src.pos -= 1 # FIX omg stupid self.yacc_value = src.matched return :tSTRING_DVAR when src.scan(/#[{]/) then self.yacc_value = src.matched return :tSTRING_DBEG when src.scan(/#/) then string_buffer << '#' end begin c = tokadd_string func, "\n", nil rb_compile_error err_msg if c == RubyLexer::EOF if c != "\n" then self.yacc_value = string_buffer.join.delete("\r") return :tSTRING_CONTENT else string_buffer << src.scan(/\n/) end rb_compile_error err_msg if src.eos? end until src.check(eos_re) else until src.check(eos_re) do string_buffer << src.scan(/.*(\n|\z)/) rb_compile_error err_msg if src.eos? end end self.lex_strterm = [:heredoc, eos, func, last_line] self.yacc_value = string_buffer.join.delete("\r") return :tSTRING_CONTENT end def heredoc_identifier # 51 lines term, func = nil, STR_FUNC_BORING self.string_buffer = [] case when src.scan(/(-?)([\'\"\`])(.*?)\2/) then term = src[2] func |= STR_FUNC_INDENT unless src[1].empty? func |= case term when "\'" then STR_SQUOTE when '"' then STR_DQUOTE else STR_XQUOTE end string_buffer << src[3] when src.scan(/-?([\'\"\`])(?!\1*\Z)/) then rb_compile_error "unterminated here document identifier" when src.scan(/(-?)(#{IDENT_CHAR_RE}+)/) then term = '"' func |= STR_DQUOTE unless src[1].empty? then func |= STR_FUNC_INDENT end string_buffer << src[2] else return nil end if src.scan(/.*\n/) then # TODO: think about storing off the char range instead line = src.matched src.extra_lines_added += 1 else line = nil end self.lex_strterm = [:heredoc, string_buffer.join, func, line] if term == '`' then self.yacc_value = "`" return :tXSTRING_BEG else self.yacc_value = "\"" return :tSTRING_BEG end end def in_lex_state?(*states) states.include? lex_state end def initialize v = 18 self.version = v self.cond = RubyParserStuff::StackState.new(:cond) self.cmdarg = RubyParserStuff::StackState.new(:cmdarg) self.tern = RubyParserStuff::StackState.new(:tern) self.string_nest = 0 self.paren_nest = 0 self.brace_nest = 0 self.lpar_beg = nil @comments = [] reset end def int_with_base base rb_compile_error "Invalid numeric format" if src.matched =~ /__/ self.yacc_value = src.matched.to_i(base) return :tINTEGER end def lex_state= o # warn "wtf lex_state = #{o.inspect} from #{caller.first}" raise "wtf\?" unless Symbol === o @lex_state = o end attr_writer :lineno def lineno @lineno ||= src.lineno end ## # Parse a number from the input stream. # # @param c The first character of the number. # @return A int constant wich represents a token. def parse_number self.lex_state = :expr_end case when src.scan(/[+-]?0[xXbBdD]\b/) then rb_compile_error "Invalid numeric format" when src.scan(/[+-]?(?:(?:[1-9][\d_]*|0)(?!\.\d)\b|0[Dd][0-9_]+)/) then int_with_base(10) when src.scan(/[+-]?0x[a-f0-9_]+/i) then int_with_base(16) when src.scan(/[+-]?0[Bb][01_]+/) then int_with_base(2) when src.scan(/[+-]?0[Oo]?[0-7_]*[89]/) then rb_compile_error "Illegal octal digit." when src.scan(/[+-]?0[Oo]?[0-7_]+|0[Oo]/) then int_with_base(8) when src.scan(/[+-]?[\d_]+_(e|\.)/) then rb_compile_error "Trailing '_' in number." when src.scan(/[+-]?[\d_]+\.[\d_]+(e[+-]?[\d_]+)?\b|[+-]?[\d_]+e[+-]?[\d_]+\b/i) then number = src.matched if number =~ /__/ then rb_compile_error "Invalid numeric format" end self.yacc_value = number.to_f :tFLOAT when src.scan(/[+-]?[0-9_]+(?![e])/) then int_with_base(10) else rb_compile_error "Bad number format" end end def parse_quote # 58 lines beg, nnd, short_hand, c = nil, nil, false, nil if src.scan(/[a-z0-9]{1,2}/i) then # Long-hand (e.g. %Q{}). rb_compile_error "unknown type of %string" if src.matched_size == 2 c, beg, short_hand = src.matched, src.getch, false else # Short-hand (e.g. %{, %., %!, etc) c, beg, short_hand = 'Q', src.getch, true end if src.eos? or c == RubyLexer::EOF or beg == RubyLexer::EOF then rb_compile_error "unterminated quoted string meets end of file" end # Figure nnd-char. "\0" is special to indicate beg=nnd and that no nesting? nnd = { "(" => ")", "[" => "]", "{" => "}", "<" => ">" }[beg] nnd, beg = beg, "\0" if nnd.nil? token_type, self.yacc_value = nil, "%#{c}#{beg}" token_type, string_type = case c when 'Q' then ch = short_hand ? nnd : c + beg self.yacc_value = "%#{ch}" [:tSTRING_BEG, STR_DQUOTE] when 'q' then [:tSTRING_BEG, STR_SQUOTE] when 'W' then src.scan(/\s*/) [:tWORDS_BEG, STR_DQUOTE | STR_FUNC_QWORDS] when 'w' then src.scan(/\s*/) [:tQWORDS_BEG, STR_SQUOTE | STR_FUNC_QWORDS] when 'x' then [:tXSTRING_BEG, STR_XQUOTE] when 'r' then [:tREGEXP_BEG, STR_REGEXP] when 's' then self.lex_state = :expr_fname [:tSYMBEG, STR_SSYM] when 'I' then [:tSYMBOLS_BEG, STR_DQUOTE | STR_FUNC_QWORDS] when 'i' then [:tQSYMBOLS_BEG, STR_SQUOTE | STR_FUNC_QWORDS] end rb_compile_error "Bad %string type. Expected [Qq\Wwxrs], found '#{c}'." if token_type.nil? self.lex_strterm = [:strterm, string_type, nnd, beg] return token_type end def parse_string(quote) # 65 lines _, string_type, term, open = quote space = false # FIX: remove these func = string_type paren = open term_re = @@regexp_cache[term] qwords = (func & STR_FUNC_QWORDS) != 0 regexp = (func & STR_FUNC_REGEXP) != 0 expand = (func & STR_FUNC_EXPAND) != 0 unless func then # FIX: impossible, prolly needs == 0 self.lineno = nil return :tSTRING_END end space = true if qwords and src.scan(/\s+/) if self.string_nest == 0 && src.scan(/#{term_re}/) then if qwords then quote[1] = nil # TODO: make struct return :tSPACE elsif regexp then self.yacc_value = self.regx_options self.lineno = nil return :tREGEXP_END else self.yacc_value = term self.lineno = nil return :tSTRING_END end end if space then return :tSPACE end self.string_buffer = [] if expand case when src.scan(/#(?=[$@])/) then return :tSTRING_DVAR when src.scan(/#[{]/) then return :tSTRING_DBEG when src.scan(/#/) then string_buffer << '#' end end if tokadd_string(func, term, paren) == RubyLexer::EOF then rb_compile_error "unterminated string meets end of file" end self.yacc_value = string_buffer.join return :tSTRING_CONTENT end def rb_compile_error msg msg += ". near line #{self.lineno}: #{src.rest[/^.*/].inspect}" raise RubyParser::SyntaxError, msg end def read_escape # 51 lines case when src.scan(/\\/) then # Backslash '\\' when src.scan(/n/) then # newline "\n" when src.scan(/t/) then # horizontal tab "\t" when src.scan(/r/) then # carriage-return "\r" when src.scan(/f/) then # form-feed "\f" when src.scan(/v/) then # vertical tab "\13" when src.scan(/a/) then # alarm(bell) "\007" when src.scan(/e/) then # escape "\033" when src.scan(/b/) then # backspace "\010" when src.scan(/s/) then # space " " when src.scan(/[0-7]{1,3}/) then # octal constant (src.matched.to_i(8) & 0xFF).chr when src.scan(/x([0-9a-fA-F]{1,2})/) then # hex constant src[1].to_i(16).chr when src.check(/M-\\[\\MCc]/) then src.scan(/M-\\/) # eat it c = self.read_escape c[0] = (c[0].ord | 0x80).chr c when src.scan(/M-(.)/) then c = src[1] c[0] = (c[0].ord | 0x80).chr c when src.check(/(C-|c)\\[\\MCc]/) then src.scan(/(C-|c)\\/) # eat it c = self.read_escape c[0] = (c[0].ord & 0x9f).chr c when src.scan(/C-\?|c\?/) then 127.chr when src.scan(/(C-|c)(.)/) then c = src[2] c[0] = (c[0].ord & 0x9f).chr c when src.scan(/^[89]/i) then # bad octal or hex... MRI ignores them :( src.matched when src.scan(/[McCx0-9]/) || src.eos? then rb_compile_error("Invalid escape character syntax") else src.getch end end def regx_options # 15 lines good, bad = [], [] if src.scan(/[a-z]+/) then good, bad = src.matched.split(//).partition { |s| s =~ /^[ixmonesu]$/ } end unless bad.empty? then rb_compile_error("unknown regexp option%s - %s" % [(bad.size > 1 ? "s" : ""), bad.join.inspect]) end return good.join end def reset self.command_start = true self.lex_strterm = nil self.token = nil self.yacc_value = nil @src = nil @lex_state = nil end def ruby18 Ruby18Parser === parser end def ruby19 Ruby19Parser === parser end def src= src raise "bad src: #{src.inspect}" unless String === src @src = RPStringScanner.new(src) end def tokadd_escape term # 20 lines case when src.scan(/\\\n/) then # just ignore when src.scan(/\\([0-7]{1,3}|x[0-9a-fA-F]{1,2})/) then self.string_buffer << src.matched when src.scan(/\\([MC]-|c)(?=\\)/) then self.string_buffer << src.matched self.tokadd_escape term when src.scan(/\\([MC]-|c)(.)/) then self.string_buffer << src.matched when src.scan(/\\[McCx]/) then rb_compile_error "Invalid escape character syntax" when src.scan(/\\(.)/m) then self.string_buffer << src.matched else rb_compile_error "Invalid escape character syntax" end end @@regexp_cache = Hash.new { |h,k| h[k] = Regexp.new(Regexp.escape(k)) } @@regexp_cache[nil] = nil def tokadd_string(func, term, paren) # 105 lines qwords = (func & STR_FUNC_QWORDS) != 0 escape = (func & STR_FUNC_ESCAPE) != 0 expand = (func & STR_FUNC_EXPAND) != 0 regexp = (func & STR_FUNC_REGEXP) != 0 symbol = (func & STR_FUNC_SYMBOL) != 0 paren_re = @@regexp_cache[paren] term_re = @@regexp_cache[term] until src.eos? do c = nil handled = true case when paren_re && src.scan(paren_re) then self.string_nest += 1 when src.scan(term_re) then if self.string_nest == 0 then src.pos -= 1 break else self.string_nest -= 1 end when expand && src.scan(/#(?=[\$\@\{])/) then src.pos -= 1 break when qwords && src.scan(/\s/) then src.pos -= 1 break when expand && src.scan(/#(?!\n)/) then # do nothing when src.check(/\\/) then case when qwords && src.scan(/\\\n/) then string_buffer << "\n" next when qwords && src.scan(/\\\s/) then c = ' ' when expand && src.scan(/\\\n/) then next when regexp && src.check(/\\/) then self.tokadd_escape term next when expand && src.scan(/\\/) then c = self.read_escape when src.scan(/\\\n/) then # do nothing when src.scan(/\\\\/) then string_buffer << '\\' if escape c = '\\' when src.scan(/\\/) then unless src.scan(term_re) || paren.nil? || src.scan(paren_re) then string_buffer << "\\" end else handled = false end # inner /\\/ case else handled = false end # top case unless handled then t = Regexp.escape term x = Regexp.escape(paren) if paren && paren != "\000" re = if qwords then if RUBY19 then /[^#{t}#{x}\#\0\\\s]+|./ # |. to pick up whatever else /[^#{t}#{x}\#\0\\\s\v]+|./ # argh. 1.8's \s doesn't pick up \v end else /[^#{t}#{x}\#\0\\]+|./ end src.scan re c = src.matched rb_compile_error "symbol cannot contain '\\0'" if symbol && c =~ /\0/ end # unless handled c ||= src.matched string_buffer << c end # until c ||= src.matched c = RubyLexer::EOF if src.eos? return c end ESCAPES = { "a" => "\007", "b" => "\010", "e" => "\033", "f" => "\f", "n" => "\n", "r" => "\r", "s" => " ", "t" => "\t", "v" => "\13", "\\" => '\\', "\n" => "", "C-\?" => 127.chr, "c\?" => 127.chr, } def unescape s r = ESCAPES[s] return r if r x = case s when /^[0-7]{1,3}/ then ($&.to_i(8) & 0xFF).chr when /^x([0-9a-fA-F]{1,2})/ then $1.to_i(16).chr when /^M-(.)/ then ($1[0].ord | 0x80).chr when /^(C-|c)(.)/ then ($2[0].ord & 0x9f).chr when /^[89a-f]/i then # bad octal or hex... ignore? that's what MRI does :( s when /^[McCx0-9]/ then rb_compile_error("Invalid escape character syntax") else s end x.force_encoding "UTF-8" if RUBY19 x end def warning s # do nothing for now end ## # Returns the next token. Also sets yy_val is needed. # # @return Description of the Returned Value def yylex # 826 lines c = '' self.space_seen = false command_state = false src = self.src self.token = nil self.yacc_value = nil return yylex_string if lex_strterm command_state = self.command_start self.command_start = false last_state = lex_state loop do # START OF CASE if src.scan(/[\ \t\r\f\v]/) then # \s - \n + \v self.space_seen = true next elsif src.check(/[^a-zA-Z]/) then if src.scan(/\n|#/) then self.lineno = nil c = src.matched if c == '#' then src.pos -= 1 while src.scan(/\s*#.*(\n+|\z)/) do @comments << src.matched.gsub(/^ +#/, '#').gsub(/^ +$/, '') end return RubyLexer::EOF if src.eos? end # Replace a string of newlines with a single one src.scan(/\n+/) next if in_lex_state?(:expr_beg, :expr_value, :expr_class, :expr_fname, :expr_dot) if src.scan(/([\ \t\r\f\v]*)\./) then self.space_seen = true unless src[1].empty? src.pos -= 1 next unless src.check(/\.\./) end self.command_start = true self.lex_state = :expr_beg return :tNL elsif src.scan(/[\]\)\}]/) then if src.matched == "}" then self.brace_nest -= 1 else self.paren_nest -= 1 end cond.lexpop cmdarg.lexpop tern.lexpop self.lex_state = if src.matched == ")" then :expr_endfn else :expr_endarg end self.yacc_value = src.matched result = { ")" => :tRPAREN, "]" => :tRBRACK, "}" => :tRCURLY }[src.matched] return result elsif src.scan(/\!/) then if in_lex_state?(:expr_fname, :expr_dot) then self.lex_state = :expr_arg if src.scan(/@/) then self.yacc_value = "!@" return :tUBANG end else self.lex_state = :expr_beg end if src.scan(/[=~]/) then self.yacc_value = "!#{src.matched}" else self.yacc_value = "!" end return TOKENS[self.yacc_value] elsif src.scan(/\.\.\.?|,|![=~]?/) then self.lex_state = :expr_beg tok = self.yacc_value = src.matched return TOKENS[tok] elsif src.check(/\./) then if src.scan(/\.\d/) then rb_compile_error "no . floating literal anymore put 0 before dot" elsif src.scan(/\./) then self.lex_state = :expr_dot self.yacc_value = "." return :tDOT end elsif src.scan(/\(/) then result = if ruby18 then yylex_paren18 else yylex_paren19 end self.paren_nest += 1 self.expr_beg_push "(" return result elsif src.check(/\=/) then if src.scan(/\=\=\=|\=\=|\=~|\=>|\=(?!begin\b)/) then self.fix_arg_lex_state tok = self.yacc_value = src.matched return TOKENS[tok] elsif src.scan(/\=begin(?=\s)/) then @comments << src.matched unless src.scan(/.*?\n=end( |\t|\f)*[^\n]*(\n|\z)/m) then @comments.clear rb_compile_error("embedded document meets end of file") end @comments << src.matched next else raise "you shouldn't be able to get here" end elsif src.scan(/\"(#{ESC_RE}|#(#{ESC_RE}|[^\{\#\@\$\"\\])|[^\"\\\#])*\"/o) then self.yacc_value = src.matched[1..-2].gsub(ESC_RE) { unescape $1 } self.lex_state = :expr_end return :tSTRING elsif src.scan(/\"/) then # FALLBACK self.lex_strterm = [:strterm, STR_DQUOTE, '"', "\0"] # TODO: question this self.yacc_value = "\"" return :tSTRING_BEG elsif src.scan(/\@\@?#{IDENT_CHAR_RE}+/o) then self.token = src.matched rb_compile_error "`#{token}` is not allowed as a variable name" if token =~ /\@\d/ return process_token(command_state) elsif src.scan(/\:\:/) then if is_beg? || in_lex_state?(:expr_class) || is_space_arg? then self.lex_state = :expr_beg self.yacc_value = "::" return :tCOLON3 end self.lex_state = :expr_dot self.yacc_value = "::" return :tCOLON2 elsif ! is_end? && src.scan(/:([a-zA-Z_]#{IDENT_CHAR_RE}*(?:[?!]|=(?==>)|=(?![=>]))?)/) then # scanning shortcut to symbols self.yacc_value = src[1] self.lex_state = :expr_end return :tSYMBOL elsif src.scan(/\:/) then # ?: / then / when if is_end? || src.check(/\s/) then self.lex_state = :expr_beg # TODO warn_balanced(":", "symbol literal"); self.yacc_value = ":" return :tCOLON end case when src.scan(/\'/) then self.lex_strterm = [:strterm, STR_SSYM, src.matched, "\0"] when src.scan(/\"/) then self.lex_strterm = [:strterm, STR_DSYM, src.matched, "\0"] end self.lex_state = :expr_fname self.yacc_value = ":" return :tSYMBEG elsif src.check(/[0-9]/) then return parse_number elsif src.scan(/\[/) then self.paren_nest += 1 result = src.matched if in_lex_state? :expr_fname, :expr_dot then self.lex_state = :expr_arg case when src.scan(/\]\=/) then self.paren_nest -= 1 # HACK? I dunno, or bug in MRI self.yacc_value = "[]=" return :tASET when src.scan(/\]/) then self.paren_nest -= 1 # HACK? I dunno, or bug in MRI self.yacc_value = "[]" return :tAREF else rb_compile_error "unexpected '['" end elsif is_beg? then self.tern.push false result = :tLBRACK elsif is_arg? && space_seen then self.tern.push false result = :tLBRACK else result = :tLBRACK2 end self.expr_beg_push "[" return result elsif src.scan(/\'(\\.|[^\'])*\'/) then self.yacc_value = src.matched[1..-2].gsub(/\\\\/, "\\").gsub(/\\'/, "'") # " self.lex_state = :expr_end return :tSTRING elsif src.check(/\|/) then if src.scan(/\|\|\=/) then self.lex_state = :expr_beg self.yacc_value = "||" return :tOP_ASGN elsif src.scan(/\|\|/) then self.lex_state = :expr_beg self.yacc_value = "||" return :tOROP elsif src.scan(/\|\=/) then self.lex_state = :expr_beg self.yacc_value = "|" return :tOP_ASGN elsif src.scan(/\|/) then self.fix_arg_lex_state self.yacc_value = "|" return :tPIPE end elsif src.scan(/\{/) then self.brace_nest += 1 if lpar_beg && lpar_beg == paren_nest then self.lpar_beg = nil self.paren_nest -= 1 expr_beg_push "{" return :tLAMBEG end result = if is_arg? || in_lex_state?(:expr_end, :expr_endfn) then :tLCURLY # block (primary) elsif in_lex_state?(:expr_endarg) then :tLBRACE_ARG # block (expr) else self.tern.push false :tLBRACE # hash end self.expr_beg_push "{" self.command_start = true unless result == :tLBRACE return result elsif src.scan(/->/) then self.lex_state = :expr_endfn return :tLAMBDA elsif src.scan(/[+-]/) then sign = src.matched utype, type = if sign == "+" then [:tUPLUS, :tPLUS] else [:tUMINUS, :tMINUS] end if in_lex_state? :expr_fname, :expr_dot then self.lex_state = :expr_arg if src.scan(/@/) then self.yacc_value = "#{sign}@" return utype else self.yacc_value = sign return type end end if src.scan(/\=/) then self.lex_state = :expr_beg self.yacc_value = sign return :tOP_ASGN end if (is_beg? || (is_arg? && space_seen && !src.check(/\s/))) then if is_arg? then arg_ambiguous end self.lex_state = :expr_beg self.yacc_value = sign if src.check(/\d/) then if utype == :tUPLUS then return self.parse_number else return :tUMINUS_NUM end end return utype end self.lex_state = :expr_beg self.yacc_value = sign return type elsif src.check(/\*/) then if src.scan(/\*\*=/) then self.lex_state = :expr_beg self.yacc_value = "**" return :tOP_ASGN elsif src.scan(/\*\*/) then result = if is_space_arg? src.check(/./m) then warning "`**' interpreted as argument prefix" :tDSTAR elsif is_beg? then :tDSTAR else # TODO: warn_balanced("**", "argument prefix"); :tPOW end self.yacc_value = "**" self.fix_arg_lex_state return result elsif src.scan(/\*\=/) then self.lex_state = :expr_beg self.yacc_value = "*" return :tOP_ASGN elsif src.scan(/\*/) then result = if is_space_arg? src.check(/./m) then warning("`*' interpreted as argument prefix") :tSTAR elsif is_beg? then :tSTAR else # TODO: warn_balanced("*", "argument prefix"); :tSTAR2 # TODO: rename end self.yacc_value = "*" self.fix_arg_lex_state return result end elsif src.check(/\/) then self.fix_arg_lex_state self.yacc_value = "<=>" return :tCMP elsif src.scan(/\<\=/) then self.fix_arg_lex_state self.yacc_value = "<=" return :tLEQ elsif src.scan(/\<\<\=/) then self.fix_arg_lex_state self.lex_state = :expr_beg self.yacc_value = "\<\<" return :tOP_ASGN elsif src.scan(/\<\/) then if src.scan(/\>\=/) then self.fix_arg_lex_state self.yacc_value = ">=" return :tGEQ elsif src.scan(/\>\>=/) then self.fix_arg_lex_state self.lex_state = :expr_beg self.yacc_value = ">>" return :tOP_ASGN elsif src.scan(/\>\>/) then self.fix_arg_lex_state self.yacc_value = ">>" return :tRSHFT elsif src.scan(/\>/) then self.fix_arg_lex_state self.yacc_value = ">" return :tGT end elsif src.scan(/\`/) then self.yacc_value = "`" case lex_state when :expr_fname then self.lex_state = :expr_end return :tBACK_REF2 when :expr_dot then self.lex_state = if command_state then :expr_cmdarg else :expr_arg end return :tBACK_REF2 end self.lex_strterm = [:strterm, STR_XQUOTE, '`', "\0"] return :tXSTRING_BEG elsif src.scan(/\?/) then if is_end? then self.lex_state = ruby18 ? :expr_beg : :expr_value # HACK? self.tern.push true self.yacc_value = "?" return :tEH end if src.eos? then rb_compile_error "incomplete character syntax" end if src.check(/\s|\v/) then unless is_arg? then c2 = { " " => 's', "\n" => 'n', "\t" => 't', "\v" => 'v', "\r" => 'r', "\f" => 'f' }[src.matched] if c2 then warning("invalid character syntax; use ?\\" + c2) end end # ternary self.lex_state = ruby18 ? :expr_beg : :expr_value # HACK? self.tern.push true self.yacc_value = "?" return :tEH elsif src.check(/\w(?=\w)/) then # ternary, also self.lex_state = :expr_beg self.tern.push true self.yacc_value = "?" return :tEH end c = if src.scan(/\\/) then self.read_escape else src.getch end self.lex_state = :expr_end if version == 18 then self.yacc_value = c[0].ord & 0xff return :tINTEGER else self.yacc_value = c return :tSTRING end elsif src.check(/\&/) then if src.scan(/\&\&\=/) then self.yacc_value = "&&" self.lex_state = :expr_beg return :tOP_ASGN elsif src.scan(/\&\&/) then self.lex_state = :expr_beg self.yacc_value = "&&" return :tANDOP elsif src.scan(/\&\=/) then self.yacc_value = "&" self.lex_state = :expr_beg return :tOP_ASGN elsif src.scan(/&/) then result = if is_arg? && space_seen && !src.check(/\s/) then warning("`&' interpreted as argument prefix") :tAMPER elsif in_lex_state? :expr_beg, :expr_mid then :tAMPER else :tAMPER2 end self.fix_arg_lex_state self.yacc_value = "&" return result end elsif src.scan(/\//) then if is_beg? then self.lex_strterm = [:strterm, STR_REGEXP, '/', "\0"] self.yacc_value = "/" return :tREGEXP_BEG end if src.scan(/\=/) then self.yacc_value = "/" self.lex_state = :expr_beg return :tOP_ASGN end if is_arg? && space_seen then unless src.scan(/\s/) then arg_ambiguous self.lex_strterm = [:strterm, STR_REGEXP, '/', "\0"] self.yacc_value = "/" return :tREGEXP_BEG end end self.fix_arg_lex_state self.yacc_value = "/" return :tDIVIDE elsif src.scan(/\^=/) then self.lex_state = :expr_beg self.yacc_value = "^" return :tOP_ASGN elsif src.scan(/\^/) then self.fix_arg_lex_state self.yacc_value = "^" return :tCARET elsif src.scan(/\;/) then self.command_start = true self.lex_state = :expr_beg self.yacc_value = ";" return :tSEMI elsif src.scan(/\~/) then if in_lex_state? :expr_fname, :expr_dot then src.scan(/@/) end self.fix_arg_lex_state self.yacc_value = "~" return :tTILDE elsif src.scan(/\\/) then if src.scan(/\r?\n/) then self.lineno = nil self.space_seen = true next end rb_compile_error "bare backslash only allowed before newline" elsif src.scan(/\%/) then if is_beg? then return parse_quote end if src.scan(/\=/) then self.lex_state = :expr_beg self.yacc_value = "%" return :tOP_ASGN end return parse_quote if is_arg? && space_seen && ! src.check(/\s/) self.fix_arg_lex_state self.yacc_value = "%" return :tPERCENT elsif src.check(/\$/) then if src.scan(/(\$_)(\w+)/) then self.lex_state = :expr_end self.token = src.matched return process_token(command_state) elsif src.scan(/\$_/) then self.lex_state = :expr_end self.token = src.matched self.yacc_value = src.matched return :tGVAR elsif src.scan(/\$[~*$?!@\/\\;,.=:<>\"]|\$-\w?/) then self.lex_state = :expr_end self.yacc_value = src.matched return :tGVAR elsif src.scan(/\$([\&\`\'\+])/) then self.lex_state = :expr_end # Explicit reference to these vars as symbols... if last_state == :expr_fname then self.yacc_value = src.matched return :tGVAR else self.yacc_value = src[1].to_sym return :tBACK_REF end elsif src.scan(/\$([1-9]\d*)/) then self.lex_state = :expr_end if last_state == :expr_fname then self.yacc_value = src.matched return :tGVAR else self.yacc_value = src[1].to_i return :tNTH_REF end elsif src.scan(/\$0/) then self.lex_state = :expr_end self.token = src.matched return process_token(command_state) elsif src.scan(/\$\W|\$\z/) then # TODO: remove? self.lex_state = :expr_end self.yacc_value = "$" return "$" elsif src.scan(/\$\w+/) self.lex_state = :expr_end self.token = src.matched return process_token(command_state) end elsif src.check(/\_/) then if src.beginning_of_line? && src.scan(/\__END__(\r?\n|\Z)/) then self.lineno = nil return RubyLexer::EOF elsif src.scan(/\_\w*/) then self.token = src.matched return process_token(command_state) end end end # END OF CASE if src.scan(/\004|\032|\000/) || src.eos? then # ^D, ^Z, EOF return RubyLexer::EOF else # alpha check rb_compile_error "Invalid char #{src.rest[0].chr} in expression" unless src.check IDENT_RE end self.token = src.matched if self.src.scan IDENT_RE return process_token(command_state) end end def yylex_paren18 self.command_start = true result = :tLPAREN2 if in_lex_state? :expr_beg, :expr_mid then result = :tLPAREN elsif space_seen then if in_lex_state? :expr_cmdarg then result = :tLPAREN_ARG elsif in_lex_state? :expr_arg then self.tern.push false warning "don't put space before argument parentheses" end else self.tern.push false end result end def yylex_paren19 if is_beg? then :tLPAREN elsif is_space_arg? then :tLPAREN_ARG else :tLPAREN2 # plain '(' in parse.y end end def is_arg? in_lex_state? :expr_arg, :expr_cmdarg end def is_end? in_lex_state? :expr_end, :expr_endarg, :expr_endfn end def is_beg? in_lex_state? :expr_beg, :expr_value, :expr_mid, :expr_class end # TODO #define IS_AFTER_OPERATOR() IS_lex_state(EXPR_FNAME | EXPR_DOT) def is_space_arg? c = "x" is_arg? and space_seen and c !~ /\s/ end def is_label_possible? command_state (in_lex_state?(:expr_beg) && !command_state) || is_arg? end def process_token(command_state) token << src.matched if token =~ IDENT_RE && src.scan(/[\!\?](?!=)/) result = nil last_state = lex_state case token when /^\$/ then self.lex_state, result = :expr_end, :tGVAR when /^@@/ then self.lex_state, result = :expr_end, :tCVAR when /^@/ then self.lex_state, result = :expr_end, :tIVAR else if token =~ /[!?]$/ then result = :tFID else if in_lex_state? :expr_fname then # ident=, not =~ => == or followed by => # TODO test lexing of a=>b vs a==>b if src.scan(/=(?:(?![~>=])|(?==>))/) then result = :tIDENTIFIER token << src.matched end end result ||= if token =~ /^[A-Z]/ then :tCONSTANT else :tIDENTIFIER end end unless ruby18 if is_label_possible? command_state then colon = src.scan(/:/) if colon && src.peek(1) != ":" then self.lex_state = :expr_beg self.yacc_value = [token, src.lineno] return :tLABEL end src.unscan if colon end end unless in_lex_state? :expr_dot then # See if it is a reserved word. keyword = if ruby18 then # REFACTOR need 18/19 lexer subclasses RubyParserStuff::Keyword.keyword18 token else RubyParserStuff::Keyword.keyword19 token end if keyword then state = lex_state self.lex_state = keyword.state self.yacc_value = [token, src.lineno] if state == :expr_fname then self.yacc_value = keyword.name return keyword.id0 end self.command_start = true if lex_state == :expr_beg if keyword.id0 == :kDO then if lpar_beg && lpar_beg == paren_nest then self.lpar_beg = nil self.paren_nest -= 1 return :kDO_LAMBDA end return :kDO_COND if cond.is_in_state return :kDO_BLOCK if cmdarg.is_in_state && state != :expr_cmdarg return :kDO_BLOCK if [:expr_beg, :expr_endarg].include? state return :kDO end return keyword.id0 if [:expr_beg, :expr_value].include? state self.lex_state = :expr_beg if keyword.id0 != keyword.id1 return keyword.id1 end end # TODO: # if (mb == ENC_CODERANGE_7BIT && lex_state != EXPR_DOT) { self.lex_state = if is_beg? || is_arg? || in_lex_state?(:expr_dot) then if command_state then :expr_cmdarg else :expr_arg end elsif !ruby18 && in_lex_state?(:expr_fname) then :expr_endfn else :expr_end end end self.yacc_value = token if (![:expr_dot, :expr_fname].include?(last_state) && self.parser.env[token.to_sym] == :lvar) then self.lex_state = :expr_end end return result end def yylex_string # 23 lines token = if lex_strterm[0] == :heredoc then self.heredoc lex_strterm else self.parse_string lex_strterm end if token == :tSTRING_END || token == :tREGEXP_END then self.lineno = nil self.lex_strterm = nil self.lex_state = :expr_end end return token end end