lib/rubylexer.rb in rubylexer-0.7.2 vs lib/rubylexer.rb in rubylexer-0.7.3

- old
+ new

@@ -50,14 +50,18 @@ WHSPCHARS=WHSPLF+"\\#" OPORBEGINWORDLIST=%w(if unless while until) BEGINWORDLIST=%w(def class module begin for case do)+OPORBEGINWORDLIST OPORBEGINWORDS="(#{OPORBEGINWORDLIST.join '|'})" BEGINWORDS=/^(#{BEGINWORDLIST.join '|'})$/o - FUNCLIKE_KEYWORDS=/^(break|next|redo|return|yield|retry|super|BEGIN|END)$/ - VARLIKE_KEYWORDS=/^(__FILE__|__LINE__|false|nil|self|true)$/ - INNERBOUNDINGWORDS="(else|elsif|ensure|in|then|rescue|when)" - BINOPWORDS="(and|or)" + FUNCLIKE_KEYWORDLIST=%w/break next redo return yield retry super BEGIN END/ + FUNCLIKE_KEYWORDS=/^(#{FUNCLIKE_KEYWORDLIST.join '|'})$/ + VARLIKE_KEYWORDLIST=%w/__FILE__ __LINE__ false nil self true/ + VARLIKE_KEYWORDS=/^(#{VARLIKE_KEYWORDLIST.join '|'})$/ + INNERBOUNDINGWORDLIST=%w"else elsif ensure in then rescue when" + INNERBOUNDINGWORDS="(#{INNERBOUNDINGWORDLIST.join '|'})" + BINOPWORDLIST=%w"and or" + BINOPWORDS="(#{BINOPWORDLIST.join '|'})" NEVERSTARTPARAMLISTWORDS=/\A(#{OPORBEGINWORDS}|#{INNERBOUNDINGWORDS}|#{BINOPWORDS}|end)([^a-zA-Z0-9_!?=]|\Z)/o NEVERSTARTPARAMLISTFIRST=CharSet['aoeitrwu'] #chars that begin NEVERSTARTPARAMLIST NEVERSTARTPARAMLISTMAXLEN=7 #max len of a NEVERSTARTPARAMLIST RUBYKEYWORDS=%r{ @@ -153,10 +157,11 @@ def localvars; @localvars_stack.last end + attr_accessor :in_def attr :localvars_stack attr :offset_adjust attr_writer :pending_here_bodies #----------------------------------- @@ -256,10 +261,11 @@ end private #----------------------------------- def inside_method_def? + return true if (defined? @in_def) and @in_def @parsestack.reverse_each{|ctx| ctx.starter=='def' and ctx.state!=:saw_def and return true } return false end @@ -374,24 +380,27 @@ assert MethNameToken===@last_operative_token || !(@last_operative_token===/^(\.|::|(un)?def|alias)$/) @moretokens.unshift(*parse_keywords(str,oldpos) do |tok| #if not a keyword, case str - when FUNCLIKE_KEYWORDS; #do nothing + when FUNCLIKE_KEYWORDS; except=tok when VARLIKE_KEYWORDS,RUBYKEYWORDS; raise "shouldnt see keywords here, now" end was_last=@last_operative_token @last_operative_token=tok if tok - safe_recurse { |a| var_or_meth_name(str,was_last,oldpos,after_nonid_op?{true}) } + normally=safe_recurse { |a| var_or_meth_name(str,was_last,oldpos,after_nonid_op?{true}) } + (Array===normally ? normally[0]=except : normally=except) if except + normally end) return @moretokens.shift end #----------------------------------- IDENTREX={} def identifier_as_string(context) #must begin w/ letter or underscore + #char class needs changing here for utf8 support /[_a-z]/i===nextchar.chr or return #equals, question mark, and exclamation mark #might be allowed at the end in some contexts. #(in def headers and symbols) @@ -407,11 +416,11 @@ # when ?: then "!(?![=])|\\?|=(?![=~>])" else "!(?![=])|\\?" end @in_def_name||context==?: and trailers<<"|=(?![=~>])" - @file.scan(IDENTREX[trailers]||=/^[_a-z][a-z0-9_]*(?:#{trailers})?/i) + @file.scan(IDENTREX[trailers]||=/^(?>[_a-z][a-z0-9_]*(?:#{trailers})?)/i) end #----------------------------------- #contexts in which comma may appear in ruby: #multiple lhs (terminated by assign op) @@ -448,11 +457,11 @@ def in_lvar_define_state lasttok=@last_operative_token #@defining_lvar is a hack @defining_lvar or case ctx=@parsestack.last #when ForSMContext; ctx.state==:for when RescueSMContext - lasttok.ident=="=>" and @file.match? /\A[\s\v]*([:;#\n]|then[^a-zA-Z0-9_])/m + lasttok.ident=="=>" and @file.match?( /\A[\s\v]*([:;#\n]|then[^a-zA-Z0-9_])/m ) #when BlockParamListLhsContext; true end end IMPLICIT_PARENS_BEFORE_ACCESSOR_ASSIGNMENT=2 @@ -587,33 +596,33 @@ # MethNameToken===lasttok or # RUBYNONSYMOPERATORREX===lastid && /=$/===lastid && '!='!=lastid # ) #look ahead for closing paren (after some whitespace...) - want_parens=false if @file.match? /\A.(?:\s|\v|\#.*\n)*\)/ + want_parens=false if @file.match?( /\A.(?:\s|\v|\#.*\n)*\)/ ) # afterparen=@file.pos # getchar # ignored_tokens(true) # want_parens=false if nextchar==?) # @file.pos=afterparen want_parens=true if /^(return|break|next)$/===@last_operative_token.ident and not( - KeywordToken===lasttok and /^(.|::)$/===lasttok.ident + KeywordToken===lasttok and /^(\.|::)$/===lasttok.ident ) want_parens ? 1 : 0 when ?},?],?),?;,(?^ unless @enable_macro), ?|, ?>, ?,, ?., ?=; 2 when ?+, ?-, ?%, ?/, (?^ if @enable_macro) if /^(return|break|next)$/===@last_operative_token.ident and not( - KeywordToken===lasttok and /^(.|::)$/===lasttok.ident + KeywordToken===lasttok and /^(\.|::)$/===lasttok.ident ) 1 else (ws_toks.empty? || readahead(2)[/^.[#{WHSPLF}]/o]) ? 2 : 3 end when ?*, ?& # lasttok=@last_operative_token if /^(return|break|next)$/===@last_operative_token.ident and not( - KeywordToken===lasttok and /^(.|::)$/===lasttok.ident + KeywordToken===lasttok and /^(\.|::)$/===lasttok.ident ) 1 else (ws_toks.empty? || readahead(2)[/^.[#{WHSPLF}*&]/o]) ? 2 : 3 end @@ -663,11 +672,11 @@ #only 1 param in list result.unshift ImplicitParamListStartToken.new(oldpos) last=result.last last.set_callsite! false if last.respond_to? :callsite? and last.callsite? #KeywordToken===last and last.ident==')' if /^(break|next|return)$/===name and - !(KeywordToken===lasttok and /^(.|::)$/===lasttok.ident) + !(KeywordToken===lasttok and /^(\.|::)$/===lasttok.ident) ty=KWParamListContextNoParen else ty=ParamListContextNoParen end @parsestack.push ty.new(@linenum) @@ -800,10 +809,13 @@ end #----------------------------------- def enable_macros! @enable_macro="macro" + class <<self + alias keyword_macro keyword_def + end end public :enable_macros! #----------------------------------- @@ -834,17 +846,20 @@ #----------------------------------- #parse keywords now, to prevent confusion over bare symbols #and match end with corresponding preceding def or class or whatever. #if arg is not a keyword, the block is called - def parse_keywords(str,offset) + def parse_keywords(str,offset,&block) assert @moretokens.empty? - assert !(KeywordToken===@last_operative_token and /A(.|::|def)\Z/===@last_operative_token.ident) + assert !(KeywordToken===@last_operative_token and /A(\.|::|def)\Z/===@last_operative_token.ident) result=[KeywordToken.new(str,offset)] - case str - when "end" + m="keyword_#{str}" + respond_to?(m) ? (send m,str,offset,result,&block) : block[MethNameToken.new(str)] + end + public #these have to be public so respond_to? can see them (sigh) + def keyword_end(str,offset,result) result.unshift(*abort_noparens!(str)) @parsestack.last.see self,:semi #sorta hacky... should make an :end event instead? =begin not needed? if ExpectDoOrNlContext===@parsestack.last @@ -857,12 +872,14 @@ ctx=@parsestack.pop start,line=ctx.starter,ctx.linenum BEGINWORDS===start or lexerror result.last, "end does not match #{start or "nil"}" /^(do)$/===start and localvars.end_block /^(class|module|def)$/===start and @localvars_stack.pop + return result + end - when "module" + def keyword_module(str,offset,result) result.first.has_end! @parsestack.push WantsEndContext.new(str,@linenum) @localvars_stack.push SymbolTable.new offset=input_position @file.scan(/\A(#@@WSTOKS)?(::)?/o) @@ -883,50 +900,73 @@ else incr=0 end @moretokens.push VarNameToken.new(name,offset+incr) break unless dc + @moretokens.push NoWsToken.new(offset+md.end(0)-2) @moretokens.push KeywordToken.new('::',offset+md.end(0)-2) end @moretokens.push EndHeaderToken.new(input_position) - + return result + end - when "class" + def keyword_class(str,offset,result) result.first.has_end! @parsestack.push ClassContext.new(str,@linenum) + return result + end + - when "if","unless" #could be infix form without end + def keyword_if(str,offset,result) #could be infix form without end if after_nonid_op?{false} #prefix form result.first.has_end! @parsestack.push WantsEndContext.new(str,@linenum) @parsestack.push ExpectThenOrNlContext.new(str,@linenum) else #infix form result.unshift(*abort_noparens!(str)) end - when "elsif" + return result + end + alias keyword_unless keyword_if + + def keyword_elsif(str,offset,result) + result.unshift(*abort_noparens!(str)) @parsestack.push ExpectThenOrNlContext.new(str,@linenum) - when "begin","case" + return result + end + def keyword_begin(str,offset,result) result.first.has_end! @parsestack.push WantsEndContext.new(str,@linenum) - when "while","until" #could be infix form without end + return result + end + + alias keyword_case keyword_begin + def keyword_while(str,offset,result) #could be infix form without end if after_nonid_op?{false} #prefix form result.first.has_end! @parsestack.push WantsEndContext.new(str,@linenum) expect_do_or_end_or_nl! str else #infix form result.unshift(*abort_noparens!(str)) end - when "for" + return result + end + + alias keyword_until keyword_while + + def keyword_for(str,offset,result) result.first.has_end! result.push KwParamListStartToken.new(offset+str.length) # corresponding EndToken emitted leaving ForContext ("in" branch, below) @parsestack.push WantsEndContext.new(str,@linenum) #expect_do_or_end_or_nl! str #handled by ForSMContext now @parsestack.push ForSMContext.new(@linenum) - when "do" + return result + end + def keyword_do(str,offset,result) result.unshift(*abort_noparens_for_do!(str)) if ExpectDoOrNlContext===@parsestack.last @parsestack.pop assert WantsEndContext===@parsestack.last result.last.as=";" @@ -934,16 +974,21 @@ result.last.has_end! @parsestack.push WantsEndContext.new(str,@linenum) localvars.start_block block_param_list_lookahead end - when "def",@enable_macro + return result + end + def keyword_def(str,offset,result) #macros too, if enabled result.first.has_end! @parsestack.push ctx=DefContext.new(@linenum) ctx.state=:saw_def - safe_recurse { |aa| - set_last_token KeywordToken.new str #hack + old_moretokens=@moretokens + @moretokens=[] + aa=@moretokens + #safe_recurse { |aa| + set_last_token KeywordToken.new(str) #hack result.concat ignored_tokens #read an expr like a.b.c or a::b::c #or (expr).b.c if nextchar==?( #look for optional parenthesised head @@ -958,11 +1003,11 @@ EoiToken===tok and lexerror tok, "eof in def header" result << tok end until parencount==0 #@parsestack.size==old_size @localvars_stack.push SymbolTable.new else #no parentheses, all tail - set_last_token KeywordToken.new "." #hack hack + set_last_token KeywordToken.new(".") #hack hack tokindex=result.size result << tok=symbol(false,false) name=tok.to_s assert !in_lvar_define_state @@ -1004,15 +1049,15 @@ #b and c should be considered varnames only if #they are capitalized and preceded by :: . #a could even be a keyword (eg self or block_given?). end #read tail: .b.c.d etc - result.reverse_each{|res| break set_last_token res unless StillIgnoreToken===res} + result.reverse_each{|res| break set_last_token( res ) unless StillIgnoreToken===res} assert !(IgnoreToken===@last_operative_token) state=:expect_op @in_def_name=true - loop do + while true #look for start of parameter list nc=(@moretokens.empty? ? nextchar.chr : @moretokens.first.to_s[0,1]) if state==:expect_op and /^[a-z_(&*]/i===nc ctx.state=:def_param_list @@ -1039,54 +1084,59 @@ lexerror tok,'expected . or ::' unless state==:expect_name state=:expect_op when /^(\.|::)$/.token_pat lexerror tok,'expected ident' unless state==:expect_op if endofs - result.insert -2, ImplicitParamListEndToken.new(endofs) + result.insert( -2, ImplicitParamListEndToken.new(endofs) ) endofs=nil end state=:expect_name when /^(;|end)$/.token_pat, NewlineToken #are we done with def name? ctx.state=:def_body state==:expect_op or lexerror tok,'expected identifier' if endofs - result.insert -2,ImplicitParamListEndToken.new(tok.offset) + result.insert( -2,ImplicitParamListEndToken.new(tok.offset) ) end - result.insert -2, EndHeaderToken.new(tok.offset) + result.insert( -2, EndHeaderToken.new(tok.offset) ) break else lexerror(tok, "bizarre token in def name: " + "#{tok}:#{tok.class}") end end @in_def_name=false - } - when "alias" + #} + @moretokens= old_moretokens.concat @moretokens + return result + end + def keyword_alias(str,offset,result) safe_recurse { |a| - set_last_token KeywordToken.new "alias" #hack + set_last_token KeywordToken.new( "alias" )#hack result.concat ignored_tokens res=symbol(eat_next_if(?:),false) unless res lexerror(result.first,"bad symbol in alias") else res.ident[0]==?$ and res=VarNameToken.new(res.ident,res.offset) result<< res - set_last_token KeywordToken.new "alias" #hack + set_last_token KeywordToken.new( "alias" )#hack result.concat ignored_tokens res=symbol(eat_next_if(?:),false) unless res lexerror(result.first,"bad symbol in alias") else res.ident[0]==?$ and res=VarNameToken.new(res.ident,res.offset) result<< res end end } - when "undef" + return result + end + def keyword_undef(str,offset,result) safe_recurse { |a| loop do - set_last_token KeywordToken.new "," #hack + set_last_token KeywordToken.new( "," )#hack result.concat ignored_tokens tok=symbol(eat_next_if(?:),false) tok or lexerror(result.first,"bad symbol in undef") result<< tok set_last_token tok @@ -1099,22 +1149,26 @@ tok= single_char_token(?,) result<< tok end } + return result + end # when "defined?" #defined? might have a baresymbol following it #does it need to be handled specially? #it would seem not..... - when "when" + def keyword_when(str,offset,result) #abort_noparens! emits EndToken on leaving context result.unshift(*abort_noparens!(str)) result.push KwParamListStartToken.new( offset+str.length) @parsestack.push WhenParamListContext.new(str,@linenum) + return result + end - when "rescue" + def keyword_rescue(str,offset,result) unless after_nonid_op? {false} #rescue needs to be treated differently when in operator context... #i think no RescueSMContext should be pushed on the stack... result.first.set_infix! #plus, the rescue token should be marked as infix result.unshift(*abort_noparens_for_rescue!(str)) @@ -1122,48 +1176,64 @@ result.push KwParamListStartToken.new(offset+str.length) #corresponding EndToken emitted by abort_noparens! on leaving rescue context @parsestack.push RescueSMContext.new(@linenum) result.unshift(*abort_noparens!(str)) end + return result + end - when "then" + def keyword_then(str,offset,result) result.unshift(*abort_noparens!(str)) @parsestack.last.see self,:then if ExpectThenOrNlContext===@parsestack.last @parsestack.pop else #error... does anyone care? end + return result + end - when "in" + def keyword_in(str,offset,result) result.unshift KwParamListEndToken.new( offset) result.unshift(*abort_noparens!(str)) @parsestack.last.see self,:in + return result + end - when /\A(#{BINOPWORDS}|#{INNERBOUNDINGWORDS})\Z/o + def _keyword_innerbounding(str,offset,result) result.unshift(*abort_noparens!(str)) + return result + end + for kw in BINOPWORDLIST+INNERBOUNDINGWORDLIST-["in","then","rescue","when","elsif"] + alias_method "keyword_#{kw}".to_sym, :_keyword_innerbounding + end - when /\A(return|break|next)\Z/ + def keyword_return(str,offset,result) fail if KeywordToken===@last_operative_token and @last_operative_token===/\A(\.|::)\Z/ tok=KeywordToken.new(str,offset) result=yield tok result[0]=tok tok.has_no_block! + return result + end + + alias keyword_break keyword_return + alias keyword_next keyword_return - when 'END' + def keyword_END(str,offset,result) #END could be treated, lexically, just as if it is an #ordinary method, except that local vars created in #END blocks are visible to subsequent code. (Why??) #That difference forces a custom parsing. if @last_operative_token===/^(\.|::)$/ result=yield MethNameToken.new(str) #should pass a methname token here else safe_recurse{ old=result.first result=[ - MethNameToken.new(old.ident,old.offset), + KeywordToken.new(old.ident,old.offset), ImplicitParamListStartToken.new(input_position), ImplicitParamListEndToken.new(input_position), *ignored_tokens ] getchar=='{' or lexerror(result.first,"expected { after #{str}") @@ -1171,24 +1241,35 @@ result.last.set_infix! result.last.as="do" @parsestack.push BeginEndContext.new(str,offset) } end + return result + end - when FUNCLIKE_KEYWORDS - result=yield MethNameToken.new(str) #should be a keyword token? - when RUBYKEYWORDS - #do nothing - - else result=yield MethNameToken.new(str) - - end - - return result + def _keyword_funclike(str,offset,result) + if @last_operative_token===/^(\.|::)$/ + result=yield MethNameToken.new(str) #should pass a methname token here + else + result=yield KeywordToken.new(str) + end + return result end + for kw in FUNCLIKE_KEYWORDLIST-["END","return","break","next"] do + alias_method "keyword_#{kw}".to_sym, :_keyword_funclike + end + + def _keyword_varlike(str,offset,result) + #do nothing + return result + end + for kw in VARLIKE_KEYWORDLIST+["defined?", "not"] do + alias_method "keyword_#{kw}".to_sym, :_keyword_varlike + end + private #----------------------------------- def parsestack_lastnonassign_is?(obj) @parsestack.reverse_each{|ctx| case ctx @@ -1219,11 +1300,11 @@ #----------------------------------- def block_param_list_lookahead safe_recurse{ |la| - set_last_token KeywordToken.new ';' + set_last_token KeywordToken.new( ';' ) a=ignored_tokens if eat_next_if(?|) a<< KeywordToken.new("|", input_position-1) if true @@ -1265,11 +1346,11 @@ fixme %#moretokens might be set from get1token call above...might be bad# end end end - set_last_token KeywordToken.new ';' + set_last_token KeywordToken.new( ';' ) #a.concat ignored_tokens #assert @last_operative_token===';' #a<<get1token @@ -1311,11 +1392,11 @@ end class << endingblock alias === call end - set_last_token KeywordToken.new ',' #hack + set_last_token KeywordToken.new( ',' )#hack #read local parameter names nextvar=nil loop do expect_name=(@last_operative_token===',' and normal_comma_level==@parsestack.size) @@ -1346,11 +1427,11 @@ nextvar=tok.ident localvars[nextvar]=false #remove nextvar from list of local vars for now when /^[&*]$/.token_pat #unary form... #a NoWsToken is also expected... read it now result.concat maybe_no_ws_token #not needed? - set_last_token KeywordToken.new ',' + set_last_token KeywordToken.new( ',' ) else lexerror tok,"unfamiliar var name '#{tok}'" end elsif /^,$/.token_pat===tok if normal_comma_level+1==@parsestack.size and @@ -1380,11 +1461,11 @@ result.concat ignored_tokens # if !eof? and nextchar.chr[/[iuw\/<|>+\-*&%?:({]/] and # !(NewlineToken===@last_operative_token) and # !(/^(end|;)$/===@last_operative_token) #result<<EndHeaderToken.new(result.last.offset+result.last.to_s.size) - set_last_token KeywordToken.new ';' + set_last_token KeywordToken.new( ';' ) result<< get1token # end } return result,listend @@ -1765,11 +1846,11 @@ @base_file=nil result="\n" end @offset_adjust=@min_offset_adjust - @moretokens.push *optional_here_bodies + @moretokens.push( *optional_here_bodies ) ln=@linenum @moretokens.push lexerror(EscNlToken.new(@filename,ln-1,result,input_position-result.size), error), FileAndLineToken.new(@filename,ln,input_position) start_of_line_directives @@ -1852,11 +1933,11 @@ assert("\n"==prevchar) back1char #-1 to make newline char the next to read @linenum-=1 - assert /[\r\n]/===nextchar.chr + assert( /[\r\n]/===nextchar.chr ) #retr evrything til next nl if FASTER_STRING_ESCAPES line=all_quote("\r\n", tofill.quote, "\r\n") else @@ -2251,13 +2332,13 @@ result=@moretokens.shift end @parsestack.last.see self,:arrow when '': #plain assignment: record local variable definitions last_context_not_implicit.lhs=false - @moretokens.push *ignored_tokens(true).map{|x| + @moretokens.push( *ignored_tokens(true).map{|x| NewlineToken===x ? EscNlToken.new(@filename,@linenum,x.ident,x.offset) : x - } + } ) @parsestack.push AssignmentRhsContext.new(@linenum) if eat_next_if ?* tok=OperatorToken.new('*', input_position-1) tok.unary=true @moretokens.push tok @@ -2399,11 +2480,11 @@ tokch.as="do" #=begin not needed now, i think # 'need to find matching callsite context and end it if implicit' lasttok=last_operative_token if !(lasttok===')' and lasttok.callsite?) #or ParamListContextNoParen===parsestack.last - @moretokens.push *(abort_1_noparen!(1).push tokch) + @moretokens.push( *(abort_1_noparen!(1).push tokch) ) tokch=@moretokens.shift end #=end localvars.start_block @@ -2473,10 +2554,18 @@ end #----------------------------------- def comma(ch) @moretokens.push token=single_char_token(ch) + + #if assignment rhs seen inside method param list, when param list, array or hash literal, + # rescue where comma is expected, or method def param list + # then end the assignment rhs now + #+[OBS,ParamListContext|ParamListContextNoParen|WhenParamListContext|ListImmedContext| + # (RescueSMContext&-{:state=>:rescue})|(DefContext&-{:in_body=>FalseClass|nil}), + # AssignmentRhsContext + #]===@parsestack if AssignmentRhsContext===@parsestack[-1] and ParamListContext===@parsestack[-2] || ParamListContextNoParen===@parsestack[-2] || WhenParamListContext===@parsestack[-2] || ListImmedContext===@parsestack[-2] || @@ -2485,12 +2574,12 @@ @parsestack.pop @moretokens.unshift AssignmentRhsListEndToken.new(input_position) end token.comma_type= case @parsestack[-1] - when AssignmentRhsContext: :rhs - when ParamListContext,ParamListContextNoParen: :call - when ListImmedContext: :array + when AssignmentRhsContext; :rhs + when ParamListContext,ParamListContextNoParen; :call + when ListImmedContext; :array else :lhs if comma_in_lvalue_list? end @parsestack.last.see self,:comma return @moretokens.shift