lib/rubylexer.rb in rubylexer-0.7.3 vs lib/rubylexer.rb in rubylexer-0.7.4

- old
+ new

@@ -1,6 +1,6 @@ -=begin legal crap +=begin rubylexer - a ruby lexer written in ruby Copyright (C) 2004,2005,2008 Caleb Clausen This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -58,22 +58,24 @@ VARLIKE_KEYWORDS=/^(#{VARLIKE_KEYWORDLIST.join '|'})$/ INNERBOUNDINGWORDLIST=%w"else elsif ensure in then rescue when" INNERBOUNDINGWORDS="(#{INNERBOUNDINGWORDLIST.join '|'})" BINOPWORDLIST=%w"and or" BINOPWORDS="(#{BINOPWORDLIST.join '|'})" - NEVERSTARTPARAMLISTWORDS=/\A(#{OPORBEGINWORDS}|#{INNERBOUNDINGWORDS}|#{BINOPWORDS}|end)([^a-zA-Z0-9_!?=]|\Z)/o - NEVERSTARTPARAMLISTFIRST=CharSet['aoeitrwu'] #chars that begin NEVERSTARTPARAMLIST - NEVERSTARTPARAMLISTMAXLEN=7 #max len of a NEVERSTARTPARAMLIST RUBYKEYWORDS=%r{ ^(alias|#{BINOPWORDS}|defined\?|not|undef|end| #{VARLIKE_KEYWORDS}|#{FUNCLIKE_KEYWORDS}| #{INNERBOUNDINGWORDS}|#{BEGINWORDS} )$ }xo #__END__ should not be in this set... its handled in start_of_line_directives + HIGHASCII=?\x80..?\xFF + NONASCII=HIGHASCII + #NONASCII=?\x80..?xFFFFFFFF #or is it 10FFFF, whatever the highest conceivable code point + + CHARMAPPINGS = { ?$ => :dollar_identifier, ?@ => :at_identifier, ?a..?z => :identifier, ?A..?Z => :identifier, @@ -113,18 +115,47 @@ "[({" => :open_brace, "])}" => :close_brace, - ?# => :comment + ?# => :comment, + + NONASCII => :identifier, } attr_reader :incomplete_here_tokens, :parsestack, :last_token_maybe_implicit + UCLETTER=@@UCLETTER="[A-Z]" + #cheaters way, treats utf chars as always 1 byte wide + #all high-bit chars are lowercase letters + #works, but strings compare with strict binary identity, not unicode collation + #works for euc too, I think + #(the ruby spec for utf8 support permits this interpretation) + LCLETTER=@@LCLETTER="[a-z_\x80-\xFF]" + LETTER=@@LETTER="[A-Za-z_\x80-\xFF]" + LETTER_DIGIT=@@LETTER_DIGIT="[A-Za-z_0-9\x80-\xFF]" + eval %w[UCLETTER LCLETTER LETTER LETTER_DIGIT].map{|n| " + def #{n}; #{n}; end + def self.#{n}; @@#{n}; end + " + }.to_s + + NEVERSTARTPARAMLISTWORDS=/\A(#{OPORBEGINWORDS}|#{INNERBOUNDINGWORDS}|#{BINOPWORDS}|end)((?:(?!#@@LETTER_DIGIT).)|\Z)/om + NEVERSTARTPARAMLISTFIRST=CharSet['aoeitrwu'] #chars that begin NEVERSTARTPARAMLIST + NEVERSTARTPARAMLISTMAXLEN=7 #max len of a NEVERSTARTPARAMLIST + +=begin + require 'jcode' + utf8=String::PATTERN_UTF8 #or euc, or sjis... + LCLETTER_U="(?>[a-z_]|#{utf8})" + LETTER_U="(?>[A-Za-z_]|#{utf8})" + IDENTCHAR_U="(?>[A-Za-z_0-9]|#{utf8})" +=end + #----------------------------------- - def initialize(filename,file,linenum=1,offset_adjust=0) + def initialize(filename,file,linenum=1,offset_adjust=0,options={:rubyversion=>1.8}) @offset_adjust=0 #set again in next line super(filename,file, linenum,offset_adjust) @start_linenum=linenum @parsestack=[TopLevelContext.new] @incomplete_here_tokens=[] #not used anymore @@ -135,17 +166,65 @@ @last_operative_token=nil @last_token_maybe_implicit=nil @enable_macro=nil @base_file=nil @progress_thread=nil + @rubyversion=options[:rubyversion] + @encoding=options[:encoding]||:detect @toptable=CharHandler.new(self, :illegal_char, CHARMAPPINGS) + read_leading_encoding start_of_line_directives progress_printer end + ENCODING_ALIASES={ + 'utf-8'=>'utf8', + + 'ascii-8bit'=>'binary', + 'ascii-7bit'=>'ascii', + 'euc-jp'=>'euc', + + 'ascii8bit'=>'binary', + 'ascii7bit'=>'ascii', + 'eucjp'=>'euc', + + 'us-ascii'=>'ascii', + 'shift-jis'=>'sjis', + + 'autodetect'=>'detect', + } + ENCODINGS=%w[ascii binary utf8 euc sjis] + def read_leading_encoding + return unless @encoding==:detect + @encoding=:ascii + @encoding=:utf8 if @file.skip( /\xEF\xBB\xBF/ ) #bom + if @file.skip( /\A#!/ ) + loop do + til_charset( /[\s\v]/ ) + break if @file.skip( / ([^-\s\v]|--[\s\v])/,4 ) + if @file.skip( /.-K(.)/ ) + case $1 + when 'u'; @encoding=:utf8 + when 'e'; @encoding=:euc + when 's'; @encoding=:sjis + end + end + end + til_charset( /[\n]/ ) + end + if @rubyversion>=1.9 and @file.skip( + /\A#[\x00-\x7F]*?(?:en)?coding[\s\v]*[:=][\s\v]*([a-z0-9_-]+)[\x00-\x7F]*\n/i + ) + name=$1 + name.downcase! + name=ENCODING_ALIASES[name] if ENCODING_ALIASES[name] + @encoding=name.to_sym if ENCODINGS.include? name + end + end + def progress_printer return unless ENV['RL_PROGRESS'] $stderr.puts 'printing progresses' @progress_thread=Thread.new do until EoiToken===@last_operative_token @@ -161,10 +240,11 @@ attr_accessor :in_def attr :localvars_stack attr :offset_adjust attr_writer :pending_here_bodies + attr :rubyversion #----------------------------------- def set_last_token(tok) @last_operative_token=@last_token_maybe_implicit=tok end @@ -359,11 +439,11 @@ def special_global #handle $-a and friends assert prevchar=='$' result = (( #order matters here, but it shouldn't #(but til_charset must be last) - eat_if(/-[a-z0-9_]/i,2) or + eat_if(/-#@@LETTER_DIGIT/o,2) or eat_next_if(/[!@&+`'=~\-\/\\,.;<>*"$?:]/) or (?0..?9)===nextchar ? til_charset(/[^\d]/) : nil )) end @@ -374,34 +454,38 @@ #skip keyword processing if 'escaped' as it were, by def, . or :: #or if in a non-bare context #just asserts because those contexts are never encountered. #control goes through symbol(<...>,nil) - assert( /^[a-z_]$/i===context) + assert( /^#@@LETTER$/o===context) assert MethNameToken===@last_operative_token || !(@last_operative_token===/^(\.|::|(un)?def|alias)$/) - @moretokens.unshift(*parse_keywords(str,oldpos) do |tok| - #if not a keyword, - case str - when FUNCLIKE_KEYWORDS; except=tok - when VARLIKE_KEYWORDS,RUBYKEYWORDS; raise "shouldnt see keywords here, now" - end - was_last=@last_operative_token - @last_operative_token=tok if tok - normally=safe_recurse { |a| var_or_meth_name(str,was_last,oldpos,after_nonid_op?{true}) } - (Array===normally ? normally[0]=except : normally=except) if except - normally - end) + if @parsestack.last.wantarrow and @rubyversion>=1.9 and @file.skip ":" + @moretokens.push SymbolToken.new(str,oldpos), KeywordToken.new("=>",input_position-1) + else + @moretokens.unshift(*parse_keywords(str,oldpos) do |tok| + #if not a keyword, decide if it should be var or method + case str + when FUNCLIKE_KEYWORDS; except=tok + when VARLIKE_KEYWORDS,RUBYKEYWORDS; raise "shouldnt see keywords here, now" + end + was_last=@last_operative_token + @last_operative_token=tok if tok + normally=safe_recurse { |a| var_or_meth_name(str,was_last,oldpos,after_nonid_op?{true}) } + (Array===normally ? normally[0]=except : normally=except) if except + normally + end) + end return @moretokens.shift end #----------------------------------- IDENTREX={} def identifier_as_string(context) #must begin w/ letter or underscore #char class needs changing here for utf8 support - /[_a-z]/i===nextchar.chr or return + /#@@LETTER/o===nextchar.chr or return #equals, question mark, and exclamation mark #might be allowed at the end in some contexts. #(in def headers and symbols) #otherwise, =,?, and ! are to be considered @@ -416,11 +500,11 @@ # when ?: then "!(?![=])|\\?|=(?![=~>])" else "!(?![=])|\\?" end @in_def_name||context==?: and trailers<<"|=(?![=~>])" - @file.scan(IDENTREX[trailers]||=/^(?>[_a-z][a-z0-9_]*(?:#{trailers})?)/i) + @file.scan(IDENTREX[trailers]||=/^(?>#@@LETTER#@@LETTER_DIGIT*(?:#{trailers})?)/) end #----------------------------------- #contexts in which comma may appear in ruby: #multiple lhs (terminated by assign op) @@ -445,23 +529,23 @@ #a comma has been seen. are we in an #lvalue list or some other construct that uses commas? def comma_in_lvalue_list? @parsestack.last.lhs= case l=@parsestack.last - when ListContext: - when DefContext: l.in_body + when ListContext; + when DefContext; l.in_body else true end end #----------------------------------- def in_lvar_define_state lasttok=@last_operative_token #@defining_lvar is a hack @defining_lvar or case ctx=@parsestack.last #when ForSMContext; ctx.state==:for when RescueSMContext - lasttok.ident=="=>" and @file.match?( /\A[\s\v]*([:;#\n]|then[^a-zA-Z0-9_])/m ) + lasttok.ident=="=>" and @file.match?( /\A[\s\v]*([:;#\n]|then(?!#@@LETTER_DIGIT))/om ) #when BlockParamListLhsContext; true end end IMPLICIT_PARENS_BEFORE_ACCESSOR_ASSIGNMENT=2 @@ -485,17 +569,17 @@ assert String===name was_in_lvar_define_state=in_lvar_define_state(lasttok) #maybe_local really means 'maybe local or constant' maybe_local=case name - when /[^a-z_0-9]$/i #do nothing - when /^[a-z_]/ + when /(?!#@@LETTER_DIGIT).$/o #do nothing + when /^#@@LCLETTER/o (localvars===name or VARLIKE_KEYWORDS===name or was_in_lvar_define_state ) and not lasttok===/^(\.|::)$/ - when /^[A-Z]/ + when /^#@@UCLETTER/o is_const=true not lasttok==='.' #this is the right algorithm for constants... end assert(@moretokens.empty?) @@ -507,11 +591,11 @@ oldpos= input_position sawnl=false result=ws_toks=ignored_tokens(true) {|nl| sawnl=true } if sawnl || eof? if was_in_lvar_define_state - if /^[a-z_][a-zA-Z_0-9]*$/===name + if /^#@@LCLETTER#@@LETTER_DIGIT*$/o===name assert !(lasttok===/^(\.|::)$/) localvars[name]=true end return result.unshift(tok) elsif maybe_local @@ -529,11 +613,11 @@ #then omit implicit parens assignment_coming=case nc=nextchar when ?=; not /^=[>=~]$/===readahead(2) when ?,; comma_in_lvalue_list? when ?); last_context_not_implicit.lhs - when ?i; /^in[^a-zA-Z_0-9]/===readahead(3) and + when ?i; /^in(?!#@@LETTER_DIGIT)/o===readahead(3) and ForSMContext===last_context_not_implicit when ?>,?<; /^(.)\1=$/===readahead(3) when ?*,?&; /^(.)\1?=/===readahead(3) when ?|; /^\|\|?=/===readahead(3) or #is it a goalpost? @@ -541,12 +625,12 @@ readahead(2)[1] != ?| when ?%,?/,?-,?+,?^; readahead(2)[1]== ?= end if (assignment_coming && !(lasttok===/^(\.|::)$/) or was_in_lvar_define_state) tok=assign_lvar_type! VarNameToken.new(name,pos) - if /[^a-z_0-9]$/i===name - elsif /^[a-z_]/===name and !(lasttok===/^(\.|::)$/) + if /(?!#@@LETTER_DIGIT).$/o===name + elsif /^#@@LCLETTER/o===name and !(lasttok===/^(\.|::)$/) localvars[name]=true end return result.unshift(tok) end @@ -557,11 +641,11 @@ else case nc when nil: 2 when ?!; /^![=~]$/===readahead(2) ? 2 : 1 when ?d; - if /^do([^a-zA-Z0-9_]|$)/===readahead(3) + if /^do((?!#@@LETTER_DIGIT)|$)/o===readahead(3) if maybe_local and expecting_do? ty=VarNameToken 0 else maybe_local=false @@ -570,11 +654,11 @@ else 1 end when NEVERSTARTPARAMLISTFIRST (NEVERSTARTPARAMLISTWORDS===readahead(NEVERSTARTPARAMLISTMAXLEN)) ? 2 : 1 - when ?",?',?`,?a..?z,?A..?Z,?0..?9,?_,?@,?$,?~; 1 #" + when ?",?',?`,?a..?z,?A..?Z,?0..?9,?_,?@,?$,?~,NONASCII; 1 #" when ?{ maybe_local=false 1 =begin x=2 @@ -631,14 +715,16 @@ if /^:(?:[#{WHSPLF}]|(:))$/o===next2 then $1 && !ws_toks.empty? ? 3 : 2 else 3 end - when ??; next3=readahead(3); - /^\?([#{WHSPLF}]|[a-z_][a-z_0-9])/io===next3 ? 2 : 3 + when ??; next3=readahead(3) + #? never begins a char constant if immediately followed + #by 2 or more letters or digits + /^\?([#{WHSPLF}]|#@@LETTER_DIGIT{2})/o===next3 ? 2 : 3 # when ?:,??; (readahead(2)[/^.[#{WHSPLF}]/o]) ? 2 : 3 - when ?<; (!ws_toks.empty? && readahead(4)[/^<<-?["'`a-zA-Z_0-9]/]) ? 3 : 2 + when ?<; (!ws_toks.empty? && readahead(4)[/^<<-?(?:["'`]|#@@LETTER_DIGIT)/o]) ? 3 : 2 when ?[; if ws_toks.empty? (KeywordToken===oldlast and /^(return|break|next)$/===oldlast.ident) ? 3 : 2 else 3 @@ -705,11 +791,11 @@ result << tok if @parsestack.size==basesize break false elsif ','==tok.to_s and @parsestack.size==basesize+1 break true - elsif OperatorToken===tok and /^[&*]$/===tok.ident and tok.unary and @parsestack.size==basesize+1 + elsif OperatorToken===tok and /^[&*]$/===tok.ident and tok.tag and @parsestack.size==basesize+1 break true elsif EoiToken===tok lexerror tok, "unexpected eof in parameter list" end } @@ -888,11 +974,11 @@ fail if all.empty? @moretokens.concat divide_ws(ws,offset) if ws @moretokens.push KeywordToken.new('::',offset+md.end(0)-2) if dc loop do offset=input_position - @file.scan(/\A(#@@WSTOKS)?([A-Z][a-zA-Z_0-9]*)(::)?/o) + @file.scan(/\A(#@@WSTOKS)?(#@@UCLETTER#@@LETTER_DIGIT*)(::)?/o) #this regexp---^ will need to change in order to support utf8 properly. md=@file.last_match all,ws,name,dc=*md if ws @moretokens.concat divide_ws(ws,offset) @@ -1011,15 +1097,15 @@ name=tok.to_s assert !in_lvar_define_state #maybe_local really means 'maybe local or constant' maybe_local=case name - when /[^a-z_0-9]$/i; #do nothing + when /(?!#@@LETTER_DIGIT).$/o; #do nothing when /^[@$]/; true when VARLIKE_KEYWORDS,FUNCLIKE_KEYWORDS; ty=KeywordToken - when /^[a-z_]/; localvars===name - when /^[A-Z]/; is_const=true #this is the right algorithm for constants... + when /^#@@LCLETTER/o; localvars===name + when /^#@@UCLETTER/o; is_const=true #this is the right algorithm for constants... end result.push( *ignored_tokens(false,false) ) nc=nextchar if !ty and maybe_local if nc==?: || nc==?. @@ -1057,11 +1143,11 @@ @in_def_name=true while true #look for start of parameter list nc=(@moretokens.empty? ? nextchar.chr : @moretokens.first.to_s[0,1]) - if state==:expect_op and /^[a-z_(&*]/i===nc + if state==:expect_op and /^(?:#@@LETTER|[(&*])/o===nc ctx.state=:def_param_list list,listend=def_param_list result.concat list end_index=result.index(listend) ofs=listend.offset @@ -1078,11 +1164,11 @@ result<< tok case tok when EoiToken lexerror tok,'unexpected eof in def header' when StillIgnoreToken - when MethNameToken ,VarNameToken # /^[a-z_]/i.token_pat + when MethNameToken ,VarNameToken # /^#@@LETTER/o.token_pat lexerror tok,'expected . or ::' unless state==:expect_name state=:expect_op when /^(\.|::)$/.token_pat lexerror tok,'expected ident' unless state==:expect_op if endofs @@ -1414,11 +1500,11 @@ #next token is a local var name #(or the one after that if unary ops present) #result.concat ignored_tokens if expect_name case tok - when IgnoreToken #, /^[A-Z]/ #do nothing + when IgnoreToken #, /^#@@UCLETTER/o #do nothing when /^,$/.token_pat #hack when VarNameToken assert@defining_lvar @defining_lvar=false @@ -1496,16 +1582,24 @@ (@last_operative_token===/^(return|next|break)$/ and KeywordToken===@last_operative_token) result=quadriop(ch) if want_unary #readahead(2)[1..1][/[\s\v#\\]/] or #not needed? assert OperatorToken===result - result.unary=true #result should distinguish unary+binary *& + result.tag=:unary #result should distinguish unary+binary *& WHSPLF[nextchar.chr] or @moretokens << NoWsToken.new(input_position) - comma_in_lvalue_list? + cill=comma_in_lvalue_list? if ch=='*' @parsestack.last.see self, :splat + case @parsestack[-1] + when AssignmentRhsContext; result.tag= :rhs + when ParamListContext,ParamListContextNoParen; #:call + when ListImmedContext; #:array + when BlockParamListLhsContext; #:block + when KnownNestedLhsParenContext; #:nested + else result.tag= :lhs if cill + end end end result end @@ -1551,14 +1645,14 @@ def is_var_name? (tok=@last_operative_token) s=tok.to_s case s - when /[^a-z_0-9]$/i; false -# when /^[a-z_]/; localvars===s or VARLIKE_KEYWORDS===s - when /^[A-Z_]/i; VarNameToken===tok when /^[@$<]/; true + when /(?!#@@LETTER_DIGIT).$/o; false +# when /^#@@LCLETTER/o; localvars===s or VARLIKE_KEYWORDS===s + when /^#@@LETTER/o; VarNameToken===tok else raise "not var or method name: #{s}" end end #----------------------------------- @@ -1571,11 +1665,11 @@ not is_var_name? and if ch==':' not TernaryContext===@parsestack.last else - !readahead(3)[/^\?[a-z0-9_]{2}/i] + !readahead(3)[/^\?#@@LETTER_DIGIT{2}/o] end } end #----------------------------------- @@ -1601,25 +1695,29 @@ #cancel implicit contexts... @moretokens.push(*abort_noparens!(':')) @moretokens.push tok=KeywordToken.new(':',startpos) case @parsestack.last - when TernaryContext: + when TernaryContext tok.ternary=true @parsestack.pop #should be in the context's see handler - when ExpectDoOrNlContext: #should be in the context's see handler - @parsestack.pop - assert @parsestack.last.starter[/^(while|until|for)$/] + when ExpectDoOrNlContext #should be in the context's see handler + if @rubyversion<1.9 + @parsestack.pop + assert @parsestack.last.starter[/^(while|until|for)$/] + tok.as=";" + end + when ExpectThenOrNlContext,WhenParamListContext + if @rubyversion<1.9 + #should be in the context's see handler + @parsestack.pop + tok.as="then" + end + when RescueSMContext tok.as=";" - when ExpectThenOrNlContext,WhenParamListContext: - #should be in the context's see handler - @parsestack.pop - tok.as="then" - when RescueSMContext: - tok.as=";" - else fail ": not expected in #{@parsestack.last.class}->#{@parsestack.last.starter}" - end + end or + fail ": not expected in #{@parsestack.last.class}->#{@parsestack.last.starter}" #end ternary context, if any @parsestack.last.see self,:colon return @moretokens.shift @@ -1629,11 +1727,11 @@ colon2=KeywordToken.new( '::',startpos) lasttok=@last_operative_token assert !(String===lasttok) if (VarNameToken===lasttok or MethNameToken===lasttok) and - lasttok===/^[$@a-zA-Z_]/ and !WHSPCHARS[lastchar] + lasttok===/^(?:[$@]|#@@LETTER)/o and !WHSPCHARS[lastchar] then @moretokens << colon2 result= NoWsToken.new(startpos) else result=colon2 @@ -1662,16 +1760,16 @@ open=":'"; close="'" single_quote("'") when ?` then read(1) #` when ?@ then at_identifier.to_s when ?$ then dollar_identifier.to_s - when ?_,?a..?z then identifier_as_string(?:) + when ?_,?a..?z,NONASCII then identifier_as_string(?:) when ?A..?Z then result=identifier_as_string(?:) if @last_operative_token==='::' assert klass==MethNameToken - /[A-Z_0-9]$/i===result and klass=VarNameToken + /#@@LETTER_DIGIT$/o===result and klass=VarNameToken end result else error= "unexpected char starting symbol: #{nc.chr}" '_' @@ -1694,11 +1792,11 @@ #look for operators opmatches=readahead(3)[RUBYSYMOPERATORREX] return [opmatches ? read(opmatches.size) : case nc=nextchar when ?` then read(1) #` - when ?_,?a..?z,?A..?Z then + when ?_,?a..?z,?A..?Z,NONASCII then context=merge_assignment_op_in_setter_callsites? ? ?: : nc identifier_as_string(context) else set_last_token KeywordToken.new(';') lexerror(tok_to_errify,"unexpected char starting callsite symbol: #{nc.chr}, tok=#{tok_to_errify.inspect}") @@ -1718,11 +1816,11 @@ (quote==getchar) or return lexerror(HerePlaceholderToken.new( dash, quote, ender ), "mismatched quotes in here doc") quote_real=true else quote='"' - ender=til_charset(/[^a-zA-Z0-9_]/) + ender=@file.scan(/#@@LETTER_DIGIT+/o) ender.length >= 1 or return lexerror(HerePlaceholderToken.new( dash, quote, ender, nil ), "invalid here header") end res= HerePlaceholderToken.new( dash, quote, ender, quote_real ) @@ -1737,19 +1835,22 @@ #actually delete procrastinated from input @file.delete(input_position_raw-procrastinated.size...input_position_raw) nl=readnl or return lexerror(res, "here header without body (at eof)") + res.string.startline=linenum @moretokens<< res bodystart=input_position @offset_adjust = @min_offset_adjust+procrastinated.size #was: @offset_adjust += procrastinated.size body=here_body(res) res.close=body.close @offset_adjust = @min_offset_adjust #was: @offset_adjust -= procrastinated.size bodysize=input_position-bodystart + res.string.line=linenum-1 + lexerror res,res.string.error #one or two already read characters are overwritten here, #in order to keep offsets correct in the long term #(at present, offsets and line numbers between #here header and its body will be wrong. but they should re-sync thereafter.) @@ -1812,11 +1913,11 @@ end #----------------------------------- def lessthan(ch) #match quadriop('<') or here doc or spaceship op case readahead(3) - when /^<<['"`\-a-z0-9_]$/i #' + when /^<<(?:['"`\-]|#@@LETTER_DIGIT)$/o #' if quote_expected?(ch) and not @last_operative_token==='class' here_header else operator_or_methname_token read(2) end @@ -1899,11 +2000,15 @@ #here body terminator? oldpos= input_position_raw if tofill.dash close+=til_charset(/[^#{WHSP}]/o) end - break if eof? #this is an error, should be handled better + if eof? #this is an error, should be handled better + lexerror tofill, "unterminated here body" + lexerror tofill.string, "unterminated here body" + break + end if read(tofill.ender.size)==tofill.ender crs=til_charset(/[^\r]/)||'' if nl=readnl close+=tofill.ender+crs+nl break @@ -1915,10 +2020,12 @@ if tofill.quote=="'" line=til_charset(/[\n]/) unless nl=readnl assert eof? + lexerror tofill, "unterminated here body" + lexerror tofill.string, "unterminated here body" break #this is an error, should be handled better end line.chomp!("\r") line<< "\n" assert("\n"==prevchar) @@ -2116,11 +2223,11 @@ #----------------------------------- #used to resolve the ambiguity of # unary ops (+, -, *, &, ~ !) in ruby #returns whether current token is to be the start of a literal - IDBEGINCHAR=/^[a-zA-Z_$@]/ + IDBEGINCHAR=/^(?:#@@LETTER|[$@])/o def unary_op_expected?(ch) #yukko hack '*&='[readahead(2)[1..1]] and return false return true if KeywordToken===@last_operative_token and @last_operative_token==='for' @@ -2137,12 +2244,12 @@ # <<, %, ? in ruby #returns whether current token is to be the start of a literal def quote_expected?(ch) #yukko hack case ch[0] when ?? then readahead(2)[/^\?[#{WHSPLF}]$/o] #not needed? - when ?% then readahead(3)[/^%([a-pt-vyzA-PR-VX-Z]|[QqrswWx][a-zA-Z0-9])/] - when ?< then !readahead(4)[/^<<-?['"`a-z0-9_]/i] + when ?% then readahead(3)[/^%([a-pt-vyzA-PR-VX-Z]|[QqrswWx]#{@@LETTER_DIGIT.gsub('_','')})/o] + when ?< then !readahead(4)[/^<<-?(?:['"`]|#@@LETTER_DIGIT)/o] else raise 'unexpected ch (#{ch}) in quote_expected?' # when ?+,?-,?&,?*,?~,?! then '*&='[readahead(2)[1..1]] end and return false after_nonid_op? { @@ -2320,29 +2427,38 @@ assert str=='=' c=(eat_next_if(/[~=>]/)or'') str << c result= operator_or_methname_token( str,offset) case c - when '=': #===,== + when '=' #===,== str<< (eat_next_if(?=)or'') - when '>': #=> + when '>' #=> unless ParamListContextNoParen===@parsestack.last @moretokens.unshift result @moretokens.unshift( *abort_noparens!("=>")) result=@moretokens.shift end @parsestack.last.see self,:arrow - when '': #plain assignment: record local variable definitions + when '~' # =~... after regex, maybe? + last=last_operative_token + + if @rubyversion>=1.9 and StringToken===last and last.lvars + #ruby delays adding lvars from regexps to known lvars table + #for several tokens in some cases. not sure why or if on purpose + #i'm just going to add them right away + localvars.concat last.lvars + end + when '' #plain assignment: record local variable definitions last_context_not_implicit.lhs=false @moretokens.push( *ignored_tokens(true).map{|x| NewlineToken===x ? EscNlToken.new(@filename,@linenum,x.ident,x.offset) : x } ) @parsestack.push AssignmentRhsContext.new(@linenum) if eat_next_if ?* tok=OperatorToken.new('*', input_position-1) - tok.unary=true + tok.tag=:unary @moretokens.push tok WHSPLF[nextchar.chr] or @moretokens << NoWsToken.new(input_position) comma_in_lvalue_list? #is this needed? end @@ -2448,27 +2564,38 @@ #but control never comes this way in those cases... goes #to custom parsers for alias, undef, and def in #parse_keywords tokch.set_infix! unless after_nonid_op?{WHSPLF[lastchar]} @parsestack.push ListImmedContext.new(ch,@linenum) lasttok=last_operative_token - #could be: lasttok===/^[a-z_]/i - if (VarNameToken===lasttok or ImplicitParamListEndToken===lasttok or MethNameToken===lasttok) and !WHSPCHARS[lastchar] + #could be: lasttok===/^#@@LETTER/o + if (VarNameToken===lasttok or ImplicitParamListEndToken===lasttok or + MethNameToken===lasttok or lasttok===FUNCLIKE_KEYWORDS) and !WHSPCHARS[lastchar] @moretokens << (tokch) tokch= NoWsToken.new(input_position-1) end when '(' lasttok=last_token_maybe_implicit #last_operative_token - #could be: lasttok===/^[a-z_]/i + #could be: lasttok===/^#@@LETTER/o if (VarNameToken===lasttok or MethNameToken===lasttok or lasttok===FUNCLIKE_KEYWORDS) unless WHSPCHARS[lastchar] @moretokens << tokch tokch= NoWsToken.new(input_position-1) end @parsestack.push ParamListContext.new(@linenum) else - @parsestack.push ParenContext.new(@linenum) + ctx=@parsestack.last + lasttok=last_operative_token + maybe_def=DefContext===ctx && !ctx.in_body && + !(KeywordToken===lasttok && lasttok.ident=="def") + if maybe_def or + BlockParamListLhsContext===ctx or + ParenContext===ctx && ctx.lhs + @parsestack.push KnownNestedLhsParenContext.new(@linenum) + else + @parsestack.push ParenContext.new(@linenum) + end end when '{' #check if we are in a hash literal or string inclusion (#{}), #in which case below would be bad. @@ -2572,16 +2699,17 @@ (RescueSMContext===@parsestack[-2] && @parsestack[-2].state==:rescue) || (DefContext===@parsestack[-2] && !@parsestack[-2].in_body) @parsestack.pop @moretokens.unshift AssignmentRhsListEndToken.new(input_position) end - token.comma_type= case @parsestack[-1] - when AssignmentRhsContext; :rhs - when ParamListContext,ParamListContextNoParen; :call - when ListImmedContext; :array + when AssignmentRhsContext; token.tag=:rhs + when ParamListContext,ParamListContextNoParen; #:call + when ListImmedContext; #:array + when BlockParamListLhsContext; #:block + when KnownNestedLhsParenContext; #:nested else - :lhs if comma_in_lvalue_list? + token.tag=:lhs if comma_in_lvalue_list? end @parsestack.last.see self,:comma return @moretokens.shift end