lib/rubylexer.rb in rubylexer-0.7.3 vs lib/rubylexer.rb in rubylexer-0.7.4
- old
+ new
@@ -1,6 +1,6 @@
-=begin legal crap
+=begin
rubylexer - a ruby lexer written in ruby
Copyright (C) 2004,2005,2008 Caleb Clausen
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -58,22 +58,24 @@
VARLIKE_KEYWORDS=/^(#{VARLIKE_KEYWORDLIST.join '|'})$/
INNERBOUNDINGWORDLIST=%w"else elsif ensure in then rescue when"
INNERBOUNDINGWORDS="(#{INNERBOUNDINGWORDLIST.join '|'})"
BINOPWORDLIST=%w"and or"
BINOPWORDS="(#{BINOPWORDLIST.join '|'})"
- NEVERSTARTPARAMLISTWORDS=/\A(#{OPORBEGINWORDS}|#{INNERBOUNDINGWORDS}|#{BINOPWORDS}|end)([^a-zA-Z0-9_!?=]|\Z)/o
- NEVERSTARTPARAMLISTFIRST=CharSet['aoeitrwu'] #chars that begin NEVERSTARTPARAMLIST
- NEVERSTARTPARAMLISTMAXLEN=7 #max len of a NEVERSTARTPARAMLIST
RUBYKEYWORDS=%r{
^(alias|#{BINOPWORDS}|defined\?|not|undef|end|
#{VARLIKE_KEYWORDS}|#{FUNCLIKE_KEYWORDS}|
#{INNERBOUNDINGWORDS}|#{BEGINWORDS}
)$
}xo
#__END__ should not be in this set... its handled in start_of_line_directives
+ HIGHASCII=?\x80..?\xFF
+ NONASCII=HIGHASCII
+ #NONASCII=?\x80..?xFFFFFFFF #or is it 10FFFF, whatever the highest conceivable code point
+
+
CHARMAPPINGS = {
?$ => :dollar_identifier,
?@ => :at_identifier,
?a..?z => :identifier,
?A..?Z => :identifier,
@@ -113,18 +115,47 @@
"[({" => :open_brace,
"])}" => :close_brace,
- ?# => :comment
+ ?# => :comment,
+
+ NONASCII => :identifier,
}
attr_reader :incomplete_here_tokens, :parsestack, :last_token_maybe_implicit
+ UCLETTER=@@UCLETTER="[A-Z]"
+ #cheaters way, treats utf chars as always 1 byte wide
+ #all high-bit chars are lowercase letters
+ #works, but strings compare with strict binary identity, not unicode collation
+ #works for euc too, I think
+ #(the ruby spec for utf8 support permits this interpretation)
+ LCLETTER=@@LCLETTER="[a-z_\x80-\xFF]"
+ LETTER=@@LETTER="[A-Za-z_\x80-\xFF]"
+ LETTER_DIGIT=@@LETTER_DIGIT="[A-Za-z_0-9\x80-\xFF]"
+ eval %w[UCLETTER LCLETTER LETTER LETTER_DIGIT].map{|n| "
+ def #{n}; #{n}; end
+ def self.#{n}; @@#{n}; end
+ "
+ }.to_s
+
+ NEVERSTARTPARAMLISTWORDS=/\A(#{OPORBEGINWORDS}|#{INNERBOUNDINGWORDS}|#{BINOPWORDS}|end)((?:(?!#@@LETTER_DIGIT).)|\Z)/om
+ NEVERSTARTPARAMLISTFIRST=CharSet['aoeitrwu'] #chars that begin NEVERSTARTPARAMLIST
+ NEVERSTARTPARAMLISTMAXLEN=7 #max len of a NEVERSTARTPARAMLIST
+
+=begin
+ require 'jcode'
+ utf8=String::PATTERN_UTF8 #or euc, or sjis...
+ LCLETTER_U="(?>[a-z_]|#{utf8})"
+ LETTER_U="(?>[A-Za-z_]|#{utf8})"
+ IDENTCHAR_U="(?>[A-Za-z_0-9]|#{utf8})"
+=end
+
#-----------------------------------
- def initialize(filename,file,linenum=1,offset_adjust=0)
+ def initialize(filename,file,linenum=1,offset_adjust=0,options={:rubyversion=>1.8})
@offset_adjust=0 #set again in next line
super(filename,file, linenum,offset_adjust)
@start_linenum=linenum
@parsestack=[TopLevelContext.new]
@incomplete_here_tokens=[] #not used anymore
@@ -135,17 +166,65 @@
@last_operative_token=nil
@last_token_maybe_implicit=nil
@enable_macro=nil
@base_file=nil
@progress_thread=nil
+ @rubyversion=options[:rubyversion]
+ @encoding=options[:encoding]||:detect
@toptable=CharHandler.new(self, :illegal_char, CHARMAPPINGS)
+ read_leading_encoding
start_of_line_directives
progress_printer
end
+ ENCODING_ALIASES={
+ 'utf-8'=>'utf8',
+
+ 'ascii-8bit'=>'binary',
+ 'ascii-7bit'=>'ascii',
+ 'euc-jp'=>'euc',
+
+ 'ascii8bit'=>'binary',
+ 'ascii7bit'=>'ascii',
+ 'eucjp'=>'euc',
+
+ 'us-ascii'=>'ascii',
+ 'shift-jis'=>'sjis',
+
+ 'autodetect'=>'detect',
+ }
+ ENCODINGS=%w[ascii binary utf8 euc sjis]
+ def read_leading_encoding
+ return unless @encoding==:detect
+ @encoding=:ascii
+ @encoding=:utf8 if @file.skip( /\xEF\xBB\xBF/ ) #bom
+ if @file.skip( /\A#!/ )
+ loop do
+ til_charset( /[\s\v]/ )
+ break if @file.skip( / ([^-\s\v]|--[\s\v])/,4 )
+ if @file.skip( /.-K(.)/ )
+ case $1
+ when 'u'; @encoding=:utf8
+ when 'e'; @encoding=:euc
+ when 's'; @encoding=:sjis
+ end
+ end
+ end
+ til_charset( /[\n]/ )
+ end
+ if @rubyversion>=1.9 and @file.skip(
+ /\A#[\x00-\x7F]*?(?:en)?coding[\s\v]*[:=][\s\v]*([a-z0-9_-]+)[\x00-\x7F]*\n/i
+ )
+ name=$1
+ name.downcase!
+ name=ENCODING_ALIASES[name] if ENCODING_ALIASES[name]
+ @encoding=name.to_sym if ENCODINGS.include? name
+ end
+ end
+
def progress_printer
return unless ENV['RL_PROGRESS']
$stderr.puts 'printing progresses'
@progress_thread=Thread.new do
until EoiToken===@last_operative_token
@@ -161,10 +240,11 @@
attr_accessor :in_def
attr :localvars_stack
attr :offset_adjust
attr_writer :pending_here_bodies
+ attr :rubyversion
#-----------------------------------
def set_last_token(tok)
@last_operative_token=@last_token_maybe_implicit=tok
end
@@ -359,11 +439,11 @@
def special_global #handle $-a and friends
assert prevchar=='$'
result = ((
#order matters here, but it shouldn't
#(but til_charset must be last)
- eat_if(/-[a-z0-9_]/i,2) or
+ eat_if(/-#@@LETTER_DIGIT/o,2) or
eat_next_if(/[!@&+`'=~\-\/\\,.;<>*"$?:]/) or
(?0..?9)===nextchar ? til_charset(/[^\d]/) : nil
))
end
@@ -374,34 +454,38 @@
#skip keyword processing if 'escaped' as it were, by def, . or ::
#or if in a non-bare context
#just asserts because those contexts are never encountered.
#control goes through symbol(<...>,nil)
- assert( /^[a-z_]$/i===context)
+ assert( /^#@@LETTER$/o===context)
assert MethNameToken===@last_operative_token || !(@last_operative_token===/^(\.|::|(un)?def|alias)$/)
- @moretokens.unshift(*parse_keywords(str,oldpos) do |tok|
- #if not a keyword,
- case str
- when FUNCLIKE_KEYWORDS; except=tok
- when VARLIKE_KEYWORDS,RUBYKEYWORDS; raise "shouldnt see keywords here, now"
- end
- was_last=@last_operative_token
- @last_operative_token=tok if tok
- normally=safe_recurse { |a| var_or_meth_name(str,was_last,oldpos,after_nonid_op?{true}) }
- (Array===normally ? normally[0]=except : normally=except) if except
- normally
- end)
+ if @parsestack.last.wantarrow and @rubyversion>=1.9 and @file.skip ":"
+ @moretokens.push SymbolToken.new(str,oldpos), KeywordToken.new("=>",input_position-1)
+ else
+ @moretokens.unshift(*parse_keywords(str,oldpos) do |tok|
+ #if not a keyword, decide if it should be var or method
+ case str
+ when FUNCLIKE_KEYWORDS; except=tok
+ when VARLIKE_KEYWORDS,RUBYKEYWORDS; raise "shouldnt see keywords here, now"
+ end
+ was_last=@last_operative_token
+ @last_operative_token=tok if tok
+ normally=safe_recurse { |a| var_or_meth_name(str,was_last,oldpos,after_nonid_op?{true}) }
+ (Array===normally ? normally[0]=except : normally=except) if except
+ normally
+ end)
+ end
return @moretokens.shift
end
#-----------------------------------
IDENTREX={}
def identifier_as_string(context)
#must begin w/ letter or underscore
#char class needs changing here for utf8 support
- /[_a-z]/i===nextchar.chr or return
+ /#@@LETTER/o===nextchar.chr or return
#equals, question mark, and exclamation mark
#might be allowed at the end in some contexts.
#(in def headers and symbols)
#otherwise, =,?, and ! are to be considered
@@ -416,11 +500,11 @@
# when ?: then "!(?![=])|\\?|=(?![=~>])"
else "!(?![=])|\\?"
end
@in_def_name||context==?: and trailers<<"|=(?![=~>])"
- @file.scan(IDENTREX[trailers]||=/^(?>[_a-z][a-z0-9_]*(?:#{trailers})?)/i)
+ @file.scan(IDENTREX[trailers]||=/^(?>#@@LETTER#@@LETTER_DIGIT*(?:#{trailers})?)/)
end
#-----------------------------------
#contexts in which comma may appear in ruby:
#multiple lhs (terminated by assign op)
@@ -445,23 +529,23 @@
#a comma has been seen. are we in an
#lvalue list or some other construct that uses commas?
def comma_in_lvalue_list?
@parsestack.last.lhs=
case l=@parsestack.last
- when ListContext:
- when DefContext: l.in_body
+ when ListContext;
+ when DefContext; l.in_body
else true
end
end
#-----------------------------------
def in_lvar_define_state lasttok=@last_operative_token
#@defining_lvar is a hack
@defining_lvar or case ctx=@parsestack.last
#when ForSMContext; ctx.state==:for
when RescueSMContext
- lasttok.ident=="=>" and @file.match?( /\A[\s\v]*([:;#\n]|then[^a-zA-Z0-9_])/m )
+ lasttok.ident=="=>" and @file.match?( /\A[\s\v]*([:;#\n]|then(?!#@@LETTER_DIGIT))/om )
#when BlockParamListLhsContext; true
end
end
IMPLICIT_PARENS_BEFORE_ACCESSOR_ASSIGNMENT=2
@@ -485,17 +569,17 @@
assert String===name
was_in_lvar_define_state=in_lvar_define_state(lasttok)
#maybe_local really means 'maybe local or constant'
maybe_local=case name
- when /[^a-z_0-9]$/i #do nothing
- when /^[a-z_]/
+ when /(?!#@@LETTER_DIGIT).$/o #do nothing
+ when /^#@@LCLETTER/o
(localvars===name or
VARLIKE_KEYWORDS===name or
was_in_lvar_define_state
) and not lasttok===/^(\.|::)$/
- when /^[A-Z]/
+ when /^#@@UCLETTER/o
is_const=true
not lasttok==='.' #this is the right algorithm for constants...
end
assert(@moretokens.empty?)
@@ -507,11 +591,11 @@
oldpos= input_position
sawnl=false
result=ws_toks=ignored_tokens(true) {|nl| sawnl=true }
if sawnl || eof?
if was_in_lvar_define_state
- if /^[a-z_][a-zA-Z_0-9]*$/===name
+ if /^#@@LCLETTER#@@LETTER_DIGIT*$/o===name
assert !(lasttok===/^(\.|::)$/)
localvars[name]=true
end
return result.unshift(tok)
elsif maybe_local
@@ -529,11 +613,11 @@
#then omit implicit parens
assignment_coming=case nc=nextchar
when ?=; not /^=[>=~]$/===readahead(2)
when ?,; comma_in_lvalue_list?
when ?); last_context_not_implicit.lhs
- when ?i; /^in[^a-zA-Z_0-9]/===readahead(3) and
+ when ?i; /^in(?!#@@LETTER_DIGIT)/o===readahead(3) and
ForSMContext===last_context_not_implicit
when ?>,?<; /^(.)\1=$/===readahead(3)
when ?*,?&; /^(.)\1?=/===readahead(3)
when ?|; /^\|\|?=/===readahead(3) or
#is it a goalpost?
@@ -541,12 +625,12 @@
readahead(2)[1] != ?|
when ?%,?/,?-,?+,?^; readahead(2)[1]== ?=
end
if (assignment_coming && !(lasttok===/^(\.|::)$/) or was_in_lvar_define_state)
tok=assign_lvar_type! VarNameToken.new(name,pos)
- if /[^a-z_0-9]$/i===name
- elsif /^[a-z_]/===name and !(lasttok===/^(\.|::)$/)
+ if /(?!#@@LETTER_DIGIT).$/o===name
+ elsif /^#@@LCLETTER/o===name and !(lasttok===/^(\.|::)$/)
localvars[name]=true
end
return result.unshift(tok)
end
@@ -557,11 +641,11 @@
else
case nc
when nil: 2
when ?!; /^![=~]$/===readahead(2) ? 2 : 1
when ?d;
- if /^do([^a-zA-Z0-9_]|$)/===readahead(3)
+ if /^do((?!#@@LETTER_DIGIT)|$)/o===readahead(3)
if maybe_local and expecting_do?
ty=VarNameToken
0
else
maybe_local=false
@@ -570,11 +654,11 @@
else
1
end
when NEVERSTARTPARAMLISTFIRST
(NEVERSTARTPARAMLISTWORDS===readahead(NEVERSTARTPARAMLISTMAXLEN)) ? 2 : 1
- when ?",?',?`,?a..?z,?A..?Z,?0..?9,?_,?@,?$,?~; 1 #"
+ when ?",?',?`,?a..?z,?A..?Z,?0..?9,?_,?@,?$,?~,NONASCII; 1 #"
when ?{
maybe_local=false
1
=begin
x=2
@@ -631,14 +715,16 @@
if /^:(?:[#{WHSPLF}]|(:))$/o===next2 then
$1 && !ws_toks.empty? ? 3 : 2
else
3
end
- when ??; next3=readahead(3);
- /^\?([#{WHSPLF}]|[a-z_][a-z_0-9])/io===next3 ? 2 : 3
+ when ??; next3=readahead(3)
+ #? never begins a char constant if immediately followed
+ #by 2 or more letters or digits
+ /^\?([#{WHSPLF}]|#@@LETTER_DIGIT{2})/o===next3 ? 2 : 3
# when ?:,??; (readahead(2)[/^.[#{WHSPLF}]/o]) ? 2 : 3
- when ?<; (!ws_toks.empty? && readahead(4)[/^<<-?["'`a-zA-Z_0-9]/]) ? 3 : 2
+ when ?<; (!ws_toks.empty? && readahead(4)[/^<<-?(?:["'`]|#@@LETTER_DIGIT)/o]) ? 3 : 2
when ?[;
if ws_toks.empty?
(KeywordToken===oldlast and /^(return|break|next)$/===oldlast.ident) ? 3 : 2
else
3
@@ -705,11 +791,11 @@
result << tok
if @parsestack.size==basesize
break false
elsif ','==tok.to_s and @parsestack.size==basesize+1
break true
- elsif OperatorToken===tok and /^[&*]$/===tok.ident and tok.unary and @parsestack.size==basesize+1
+ elsif OperatorToken===tok and /^[&*]$/===tok.ident and tok.tag and @parsestack.size==basesize+1
break true
elsif EoiToken===tok
lexerror tok, "unexpected eof in parameter list"
end
}
@@ -888,11 +974,11 @@
fail if all.empty?
@moretokens.concat divide_ws(ws,offset) if ws
@moretokens.push KeywordToken.new('::',offset+md.end(0)-2) if dc
loop do
offset=input_position
- @file.scan(/\A(#@@WSTOKS)?([A-Z][a-zA-Z_0-9]*)(::)?/o)
+ @file.scan(/\A(#@@WSTOKS)?(#@@UCLETTER#@@LETTER_DIGIT*)(::)?/o)
#this regexp---^ will need to change in order to support utf8 properly.
md=@file.last_match
all,ws,name,dc=*md
if ws
@moretokens.concat divide_ws(ws,offset)
@@ -1011,15 +1097,15 @@
name=tok.to_s
assert !in_lvar_define_state
#maybe_local really means 'maybe local or constant'
maybe_local=case name
- when /[^a-z_0-9]$/i; #do nothing
+ when /(?!#@@LETTER_DIGIT).$/o; #do nothing
when /^[@$]/; true
when VARLIKE_KEYWORDS,FUNCLIKE_KEYWORDS; ty=KeywordToken
- when /^[a-z_]/; localvars===name
- when /^[A-Z]/; is_const=true #this is the right algorithm for constants...
+ when /^#@@LCLETTER/o; localvars===name
+ when /^#@@UCLETTER/o; is_const=true #this is the right algorithm for constants...
end
result.push( *ignored_tokens(false,false) )
nc=nextchar
if !ty and maybe_local
if nc==?: || nc==?.
@@ -1057,11 +1143,11 @@
@in_def_name=true
while true
#look for start of parameter list
nc=(@moretokens.empty? ? nextchar.chr : @moretokens.first.to_s[0,1])
- if state==:expect_op and /^[a-z_(&*]/i===nc
+ if state==:expect_op and /^(?:#@@LETTER|[(&*])/o===nc
ctx.state=:def_param_list
list,listend=def_param_list
result.concat list
end_index=result.index(listend)
ofs=listend.offset
@@ -1078,11 +1164,11 @@
result<< tok
case tok
when EoiToken
lexerror tok,'unexpected eof in def header'
when StillIgnoreToken
- when MethNameToken ,VarNameToken # /^[a-z_]/i.token_pat
+ when MethNameToken ,VarNameToken # /^#@@LETTER/o.token_pat
lexerror tok,'expected . or ::' unless state==:expect_name
state=:expect_op
when /^(\.|::)$/.token_pat
lexerror tok,'expected ident' unless state==:expect_op
if endofs
@@ -1414,11 +1500,11 @@
#next token is a local var name
#(or the one after that if unary ops present)
#result.concat ignored_tokens
if expect_name
case tok
- when IgnoreToken #, /^[A-Z]/ #do nothing
+ when IgnoreToken #, /^#@@UCLETTER/o #do nothing
when /^,$/.token_pat #hack
when VarNameToken
assert@defining_lvar
@defining_lvar=false
@@ -1496,16 +1582,24 @@
(@last_operative_token===/^(return|next|break)$/ and KeywordToken===@last_operative_token)
result=quadriop(ch)
if want_unary
#readahead(2)[1..1][/[\s\v#\\]/] or #not needed?
assert OperatorToken===result
- result.unary=true #result should distinguish unary+binary *&
+ result.tag=:unary #result should distinguish unary+binary *&
WHSPLF[nextchar.chr] or
@moretokens << NoWsToken.new(input_position)
- comma_in_lvalue_list?
+ cill=comma_in_lvalue_list?
if ch=='*'
@parsestack.last.see self, :splat
+ case @parsestack[-1]
+ when AssignmentRhsContext; result.tag= :rhs
+ when ParamListContext,ParamListContextNoParen; #:call
+ when ListImmedContext; #:array
+ when BlockParamListLhsContext; #:block
+ when KnownNestedLhsParenContext; #:nested
+ else result.tag= :lhs if cill
+ end
end
end
result
end
@@ -1551,14 +1645,14 @@
def is_var_name?
(tok=@last_operative_token)
s=tok.to_s
case s
- when /[^a-z_0-9]$/i; false
-# when /^[a-z_]/; localvars===s or VARLIKE_KEYWORDS===s
- when /^[A-Z_]/i; VarNameToken===tok
when /^[@$<]/; true
+ when /(?!#@@LETTER_DIGIT).$/o; false
+# when /^#@@LCLETTER/o; localvars===s or VARLIKE_KEYWORDS===s
+ when /^#@@LETTER/o; VarNameToken===tok
else raise "not var or method name: #{s}"
end
end
#-----------------------------------
@@ -1571,11 +1665,11 @@
not is_var_name? and
if ch==':'
not TernaryContext===@parsestack.last
else
- !readahead(3)[/^\?[a-z0-9_]{2}/i]
+ !readahead(3)[/^\?#@@LETTER_DIGIT{2}/o]
end
}
end
#-----------------------------------
@@ -1601,25 +1695,29 @@
#cancel implicit contexts...
@moretokens.push(*abort_noparens!(':'))
@moretokens.push tok=KeywordToken.new(':',startpos)
case @parsestack.last
- when TernaryContext:
+ when TernaryContext
tok.ternary=true
@parsestack.pop #should be in the context's see handler
- when ExpectDoOrNlContext: #should be in the context's see handler
- @parsestack.pop
- assert @parsestack.last.starter[/^(while|until|for)$/]
+ when ExpectDoOrNlContext #should be in the context's see handler
+ if @rubyversion<1.9
+ @parsestack.pop
+ assert @parsestack.last.starter[/^(while|until|for)$/]
+ tok.as=";"
+ end
+ when ExpectThenOrNlContext,WhenParamListContext
+ if @rubyversion<1.9
+ #should be in the context's see handler
+ @parsestack.pop
+ tok.as="then"
+ end
+ when RescueSMContext
tok.as=";"
- when ExpectThenOrNlContext,WhenParamListContext:
- #should be in the context's see handler
- @parsestack.pop
- tok.as="then"
- when RescueSMContext:
- tok.as=";"
- else fail ": not expected in #{@parsestack.last.class}->#{@parsestack.last.starter}"
- end
+ end or
+ fail ": not expected in #{@parsestack.last.class}->#{@parsestack.last.starter}"
#end ternary context, if any
@parsestack.last.see self,:colon
return @moretokens.shift
@@ -1629,11 +1727,11 @@
colon2=KeywordToken.new( '::',startpos)
lasttok=@last_operative_token
assert !(String===lasttok)
if (VarNameToken===lasttok or MethNameToken===lasttok) and
- lasttok===/^[$@a-zA-Z_]/ and !WHSPCHARS[lastchar]
+ lasttok===/^(?:[$@]|#@@LETTER)/o and !WHSPCHARS[lastchar]
then
@moretokens << colon2
result= NoWsToken.new(startpos)
else
result=colon2
@@ -1662,16 +1760,16 @@
open=":'"; close="'"
single_quote("'")
when ?` then read(1) #`
when ?@ then at_identifier.to_s
when ?$ then dollar_identifier.to_s
- when ?_,?a..?z then identifier_as_string(?:)
+ when ?_,?a..?z,NONASCII then identifier_as_string(?:)
when ?A..?Z then
result=identifier_as_string(?:)
if @last_operative_token==='::'
assert klass==MethNameToken
- /[A-Z_0-9]$/i===result and klass=VarNameToken
+ /#@@LETTER_DIGIT$/o===result and klass=VarNameToken
end
result
else
error= "unexpected char starting symbol: #{nc.chr}"
'_'
@@ -1694,11 +1792,11 @@
#look for operators
opmatches=readahead(3)[RUBYSYMOPERATORREX]
return [opmatches ? read(opmatches.size) :
case nc=nextchar
when ?` then read(1) #`
- when ?_,?a..?z,?A..?Z then
+ when ?_,?a..?z,?A..?Z,NONASCII then
context=merge_assignment_op_in_setter_callsites? ? ?: : nc
identifier_as_string(context)
else
set_last_token KeywordToken.new(';')
lexerror(tok_to_errify,"unexpected char starting callsite symbol: #{nc.chr}, tok=#{tok_to_errify.inspect}")
@@ -1718,11 +1816,11 @@
(quote==getchar) or
return lexerror(HerePlaceholderToken.new( dash, quote, ender ), "mismatched quotes in here doc")
quote_real=true
else
quote='"'
- ender=til_charset(/[^a-zA-Z0-9_]/)
+ ender=@file.scan(/#@@LETTER_DIGIT+/o)
ender.length >= 1 or
return lexerror(HerePlaceholderToken.new( dash, quote, ender, nil ), "invalid here header")
end
res= HerePlaceholderToken.new( dash, quote, ender, quote_real )
@@ -1737,19 +1835,22 @@
#actually delete procrastinated from input
@file.delete(input_position_raw-procrastinated.size...input_position_raw)
nl=readnl or return lexerror(res, "here header without body (at eof)")
+ res.string.startline=linenum
@moretokens<< res
bodystart=input_position
@offset_adjust = @min_offset_adjust+procrastinated.size
#was: @offset_adjust += procrastinated.size
body=here_body(res)
res.close=body.close
@offset_adjust = @min_offset_adjust
#was: @offset_adjust -= procrastinated.size
bodysize=input_position-bodystart
+ res.string.line=linenum-1
+ lexerror res,res.string.error
#one or two already read characters are overwritten here,
#in order to keep offsets correct in the long term
#(at present, offsets and line numbers between
#here header and its body will be wrong. but they should re-sync thereafter.)
@@ -1812,11 +1913,11 @@
end
#-----------------------------------
def lessthan(ch) #match quadriop('<') or here doc or spaceship op
case readahead(3)
- when /^<<['"`\-a-z0-9_]$/i #'
+ when /^<<(?:['"`\-]|#@@LETTER_DIGIT)$/o #'
if quote_expected?(ch) and not @last_operative_token==='class'
here_header
else
operator_or_methname_token read(2)
end
@@ -1899,11 +2000,15 @@
#here body terminator?
oldpos= input_position_raw
if tofill.dash
close+=til_charset(/[^#{WHSP}]/o)
end
- break if eof? #this is an error, should be handled better
+ if eof? #this is an error, should be handled better
+ lexerror tofill, "unterminated here body"
+ lexerror tofill.string, "unterminated here body"
+ break
+ end
if read(tofill.ender.size)==tofill.ender
crs=til_charset(/[^\r]/)||''
if nl=readnl
close+=tofill.ender+crs+nl
break
@@ -1915,10 +2020,12 @@
if tofill.quote=="'"
line=til_charset(/[\n]/)
unless nl=readnl
assert eof?
+ lexerror tofill, "unterminated here body"
+ lexerror tofill.string, "unterminated here body"
break #this is an error, should be handled better
end
line.chomp!("\r")
line<< "\n"
assert("\n"==prevchar)
@@ -2116,11 +2223,11 @@
#-----------------------------------
#used to resolve the ambiguity of
# unary ops (+, -, *, &, ~ !) in ruby
#returns whether current token is to be the start of a literal
- IDBEGINCHAR=/^[a-zA-Z_$@]/
+ IDBEGINCHAR=/^(?:#@@LETTER|[$@])/o
def unary_op_expected?(ch) #yukko hack
'*&='[readahead(2)[1..1]] and return false
return true if KeywordToken===@last_operative_token and @last_operative_token==='for'
@@ -2137,12 +2244,12 @@
# <<, %, ? in ruby
#returns whether current token is to be the start of a literal
def quote_expected?(ch) #yukko hack
case ch[0]
when ?? then readahead(2)[/^\?[#{WHSPLF}]$/o] #not needed?
- when ?% then readahead(3)[/^%([a-pt-vyzA-PR-VX-Z]|[QqrswWx][a-zA-Z0-9])/]
- when ?< then !readahead(4)[/^<<-?['"`a-z0-9_]/i]
+ when ?% then readahead(3)[/^%([a-pt-vyzA-PR-VX-Z]|[QqrswWx]#{@@LETTER_DIGIT.gsub('_','')})/o]
+ when ?< then !readahead(4)[/^<<-?(?:['"`]|#@@LETTER_DIGIT)/o]
else raise 'unexpected ch (#{ch}) in quote_expected?'
# when ?+,?-,?&,?*,?~,?! then '*&='[readahead(2)[1..1]]
end and return false
after_nonid_op? {
@@ -2320,29 +2427,38 @@
assert str=='='
c=(eat_next_if(/[~=>]/)or'')
str << c
result= operator_or_methname_token( str,offset)
case c
- when '=': #===,==
+ when '=' #===,==
str<< (eat_next_if(?=)or'')
- when '>': #=>
+ when '>' #=>
unless ParamListContextNoParen===@parsestack.last
@moretokens.unshift result
@moretokens.unshift( *abort_noparens!("=>"))
result=@moretokens.shift
end
@parsestack.last.see self,:arrow
- when '': #plain assignment: record local variable definitions
+ when '~' # =~... after regex, maybe?
+ last=last_operative_token
+
+ if @rubyversion>=1.9 and StringToken===last and last.lvars
+ #ruby delays adding lvars from regexps to known lvars table
+ #for several tokens in some cases. not sure why or if on purpose
+ #i'm just going to add them right away
+ localvars.concat last.lvars
+ end
+ when '' #plain assignment: record local variable definitions
last_context_not_implicit.lhs=false
@moretokens.push( *ignored_tokens(true).map{|x|
NewlineToken===x ? EscNlToken.new(@filename,@linenum,x.ident,x.offset) : x
} )
@parsestack.push AssignmentRhsContext.new(@linenum)
if eat_next_if ?*
tok=OperatorToken.new('*', input_position-1)
- tok.unary=true
+ tok.tag=:unary
@moretokens.push tok
WHSPLF[nextchar.chr] or
@moretokens << NoWsToken.new(input_position)
comma_in_lvalue_list? #is this needed?
end
@@ -2448,27 +2564,38 @@
#but control never comes this way in those cases... goes
#to custom parsers for alias, undef, and def in #parse_keywords
tokch.set_infix! unless after_nonid_op?{WHSPLF[lastchar]}
@parsestack.push ListImmedContext.new(ch,@linenum)
lasttok=last_operative_token
- #could be: lasttok===/^[a-z_]/i
- if (VarNameToken===lasttok or ImplicitParamListEndToken===lasttok or MethNameToken===lasttok) and !WHSPCHARS[lastchar]
+ #could be: lasttok===/^#@@LETTER/o
+ if (VarNameToken===lasttok or ImplicitParamListEndToken===lasttok or
+ MethNameToken===lasttok or lasttok===FUNCLIKE_KEYWORDS) and !WHSPCHARS[lastchar]
@moretokens << (tokch)
tokch= NoWsToken.new(input_position-1)
end
when '('
lasttok=last_token_maybe_implicit #last_operative_token
- #could be: lasttok===/^[a-z_]/i
+ #could be: lasttok===/^#@@LETTER/o
if (VarNameToken===lasttok or MethNameToken===lasttok or
lasttok===FUNCLIKE_KEYWORDS)
unless WHSPCHARS[lastchar]
@moretokens << tokch
tokch= NoWsToken.new(input_position-1)
end
@parsestack.push ParamListContext.new(@linenum)
else
- @parsestack.push ParenContext.new(@linenum)
+ ctx=@parsestack.last
+ lasttok=last_operative_token
+ maybe_def=DefContext===ctx && !ctx.in_body &&
+ !(KeywordToken===lasttok && lasttok.ident=="def")
+ if maybe_def or
+ BlockParamListLhsContext===ctx or
+ ParenContext===ctx && ctx.lhs
+ @parsestack.push KnownNestedLhsParenContext.new(@linenum)
+ else
+ @parsestack.push ParenContext.new(@linenum)
+ end
end
when '{'
#check if we are in a hash literal or string inclusion (#{}),
#in which case below would be bad.
@@ -2572,16 +2699,17 @@
(RescueSMContext===@parsestack[-2] && @parsestack[-2].state==:rescue) ||
(DefContext===@parsestack[-2] && !@parsestack[-2].in_body)
@parsestack.pop
@moretokens.unshift AssignmentRhsListEndToken.new(input_position)
end
- token.comma_type=
case @parsestack[-1]
- when AssignmentRhsContext; :rhs
- when ParamListContext,ParamListContextNoParen; :call
- when ListImmedContext; :array
+ when AssignmentRhsContext; token.tag=:rhs
+ when ParamListContext,ParamListContextNoParen; #:call
+ when ListImmedContext; #:array
+ when BlockParamListLhsContext; #:block
+ when KnownNestedLhsParenContext; #:nested
else
- :lhs if comma_in_lvalue_list?
+ token.tag=:lhs if comma_in_lvalue_list?
end
@parsestack.last.see self,:comma
return @moretokens.shift
end