rubylexer.rb in rubylexer-0.7.4

- old
+ new

@@ -1,6 +1,6 @@
-=begin legal crap
+=begin
     rubylexer - a ruby lexer written in ruby
     Copyright (C) 2004,2005,2008  Caleb Clausen
 
     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
@@ -58,22 +58,24 @@
    VARLIKE_KEYWORDS=/^(#{VARLIKE_KEYWORDLIST.join '|'})$/
    INNERBOUNDINGWORDLIST=%w"else elsif ensure in then rescue when"
    INNERBOUNDINGWORDS="(#{INNERBOUNDINGWORDLIST.join '|'})"
    BINOPWORDLIST=%w"and or"
    BINOPWORDS="(#{BINOPWORDLIST.join '|'})"
-   NEVERSTARTPARAMLISTWORDS=/\A(#{OPORBEGINWORDS}|#{INNERBOUNDINGWORDS}|#{BINOPWORDS}|end)([^a-zA-Z0-9_!?=]|\Z)/o
-   NEVERSTARTPARAMLISTFIRST=CharSet['aoeitrwu']  #chars that begin NEVERSTARTPARAMLIST
-   NEVERSTARTPARAMLISTMAXLEN=7     #max len of a NEVERSTARTPARAMLIST
    
    RUBYKEYWORDS=%r{
      ^(alias|#{BINOPWORDS}|defined\?|not|undef|end|
        #{VARLIKE_KEYWORDS}|#{FUNCLIKE_KEYWORDS}|
        #{INNERBOUNDINGWORDS}|#{BEGINWORDS}
      )$
    }xo
       #__END__ should not be in this set... its handled in start_of_line_directives
 
+   HIGHASCII=?\x80..?\xFF
+   NONASCII=HIGHASCII
+   #NONASCII=?\x80..?xFFFFFFFF  #or is it 10FFFF, whatever the highest conceivable code point
+
+
    CHARMAPPINGS = {
          ?$ => :dollar_identifier,
          ?@ => :at_identifier,
          ?a..?z => :identifier,
          ?A..?Z => :identifier,
@@ -113,18 +115,47 @@
 
          "[({" => :open_brace,
          "])}" => :close_brace,
 
 
-         ?# => :comment
+         ?# => :comment,
+
+         NONASCII => :identifier,
    }
 
    attr_reader :incomplete_here_tokens, :parsestack, :last_token_maybe_implicit
 
+   UCLETTER=@@UCLETTER="[A-Z]"
 
+   #cheaters way, treats utf chars as always 1 byte wide
+   #all high-bit chars are lowercase letters
+   #works, but strings compare with strict binary identity, not unicode collation
+   #works for euc too, I think
+   #(the ruby spec for utf8 support permits this interpretation)
+   LCLETTER=@@LCLETTER="[a-z_\x80-\xFF]"
+   LETTER=@@LETTER="[A-Za-z_\x80-\xFF]"
+   LETTER_DIGIT=@@LETTER_DIGIT="[A-Za-z_0-9\x80-\xFF]"
+   eval %w[UCLETTER LCLETTER LETTER LETTER_DIGIT].map{|n| "
+     def #{n}; #{n}; end
+     def self.#{n}; @@#{n}; end
+     " 
+   }.to_s
+
+   NEVERSTARTPARAMLISTWORDS=/\A(#{OPORBEGINWORDS}|#{INNERBOUNDINGWORDS}|#{BINOPWORDS}|end)((?:(?!#@@LETTER_DIGIT).)|\Z)/om
+   NEVERSTARTPARAMLISTFIRST=CharSet['aoeitrwu']  #chars that begin NEVERSTARTPARAMLIST
+   NEVERSTARTPARAMLISTMAXLEN=7     #max len of a NEVERSTARTPARAMLIST
+
+=begin
+   require 'jcode'
+   utf8=String::PATTERN_UTF8 #or euc, or sjis...
+   LCLETTER_U="(?>[a-z_]|#{utf8})"
+   LETTER_U="(?>[A-Za-z_]|#{utf8})"
+   IDENTCHAR_U="(?>[A-Za-z_0-9]|#{utf8})"
+=end
+
    #-----------------------------------
-   def initialize(filename,file,linenum=1,offset_adjust=0)
+   def initialize(filename,file,linenum=1,offset_adjust=0,options={:rubyversion=>1.8})
       @offset_adjust=0 #set again in next line
       super(filename,file, linenum,offset_adjust)
       @start_linenum=linenum
       @parsestack=[TopLevelContext.new]
       @incomplete_here_tokens=[] #not used anymore
@@ -135,17 +166,65 @@
       @last_operative_token=nil
       @last_token_maybe_implicit=nil
       @enable_macro=nil
       @base_file=nil
       @progress_thread=nil
+      @rubyversion=options[:rubyversion]
+      @encoding=options[:encoding]||:detect
 
       @toptable=CharHandler.new(self, :illegal_char, CHARMAPPINGS)
 
+      read_leading_encoding
       start_of_line_directives
       progress_printer
    end
 
+   ENCODING_ALIASES={
+    'utf-8'=>'utf8',
+
+    'ascii-8bit'=>'binary',
+    'ascii-7bit'=>'ascii',
+    'euc-jp'=>'euc',
+
+    'ascii8bit'=>'binary',
+    'ascii7bit'=>'ascii',
+    'eucjp'=>'euc',
+
+    'us-ascii'=>'ascii',
+    'shift-jis'=>'sjis',
+
+    'autodetect'=>'detect',
+   }
+   ENCODINGS=%w[ascii binary utf8 euc sjis]
+   def read_leading_encoding
+     return unless @encoding==:detect
+     @encoding=:ascii
+     @encoding=:utf8 if @file.skip( /\xEF\xBB\xBF/ )   #bom
+     if @file.skip( /\A#!/ )
+       loop do
+         til_charset( /[\s\v]/ )
+         break if @file.skip( / ([^-\s\v]|--[\s\v])/,4 )
+         if @file.skip( /.-K(.)/ )
+           case $1
+           when 'u'; @encoding=:utf8
+           when 'e'; @encoding=:euc
+           when 's'; @encoding=:sjis
+           end
+         end
+       end
+       til_charset( /[\n]/ )
+     end
+     if @rubyversion>=1.9 and @file.skip( 
+          /\A#[\x00-\x7F]*?(?:en)?coding[\s\v]*[:=][\s\v]*([a-z0-9_-]+)[\x00-\x7F]*\n/i 
+        )
+       name=$1
+       name.downcase!
+       name=ENCODING_ALIASES[name] if ENCODING_ALIASES[name]
+       @encoding=name.to_sym if ENCODINGS.include? name
+     end
+   end
+
    def progress_printer
      return unless ENV['RL_PROGRESS']
      $stderr.puts 'printing progresses'
      @progress_thread=Thread.new do
        until EoiToken===@last_operative_token
@@ -161,10 +240,11 @@
 
    attr_accessor :in_def
    attr :localvars_stack	
    attr :offset_adjust
    attr_writer :pending_here_bodies
+   attr :rubyversion
 
    #-----------------------------------
    def set_last_token(tok)
      @last_operative_token=@last_token_maybe_implicit=tok
    end
@@ -359,11 +439,11 @@
    def special_global   #handle $-a and friends
       assert prevchar=='$'
       result = ((
       #order matters here, but it shouldn't
       #(but til_charset must be last)
-         eat_if(/-[a-z0-9_]/i,2) or
+         eat_if(/-#@@LETTER_DIGIT/o,2) or
          eat_next_if(/[!@&+`'=~\-\/\\,.;<>*"$?:]/) or
          (?0..?9)===nextchar ? til_charset(/[^\d]/) : nil
       ))
    end
 
@@ -374,34 +454,38 @@
 
       #skip keyword processing if 'escaped' as it were, by def, . or ::
       #or if in a non-bare context
       #just asserts because those contexts are never encountered.
       #control goes through symbol(<...>,nil) 
-      assert( /^[a-z_]$/i===context)
+      assert( /^#@@LETTER$/o===context)
       assert MethNameToken===@last_operative_token || !(@last_operative_token===/^(\.|::|(un)?def|alias)$/)
 
-      @moretokens.unshift(*parse_keywords(str,oldpos) do |tok|
-        #if not a keyword,
-        case str
-          when FUNCLIKE_KEYWORDS; except=tok
-          when VARLIKE_KEYWORDS,RUBYKEYWORDS; raise "shouldnt see keywords here, now"
-        end
-        was_last=@last_operative_token
-        @last_operative_token=tok if tok
-        normally=safe_recurse { |a| var_or_meth_name(str,was_last,oldpos,after_nonid_op?{true}) }
-        (Array===normally ? normally[0]=except : normally=except) if except
-        normally
-      end)
+      if @parsestack.last.wantarrow and @rubyversion>=1.9 and @file.skip ":"
+        @moretokens.push SymbolToken.new(str,oldpos), KeywordToken.new("=>",input_position-1)
+      else
+        @moretokens.unshift(*parse_keywords(str,oldpos) do |tok|
+          #if not a keyword, decide if it should be var or method
+          case str
+            when FUNCLIKE_KEYWORDS; except=tok
+            when VARLIKE_KEYWORDS,RUBYKEYWORDS; raise "shouldnt see keywords here, now"
+          end
+          was_last=@last_operative_token
+          @last_operative_token=tok if tok
+          normally=safe_recurse { |a| var_or_meth_name(str,was_last,oldpos,after_nonid_op?{true}) }
+          (Array===normally ? normally[0]=except : normally=except) if except
+          normally
+        end)
+      end
       return @moretokens.shift
    end
 
    #-----------------------------------
    IDENTREX={}
    def identifier_as_string(context)
       #must begin w/ letter or underscore
       #char class needs changing here for utf8 support
-      /[_a-z]/i===nextchar.chr or return
+      /#@@LETTER/o===nextchar.chr or return
 
       #equals, question mark, and exclamation mark
       #might be allowed at the end in some contexts.
       #(in def headers and symbols)
       #otherwise, =,?, and ! are to be considered
@@ -416,11 +500,11 @@
 #         when ?:    then "!(?![=])|\\?|=(?![=~>])"
          else            "!(?![=])|\\?"
         end      
       @in_def_name||context==?: and trailers<<"|=(?![=~>])"
 
-      @file.scan(IDENTREX[trailers]||=/^(?>[_a-z][a-z0-9_]*(?:#{trailers})?)/i)
+      @file.scan(IDENTREX[trailers]||=/^(?>#@@LETTER#@@LETTER_DIGIT*(?:#{trailers})?)/)
    end
 
   #-----------------------------------
   #contexts in which comma may appear in ruby:
     #multiple lhs (terminated by assign op)
@@ -445,23 +529,23 @@
    #a comma has been seen. are we in an
    #lvalue list or some other construct that uses commas?
    def comma_in_lvalue_list?
      @parsestack.last.lhs=
        case l=@parsestack.last
-       when ListContext:
-       when DefContext: l.in_body
+       when ListContext;
+       when DefContext; l.in_body
        else true
        end
    end
    
    #-----------------------------------
    def in_lvar_define_state lasttok=@last_operative_token
      #@defining_lvar is a hack
      @defining_lvar or case ctx=@parsestack.last
        #when ForSMContext; ctx.state==:for
        when RescueSMContext
-         lasttok.ident=="=>" and @file.match?( /\A[\s\v]*([:;#\n]|then[^a-zA-Z0-9_])/m )
+         lasttok.ident=="=>" and @file.match?( /\A[\s\v]*([:;#\n]|then(?!#@@LETTER_DIGIT))/om )
        #when BlockParamListLhsContext; true
      end 
    end
 
    IMPLICIT_PARENS_BEFORE_ACCESSOR_ASSIGNMENT=2
@@ -485,17 +569,17 @@
      assert String===name
      
      was_in_lvar_define_state=in_lvar_define_state(lasttok)
      #maybe_local really means 'maybe local or constant'
      maybe_local=case name
-       when /[^a-z_0-9]$/i #do nothing
-       when /^[a-z_]/  
+       when /(?!#@@LETTER_DIGIT).$/o #do nothing
+       when /^#@@LCLETTER/o  
          (localvars===name or 
           VARLIKE_KEYWORDS===name or 
           was_in_lvar_define_state
          ) and not lasttok===/^(\.|::)$/
-       when /^[A-Z]/
+       when /^#@@UCLETTER/o
          is_const=true
          not lasttok==='.'  #this is the right algorithm for constants... 
      end 
           
      assert(@moretokens.empty?)
@@ -507,11 +591,11 @@
      oldpos= input_position
      sawnl=false
      result=ws_toks=ignored_tokens(true) {|nl| sawnl=true }
      if sawnl || eof? 
          if was_in_lvar_define_state
-           if /^[a-z_][a-zA-Z_0-9]*$/===name 
+           if /^#@@LCLETTER#@@LETTER_DIGIT*$/o===name 
              assert !(lasttok===/^(\.|::)$/)
              localvars[name]=true
            end
            return result.unshift(tok)
          elsif maybe_local
@@ -529,11 +613,11 @@
      #then omit implicit parens
      assignment_coming=case nc=nextchar
        when ?=;  not /^=[>=~]$/===readahead(2)
        when ?,; comma_in_lvalue_list? 
        when ?); last_context_not_implicit.lhs
-       when ?i; /^in[^a-zA-Z_0-9]/===readahead(3) and 
+       when ?i; /^in(?!#@@LETTER_DIGIT)/o===readahead(3) and 
                   ForSMContext===last_context_not_implicit
        when ?>,?<; /^(.)\1=$/===readahead(3)
        when ?*,?&; /^(.)\1?=/===readahead(3)
        when ?|; /^\|\|?=/===readahead(3) or
                 #is it a goalpost?
@@ -541,12 +625,12 @@
                 readahead(2)[1] != ?|
        when ?%,?/,?-,?+,?^; readahead(2)[1]== ?=
      end 
      if (assignment_coming && !(lasttok===/^(\.|::)$/) or was_in_lvar_define_state)
         tok=assign_lvar_type! VarNameToken.new(name,pos)
-        if /[^a-z_0-9]$/i===name 
-        elsif /^[a-z_]/===name and !(lasttok===/^(\.|::)$/)
+        if /(?!#@@LETTER_DIGIT).$/o===name 
+        elsif /^#@@LCLETTER/o===name and !(lasttok===/^(\.|::)$/)
           localvars[name]=true
         end
         return result.unshift(tok)
      end
      
@@ -557,11 +641,11 @@
      else
      case nc
        when nil: 2
        when ?!; /^![=~]$/===readahead(2) ? 2 : 1
        when ?d; 
-         if /^do([^a-zA-Z0-9_]|$)/===readahead(3)
+         if /^do((?!#@@LETTER_DIGIT)|$)/o===readahead(3)
            if maybe_local and expecting_do?
              ty=VarNameToken 
              0
            else 
              maybe_local=false
@@ -570,11 +654,11 @@
          else 
            1
          end
        when NEVERSTARTPARAMLISTFIRST
          (NEVERSTARTPARAMLISTWORDS===readahead(NEVERSTARTPARAMLISTMAXLEN)) ? 2 : 1
-       when ?",?',?`,?a..?z,?A..?Z,?0..?9,?_,?@,?$,?~; 1 #"
+       when ?",?',?`,?a..?z,?A..?Z,?0..?9,?_,?@,?$,?~,NONASCII; 1 #"
        when ?{
          maybe_local=false
          1
 =begin
          x=2
@@ -631,14 +715,16 @@
          if /^:(?:[#{WHSPLF}]|(:))$/o===next2 then 
            $1 && !ws_toks.empty?   ? 3 : 2 
          else 
            3 
          end
-       when ??; next3=readahead(3);
-                   /^\?([#{WHSPLF}]|[a-z_][a-z_0-9])/io===next3 ? 2 : 3
+       when ??; next3=readahead(3)
+                #? never begins a char constant if immediately followed 
+                #by 2 or more letters or digits
+                   /^\?([#{WHSPLF}]|#@@LETTER_DIGIT{2})/o===next3 ? 2 : 3
 #       when ?:,??; (readahead(2)[/^.[#{WHSPLF}]/o]) ? 2 : 3
-       when ?<; (!ws_toks.empty? && readahead(4)[/^<<-?["'`a-zA-Z_0-9]/]) ? 3 : 2 
+       when ?<; (!ws_toks.empty? && readahead(4)[/^<<-?(?:["'`]|#@@LETTER_DIGIT)/o]) ? 3 : 2 
        when ?[; 
            if ws_toks.empty? 
              (KeywordToken===oldlast and /^(return|break|next)$/===oldlast.ident) ? 3 : 2
            else
              3
@@ -705,11 +791,11 @@
          result << tok
          if @parsestack.size==basesize
            break false
          elsif ','==tok.to_s and @parsestack.size==basesize+1
            break true 
-         elsif OperatorToken===tok and /^[&*]$/===tok.ident and tok.unary and @parsestack.size==basesize+1
+         elsif OperatorToken===tok and /^[&*]$/===tok.ident and tok.tag and @parsestack.size==basesize+1
            break true 
          elsif EoiToken===tok
            lexerror tok, "unexpected eof in parameter list"
          end
        }
@@ -888,11 +974,11 @@
          fail if all.empty?
          @moretokens.concat divide_ws(ws,offset) if ws
          @moretokens.push KeywordToken.new('::',offset+md.end(0)-2) if dc
          loop do
            offset=input_position
-           @file.scan(/\A(#@@WSTOKS)?([A-Z][a-zA-Z_0-9]*)(::)?/o)
+           @file.scan(/\A(#@@WSTOKS)?(#@@UCLETTER#@@LETTER_DIGIT*)(::)?/o)
            #this regexp---^ will need to change in order to support utf8 properly.
            md=@file.last_match
            all,ws,name,dc=*md
            if ws
              @moretokens.concat divide_ws(ws,offset)
@@ -1011,15 +1097,15 @@
               name=tok.to_s
               assert !in_lvar_define_state
      
               #maybe_local really means 'maybe local or constant'
               maybe_local=case name
-                when /[^a-z_0-9]$/i; #do nothing
+                when /(?!#@@LETTER_DIGIT).$/o; #do nothing
                 when /^[@$]/; true
                 when VARLIKE_KEYWORDS,FUNCLIKE_KEYWORDS; ty=KeywordToken
-                when /^[a-z_]/;  localvars===name 
-                when /^[A-Z]/; is_const=true  #this is the right algorithm for constants... 
+                when /^#@@LCLETTER/o;  localvars===name 
+                when /^#@@UCLETTER/o; is_const=true  #this is the right algorithm for constants... 
               end
               result.push(  *ignored_tokens(false,false)  )
               nc=nextchar
               if !ty and maybe_local
                 if nc==?: || nc==?.
@@ -1057,11 +1143,11 @@
             @in_def_name=true
             while true
 
                #look for start of parameter list
                nc=(@moretokens.empty? ? nextchar.chr : @moretokens.first.to_s[0,1])
-               if state==:expect_op and /^[a-z_(&*]/i===nc
+               if state==:expect_op and /^(?:#@@LETTER|[(&*])/o===nc
                   ctx.state=:def_param_list
                   list,listend=def_param_list
                   result.concat list
                   end_index=result.index(listend)
                   ofs=listend.offset
@@ -1078,11 +1164,11 @@
                result<< tok
                case tok
                when EoiToken
                   lexerror tok,'unexpected eof in def header'
                when StillIgnoreToken
-               when MethNameToken ,VarNameToken # /^[a-z_]/i.token_pat
+               when MethNameToken ,VarNameToken # /^#@@LETTER/o.token_pat
                   lexerror tok,'expected . or ::' unless state==:expect_name
                   state=:expect_op
                when /^(\.|::)$/.token_pat
                   lexerror tok,'expected ident' unless state==:expect_op
                   if endofs
@@ -1414,11 +1500,11 @@
             #next token is a local var name
             #(or the one after that if unary ops present)
             #result.concat ignored_tokens
             if expect_name 
               case tok
-                when IgnoreToken #, /^[A-Z]/ #do nothing
+                when IgnoreToken #, /^#@@UCLETTER/o #do nothing
                 when /^,$/.token_pat #hack
                               
                 when VarNameToken
                   assert@defining_lvar
                   @defining_lvar=false
@@ -1496,16 +1582,24 @@
        (@last_operative_token===/^(return|next|break)$/ and KeywordToken===@last_operative_token)
      result=quadriop(ch)
      if want_unary
        #readahead(2)[1..1][/[\s\v#\\]/] or #not needed?
        assert OperatorToken===result
-       result.unary=true         #result should distinguish unary+binary *&
+       result.tag=:unary         #result should distinguish unary+binary *&
        WHSPLF[nextchar.chr] or
          @moretokens << NoWsToken.new(input_position)
-       comma_in_lvalue_list?
+       cill=comma_in_lvalue_list?
        if ch=='*'
          @parsestack.last.see self, :splat
+         case @parsestack[-1]
+         when AssignmentRhsContext; result.tag= :rhs
+         when ParamListContext,ParamListContextNoParen; #:call
+         when ListImmedContext; #:array
+         when BlockParamListLhsContext; #:block
+         when KnownNestedLhsParenContext; #:nested
+         else          result.tag=     :lhs if cill
+         end
        end
      end
      result
    end
 
@@ -1551,14 +1645,14 @@
    def is_var_name?
      (tok=@last_operative_token)
 
      s=tok.to_s
      case s
-     when /[^a-z_0-9]$/i; false
-#     when /^[a-z_]/; localvars===s or VARLIKE_KEYWORDS===s
-     when /^[A-Z_]/i; VarNameToken===tok
      when /^[@$<]/; true
+     when /(?!#@@LETTER_DIGIT).$/o; false
+#     when /^#@@LCLETTER/o; localvars===s or VARLIKE_KEYWORDS===s
+     when /^#@@LETTER/o; VarNameToken===tok
      else raise "not var or method name: #{s}"
      end   
    end
    
    #-----------------------------------
@@ -1571,11 +1665,11 @@
 
        not is_var_name? and
          if ch==':'
            not TernaryContext===@parsestack.last
          else
-           !readahead(3)[/^\?[a-z0-9_]{2}/i]
+           !readahead(3)[/^\?#@@LETTER_DIGIT{2}/o]
          end
      }
    end
 
    #-----------------------------------
@@ -1601,25 +1695,29 @@
         #cancel implicit contexts...
         @moretokens.push(*abort_noparens!(':'))
         @moretokens.push tok=KeywordToken.new(':',startpos)
         
         case @parsestack.last
-        when TernaryContext: 
+        when TernaryContext 
           tok.ternary=true
           @parsestack.pop #should be in the context's see handler
-        when ExpectDoOrNlContext: #should be in the context's see handler
-          @parsestack.pop
-          assert @parsestack.last.starter[/^(while|until|for)$/]
+        when ExpectDoOrNlContext #should be in the context's see handler
+          if @rubyversion<1.9
+            @parsestack.pop
+            assert @parsestack.last.starter[/^(while|until|for)$/]
+            tok.as=";"
+          end
+        when ExpectThenOrNlContext,WhenParamListContext
+          if @rubyversion<1.9
+            #should be in the context's see handler
+            @parsestack.pop
+            tok.as="then"
+          end
+        when RescueSMContext
           tok.as=";"
-        when ExpectThenOrNlContext,WhenParamListContext: 
-          #should be in the context's see handler
-          @parsestack.pop
-          tok.as="then"
-        when RescueSMContext:
-          tok.as=";"
-        else fail ": not expected in #{@parsestack.last.class}->#{@parsestack.last.starter}"
-        end
+        end or 
+          fail ": not expected in #{@parsestack.last.class}->#{@parsestack.last.starter}"
         
         #end ternary context, if any
         @parsestack.last.see self,:colon
         
         return @moretokens.shift
@@ -1629,11 +1727,11 @@
 
       colon2=KeywordToken.new( '::',startpos)
       lasttok=@last_operative_token
       assert !(String===lasttok)
       if (VarNameToken===lasttok or MethNameToken===lasttok) and
-          lasttok===/^[$@a-zA-Z_]/ and !WHSPCHARS[lastchar]
+          lasttok===/^(?:[$@]|#@@LETTER)/o and !WHSPCHARS[lastchar]
       then
          @moretokens << colon2
          result= NoWsToken.new(startpos)
       else
          result=colon2
@@ -1662,16 +1760,16 @@
            open=":'"; close="'"
            single_quote("'")
          when ?` then read(1) #`
          when ?@ then at_identifier.to_s
          when ?$ then dollar_identifier.to_s
-         when ?_,?a..?z then identifier_as_string(?:)
+         when ?_,?a..?z,NONASCII then identifier_as_string(?:)
          when ?A..?Z then 
            result=identifier_as_string(?:)
            if @last_operative_token==='::' 
              assert klass==MethNameToken
-             /[A-Z_0-9]$/i===result and klass=VarNameToken
+             /#@@LETTER_DIGIT$/o===result and klass=VarNameToken
            end
            result
          else 
            error= "unexpected char starting symbol: #{nc.chr}"
            '_'
@@ -1694,11 +1792,11 @@
      #look for operators
      opmatches=readahead(3)[RUBYSYMOPERATORREX]
      return [opmatches ? read(opmatches.size) :
        case nc=nextchar
          when ?` then read(1) #`
-         when ?_,?a..?z,?A..?Z then 
+         when ?_,?a..?z,?A..?Z,NONASCII then 
            context=merge_assignment_op_in_setter_callsites? ? ?: : nc
            identifier_as_string(context)
          else 
            set_last_token KeywordToken.new(';')
            lexerror(tok_to_errify,"unexpected char starting callsite symbol: #{nc.chr}, tok=#{tok_to_errify.inspect}")
@@ -1718,11 +1816,11 @@
         (quote==getchar) or 
           return lexerror(HerePlaceholderToken.new( dash, quote, ender ), "mismatched quotes in here doc")
         quote_real=true
       else
         quote='"'
-        ender=til_charset(/[^a-zA-Z0-9_]/)
+        ender=@file.scan(/#@@LETTER_DIGIT+/o)
         ender.length >= 1  or 
           return lexerror(HerePlaceholderToken.new( dash, quote, ender, nil ), "invalid here header")
       end
 
       res= HerePlaceholderToken.new( dash, quote, ender, quote_real )
@@ -1737,19 +1835,22 @@
       #actually delete procrastinated from input
       @file.delete(input_position_raw-procrastinated.size...input_position_raw) 
       
       nl=readnl or return lexerror(res, "here header without body (at eof)")
 
+      res.string.startline=linenum
       @moretokens<< res
       bodystart=input_position
       @offset_adjust = @min_offset_adjust+procrastinated.size
       #was: @offset_adjust += procrastinated.size
       body=here_body(res)
       res.close=body.close
       @offset_adjust = @min_offset_adjust
       #was: @offset_adjust -= procrastinated.size
       bodysize=input_position-bodystart
+      res.string.line=linenum-1
+      lexerror res,res.string.error
 
       #one or two already read characters are overwritten here,
       #in order to keep offsets correct in the long term
       #(at present, offsets and line numbers between 
       #here header and its body will be wrong. but they should re-sync thereafter.)
@@ -1812,11 +1913,11 @@
    end
 
    #-----------------------------------
    def lessthan(ch) #match quadriop('<') or here doc or spaceship op
       case readahead(3)
-        when /^<<['"`\-a-z0-9_]$/i #'
+        when /^<<(?:['"`\-]|#@@LETTER_DIGIT)$/o #'
            if quote_expected?(ch) and not @last_operative_token==='class'
               here_header
            else
               operator_or_methname_token read(2)
            end
@@ -1899,11 +2000,15 @@
             #here body terminator?
             oldpos= input_position_raw
             if tofill.dash
               close+=til_charset(/[^#{WHSP}]/o)
             end
-            break if eof? #this is an error, should be handled better
+            if eof? #this is an error, should be handled better
+              lexerror tofill, "unterminated here body"
+              lexerror tofill.string, "unterminated here body"
+              break
+            end
             if read(tofill.ender.size)==tofill.ender
               crs=til_charset(/[^\r]/)||''
               if nl=readnl
                 close+=tofill.ender+crs+nl
                 break
@@ -1915,10 +2020,12 @@
 
             if tofill.quote=="'" 
               line=til_charset(/[\n]/)
               unless nl=readnl
                 assert eof?
+                lexerror tofill, "unterminated here body"
+                lexerror tofill.string, "unterminated here body"
                 break  #this is an error, should be handled better
               end
               line.chomp!("\r")
               line<< "\n"
               assert("\n"==prevchar)
@@ -2116,11 +2223,11 @@
 
   #-----------------------------------
   #used to resolve the ambiguity of
   # unary ops (+, -, *, &, ~ !) in ruby
   #returns whether current token is to be the start of a literal
-  IDBEGINCHAR=/^[a-zA-Z_$@]/
+  IDBEGINCHAR=/^(?:#@@LETTER|[$@])/o
   def unary_op_expected?(ch) #yukko hack
     '*&='[readahead(2)[1..1]] and return false
 
     return true if KeywordToken===@last_operative_token and @last_operative_token==='for'
  
@@ -2137,12 +2244,12 @@
    # <<, %, ? in ruby
    #returns whether current token is to be the start of a literal
    def quote_expected?(ch) #yukko hack
      case ch[0]
           when ?? then readahead(2)[/^\?[#{WHSPLF}]$/o] #not needed?
-          when ?% then readahead(3)[/^%([a-pt-vyzA-PR-VX-Z]|[QqrswWx][a-zA-Z0-9])/]
-          when ?< then !readahead(4)[/^<<-?['"`a-z0-9_]/i]
+          when ?% then readahead(3)[/^%([a-pt-vyzA-PR-VX-Z]|[QqrswWx]#{@@LETTER_DIGIT.gsub('_','')})/o]
+          when ?< then !readahead(4)[/^<<-?(?:['"`]|#@@LETTER_DIGIT)/o]
           else raise 'unexpected ch (#{ch}) in quote_expected?'
      #     when ?+,?-,?&,?*,?~,?! then '*&='[readahead(2)[1..1]]
      end and return false
 
      after_nonid_op? {
@@ -2320,29 +2427,38 @@
       assert str=='='
       c=(eat_next_if(/[~=>]/)or'')
       str << c
       result= operator_or_methname_token( str,offset)
       case c
-      when '=': #===,==
+      when '=' #===,==
         str<< (eat_next_if(?=)or'')
       
-      when '>': #=>
+      when '>' #=>
         unless ParamListContextNoParen===@parsestack.last
           @moretokens.unshift result
           @moretokens.unshift( *abort_noparens!("=>"))
           result=@moretokens.shift
         end
         @parsestack.last.see self,:arrow
-      when '': #plain assignment: record local variable definitions
+      when '~' # =~... after regex, maybe?
+        last=last_operative_token
+        
+        if @rubyversion>=1.9 and StringToken===last and last.lvars
+          #ruby delays adding lvars from regexps to known lvars table
+          #for several tokens in some cases. not sure why or if on purpose
+          #i'm just going to add them right away
+          localvars.concat last.lvars
+        end
+      when '' #plain assignment: record local variable definitions
         last_context_not_implicit.lhs=false
         @moretokens.push( *ignored_tokens(true).map{|x| 
           NewlineToken===x ? EscNlToken.new(@filename,@linenum,x.ident,x.offset) : x 
         } )
         @parsestack.push AssignmentRhsContext.new(@linenum)
         if eat_next_if ?* 
           tok=OperatorToken.new('*', input_position-1)
-          tok.unary=true
+          tok.tag=:unary
           @moretokens.push tok
           WHSPLF[nextchar.chr] or
             @moretokens << NoWsToken.new(input_position)
           comma_in_lvalue_list? #is this needed?
         end
@@ -2448,27 +2564,38 @@
         #but control never comes this way in those cases... goes 
         #to custom parsers for alias, undef, and def in #parse_keywords
         tokch.set_infix! unless after_nonid_op?{WHSPLF[lastchar]}
         @parsestack.push ListImmedContext.new(ch,@linenum)
         lasttok=last_operative_token
-        #could be: lasttok===/^[a-z_]/i
-        if (VarNameToken===lasttok or ImplicitParamListEndToken===lasttok or MethNameToken===lasttok) and !WHSPCHARS[lastchar]
+        #could be: lasttok===/^#@@LETTER/o
+        if (VarNameToken===lasttok or ImplicitParamListEndToken===lasttok or 
+            MethNameToken===lasttok or lasttok===FUNCLIKE_KEYWORDS) and !WHSPCHARS[lastchar]
                @moretokens << (tokch)
                tokch= NoWsToken.new(input_position-1)
         end
       when '('
         lasttok=last_token_maybe_implicit #last_operative_token
-        #could be: lasttok===/^[a-z_]/i
+        #could be: lasttok===/^#@@LETTER/o
         if (VarNameToken===lasttok or MethNameToken===lasttok or
             lasttok===FUNCLIKE_KEYWORDS)
           unless WHSPCHARS[lastchar]
                @moretokens << tokch
                tokch= NoWsToken.new(input_position-1)
           end
           @parsestack.push ParamListContext.new(@linenum)
         else
-          @parsestack.push ParenContext.new(@linenum)
+          ctx=@parsestack.last
+          lasttok=last_operative_token
+          maybe_def=DefContext===ctx && !ctx.in_body &&
+            !(KeywordToken===lasttok && lasttok.ident=="def")
+          if maybe_def or 
+             BlockParamListLhsContext===ctx or 
+             ParenContext===ctx && ctx.lhs
+            @parsestack.push KnownNestedLhsParenContext.new(@linenum)
+          else
+            @parsestack.push ParenContext.new(@linenum)
+          end
         end
 
       when '{'
       #check if we are in a hash literal or string inclusion (#{}),
       #in which case below would be bad.
@@ -2572,16 +2699,17 @@
        (RescueSMContext===@parsestack[-2] && @parsestack[-2].state==:rescue) ||
        (DefContext===@parsestack[-2] && !@parsestack[-2].in_body)
          @parsestack.pop
          @moretokens.unshift AssignmentRhsListEndToken.new(input_position)
     end
-    token.comma_type=
     case @parsestack[-1]
-    when AssignmentRhsContext; :rhs
-    when ParamListContext,ParamListContextNoParen; :call
-    when ListImmedContext; :array
+    when AssignmentRhsContext; token.tag=:rhs
+    when ParamListContext,ParamListContextNoParen; #:call
+    when ListImmedContext; #:array
+    when BlockParamListLhsContext; #:block
+    when KnownNestedLhsParenContext; #:nested
     else
-      :lhs if comma_in_lvalue_list? 
+      token.tag=:lhs if comma_in_lvalue_list? 
     end
     @parsestack.last.see self,:comma
     return @moretokens.shift
   end