lib/rubylexer.rb in rubylexer-0.7.0 vs lib/rubylexer.rb in rubylexer-0.7.1

- old
+ new

@@ -1,8 +1,8 @@ -=begin copyright +=begin legal crap rubylexer - a ruby lexer written in ruby - Copyright (C) 2004,2005 Caleb Clausen + Copyright (C) 2004,2005,2008 Caleb Clausen This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. @@ -16,11 +16,10 @@ License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA =end - require 'rubylexer/rulexer' #must be 1st!!! require 'rubylexer/version' require 'rubylexer/token' require 'rubylexer/charhandler' require 'rubylexer/symboltable' @@ -30,37 +29,41 @@ #----------------------------------- class RubyLexer include NestedContexts + + RUBYSYMOPERATORREX= - %r{^([&|^/%~]|=(==?|~)|>[=>]?|<(<|=>?)?|[+\-]@?|\*\*?|\[\]=?)} + %r{^([&|^/%]|=(==?)|=~|>[=>]?|<(<|=>?)?|[+~\-]@?|\*\*?|\[\]=?)} # (nasty beastie, eh?) #these are the overridable operators #does not match flow-control operators like: || && ! or and if not #or op= ops like: += -= ||= #or .. ... ?: #for that use: RUBYNONSYMOPERATORREX= - %r{^([%^~/\-+|&]=|(\|\||&&)=?|(<<|>>|\*\*?)=|\.{1,3}|[?:,;]|=>?|![=~]?)$} + %r{^([%^/\-+|&]=|(\|\||&&)=?|(<<|>>|\*\*?)=|\.{1,3}|[?:,;]|::|=>?|![=~]?)$} RUBYOPERATORREX=/#{RUBYSYMOPERATORREX}|#{RUBYNONSYMOPERATORREX}/o UNSYMOPS=/^[~!]$/ #always unary UBSYMOPS=/^([*&+-]|::)$/ #ops that could be unary or binary WHSPCHARS=WHSPLF+"\\#" - OPORBEGINWORDS="(if|unless|while|until)" - BEGINWORDS=/^(def|class|module|begin|for|case|do|#{OPORBEGINWORDS})$/o - FUNCLIKE_KEYWORDS=/^(break|next|redo|return|raise|yield|defined\?|retry|super|BEGIN|END)$/ + OPORBEGINWORDLIST=%w(if unless while until) + BEGINWORDLIST=%w(def class module begin for case do)+OPORBEGINWORDLIST + OPORBEGINWORDS="(#{OPORBEGINWORDLIST.join '|'})" + BEGINWORDS=/^(#{BEGINWORDLIST.join '|'})$/o + FUNCLIKE_KEYWORDS=/^(break|next|redo|return|yield|retry|super|BEGIN|END)$/ VARLIKE_KEYWORDS=/^(__FILE__|__LINE__|false|nil|self|true)$/ INNERBOUNDINGWORDS="(else|elsif|ensure|in|then|rescue|when)" BINOPWORDS="(and|or)" - NEVERSTARTPARAMLISTWORDS=/^(#{OPORBEGINWORDS}|#{INNERBOUNDINGWORDS}|#{BINOPWORDS}|end)([^a-zA-Z0-9_!?=]|\Z)/o + NEVERSTARTPARAMLISTWORDS=/\A(#{OPORBEGINWORDS}|#{INNERBOUNDINGWORDS}|#{BINOPWORDS}|end)([^a-zA-Z0-9_!?=]|\Z)/o NEVERSTARTPARAMLISTFIRST=CharSet['aoeitrwu'] #chars that begin NEVERSTARTPARAMLIST NEVERSTARTPARAMLISTMAXLEN=7 #max len of a NEVERSTARTPARAMLIST RUBYKEYWORDS=%r{ - ^(alias|#{BINOPWORDS}|not|undef|end| + ^(alias|#{BINOPWORDS}|defined\?|not|undef|end| #{VARLIKE_KEYWORDS}|#{FUNCLIKE_KEYWORDS}| #{INNERBOUNDINGWORDS}|#{BEGINWORDS} )$ }xo #__END__ should not be in this set... its handled in start_of_line_directives @@ -70,12 +73,13 @@ ?@ => :at_identifier, ?a..?z => :identifier, ?A..?Z => :identifier, ?_ => :identifier, ?0..?9 => :number, - %{"'} => :double_quote, - ?` => :back_quote, + ?" => :double_quote, #" + ?' => :single_quote, #' + ?` => :back_quote, #` WHSP => :whitespace, #includes \r ?, => :comma, ?; => :semicolon, @@ -97,63 +101,128 @@ ?: => :symbol_or_op, ?\n => :newline, #implicitly escaped after op #?\r => :newline, #implicitly escaped after op ?\\ => :escnewline, - ?\0 => :eof, + ?\x00 => :eof, + ?\x04 => :eof, + ?\x1a => :eof, "[({" => :open_brace, "])}" => :close_brace, ?# => :comment } - attr_reader :incomplete_here_tokens, :parsestack + attr_reader :incomplete_here_tokens, :parsestack, :last_token_maybe_implicit #----------------------------------- - def initialize(filename,file,linenum=1) - super(filename,file, linenum) + def initialize(filename,file,linenum=1,offset_adjust=0) + @offset_adjust=0 #set again in next line + super(filename,file, linenum,offset_adjust) @start_linenum=linenum @parsestack=[TopLevelContext.new] - @incomplete_here_tokens=[] + @incomplete_here_tokens=[] #not used anymore + @pending_here_bodies=[] @localvars_stack=[SymbolTable.new] @defining_lvar=nil @in_def_name=false + @last_operative_token=nil + @last_token_maybe_implicit=nil @toptable=CharHandler.new(self, :illegal_char, CHARMAPPINGS) start_of_line_directives + progress_printer end + + def progress_printer + return unless ENV['RL_PROGRESS'] + $stderr.puts 'printing progresses' + @progress_thread=Thread.new do + until EoiToken===@last_operative_token + sleep 10 + $stderr.puts @file.pos + end + end + end def localvars; @localvars_stack.last end + attr :localvars_stack + attr :offset_adjust + attr_writer :pending_here_bodies + #----------------------------------- + def set_last_token(tok) + @last_operative_token=@last_token_maybe_implicit=tok + end + + #----------------------------------- def get1token result=super #most of the action's here + if ENV['PROGRESS'] + @last_cp_pos||=0 + @start_time||=Time.now + if result.offset-@last_cp_pos>100000 + $stderr.puts "#{result.offset} #{Time.now-@start_time}" + @last_cp_pos=result.offset + end + end + #now cleanup and housekeeping #check for bizarre token types case result + when ImplicitParamListStartToken, ImplicitParamListEndToken + @last_token_maybe_implicit=result + result when StillIgnoreToken#,nil result + when StringToken + set_last_token result + assert !(IgnoreToken===@last_operative_token) + result.elems.map!{|frag| + if String===frag + result.translate_escapes(frag) + else + frag + end + } if AUTO_UNESCAPE_STRINGS + result + when Token#,String - @last_operative_token=result + set_last_token result assert !(IgnoreToken===@last_operative_token) result else raise "#{@filename}:#{linenum}:token is a #{result.class}, last is #{@last_operative_token}" end end + #----------------------------------- + def eof? + super or EoiToken===@last_operative_token + end #----------------------------------- + def input_position + super+@offset_adjust + end + + #----------------------------------- + def input_position_raw + @file.pos + end + + #----------------------------------- def balanced_braces? #@parsestack.empty? @parsestack.size==1 and TopLevelContext===@parsestack.first end @@ -161,31 +230,41 @@ #----------------------------------- def dollar_identifier(ch=nil) s=eat_next_if(?$) or return nil if t=((identifier_as_string(?$) or special_global)) - s<<t + s << t else error= "missing $id name" end return lexerror(VarNameToken.new(s),error) end #----------------------------------- def at_identifier(ch=nil) result = (eat_next_if(?@) or return nil) - result << (eat_next_if(?@)or'') + result << (eat_next_if(?@) or '') if t=identifier_as_string(?@) - result<<t + result << t else error= "missing @id name" end - return lexerror(VarNameToken.new(result),error) + result=VarNameToken.new(result) + result.in_def=true if inside_method_def? + return lexerror(result,error) end private #----------------------------------- - def here_spread_over_ruby_code(rl,tok) + def inside_method_def? + @parsestack.reverse_each{|ctx| + ctx.starter=='def' and ctx.state!=:saw_def and return true + } + return false + end + + #----------------------------------- + def here_spread_over_ruby_code(rl,tok) #not used anymore assert(!rl.incomplete_here_tokens.empty?) @incomplete_here_tokens += rl.incomplete_here_tokens end #----------------------------------- @@ -205,14 +284,14 @@ @moretokens.unshift tok return result end #----------------------------------- - WSCHARSET=/[#\\\n\s\t\v\r\f]/ + WSCHARSET=/[#\\\n\s\t\v\r\f\x00\x04\x1a]/ def ignored_tokens(allow_eof=false,allow_eol=true) result=[] - result<<@moretokens.shift while StillIgnoreToken===@moretokens.first + result << @moretokens.shift while StillIgnoreToken===@moretokens.first @moretokens.empty? or return result loop do unless @moretokens.empty? case @moretokens.first when StillIgnoreToken @@ -271,12 +350,12 @@ def special_global #handle $-a and friends assert prevchar=='$' result = (( #order matters here, but it shouldn't #(but til_charset must be last) - eat_next_if(/[!@&+`'=~\/\\,.;<>*"$?:]/) or - (eat_next_if('-') and ("-"+getchar)) or + eat_if(/-[a-z0-9_]/i,2) or + eat_next_if(/[!@&+`'=~\-\/\\,.;<>*"$?:]/) or (?0..?9)===nextchar ? til_charset(/[^\d]/) : nil )) end #----------------------------------- @@ -287,75 +366,49 @@ #skip keyword processing if 'escaped' as it were, by def, . or :: #or if in a non-bare context #just asserts because those contexts are never encountered. #control goes through symbol(<...>,nil) assert( /^[a-z_]$/i===context) - assert !(@last_operative_token===/^(\.|::|(un)?def|alias)$/) + assert MethNameToken===@last_operative_token || !(@last_operative_token===/^(\.|::|(un)?def|alias)$/) - @moretokens.unshift(*parse_keywords(str,oldpos) do + @moretokens.unshift(*parse_keywords(str,oldpos) do |tok| #if not a keyword, case str when FUNCLIKE_KEYWORDS; #do nothing when VARLIKE_KEYWORDS,RUBYKEYWORDS; raise "shouldnt see keywords here, now" end - safe_recurse { |a| var_or_meth_name(str,@last_operative_token,oldpos) } + was_last=@last_operative_token + @last_operative_token=tok if tok + safe_recurse { |a| var_or_meth_name(str,was_last,oldpos,after_nonid_op?{true}) } end) return @moretokens.shift end #----------------------------------- + IDENTREX={} def identifier_as_string(context) #must begin w/ letter or underscore - str=eat_next_if(/[_a-z]/i) or return nil + /[_a-z]/i===nextchar.chr or return #equals, question mark, and exclamation mark #might be allowed at the end in some contexts. #(in def headers and symbols) #otherwise, =,?, and ! are to be considered #separate tokens. confusing, eh? #i hope i've captured all right conditions.... #context should always be ?: right after def, ., and :: now - maybe_eq,maybe_qm,maybe_ex = case context - when ?@,?$ then [nil,nil,nil] - when ?: then [?=, ??, ?!] - else [nil,??, ?!] - end - - @in_def_name and maybe_eq= ?= + #= and ! only match if not part of a larger operator + trailers = + case context + when ?@,?$ then "" +# when ?: then "!(?![=])|\\?|=(?![=~>])" + else "!(?![=])|\\?" + end + @in_def_name||context==?: and trailers<<"|=(?![=~>])" - str<<til_charset(/[^a-z0-9_]/i) - - #look for ?, !, or =, if allowed - case b=getc - when nil #means we're at eof - #handling nil here prevents b from ever matching - #a nil value of maybe_qm, maybe_ex or maybe_eq - when maybe_qm - str << b - when maybe_ex - nc=(nextchar unless eof?) - #does ex appear to be part of a larger operator? - if nc==?= #or nc==?~ - back1char - else - str << b - end - when maybe_eq - nc=(nextchar unless eof?) - #does eq appear to be part of a larger operator? - if nc==?= or nc==?~ or nc==?> - back1char - else - str << b - end - else - back1char - end - - - return str + @file.scan(IDENTREX[trailers]||=/^[_a-z][a-z0-9_]*(?:#{trailers})?/i) end #----------------------------------- #contexts in which comma may appear in ruby: #multiple lhs (terminated by assign op) @@ -378,69 +431,82 @@ #----------------------------------- #a comma has been seen. are we in an #lvalue list or some other construct that uses commas? def comma_in_lvalue_list? - @parsestack.last.lhs= (not ListContext===@parsestack.last) + @parsestack.last.lhs= + case l=@parsestack.last + when ListContext: + when DefContext: l.in_body + else true + end end #----------------------------------- def in_lvar_define_state #@defining_lvar is a hack @defining_lvar or case ctx=@parsestack.last - when ForSMContext; ctx.state==:for - when RescueSMContext; ctx.state==:arrow + #when ForSMContext; ctx.state==:for + when RescueSMContext + @last_operative_token.ident=="=>" and @file.match? /\A[\s\v]*([:;#\n]|then[^a-zA-Z0-9_])/m #when BlockParamListLhsContext; true end end + + IMPLICIT_PARENS_BEFORE_ACCESSOR_ASSIGNMENT=2 #----------------------------------- #determine if an alphabetic identifier refers to a variable #or method name. generates implicit parenthes(es) if it is a #call site and no explicit parens are present. starts an implicit param list #if appropriate. adds tok to the #local var table if its a local var being defined for the first time. - #note: what we here call variables (rather, constants) following :: - #might actually be methods at runtime, but that's immaterial to tokenization. - - #note: this routine should determine the correct token type for name and - #create the appropriate token. currently this is not done because callers - #typically have done it (perhaps incorrectly) already. - def var_or_meth_name(name,lasttok,pos) + #in general, operators in ruby are disambuated by the before-but-not-after rule. + #an otherwise ambiguous operator is disambiguated by the surrounding whitespace: + #whitespace before but not after the 'operator' indicates it is to be considered a + #value token instead. otherwise it is a binary operator. (unary (prefix) ops count + #as 'values' here.) + def var_or_meth_name(name,lasttok,pos,was_after_nonid_op) #look for call site if not a keyword or keyword is function-like #look for and ignore local variable names assert String===name + was_in_lvar_define_state=in_lvar_define_state #maybe_local really means 'maybe local or constant' maybe_local=case name - when /[^a-z_0-9]$/i; #do nothing - when /^[a-z_]/; (localvars===name or VARLIKE_KEYWORDS===name or in_lvar_define_state) and not lasttok===/^(\.|::)$/ - when /^[A-Z]/; is_const=true;not lasttok==='.' #this is the right algorithm for constants... + when /[^a-z_0-9]$/i #do nothing + when /^[a-z_]/ + (localvars===name or + VARLIKE_KEYWORDS===name or + was_in_lvar_define_state + ) and not lasttok===/^(\.|::)$/ + when /^[A-Z]/ + is_const=true + not lasttok==='.' #this is the right algorithm for constants... end assert(@moretokens.empty?) oldlast=@last_operative_token - tok=@last_operative_token=VarNameToken.new(name,pos) + tok=set_last_token assign_lvar_type!(VarNameToken.new(name,pos)) oldpos= input_position sawnl=false result=ws_toks=ignored_tokens(true) {|nl| sawnl=true } if sawnl || eof? - if maybe_local then - if in_lvar_define_state - if /^[a-z_][a-zA-Z_0-9]*$/===name - assert !(lasttok===/^(\.|::)$/) - localvars[name]=true - else - lexerror tok,"not a valid variable name: #{name}" - end - return result.unshift(tok) + if was_in_lvar_define_state + if /^[a-z_][a-zA-Z_0-9]*$/===name + assert !(lasttok===/^(\.|::)$/) + localvars[name]=true + else + lexerror tok,"not a valid variable name: #{name}" end + return result.unshift(tok) + elsif maybe_local return result.unshift(tok) #if is_const else return result.unshift( MethNameToken.new(name,pos), #insert implicit parens right after tok ImplicitParamListStartToken.new( oldpos), @@ -453,20 +519,22 @@ #then omit implicit parens assignment_coming=case nc=nextchar when ?=; not /^=[>=~]$/===readahead(2) when ?,; comma_in_lvalue_list? when ?); last_context_not_implicit.lhs + when ?i; /^in[^a-zA-Z_0-9]/===readahead(3) and + ForSMContext===last_context_not_implicit when ?>,?<; /^(.)\1=$/===readahead(3) when ?*,?&; /^(.)\1?=/===readahead(3) when ?|; /^\|\|?=/===readahead(3) or #is it a goalpost? BlockParamListLhsContext===last_context_not_implicit && readahead(2)[1] != ?| when ?%,?/,?-,?+,?^; readahead(2)[1]== ?= end - if (assignment_coming && !(lasttok===/^(\.|::)$/) or in_lvar_define_state) - tok=VarNameToken.new(name,pos) + if (assignment_coming && !(lasttok===/^(\.|::)$/) or was_in_lvar_define_state) + tok=assign_lvar_type! VarNameToken.new(name,pos) if /[^a-z_0-9]$/i===name lexerror tok,"not a valid variable name: #{name}" elsif /^[a-z_]/===name and !(lasttok===/^(\.|::)$/) localvars[name]=true end @@ -474,59 +542,130 @@ end implicit_parens_to_emit= if assignment_coming @parsestack.push AssignmentContext.new(nil) if nc==?% or nc==?/ - 0 + IMPLICIT_PARENS_BEFORE_ACCESSOR_ASSIGNMENT else case nc when nil: 2 - when ?!; readahead(2)=='!=' ? 2 : 1 + when ?!; /^![=~]$/===readahead(2) ? 2 : 1 + when ?d; + if /^do([^a-zA-Z0-9_]|$)/===readahead(3) + if maybe_local and expecting_do? + ty=VarNameToken + 0 + else + maybe_local=false + 2 + end + else + 1 + end when NEVERSTARTPARAMLISTFIRST (NEVERSTARTPARAMLISTWORDS===readahead(NEVERSTARTPARAMLISTMAXLEN)) ? 2 : 1 - when ?",?',?`,?a..?z,?A..?Z,?0..?9,?_,?@,?$,?~; 1 + when ?",?',?`,?a..?z,?A..?Z,?0..?9,?_,?@,?$,?~; 1 #" when ?{ maybe_local=false + 1 +=begin x=2 x-=1 if /\A(return|break|next)\Z/===name and !(KeywordToken===oldlast and oldlast===/\A(\.|::)\Z/) x +=end when ?(; - maybe_local=false; !(ws_toks.empty? or lasttok===/^(\.|::)$/)? 1 : 0 + maybe_local=false + lastid=lasttok&&lasttok.ident + case lastid + when /\A[;(]|do\Z/: was_after_nonid_op=false + when '|': was_after_nonid_op=false unless BlockParamListLhsContext===@parsestack.last + when '{': was_after_nonid_op=false if BlockContext===@parsestack.last or BeginEndContext===@parsestack.last + end if KeywordToken===lasttok + was_after_nonid_op=false if NewlineToken===lasttok or lasttok.nil? + want_parens=!(ws_toks.empty? or was_after_nonid_op) #or +# /^(::|rescue|yield|else|case|when|if|unless|until|while|and|or|&&|\|\||[?:]|\.\.?\.?|=>)$/===lastid or +# MethNameToken===lasttok or +# RUBYNONSYMOPERATORREX===lastid && /=$/===lastid && '!='!=lastid +# ) + + #look ahead for closing paren (after some whitespace...) + want_parens=false if @file.match? /\A.(?:\s|\v|\#.*\n)*\)/ +# afterparen=@file.pos +# getchar +# ignored_tokens(true) +# want_parens=false if nextchar==?) +# @file.pos=afterparen + + want_parens ? 1 : 0 when ?},?],?),?;,?^, ?|, ?>, ?,, ?., ?=; 2 - when ?+, ?-, ?*, ?&, ?%, ?/; (ws_toks.empty? || readahead(2)[/^.[#{WHSPLF}]/]) ? 2 : 3 - when ?:,??; next2=readahead(2); - WHSPLF[next2[1].chr] || next2=='::' ? 2 : 3 -# when ?:,??; (readahead(2)[/^.[#{WHSPLF}]/]) ? 2 : 3 - when ?<; (ws_toks.empty? || readahead(3)[/^<<["'`a-zA-Z_0-9-]/]) ? 3 : 2 - when ?[; ws_toks.empty? ? 2 : 3 + when ?+, ?-, ?%, ?/ + if /^(return|break|next)$/===@last_operative_token.ident and not( + KeywordToken===lasttok and /^(.|::)$/===lasttok.ident + ) + 1 + else + (ws_toks.empty? || readahead(2)[/^.[#{WHSPLF}]/o]) ? 2 : 3 + end + when ?*, ?& + lasttok=@last_operative_token + if /^(return|break|next)$/===@last_operative_token.ident and not( + KeywordToken===lasttok and /^(.|::)$/===lasttok.ident + ) + 1 + else + (ws_toks.empty? || readahead(2)[/^.[#{WHSPLF}*&]/o]) ? 2 : 3 + end + when ?: + next2=readahead(2) + if /^:(?:[#{WHSPLF}]|(:))$/o===next2 then + $1 && !ws_toks.empty? ? 3 : 2 + else + 3 + end + when ??; next3=readahead(3); + /^\?([#{WHSPLF}]|[a-z_][a-z_0-9])/io===next3 ? 2 : 3 +# when ?:,??; (readahead(2)[/^.[#{WHSPLF}]/o]) ? 2 : 3 + when ?<; (!ws_toks.empty? && readahead(4)[/^<<-?["'`a-zA-Z_0-9]/]) ? 3 : 2 + when ?[; ws_toks.empty?&&!(KeywordToken===oldlast and /^(return|break|next)$/===oldlast.ident) ? 2 : 3 when ?\\, ?\s, ?\t, ?\n, ?\r, ?\v, ?#; raise 'failure' else raise "unknown char after ident: #{nc=nextchar ? nc.chr : "<<EOF>>"}" end end - if is_const and implicit_parens_to_emit==3 then + if is_const and implicit_parens_to_emit==3 then #needed? implicit_parens_to_emit=1 end - tok=if maybe_local and implicit_parens_to_emit>=2 + if maybe_local and implicit_parens_to_emit>=2 implicit_parens_to_emit=0 - VarNameToken + ty=VarNameToken else - MethNameToken - end.new(name,pos) + ty||=MethNameToken + end + tok=assign_lvar_type!(ty.new(name,pos)) + case implicit_parens_to_emit when 2; result.unshift ImplicitParamListStartToken.new(oldpos), ImplicitParamListEndToken.new(oldpos) when 1,3; arr,pass=*param_list_coming_with_2_or_more_params? result.push( *arr ) unless pass + #only 1 param in list result.unshift ImplicitParamListStartToken.new(oldpos) - @parsestack.push ParamListContextNoParen.new(@linenum) + last=result.last + last.set_callsite! false if last.respond_to? :callsite? and last.callsite? #KeywordToken===last and last.ident==')' + if /^(break|next|return)$/===name and + !(KeywordToken===lasttok and /^(.|::)$/===lasttok.ident) + ty=KWParamListContextNoParen + else + ty=ParamListContextNoParen + end + @parsestack.push ty.new(@linenum) end when 0; #do nothing else raise 'invalid value of implicit_parens_to_emit' end return result.unshift(tok) @@ -545,28 +684,32 @@ WHSPCHARS[prevchar] && (?(==nextchar) or return [[],false] basesize=@parsestack.size result=[get1token] pass=loop{ tok=get1token - result<<tok + result << tok if @parsestack.size==basesize break false elsif ','==tok.to_s and @parsestack.size==basesize+1 break true + elsif OperatorToken===tok and /^[&*]$/===tok.ident and tok.unary and @parsestack.size==basesize+1 + break true elsif EoiToken===tok lexerror tok, "unexpected eof in parameter list" end } return [result,pass] end #----------------------------------- - CONTEXT2ENDTOK={AssignmentRhsContext=>AssignmentRhsListEndToken, - ParamListContextNoParen=>ImplicitParamListEndToken, - WhenParamListContext=>KwParamListEndToken, - RescueSMContext=>KwParamListEndToken - } + CONTEXT2ENDTOK={ + AssignmentRhsContext=>AssignmentRhsListEndToken, + ParamListContextNoParen=>ImplicitParamListEndToken, + KWParamListContextNoParen=>ImplicitParamListEndToken, + WhenParamListContext=>KwParamListEndToken, + RescueSMContext=>KwParamListEndToken + } def abort_noparens!(str='') #assert @moretokens.empty? result=[] while klass=CONTEXT2ENDTOK[@parsestack.last.class] result << klass.new(input_position-str.length) @@ -574,32 +717,89 @@ @parsestack.pop end return result end -if false #no longer used + #----------------------------------- + CONTEXT2ENDTOK_FOR_RESCUE={ + AssignmentRhsContext=>AssignmentRhsListEndToken, + ParamListContextNoParen=>ImplicitParamListEndToken, + KWParamListContextNoParen=>ImplicitParamListEndToken, + WhenParamListContext=>KwParamListEndToken, + RescueSMContext=>KwParamListEndToken + } + def abort_noparens_for_rescue!(str='') + #assert @moretokens.empty? + result=[] + ctx=@parsestack.last + while klass=CONTEXT2ENDTOK_FOR_RESCUE[ctx.class] + break if AssignmentRhsContext===ctx && !ctx.multi_assign? + if ParamListContextNoParen===ctx && AssignmentRhsContext===@parsestack[-2] + result.push ImplicitParamListEndToken.new(input_position-str.length), + AssignmentRhsListEndToken.new(input_position-str.length) + @parsestack.pop + @parsestack.pop + break + end + result << klass.new(input_position-str.length) #unless AssignmentRhsContext===ctx and !ctx.multi_assign? + break if RescueSMContext===ctx #why is this here? + @parsestack.pop + ctx=@parsestack.last + end + return result + end + + #----------------------------------- + CONTEXT2ENDTOK_FOR_DO={ + AssignmentRhsContext=>AssignmentRhsListEndToken, + ParamListContextNoParen=>ImplicitParamListEndToken, + ExpectDoOrNlContext=>1, + #WhenParamListContext=>KwParamListEndToken, + #RescueSMContext=>KwParamListEndToken + } + def abort_noparens_for_do!(str='') + #assert @moretokens.empty? + result=[] + while klass=CONTEXT2ENDTOK_FOR_DO[@parsestack.last.class] + break if klass==1 + result << klass.new(input_position-str.length) + @parsestack.pop + end + return result + end + + #----------------------------------- + def expecting_do? + @parsestack.reverse_each{|ctx| + next if AssignmentRhsContext===ctx + return !!CONTEXT2ENDTOK_FOR_DO[ctx.class] + } + return false + end + #----------------------------------- def abort_1_noparen!(offs=0) assert @moretokens.empty? result=[] while AssignmentRhsContext===@parsestack.last @parsestack.pop result << AssignmentRhsListEndToken.new(input_position-offs) end - ParamListContextNoParen===@parsestack.last or lexerror huh,'{} with no matching callsite' + if ParamListContextNoParen===@parsestack.last #or lexerror huh,'{} with no matching callsite' @parsestack.pop result << ImplicitParamListEndToken.new(input_position-offs) + end return result end -end #----------------------------------- #parse keywords now, to prevent confusion over bare symbols #and match end with corresponding preceding def or class or whatever. #if arg is not a keyword, the block is called def parse_keywords(str,offset) assert @moretokens.empty? + assert !(KeywordToken===@last_operative_token and /A(.|::|def)\Z/===@last_operative_token.ident) result=[KeywordToken.new(str,offset)] case str when "end" result.unshift(*abort_noparens!(str)) @@ -617,15 +817,19 @@ start,line=ctx.starter,ctx.linenum BEGINWORDS===start or lexerror result.last, "end does not match #{start or "nil"}" /^(do)$/===start and localvars.end_block /^(class|module|def)$/===start and @localvars_stack.pop - when "class","module" + when "module" result.first.has_end! @parsestack.push WantsEndContext.new(str,@linenum) @localvars_stack.push SymbolTable.new + when "class" + result.first.has_end! + @parsestack.push ClassContext.new(str,@linenum) + when "if","unless" #could be infix form without end if after_nonid_op?{false} #prefix form result.first.has_end! @parsestack.push WantsEndContext.new(str,@linenum) @@ -651,26 +855,27 @@ # corresponding EndToken emitted leaving ForContext ("in" branch, below) @parsestack.push WantsEndContext.new(str,@linenum) #expect_do_or_end_or_nl! str #handled by ForSMContext now @parsestack.push ForSMContext.new(@linenum) when "do" - result.unshift(*abort_noparens!(str)) + result.unshift(*abort_noparens_for_do!(str)) if ExpectDoOrNlContext===@parsestack.last @parsestack.pop assert WantsEndContext===@parsestack.last + result.last.as=";" else result.last.has_end! @parsestack.push WantsEndContext.new(str,@linenum) localvars.start_block block_param_list_lookahead end when "def" result.first.has_end! - @parsestack.push WantsEndContext.new("def",@linenum) - @localvars_stack.push SymbolTable.new + @parsestack.push ctx=DefContext.new(@linenum) + ctx.state=:saw_def safe_recurse { |aa| - @last_operative_token=KeywordToken.new "def" #hack + set_last_token KeywordToken.new "def" #hack result.concat ignored_tokens #read an expr like a.b.c or a::b::c #or (expr).b.c if nextchar==?( #look for optional parenthesised head @@ -681,14 +886,15 @@ case tok when/^\($/.token_pat then parencount+=1 when/^\)$/.token_pat then parencount-=1 end EoiToken===tok and lexerror tok, "eof in def header" - result<<tok + result << tok end until parencount==0 #@parsestack.size==old_size - else #no parentheses, all tail - @last_operative_token=KeywordToken.new "." #hack hack + @localvars_stack.push SymbolTable.new + else #no parentheses, all tail + set_last_token KeywordToken.new "." #hack hack tokindex=result.size result << tok=symbol(false,false) name=tok.to_s assert !in_lvar_define_state @@ -698,93 +904,126 @@ when /^[@$]/; true when VARLIKE_KEYWORDS,FUNCLIKE_KEYWORDS; ty=KeywordToken when /^[a-z_]/; localvars===name when /^[A-Z]/; is_const=true #this is the right algorithm for constants... end - if !ty and maybe_local - result.push( *ignored_tokens(false,false) ) - nc=nextchar + result.push( *ignored_tokens(false,false) ) + nc=nextchar + if !ty and maybe_local if nc==?: || nc==?. ty=VarNameToken end end - unless ty - ty=MethNameToken - endofs=tok.offset+tok.to_s.length - result[tokindex+1...tokindex+1]= - [ImplicitParamListStartToken.new(endofs),ImplicitParamListEndToken.new(endofs)] + if ty.nil? or (ty==KeywordToken and nc!=?: and nc!=?.) + ty=MethNameToken + if nc != ?( + endofs=tok.offset+tok.to_s.length + newtok=ImplicitParamListStartToken.new(endofs) + result.insert tokindex+1, newtok + end end assert result[tokindex].equal?(tok) - result[tokindex]=ty.new(tok.to_s,tok.offset) + var=assign_lvar_type! ty.new(tok.to_s,tok.offset) + @localvars_stack.push SymbolTable.new + var.in_def=true if inside_method_def? and var.respond_to? :in_def= + result[tokindex]=var - #if a.b.c.d is seen, a, b, and c + #if a.b.c.d is seen, a, b and c #should be considered maybe varname instead of methnames. #the last (d in the example) is always considered a methname; #it's what's being defined. #b and c should be considered varnames only if #they are capitalized and preceded by :: . #a could even be a keyword (eg self or block_given?). end #read tail: .b.c.d etc - result.reverse_each{|res| break @last_operative_token=res unless StillIgnoreToken===res} - ###@last_operative_token=result.last #naive + result.reverse_each{|res| break set_last_token res unless StillIgnoreToken===res} assert !(IgnoreToken===@last_operative_token) state=:expect_op @in_def_name=true loop do #look for start of parameter list nc=(@moretokens.empty? ? nextchar.chr : @moretokens.first.to_s[0,1]) if state==:expect_op and /^[a-z_(&*]/i===nc - result.concat def_param_list + ctx.state=:def_param_list + list,listend=def_param_list + result.concat list + end_index=result.index(listend) + ofs=listend.offset + if endofs + result.insert end_index,ImplicitParamListEndToken.new(ofs) + else + ofs+=listend.to_s.size + end + result.insert end_index+1,EndDefHeaderToken.new(ofs) break end tok=get1token - result<<tok + result<< tok case tok when EoiToken lexerror tok,'unexpected eof in def header' when StillIgnoreToken when MethNameToken ,VarNameToken # /^[a-z_]/i.token_pat lexerror tok,'expected . or ::' unless state==:expect_name state=:expect_op when /^(\.|::)$/.token_pat lexerror tok,'expected ident' unless state==:expect_op + if endofs + result.insert -2, ImplicitParamListEndToken.new(endofs) + endofs=nil + end state=:expect_name when /^(;|end)$/.token_pat, NewlineToken #are we done with def name? + ctx.state=:def_body state==:expect_op or lexerror tok,'expected identifier' + if endofs + result.insert -2,ImplicitParamListEndToken.new(tok.offset) + end + result.insert -2, EndDefHeaderToken.new(tok.offset) break else lexerror(tok, "bizarre token in def name: " + "#{tok}:#{tok.class}") end end @in_def_name=false } when "alias" safe_recurse { |a| - @last_operative_token=KeywordToken.new "alias" #hack + set_last_token KeywordToken.new "alias" #hack result.concat ignored_tokens res=symbol(eat_next_if(?:),false) - res ? result<<res : lexerror(result.first,"bad symbol in alias") - @last_operative_token=KeywordToken.new "alias" #hack - result.concat ignored_tokens - res=symbol(eat_next_if(?:),false) - res ? result<<res : lexerror(result.first,"bad symbol in alias") + unless res + lexerror(result.first,"bad symbol in alias") + else + res.ident[0]==?$ and res=VarNameToken.new(res.ident,res.offset) + result<< res + set_last_token KeywordToken.new "alias" #hack + result.concat ignored_tokens + res=symbol(eat_next_if(?:),false) + unless res + lexerror(result.first,"bad symbol in alias") + else + res.ident[0]==?$ and res=VarNameToken.new(res.ident,res.offset) + result<< res + end + end } when "undef" safe_recurse { |a| loop do - @last_operative_token=KeywordToken.new "," #hack + set_last_token KeywordToken.new "," #hack result.concat ignored_tokens tok=symbol(eat_next_if(?:),false) tok or lexerror(result.first,"bad symbol in undef") result<< tok - @last_operative_token=tok + set_last_token tok assert !(IgnoreToken===@last_operative_token) sawnl=false result.concat ignored_tokens(true){|nl| sawnl=true} @@ -807,17 +1046,17 @@ when "rescue" unless after_nonid_op? {false} #rescue needs to be treated differently when in operator context... #i think no RescueSMContext should be pushed on the stack... - #plus, the rescue token should be marked as infix - result.first.set_infix! + result.first.set_infix! #plus, the rescue token should be marked as infix + result.unshift(*abort_noparens_for_rescue!(str)) else result.push KwParamListStartToken.new(offset+str.length) #corresponding EndToken emitted by abort_noparens! on leaving rescue context - result.unshift(*abort_noparens!(str)) @parsestack.push RescueSMContext.new(@linenum) + result.unshift(*abort_noparens!(str)) end when "then" result.unshift(*abort_noparens!(str)) @parsestack.last.see self,:then @@ -829,20 +1068,47 @@ when /\A(#{BINOPWORDS}|#{INNERBOUNDINGWORDS})\Z/o result.unshift(*abort_noparens!(str)) when /\A(return|break|next)\Z/ - result=yield - result.first.has_no_block! unless KeywordToken===@last_operative_token and @last_operative_token===/\A(\.|::)\Z/ + fail if KeywordToken===@last_operative_token and @last_operative_token===/\A(\.|::)\Z/ + tok=KeywordToken.new(str,offset) + result=yield tok + result[0]=tok + tok.has_no_block! + + + when 'END' + #END could be treated, lexically, just as if it is an + #ordinary method, except that local vars created in + #END blocks are visible to subsequent code. (Why??) + #That difference forces a custom parsing. + if @last_operative_token===/^(\.|::)$/ + result=yield nil #should pass a keyword token here + else + safe_recurse{ + old=result.first + result=[ + MethNameToken.new(old.ident,old.offset), + ImplicitParamListStartToken.new(input_position), + ImplicitParamListEndToken.new(input_position), + *ignored_tokens + ] + getchar=='{' or lexerror(result.first,"expected { after #{str}") + result.push KeywordToken.new('{',input_position-1) + result.last.set_infix! + @parsestack.push BeginEndContext.new(str,offset) + } + end when FUNCLIKE_KEYWORDS - result=yield + result=yield nil #should be a keyword token when RUBYKEYWORDS #do nothing - else result=yield + else result=yield nil end return result end @@ -879,15 +1145,15 @@ #----------------------------------- def block_param_list_lookahead safe_recurse{ |la| - @last_operative_token=KeywordToken.new ';' + set_last_token KeywordToken.new ';' a=ignored_tokens if eat_next_if(?|) - a<<KeywordToken.new("|", input_position-1) + a<< KeywordToken.new("|", input_position-1) if true @parsestack.push mycontext=BlockParamListLhsContext.new(@linenum) nextchar==?| and a.push NoWsToken.new(input_position) else if eat_next_if(?|) @@ -907,29 +1173,29 @@ when AssignmentRhsListStartToken; @defining_lvar=false when AssignmentRhsListEndToken; parsestack_lastnonassign_is?(mycontext) and @defining_lvar=true end tok==='|' and parsestack_lastnonassign_is?(mycontext) and break - a<<tok + a<< tok end assert@defining_lvar || AssignmentRhsContext===@parsestack.last @defining_lvar=false while AssignmentRhsContext===@parsestack.last a.push( *abort_noparens!('|') ) end @parsestack.last.object_id==mycontext.object_id or raise 'expected my BlockParamListLhsContext atop @parsestack' @parsestack.pop - a<<KeywordToken.new('|',tok.offset) + a<< KeywordToken.new('|',tok.offset) @moretokens.empty? or fixme %#moretokens might be set from get1token call above...might be bad# end end end - @last_operative_token=KeywordToken.new ';' + set_last_token KeywordToken.new ';' #a.concat ignored_tokens #assert @last_operative_token===';' #a<<get1token @@ -946,10 +1212,11 @@ #the matching endbrace is found def def_param_list @in_def_name=false result=[] normal_comma_level=old_parsestack_size=@parsestack.size + listend=nil safe_recurse { |a| assert(@moretokens.empty?) assert((not IgnoreToken===@moretokens[0])) assert((@moretokens[0] or not nextchar.chr[WHSPCHARS])) @@ -970,100 +1237,123 @@ end class << endingblock alias === call end - @last_operative_token=KeywordToken.new ',' #hack + set_last_token KeywordToken.new ',' #hack #read local parameter names + nextvar=nil loop do expect_name=(@last_operative_token===',' and normal_comma_level==@parsestack.size) expect_name and @defining_lvar||=true result << tok=get1token - lexerror tok, "unexpected eof in def header" if EoiToken===tok + break lexerror(tok, "unexpected eof in def header") if EoiToken===tok #break if at end of param list - endingblock===tok and - old_parsestack_size>=@parsestack.size and break + if endingblock===tok and old_parsestack_size>=@parsestack.size + nextvar and localvars[nextvar]=true #add nextvar to local vars + listend=tok + break + end #next token is a local var name #(or the one after that if unary ops present) #result.concat ignored_tokens if expect_name case tok when IgnoreToken #, /^[A-Z]/ #do nothing when /^,$/.token_pat #hack - - + when VarNameToken assert@defining_lvar @defining_lvar=false assert((not @last_operative_token===',')) +# assert !nextvar + nextvar=tok.ident + localvars[nextvar]=false #remove nextvar from list of local vars for now when /^[&*]$/.token_pat #unary form... #a NoWsToken is also expected... read it now result.concat maybe_no_ws_token #not needed? - @last_operative_token=KeywordToken.new ',' + set_last_token KeywordToken.new ',' else lexerror tok,"unfamiliar var name '#{tok}'" end - elsif /^,$/.token_pat===tok and - normal_comma_level+1==@parsestack.size and - AssignmentRhsContext===@parsestack.last - #seeing comma here should end implicit rhs started within the param list - result[-1,0]=AssignmentRhsListEndToken.new(tok.offset) - @parsestack.pop + elsif /^,$/.token_pat===tok + if normal_comma_level+1==@parsestack.size and + AssignmentRhsContext===@parsestack.last + #seeing comma here should end implicit rhs started within the param list + result << AssignmentRhsListEndToken.new(tok.offset) + @parsestack.pop + end + if nextvar and normal_comma_level==@parsestack.size + localvars[nextvar]=true #now, finally add nextvar back to local vars + nextvar + end end end @defining_lvar=false - + @parsestack.last.see self,:semi assert(@parsestack.size <= old_parsestack_size) - assert(endingblock[tok]) + assert(endingblock[tok] || ErrorToken===tok) #hack: force next token to look like start of a #new stmt, if the last ignored_tokens #call above did not find a newline #(just in case the next token parsed #happens to call quote_expected? or after_nonid_op) result.concat ignored_tokens - if nextchar.chr[/[iuw\/<|>+\-*&%?:]/] and - !(NewlineToken===@last_operative_token) and - !(/^(end|;)$/===@last_operative_token) - @last_operative_token=KeywordToken.new ';' +# if !eof? and nextchar.chr[/[iuw\/<|>+\-*&%?:({]/] and +# !(NewlineToken===@last_operative_token) and +# !(/^(end|;)$/===@last_operative_token) + #result<<EndDefHeaderToken.new(result.last.offset+result.last.to_s.size) + set_last_token KeywordToken.new ';' result<< get1token - end +# end } - return result + return result,listend end #----------------------------------- #handle % in ruby code. is it part of fancy quote or a modulo operator? def percent(ch) - if quote_expected? ch + if AssignmentContext===@parsestack.last + @parsestack.pop + op=true + end + + if !op and quote_expected?(ch) || + (@last_operative_token===/^(return|next|break)$/ and KeywordToken===@last_operative_token) fancy_quote ch - else + else biop ch - end + end end #----------------------------------- #handle * & in ruby code. is unary or binary operator? def star_or_amp(ch) - assert('*&'[ch]) - want_unary=unary_op_expected? ch - result=(quadriop ch) - if want_unary - #readahead(2)[1..1][/[\s\v#\\]/] or #not needed? - assert OperatorToken===result - result.unary=true #result should distinguish unary+binary *& - WHSPLF[nextchar.chr] or - @moretokens << NoWsToken.new(input_position) - end - result + assert('*&'[ch]) + want_unary=unary_op_expected?(ch) || + (@last_operative_token===/^(return|next|break)$/ and KeywordToken===@last_operative_token) + result=quadriop(ch) + if want_unary + #readahead(2)[1..1][/[\s\v#\\]/] or #not needed? + assert OperatorToken===result + result.unary=true #result should distinguish unary+binary *& + WHSPLF[nextchar.chr] or + @moretokens << NoWsToken.new(input_position) + comma_in_lvalue_list? + if ch=='*' + @parsestack.last.see self, :splat + end + end + result end #----------------------------------- #handle ? in ruby code. is it part of ?..: or a character literal? def char_literal_or_op(ch) @@ -1077,19 +1367,27 @@ end #----------------------------------- def regex_or_div(ch) #space after slash always means / operator, rather than regex start - if after_nonid_op?{ !is_var_name? and WHSPLF[prevchar] and !readahead(2)[%r{^/\s}] } - return regex(ch) - else #/ is operator - result=getchar - if eat_next_if(?=) - result << '=' - end - return(operator_or_methname_token result) - end + #= after slash always means /= operator, rather than regex start + if AssignmentContext===@parsestack.last + @parsestack.pop + op=true + end + + if !op and after_nonid_op?{ + !is_var_name? and WHSPLF[prevchar] and !readahead(2)[%r{^/[\s\v=]}] + } || (KeywordToken===@last_token_maybe_implicit and @last_token_maybe_implicit.ident=="(") + return regex(ch) + else #/ is operator + result=getchar + if eat_next_if(?=) + result << '=' + end + return(operator_or_methname_token result) + end end #----------------------------------- #return true if last tok corresponds to a variable or constant, #false if its for a method, nil for something else @@ -1099,12 +1397,12 @@ (tok=@last_operative_token) s=tok.to_s case s when /[^a-z_0-9]$/i; false - when /^[a-z_]/; localvars===s or VARLIKE_KEYWORDS===s - when /^[A-Z]/; VarNameToken===tok +# when /^[a-z_]/; localvars===s or VARLIKE_KEYWORDS===s + when /^[A-Z_]/i; VarNameToken===tok when /^[@$<]/; true else raise "not var or method name: #{s}" end end @@ -1137,22 +1435,26 @@ #look for another colon; return single : if not found unless eat_next_if(?:) #cancel implicit contexts... @moretokens.push(*abort_noparens!(':')) + @moretokens.push KeywordToken.new(':',startpos) - #end ternary context, if any - @parsestack.last.see self,:colon - - TernaryContext===@parsestack.last and @parsestack.pop #should be in the context's see handler - - if ExpectDoOrNlContext===@parsestack.last #should be in the context's see handler + case @parsestack.last + when TernaryContext: @parsestack.pop #should be in the context's see handler + when ExpectDoOrNlContext: #should be in the context's see handler @parsestack.pop assert @parsestack.last.starter[/^(while|until|for)$/] + @moretokens.last.as=";" + when RescueSMContext: + @moretokens.last.as=";" + else @moretokens.last.as="then" end - @moretokens.push KeywordToken.new(':',startpos) + #end ternary context, if any + @parsestack.last.see self,:colon + return @moretokens.shift end #we definately found a :: @@ -1180,13 +1482,19 @@ #look for operators opmatches=readahead(3)[RUBYSYMOPERATORREX] result= opmatches ? read(opmatches.size) : case nc=nextchar - when ?" then assert notbare;double_quote('"') - when ?' then assert notbare;double_quote("'") - when ?` then read(1) + when ?" #" + assert notbare + open=':"'; close='"' + double_quote('"') + when ?' #' + assert notbare + open=":'"; close="'" + single_quote("'") + when ?` then read(1) #` when ?@ then at_identifier.to_s when ?$ then dollar_identifier.to_s when ?_,?a..?z then identifier_as_string(?:) when ?A..?Z then result=identifier_as_string(?:) @@ -1195,11 +1503,16 @@ /[A-Z_0-9]$/i===result and klass=VarNameToken end result else error= "unexpected char starting symbol: #{nc.chr}" end - return lexerror(klass.new(result,start),error) + result= lexerror(klass.new(result,start,notbare ? ':' : ''),error) + if open + result.open=open + result.close=close + end + return result end def merge_assignment_op_in_setter_callsites? false end @@ -1209,16 +1522,16 @@ #look for operators opmatches=readahead(3)[RUBYSYMOPERATORREX] return [opmatches ? read(opmatches.size) : case nc=nextchar - when ?` then read(1) + when ?` then read(1) #` when ?_,?a..?z,?A..?Z then context=merge_assignment_op_in_setter_callsites? ? ?: : nc identifier_as_string(context) else - @last_operative_token=KeywordToken.new(';') + set_last_token KeywordToken.new(';') lexerror(tok_to_errify,"unexpected char starting callsite symbol: #{nc.chr}, tok=#{tok_to_errify.inspect}") nil end, start ] end @@ -1231,24 +1544,67 @@ quote=eat_next_if( /['"`]/) if quote ender=til_charset(/[#{quote}]/) (quote==getchar) or return lexerror(HerePlaceholderToken.new( dash, quote, ender ), "mismatched quotes in here doc") + quote_real=true else quote='"' ender=til_charset(/[^a-zA-Z0-9_]/) ender.length >= 1 or - return lexerror(HerePlaceholderToken.new( dash, quote, ender ), "invalid here header") + return lexerror(HerePlaceholderToken.new( dash, quote, ender, nil ), "invalid here header") end - res= HerePlaceholderToken.new( dash, quote, ender ) + res= HerePlaceholderToken.new( dash, quote, ender, quote_real ) +if true + res.open=["<<",dash,quote,ender,quote].to_s + procrastinated=til_charset(/[\n]/)#+readnl + unless @base_file + @base_file=@file + @file=Sequence::List.new([@file]) + @file.pos=@base_file.pos + end + #actually delete procrastinated from input + @file.delete(input_position_raw-procrastinated.size...input_position_raw) + + nl=readnl or return lexerror(res, "here header without body (at eof)") + + @moretokens<< res + bodystart=input_position + @offset_adjust = @min_offset_adjust+procrastinated.size + #was: @offset_adjust += procrastinated.size + body=here_body(res) + res.close=body.close + @offset_adjust = @min_offset_adjust + #was: @offset_adjust -= procrastinated.size + bodysize=input_position-bodystart + + #one or two already read characters are overwritten here, + #in order to keep offsets correct in the long term + #(at present, offsets and line numbers between + #here header and its body will be wrong. but they should re-sync thereafter.) + newpos=input_position_raw-nl.size + #unless procrastinated.empty? + @file.modify(newpos,nl.size,procrastinated+nl) #vomit procrastinated text back onto input + #end + input_position_set newpos + + #line numbers would be wrong within the procrastinated section + @linenum-=1 + + #be nice to get the here body token at the right place in input, too... + @pending_here_bodies<< body + @offset_adjust-=bodysize#+nl.size + + return @moretokens.shift +else @incomplete_here_tokens.push res #hack: normally this should just be in get1token #this fixup is necessary because the call the get1token below #makes a recursion. - @last_operative_token=res + set_last_token res safe_recurse { |a| assert(a.object_id==@moretokens.object_id) toks=[] begin @@ -1267,11 +1623,11 @@ break end tok=get1token assert(a.equal?( @moretokens)) - toks<<tok + toks<< tok EoiToken===tok and lexerror tok, "here body expected before eof" end while res.unsafe_to_use assert(a.equal?( @moretokens)) a[0,0]= toks #same as a=toks+a, but keeps a's id } @@ -1279,17 +1635,18 @@ return res #the action continues in newline, where #the rest of the here token is read after a #newline has been seen and res.affix is eventually called +end end #----------------------------------- def lessthan(ch) #match quadriop('<') or here doc or spaceship op case readahead(3) - when /^<<['"`\-a-z0-9_]$/i - if quote_expected?(ch) #and @last_operative_token!='class' #not needed? + when /^<<['"`\-a-z0-9_]$/i #' + if quote_expected?(ch) and not @last_operative_token==='class' here_header else operator_or_methname_token read(2) end when "<=>" then operator_or_methname_token read(3) @@ -1307,105 +1664,235 @@ result+=nl else error='illegal escape sequence' end - @moretokens.unshift FileAndLineToken.new(@filename,ln=@linenum,input_position) - optional_here_bodies + #optimization: when thru with regurgitated text from a here document, + #revert back to original unadorned Sequence instead of staying in the List. + if @base_file and indices=@file.instance_eval{@start_pos} and + (indices[-2]..indices[-1])===@file.pos + @base_file.pos=@file.pos + @file=@base_file + @base_file=nil + result="\n" + end + + @offset_adjust=@min_offset_adjust + @moretokens.push *optional_here_bodies + ln=@linenum + @moretokens.push lexerror(EscNlToken.new(@filename,ln-1,result,input_position-result.size), error), + FileAndLineToken.new(@filename,ln,input_position) - lexerror EscNlToken.new(@filename,ln-1,result,pos), error + start_of_line_directives + + return @moretokens.shift end #----------------------------------- def optional_here_bodies - + result=[] +if true #handle here bodies queued up by previous line - #(we should be more compatible with dos/mac style newlines...) + pos=input_position + while body=@pending_here_bodies.shift + #body.offset=pos + result.push EscNlToken.new(@filename,nil,"\n",body.offset-1) + result.push FileAndLineToken.new(@filename,body.ident.line,body.offset) + result.push body + #result.push NoWsToken.new @pending_here_bodies.empty? ? input_position : @pending_here_bodies.first + #result.push FileAndLineToken.new(@filename,@linenum,pos) #position and line num are off + body.headtok.line=@linenum-1 + end +else + #...(we should be more compatible with dos/mac style newlines...) while tofill=@incomplete_here_tokens.shift + result.push( + here_body(tofill), + FileAndLineToken.new(@filename,@linenum,input_position) + ) + assert(eof? || "\r\n"[prevchar]) + tofill.line=@linenum-1 + end +end + return result + end + + #----------------------------------- + def here_body(tofill) + close="\n" tofill.string.offset= input_position + linecount=1 #for terminator + assert("\n"==prevchar) loop { - assert("\r\n"[prevchar]) + assert("\n"==prevchar) #here body terminator? - oldpos= input_position + oldpos= input_position_raw if tofill.dash - til_charset(/[^#{WHSP}]/o) + close+=til_charset(/[^#{WHSP}]/o) end - break if eof? - break if read(tofill.ender.size)==tofill.ender and readnl + break if eof? #this is an error, should be handled better + if read(tofill.ender.size)==tofill.ender + crs=til_charset(/[^\r]/)||'' + if nl=readnl + close+=tofill.ender+crs+nl + break + end + end input_position_set oldpos + assert("\n"==prevchar) + if tofill.quote=="'" - line=til_charset(/[\r\n]/)+readnl - line.gsub! "\\\\", "\\" + line=til_charset(/[\n]/) + unless nl=readnl + assert eof? + break #this is an error, should be handled better + end + line.chomp!("\r") + line<< "\n" + assert("\n"==prevchar) + #line.gsub! "\\\\", "\\" tofill.append line - assert(line[-1..-1][/[\r\n]/]) + tofill.string.bs_handler=:squote_heredoc_esc_seq + linecount+=1 + assert("\n"==line[-1,1]) + assert("\n"==prevchar) else + assert("\n"==prevchar) + back1char #-1 to make newline char the next to read @linenum-=1 + assert /[\r\n]/===nextchar.chr + #retr evrything til next nl +if FASTER_STRING_ESCAPES + line=all_quote("\r\n", tofill.quote, "\r\n") +else line=all_quote(INET_NL_REX, tofill.quote, INET_NL_REX) +end + linecount+=1 #(you didn't know all_quote could take a regex, did you?) + assert("\n"==prevchar) + #get rid of fals that otherwise appear to be in the middle of #a string (and are emitted out of order) fal=@moretokens.pop assert FileAndLineToken===fal || fal.nil? + assert line.bs_handler + tofill.string.bs_handler||=line.bs_handler + + tofill.append_token line + tofill.string.elems<<'' unless String===tofill.string.elems.last + + assert("\n"==prevchar) + back1char @linenum-=1 assert("\r\n"[nextchar.chr]) - tofill.append_token line tofill.append readnl + + assert("\n"==prevchar) end + + assert("\n"==prevchar) } + - assert(eof? || "\r\n"[prevchar]) + str=tofill.string + str.bs_handler||=:dquote_esc_seq if str.elems.size==1 and str.elems.first=='' tofill.unsafe_to_use=false - tofill.line=@linenum-1 + assert str.bs_handler + #?? or tofill.string.elems==[] + + + tofill.string.instance_eval{@char="`"} if tofill.quote=="`" + #special cased, but I think that's all that's necessary... - @moretokens.push \ - tofill.bodyclass.new(tofill), - FileAndLineToken.new(@filename,@linenum,input_position) - end - + result=tofill.bodyclass.new(tofill,linecount) + result.open=str.open="" + tofill.close=close + result.close=str.close=close[1..-1] + result.offset=str.offset + assert str.open + assert str.close + return result end #----------------------------------- def newline(ch) assert("\r\n"[nextchar.chr]) - - #ordinary newline handling (possibly implicitly escaped) assert("\r\n"[nextchar.chr]) assert !@parsestack.empty? assert @moretokens.empty? - result=if NewlineToken===@last_operative_token or #hack - @last_operative_token===/^(;|begin|do|#{INNERBOUNDINGWORDS})$/ or #hack - !after_nonid_op?{false} - then #hack-o-rama: probly cases left out above - a= abort_noparens! - ExpectDoOrNlContext===@parsestack.last and @parsestack.pop - assert !@parsestack.empty? - @parsestack.last.see self,:semi - a << super(ch) - @moretokens.replace a+@moretokens - @moretokens.shift - else - offset= input_position - nl=readnl - @moretokens << FileAndLineToken.new(@filename,@linenum,input_position) - EscNlToken.new(@filename,@linenum-1,nl,offset) - #WsToken.new ' ' #why? #should be "\\\n" ? - end + pre=FileAndLineToken.new(@filename,@linenum+1,input_position) + pre.allow_ooo_offset=true - optional_here_bodies + if NewlineToken===@last_operative_token or #hack + (KeywordToken===@last_operative_token and + @last_operative_token.ident=="rescue" and + !@last_operative_token.infix?) or + #/^(;|begin|do|#{INNERBOUNDINGWORDS})$/ or #hack + !after_nonid_op?{false} + then #hack-o-rama: probly cases left out above + @offset_adjust=@min_offset_adjust + a= abort_noparens! + ExpectDoOrNlContext===@parsestack.last and @parsestack.pop + assert !@parsestack.empty? + @parsestack.last.see self,:semi + a << super(ch) + @moretokens.replace a+@moretokens + else + @offset_adjust=@min_offset_adjust + offset= input_position + nl=readnl + @moretokens.push EscNlToken.new(@filename,@linenum-1,nl,offset), + FileAndLineToken.new(@filename,@linenum,input_position) + end + + #optimization: when thru with regurgitated text from a here document, + #revert back to original unadorned Sequence instead of staying in the list. + if @base_file and indices=@file.instance_eval{@start_pos} and + (indices[-2]..indices[-1])===@file.pos and Sequence::SubSeq===@file.list.last + @base_file.pos=@file.pos + @file=@base_file + @base_file=nil + end + + fal=@moretokens.last + assert FileAndLineToken===fal + + @offset_adjust=@min_offset_adjust + + @moretokens.unshift(*optional_here_bodies) + result=@moretokens.shift + + #adjust line count in fal to account for newlines in here bodys + i=@moretokens.size-1 + while(i>=0) + #assert FileAndLineToken===@moretokens[i] + i-=1 if FileAndLineToken===@moretokens[i] + break unless HereBodyToken===@moretokens[i] + pre_fal=true + fal.line-=@moretokens[i].linecount + + i-=1 + end + + if pre_fal + @moretokens.unshift result + pre.offset=result.offset + result=pre + end start_of_line_directives return result end @@ -1422,19 +1909,20 @@ startpos= input_position more= read(EQBEGINLENGTH-1) #get =begin begin eof? and raise "eof before =end" - more<<til_charset(/[\r\n]/) - more<<readnl + more<< til_charset(/[\r\n]/) + eof? and raise "eof before =end" + more<< readnl end until readahead(EQENDLENGTH)==EQEND #read rest of line after =end more << til_charset(/[\r\n]/) - assert((?\r===nextchar or ?\n===nextchar)) + assert((eof? or ?\r===nextchar or ?\n===nextchar)) assert !(/[\r\n]/===more[-1,1]) - more<< readnl + more<< readnl unless eof? # newls= more.scan(/\r\n?|\n\r?/) # @linenum+= newls.size #inject the fresh comment into future token results @@ -1443,11 +1931,11 @@ end #handle __END__ if ENDMARKER===readahead(ENDMARKERLENGTH) assert !(ImplicitContext===@parsestack.last) - @moretokens.unshift endoffile_detected(read(7)) + @moretokens.unshift endoffile_detected(read(ENDMARKERLENGTH)) # input_position_set @file.size end end @@ -1458,28 +1946,25 @@ #returns whether current token is to be the start of a literal IDBEGINCHAR=/^[a-zA-Z_$@]/ def unary_op_expected?(ch) #yukko hack '*&='[readahead(2)[1..1]] and return false + return true if KeywordToken===@last_operative_token and @last_operative_token==='for' + after_nonid_op? { #possible func-call as operator not is_var_name? and - WHSPLF[prevchar] + WHSPLF[prevchar] and !WHSPLF[readahead(2)[1..1]] } end #----------------------------------- #used to resolve the ambiguity of # <<, %, ? in ruby #returns whether current token is to be the start of a literal def quote_expected?(ch) #yukko hack - if AssignmentContext===@parsestack.last - @parsestack.pop - return false - end - case ch[0] when ?? then readahead(2)[/^\?[#{WHSPLF}]$/o] #not needed? when ?% then readahead(3)[/^%([a-pt-vyzA-PR-VX-Z]|[QqrswWx][a-zA-Z0-9])/] when ?< then !readahead(4)[/^<<-?['"`a-z0-9_]/i] else raise 'unexpected ch (#{ch}) in quote_expected?' @@ -1498,21 +1983,27 @@ #returns false if last token was an value, true if it was an operator. #returns what block yields if last token was a method name. #used to resolve the ambiguity of # <<, %, /, ?, :, and newline (among others) in ruby def after_nonid_op? + + #this is how it should be, I think, and then no handlers for methnametoken and FUNCLIKE_KEYWORDS are needed +# if ImplicitParamListStartToken===@last_token_including_implicit +# huh return true +# end case @last_operative_token - when MethNameToken, FUNCLIKE_KEYWORDS.token_pat ,VarNameToken + when VarNameToken , MethNameToken, FUNCLIKE_KEYWORDS.token_pat #VarNameToken should really be left out of this case... #should be in next branch instread #callers all check for last token being not a variable if they pass anything - #but {false} in the block + #but {false} in the block + #(hmmm... some now have true or other non-varname checks in them... could these be bugs?) return yield when StringToken, SymbolToken, NumberToken, HerePlaceholderToken, %r{^( - class|module|end|self|true|false|nil| - __FILE__|__LINE__|[\})\]]|alias|(un)?def|for + end|self|true|false|nil| + __FILE__|__LINE__|[\})\]] )$}x.token_pat #dunno about def/undef #maybe class/module shouldn't he here either? #for is also in NewlineToken branch, below. #what about rescue? @@ -1520,21 +2011,20 @@ when /^(#{RUBYOPERATORREX}|#{INNERBOUNDINGWORDS}|do)$/o.token_pat #regexs above must match whole string #assert(@last_operative_token==$&) #disabled 'cause $& is now always nil :( return true when NewlineToken, nil, #nil means we're still at beginning of file - /^([({\[]|or|not|and|if|unless|then|elsif|else| - while|until|begin|for|in|case|when|ensure)$ + /^([({\[]|or|not|and|if|unless|then|elsif|else|class|module|def| + while|until|begin|for|in|case|when|ensure|defined\?)$ /x.token_pat return true - #when KeywordToken - # return true + when KeywordToken + return true if /^(alias|undef)$/===@last_operative_token.ident #is this ever actually true??? when IgnoreToken raise "last_operative_token shouldn't be ignoreable" - else - raise "after_nonid_op? after #{@last_operative_token}:#{@last_operative_token.class} -- now what" end + raise "after_nonid_op? after #{@last_operative_token}:#{@last_operative_token.class} -- now what" end @@ -1575,14 +2065,14 @@ end #----------------------------------- def biop(ch) #match /%=?/ (% or %=) - assert(ch[/^[%^~]$/]) + assert(ch[/^[%^]$/]) result=getchar if eat_next_if(?=) - result <<?= + result << ?= end return operator_or_methname_token( result) end #----------------------------------- def tilde(ch) #match ~ @@ -1608,31 +2098,31 @@ #match /[+\-]=?/ (+ or +=) #could be beginning of number, too #fixme: handle +@ and -@ here as well... (currently, this is done in symbol()?) def plusminus(ch) assert(/^[+\-]$/===ch) - if unary_op_expected?(ch) + if unary_op_expected?(ch) or + KeywordToken===@last_operative_token && + /^(return|break|next)$/===@last_operative_token.ident if (?0..?9)===readahead(2)[1] return number(ch) else #unary operator result=getchar WHSPLF[nextchar.chr] or @moretokens << NoWsToken.new(input_position) result=(operator_or_methname_token result) result.unary=true - #todo: result should distinguish unary+binary +- end else #binary operator assert(! want_op_name) result=getchar if eat_next_if(?=) result << ?= end result=(operator_or_methname_token result) - #todo: result should distinguish unary+binary +- end - result + return result end #----------------------------------- def equals(ch) #match /=(>|~|==?)?/ (= or == or =~ or === or =>) offset= input_position @@ -1640,23 +2130,35 @@ assert str=='=' c=(eat_next_if(/[~=>]/)or'') str << c result= operator_or_methname_token( str,offset) case c - when '=': str<< (eat_next_if(?=)or'') + when '=': #===,== + str<< (eat_next_if(?=)or'') - when '>': + when '>': #=> unless ParamListContextNoParen===@parsestack.last @moretokens.unshift result @moretokens.unshift( *abort_noparens!("=>")) result=@moretokens.shift end @parsestack.last.see self,:arrow - when '': #record local variable definitions - + when '': #plain assignment: record local variable definitions + last_context_not_implicit.lhs=false + @moretokens.push *ignored_tokens(true).map{|x| + NewlineToken===x ? EscNlToken.new(@filename,@linenum,x.ident,x.offset) : x + } @parsestack.push AssignmentRhsContext.new(@linenum) - @moretokens.unshift AssignmentRhsListStartToken.new( offset+1) + if eat_next_if ?* + tok=OperatorToken.new('*', input_position-1) + tok.unary=true + @moretokens.push tok + WHSPLF[nextchar.chr] or + @moretokens << NoWsToken.new(input_position) + comma_in_lvalue_list? #is this needed? + end + @moretokens.push AssignmentRhsListStartToken.new( input_position) end return result end #----------------------------------- @@ -1664,10 +2166,11 @@ assert nextchar==?! result=getchar k=eat_next_if(/[~=]/) if k result+=k + elsif eof?: #do nothing else WHSPLF[nextchar.chr] or @moretokens << NoWsToken.new(input_position) end return KeywordToken.new(result, input_position-result.size) @@ -1691,34 +2194,35 @@ return result end #----------------------------------- def dot_rhs(prevtok) safe_recurse { |a| - @last_operative_token=prevtok + set_last_token prevtok aa= ignored_tokens + was=after_nonid_op?{true} tok,pos=callsite_symbol(prevtok) - tok and aa.push(*var_or_meth_name(tok,prevtok,pos)) + tok and aa.push(*var_or_meth_name(tok,prevtok,pos,was)) a.unshift(*aa) } end #----------------------------------- def back_quote(ch=nil) if @last_operative_token===/^(def|::|\.)$/ oldpos= input_position - MethNameToken.new(eat_next_if(?`), oldpos) + MethNameToken.new(eat_next_if(?`), oldpos) #` else double_quote(ch) end end if false #----------------------------------- def comment(str) result="" #loop{ - result<<super(nil).to_s + result<< super(nil).to_s if /^\#.*\#$/===result #if comment was ended by a crunch #that's not a legal comment end in ruby, so just keep reading assert(result.to_s[-1]==?#) @@ -1760,11 +2264,11 @@ if (VarNameToken===lasttok or ImplicitParamListEndToken===lasttok or MethNameToken===lasttok) and !WHSPCHARS[lastchar] @moretokens << (tokch) tokch= NoWsToken.new(input_position-1) end when '(' - lasttok=last_operative_token + lasttok=last_token_maybe_implicit #last_operative_token #could be: lasttok===/^[a-z_]/i if (VarNameToken===lasttok or MethNameToken===lasttok or lasttok===FUNCLIKE_KEYWORDS) unless WHSPCHARS[lastchar] @moretokens << tokch @@ -1779,19 +2283,21 @@ #check if we are in a hash literal or string inclusion (#{}), #in which case below would be bad. if after_nonid_op?{false} or @last_operative_token.has_no_block? @parsestack.push ListImmedContext.new(ch,@linenum) #that is, a hash else + #abort_noparens! tokch.set_infix! -=begin not needed now, i think + tokch.as="do" +#=begin not needed now, i think # 'need to find matching callsite context and end it if implicit' lasttok=last_operative_token - unless lasttok===')' and lasttok.callsite? + if !(lasttok===')' and lasttok.callsite?) #or ParamListContextNoParen===parsestack.last @moretokens.push *(abort_1_noparen!(1).push tokch) tokch=@moretokens.shift end -=end +#=end localvars.start_block @parsestack.push BlockContext.new(@linenum) block_param_list_lookahead end @@ -1809,38 +2315,48 @@ lexerror kw,"unmatched brace: #{ch}" return @moretokens.shift end ctx=@parsestack.pop origch,line=ctx.starter,ctx.linenum - ch==PAIRS[origch] or + if ch!=PAIRS[origch] + #kw.extend MismatchedBrace lexerror kw,"mismatched braces: #{origch}#{ch}\n" + "matching brace location", @filename, line - BlockContext===ctx and localvars.end_block + end + if BlockContext===ctx + localvars.end_block + @moretokens.last.as="end" + end if ParamListContext==ctx.class assert ch==')' - #kw.set_callsite! #not needed? + kw.set_callsite! #not needed? end return @moretokens.shift end #----------------------------------- def eof(ch=nil) #this must be the very last character... oldpos= input_position - assert(?\0==getc) + assert(/\A[\x0\x4\x1a]\Z/===nextchar.chr) - result= "\0#{ignored_tokens(true).delete_if{|t|FileAndLineToken===t}}" + result=@file.read! +# result= "\0#{ignored_tokens(true).delete_if{|t|FileAndLineToken===t}}" - eof? or - lexerror result,'nul character is not at the end of file' - input_position_set @file.size +# eof? or +# lexerror result,'nul character is not at the end of file' +# input_position_set @file.size return(endoffile_detected result) end #----------------------------------- def endoffile_detected(s='') @moretokens.push( *(abort_noparens!.push super(s))) + if @progress_thread + @progress_thread.kill + @progress_thread=nil + end result= @moretokens.shift balanced_braces? or (lexerror result,"unbalanced braces at eof. parsestack=#{@parsestack.inspect}") result end @@ -1849,11 +2365,30 @@ KeywordToken.new super(ch), input_position-1 end #----------------------------------- def comma(ch) - single_char_token(ch) + @moretokens.push token=single_char_token(ch) + if AssignmentRhsContext===@parsestack[-1] and + ParamListContext===@parsestack[-2] || + ParamListContextNoParen===@parsestack[-2] || + WhenParamListContext===@parsestack[-2] || + (RescueSMContext===@parsestack[-2] && @parsestack[-2].state==:rescue) || + (DefContext===@parsestack[-2] && !@parsestack[-2].in_body) + @parsestack.pop + @moretokens.unshift AssignmentRhsListEndToken.new(input_position) + end + token.comma_type= + case @parsestack[-1] + when AssignmentRhsContext: :rhs + when ParamListContext,ParamListContextNoParen: :call + when ListImmedContext: :array + else + :lhs if comma_in_lvalue_list? + end + @parsestack.last.see self,:comma + return @moretokens.shift end #----------------------------------- def semicolon(ch) assert @moretokens.empty? @@ -1870,22 +2405,20 @@ #----------------------------------- def operator_or_methname_token(s,offset=nil) assert RUBYOPERATORREX===s if RUBYNONSYMOPERATORREX===s KeywordToken - elsif @last_operative_token===/^(\.|::|def|undef|alias|defined\?)$/ + elsif want_op_name MethNameToken else OperatorToken end.new(s,offset) end #----------------------------------- #tokenify_results_of :identifier save_offsets_in(*CHARMAPPINGS.values.uniq-[ - :symbol_or_op,:open_brace,:whitespace,:exclam,:backquote - - + :symbol_or_op,:open_brace,:whitespace,:exclam,:backquote ]) #save_offsets_in :symbol end