=begin copyright rubylexer - a ruby lexer written in ruby Copyright (C) 2004,2005 Caleb Clausen This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA =end require "rulexer" require "symboltable" require "io.each_til_charset" require "context.rb" #----------------------------------- class RubyLexer < RuLexer include NestedContexts RUBYSYMOPERATORREX= %r{^([&|^/%~]|=(==?|~)|>[=>]?|<(<|=>?)?|[+\-]@?|\*\*?|\[\]=?)} # (nasty beastie, eh?) #these are the overridable operators #does not match flow-control operators like: || && ! or and if not #or op= ops like: += -= ||= #or .. ... ?: #for that use: RUBYNONSYMOPERATORREX= %r{^([%^~/\-+]=|(\|\|?|&&?)=?|(<<|>>|\*\*?)=|\.{1,3}|[?:,;]|=>?|![=~]?)$} RUBYOPERATORREX=/#{RUBYSYMOPERATORREX}|#{RUBYNONSYMOPERATORREX}/o UNSYMOPS=/^[~!]$/ #always unary UBSYMOPS=/^([*&+-]|::)$/ #ops that could be unary or binary WHSPCHARS=WHSPLF+"\\#" OPORBEGINWORDS="(if|unless|while|until)" BEGINWORDS=/^(def|class|module|begin|for|case|do|#{OPORBEGINWORDS})$/o FUNCLIKE_KEYWORDS=/^(break|next|redo|return|raise|yield|defined\?|retry|super|BEGIN|END)$/ VARLIKE_KEYWORDS=/^(__FILE__|__LINE__|false|nil|self|true)$/ INNERBOUNDINGWORDS="(else|elsif|ensure|in|then|rescue|when)" BINOPWORDS="(and|or)" NEVERSTARTPARAMLISTWORDS=/^(#{OPORBEGINWORDS}|#{INNERBOUNDINGWORDS}|#{BINOPWORDS}|end)[^a-zA-Z0-9_!?=]?/o NEVERSTARTPARAMLISTFIRST=CharSet[%[aoeitrwu]] #char set that begins NEVERSTARTPARAMLIST NEVERSTARTPARAMLISTMAXLEN=7 #max len of a NEVERSTARTPARAMLIST RUBYKEYWORDS=%r{ ^(alias|#{BINOPWORDS}|not|undef|__END__|end| #{VARLIKE_KEYWORDS}|#{FUNCLIKE_KEYWORDS}| #{INNERBOUNDINGWORDS}|#{BEGINWORDS} )$ }xo CHARMAPPINGS = { ?$ => :dollar_identifier, ?@ => :at_identifier, ?a..?z => :identifier, ?A..?Z => :identifier, ?_ => :identifier, ?0..?9 => :number, ?" => :double_quote, ?' => :single_quote, ?` => :back_quote, WHSP => :whitespace, #includes \r ?, => :comma, ?; => :semicolon, ?^ => :biop, ?~ => :tilde, ?= => :equals, ?! => :exclam, ?. => :dot, #these ones could signal either an op or a term ?/ => :regex_or_div, "|>" => :quadriop, "*&" => :star_or_amp, #could be unary "+-" => :plusminus, #could be unary ?< => :lessthan, ?% => :percent, ?? => :char_literal_or_op, #single-char int literal ?: => :symbol_or_op, ?\n => :newline, #implicitly escaped after op #?\r => :newline, #implicitly escaped after op ?\\ => :escnewline, ?\0 => :eof, "[({" => :open_brace, "])}" => :close_brace, ?# => :comment } attr :incomplete_here_tokens #----------------------------------- def initialize(filename,file,linenum=1) super(filename,file, linenum) @start_linenum=linenum @bracestack=[TopLevelContext.new] @incomplete_here_tokens=[] @localvars=SymbolTable.new @defining_lvar=nil @toptable=CharHandler.new(self, :illegal_char, CHARMAPPINGS) start_of_line_directives end #----------------------------------- def get1token result=super #most of the action's here #now cleanup and housekeeping #check for bizarre token types case result when IgnoreToken#,nil return result when Token#,String else raise "#{@filename}:#{linenum}:token is a #{result.class}, last is #{@last_operative_token}" end @last_operative_token=result return result end #----------------------------------- def balanced_braces? #@bracestack.empty? @bracestack.size==1 and TopLevelContext===@bracestack.first end #----------------------------------- def dollar_identifier(ch=nil) s=eat_next_if(?$) or return nil if t=((identifier_as_string(?$) or special_global)) s< :comment, "\n" => :newline, "\\" => :escnewline, "\s\t\v\r\f" => :whitespace ) #tok=nil while tok=@whsphandler.go((nextchar or return result)) block_given? and NewlineToken===tok and yield tok result << tok end end return result end #----------------------------------- def safe_recurse old_moretokens=@moretokens #old_bracestack=@bracestack.dup @moretokens=[] result= yield @moretokens #assert @incomplete_here_tokens.empty? #assert @bracestack==old_bracestack @moretokens= old_moretokens.concat @moretokens return result #need to do something with @last_operative_token? end #----------------------------------- def special_global #handle $-a and friends assert prevchar=='$' result = (( #order matters here, but it shouldn't #(but til_charset must be last) eat_next_if(/^[!@&+`'=~\/\\,.;<>*"$?:]$/) or (eat_next_if('-') and ("-"+getchar)) or (?0..?9)===nextchar ? til_charset(/[^\d]/) : nil )) end #----------------------------------- def identifier(context=nil) oldpos=@file.pos str=identifier_as_string(context) #skip keyword processing if 'escaped' as it were, by def, . or :: #or if in a non-bare context #just asserts because those contexts are never encountered. #control goes through symbol(<...>,nil) assert( /^[a-z_]$/i===context) assert !(@last_operative_token===/^(\.|::|(un)?def|alias)$/) @moretokens.unshift(*parse_keywords(str,oldpos) do #if not a keyword, case str when FUNCLIKE_KEYWORDS: #do nothing when VARLIKE_KEYWORDS,RUBYKEYWORDS: raise "shouldnt see keywords here, now" end safe_recurse { |a| var_or_meth_name(str,@last_operative_token,oldpos) } end) return @moretokens.shift end #----------------------------------- def identifier_as_string(context) #must begin w/ letter or underscore str=eat_next_if(/^[_a-z]$/i) or return nil #equals, question mark, and exclamation mark #might be allowed at the end in some contexts. #(in def headers and symbols) #otherwise, =,?, and ! are to be considered #separate tokens. confusing, eh? #i hope i've captured all right conditions.... #context should always be ?: right after def, ., and :: now maybe_eq,maybe_qm,maybe_ex = case context when ?@,?$ then [nil,nil,nil] when ?: then [?=, ??, ?!] else [nil,??, ?!] end str< back1char else str << b end else back1char end return str end #----------------------------------- #contexts in which comma may appear in ruby: #multiple lhs (terminated by assign op) #multiple rhs (in implicit context) (tbd) #method actual param list (in ( or implicit context) #method formal param list (in ( or implicit context) #block formal param list (in | context) (tbd) #hash immediate (in imm{ context) #array immediate (in imm[ context) #element reference/assignment (in [] or []= method actual parameter context) #list after for #list after rescue #list after when #list after undef #note: comma in parens not around a param list is illegal #----------------------------------- #a comma has been seen. are we in an #lvalue list or some other construct that uses commas? def comma_in_lvalue_list? not ListContext===@bracestack.last end #----------------------------------- def in_lvar_define_state #@defining_lvar is a hack @defining_lvar or case ctx=@bracestack.last when ForSMContext: ctx.state==:for when RescueSMContext: ctx.state==:arrow when BlockParamListContext: true end end #----------------------------------- #determine if an alphabetic identifier refers to a variable #or method name. generates implicit parenthes(es) if it is a #call site and no explicit parens are present. starts an implicit param list #if appropriate. adds tok to the #local var table if its a local var being defined for the first time. #note: what we here call variables (rather, constants) following :: #might actually be methods at runtime, but that's immaterial to tokenization. #note: this routine should determine the correct token type for name and #create the appropriate token. currently this is not done because callers #typically have done it (perhaps incorrectly) already. def var_or_meth_name(name,lasttok,pos) #look for call site if not a keyword or keyword is function-like #look for and ignore local variable names assert String===name #fixme: keywords shouldn't be treated specially after :: and . #maybe_local really means 'maybe local or constant' maybe_local=case name when /[^a-z_0-9]$/i: #do nothing when /^[a-z_]/: (@localvars===name or VARLIKE_KEYWORDS===name or in_lvar_define_state) and not lasttok===/^(\.|::)$/ when /^[A-Z]/: is_const=true;not lasttok==='.' #this is the right algorithm for constants... end assert(@moretokens.empty?) tok=@last_operative_token=VarNameToken.new(name,pos) oldpos=@file.pos sawnl=false result=ws_toks=ignored_tokens(true) {|nl| sawnl=true } sawnl || @file.eof? and return result.unshift( *if maybe_local : [tok] else [MethNameToken.new(name,pos), #insert implicit parens right after tok ImplicitParamListStartToken.new( oldpos), ImplicitParamListEndToken.new( oldpos) ] end ) #if next op is assignment (or comma in lvalue list) #then omit implicit parens assignment_coming=case nc=nextchar when ?=: not /^=[=~]$/===readahead(2) when ?,: comma_in_lvalue_list? when ?>,?<: /^([<>])\1=$/===readahead(3) when ?*,?|,?&: /^([*|&])\1?=/===readahead(3) when ?%,?/,?-,?+,?^: readahead(2)[1..1]=='=' end if (assignment_coming or in_lvar_define_state) tok=VarNameToken.new(name,pos) if /[^a-z_0-9]$/i===name lexerror tok,"not a valid variable name: #{name}" elsif /^[a-z_]/===name and !(lasttok===/^(\.|::)$/) @localvars[name]=true end return result.unshift(tok) end implicit_parens_to_emit=case nc when ?!: readahead(2)=='!=' ? 2 : 1 when NEVERSTARTPARAMLISTFIRST (NEVERSTARTPARAMLISTWORDS===readahead(NEVERSTARTPARAMLISTMAXLEN)) ? 2 : 1 when ?",?',?`,?a..?z,?A..?Z,?0..?9,?_,?@,?$,?~: 1 when ?{: maybe_local=false; 2 when ?(: maybe_local=false; 0 when ?},?],?),?;,?^, ?|, ?>, ?,, ?., ?=: 2 when ?+, ?-, ?*, ?&, ?%, ?/, ?:, ??: (ws_toks.empty? || readahead(2)[/^.[#{WHSPLF}]/]) ? 2 : 3 when ?<: (ws_toks.empty? || readahead(3)[/^<<[^"'`a-zA-Z_0-9-]/]) ? 2 : 3 when ?[: ws_toks.empty? ? 2 : 3 when ?\\, ?\s, ?\t, ?\n, ?\r, ?\v, ?#: raise 'failure' else raise "unknown char after ident: #{nextchar.chr}" end implicit_parens_to_emit==3 and is_const and implicit_parens_to_emit=1 tok=if maybe_local and implicit_parens_to_emit>=2 implicit_parens_to_emit=0 VarNameToken else MethNameToken end.new(name,pos) case implicit_parens_to_emit when 2: result.unshift ImplicitParamListStartToken.new(oldpos), ImplicitParamListEndToken.new(oldpos) when 1,3: result.unshift ImplicitParamListStartToken.new(oldpos) @bracestack.push ParamListContextNoParen.new(@linenum) when 0: #do nothing else raise 'invalid value of implicit_parens_to_emit' end return result.unshift(tok) # 'ok:' # 'if unless while until {' # '\n (unescaped) and or' # 'then else elsif rescue ensure (illegal in value context)' # 'need to pop noparen from bracestack on these tokens: (in operator context)' # 'not ok:' # 'not (but should it be?)' end #----------------------------------- CONTEXT2ENDTOK={AssignmentRhsContext=>AssignmentRhsListEndToken, ParamListContextNoParen=>ImplicitParamListEndToken, KwParamListContext=>KwParamListEndToken } def abort_noparens!(str='') #assert @moretokens.empty? result=[] while klass=CONTEXT2ENDTOK[@bracestack.last.class] result << klass.new(@file.pos-str.length) @bracestack.pop end return result end if false #no longer used #----------------------------------- def abort_1_noparen!(offs=0) assert @moretokens.empty? result=[] while AssignmentRhsContext===@bracestack.last @bracestack.pop result << AssignmentRhsListEndToken.new(@file.pos-offs) end ParamListContextNoParen===@bracestack.last or lexerror huh,'{} with no matching callsite' @bracestack.pop result << ImplicitParamListEndToken.new(@file.pos-offs) return result end end #----------------------------------- #parse keywords now, to prevent confusion over bare symbols #and match end with corresponding preceding def or class or whatever. #if arg is not a keyword, the block is called def parse_keywords(str,offset) assert @moretokens.empty? result=[KeywordToken.new(str,offset)] case str when "end" result.unshift(*abort_noparens!(str)) @bracestack.last.see @bracestack,:semi #sorta hacky... should make an :end event instead? =begin not needed? if ExpectDoOrNlContext===@bracestack.last @bracestack.pop assert @bracestack.last.starter[/^(while|until|for)$/] end =end WantsEndContext===@bracestack.last or lexerror result.last, 'unbalanced end' ctx=@bracestack.pop start,line=ctx.starter,ctx.linenum BEGINWORDS===start or lexerror result.last, "end does not match #{start or "nil"}" /^(class|module|def|do)$/===start and @localvars.end_block when "class","module" result.first.has_end! @bracestack.push WantsEndContext.new(str,@linenum) @localvars.start_block when "if","unless" #could be infix form without end if after_nonid_op?{false} #prefix form result.first.has_end! @bracestack.push WantsEndContext.new(str,@linenum) else #infix form result.unshift(*abort_noparens!(str)) end when "begin","case" result.first.has_end! @bracestack.push WantsEndContext.new(str,@linenum) when "while","until" #could be infix form without end if after_nonid_op?{false} #prefix form result.first.has_end! @bracestack.push WantsEndContext.new(str,@linenum) expect_do_or_end_or_nl! str else #infix form result.unshift(*abort_noparens!(str)) end when "for" result.first.has_end! @bracestack.push WantsEndContext.new(str,@linenum) #expect_do_or_end_or_nl! str #handled by ForSMContext now @bracestack.push ForSMContext.new(@linenum) when "do" result.unshift(*abort_noparens!(str)) if ExpectDoOrNlContext===@bracestack.last @bracestack.pop assert WantsEndContext===@bracestack.last else result.last.has_end! @bracestack.push WantsEndContext.new(str,@linenum) @localvars.start_block block_param_list_lookahead end when "def" result.first.has_end! @bracestack.push WantsEndContext.new("def",@linenum) @localvars.start_block safe_recurse { |aa| @last_operative_token=KeywordToken.new "def" #hack result.concat ignored_tokens #read an expr like a.b.c or a::b::c #or (expr).b.c if nextchar==?( #look for optional parenthesised head old_size=@bracestack.size parencount=0 begin tok=get1token case tok when/^\($/.token_pat then parencount+=1 when/^\)$/.token_pat then parencount-=1 end EoiToken===tok and lexerror tok, "eof in def header" result<=@bracestack.size and break #next token is a local var name #(or the one after that if unary ops present) #result.concat ignored_tokens expect_name and case tok when IgnoreToken#, /^[A-Z]/ #do nothing when VarNameToken assert@defining_lvar @defining_lvar=false assert((not @last_operative_token===',')) when /^[&*]$/.token_pat #unary form... #a NoWsToken is also expected... read it now result.concat maybe_no_ws_token #not needed? @last_operative_token=KeywordToken.new ',' else lexerror tok,"unfamiliar var name '#{tok}'" end end @defining_lvar=false assert(@bracestack.size <= old_bracestack_size) assert(endingblock[tok]) #hack: force next token to look like start of a #new stmt, if the last ignored_tokens #call above did not find a newline #(just in case the next token parsed #happens to call quote_expected? or after_nonid_op) result.concat ignored_tokens if nextchar.chr[/[iuw\/<|>+\-*&%?:]/] and !(NewlineToken===@last_operative_token) and !(/^(end|;)$/===@last_operative_token) @last_operative_token=KeywordToken.new ';' result<< get1token end } return result end #----------------------------------- #handle % in ruby code. is it part of fancy quote or a modulo operator? def percent(ch) if quote_expected? ch fancy_quote ch else biop ch end end #----------------------------------- #handle * in ruby code. is unary or binary operator? def star_or_amp(ch) assert('*&'[ch]) if unary_op_expected? ch #readahead(2)[1..1][/[\s\v#\\]/] or #not needed? result=operator_or_methname_token getchar WHSPLF[nextchar.chr] or @moretokens << NoWsToken.new(@file.pos) return result else return(quadriop ch) end #result should distinguish unary+binary *& end #----------------------------------- #handle ? in ruby code. is it part of ?..: or a character literal? def char_literal_or_op(ch) if colon_quote_expected? ch getchar NumberToken.new getchar_maybe_escape else @bracestack.push TernaryContext.new(@linenum) KeywordToken.new getchar #operator end end #----------------------------------- def regex_or_div(ch) #space after slash always means / operator, rather than regex start if after_nonid_op?{ !is_var_name? and WHSPLF[prevchar] and !readahead(2)[%r{^/\s}] } return regex(ch) else #/ is operator result=getchar if eat_next_if(?=) result << '=' end return(operator_or_methname_token result) end end #----------------------------------- #return true if tok corresponds to a variable or constant, false if its for a method, nil for something else #we assume tok is a valid token with a correctly formed name. #...should really be called was_var_name def is_var_name? (tok=@last_operative_token) s=tok.to_s case s when /[^a-z_0-9]$/i: false when /^[a-z_]/: @localvars===s or VARLIKE_KEYWORDS===s when /^[A-Z]/: VarNameToken===tok when /^[@$<]/: true else raise "not var or method name: #{s}" end end #----------------------------------- def colon_quote_expected?(ch) #yukko hack assert ':?'[ch] readahead(2)[/^(\?[^#{WHSPLF}]|:[$@a-zA-Z_'"`\[*~+\-\/%<=>&|^])$/o] or return false after_nonid_op? { #possible func-call as operator !is_var_name? } end #----------------------------------- def symbol_or_op(ch) startpos=@file.pos qe= colon_quote_expected?(ch) lastchar=prevchar eat_next_if(ch) or raise "needed: "+ch #handle quoted symbols like :"foobar", :"[]" qe and return symbol(':') #look for another colon; return single : if not found unless eat_next_if(?:) #cancel implicit contexts... @moretokens.push(*abort_noparens!(':')) #end ternary context, if any @bracestack.last.see @bracestack,:colon TernaryContext===@bracestack.last and @bracestack.pop #should be in the context's see handler if ExpectDoOrNlContext===@bracestack.last #should be in the context's see handler @bracestack.pop assert @bracestack.last.starter[/^(while|until|for)$/] end @moretokens.push KeywordToken.new(':',startpos) return @moretokens.shift end #we definately found a :: colon2=KeywordToken.new( '::',startpos) lasttok=@last_operative_token assert !(String===lasttok) if (VarNameToken===lasttok or MethNameToken===lasttok) and lasttok===/^[$@a-zA-Z_]/ and !WHSPCHARS[lastchar] then @moretokens << colon2 result= NoWsToken.new(startpos) else result=colon2 end dot_rhs(colon2) return result end #----------------------------------- def symbol(notbare,couldbecallsite=!notbare) assert !couldbecallsite start=@file.pos notbare and start-=1 klass=(notbare ? SymbolToken : MethNameToken) #look for operators opmatches=readahead(3)[RUBYSYMOPERATORREX] result= opmatches ? @file.read(opmatches.size) : case nc=nextchar when ?" then assert notbare;double_quote('"') when ?' then assert notbare;double_quote("'") when ?` then @file.read(1) when ?@ then at_identifier.to_s when ?$ then dollar_identifier.to_s when ?_,?a..?z then identifier_as_string(?:) when ?A..?Z then result=identifier_as_string(?:) if @last_operative_token==='::' assert klass==MethNameToken /[A-Z_0-9]$/i===result and klass=VarNameToken end result else error= "unexpected char starting symbol: #{nc.chr}" end return lexerror(klass.new(result,start),error) end #----------------------------------- def callsite_symbol(tok_to_errify) start=@file.pos #look for operators opmatches=readahead(3)[RUBYSYMOPERATORREX] return [opmatches ? @file.read(opmatches.size) : case nc=nextchar when ?` then @file.read(1) when ?_,?a..?z,?A..?Z then identifier_as_string(?:) else @last_operative_token=KeywordToken.new(';') lexerror(tok_to_errify,"unexpected char starting symbol: #{nc.chr}") nil end, start ] end #----------------------------------- def here_header @file.read(2)=='<<' or raise "parser insanity" dash=eat_next_if(?-) quote=eat_next_if( /^['"`]$/) if quote ender=til_charset(/[#{quote}]/) (quote==getchar) or return lexerror(HerePlaceholderToken.new( dash, quote, ender ), "mismatched quotes in here doc") else quote='"' ender=til_charset(/[^a-zA-Z0-9_]/) ender.length >= 1 or return lexerror(HerePlaceholderToken.new( dash, quote, ender ), "invalid here header") end res= HerePlaceholderToken.new( dash, quote, ender ) @incomplete_here_tokens.push res #hack: normally this should just be in get1token #this fixup is necessary because the call the get1token below #makes a recursion. @last_operative_token=res safe_recurse { |a| assert(a.object_id==@moretokens.object_id) toks=[] begin #yech. #handle case of here header in a string inclusion, but #here body outside it. cnt=0 1.upto @bracestack.size do |i| case @bracestack[-i] when AssignmentRhsContext,ParamListContextNoParen,TopLevelContext else cnt+=1 end end if nextchar==?} and cnt==1 res.bodyclass=OutlinedHereBodyToken break end tok=get1token assert(a.object_id==@moretokens.object_id) toks<" then operator_or_methname_token @file.read(3) else quadriop(ch) end end #----------------------------------- def escnewline(ch) assert ch == '\\' pos=@file.pos result=getchar if nl=readnl result+=nl else error='illegal escape sequence' end lexerror EscNlToken.new(@filename,@linenum,result,pos), error end #----------------------------------- def newline(ch) assert("\r\n"[nextchar.chr]) #handle here bodies queued up by previous line #(we should be more compatible with dos/mac style newlines...) if tofill=@incomplete_here_tokens.shift tofill.string.offset=@file.pos loop { assert("\r\n"[nextchar.chr]) #retr evrything til next nl line=all_quote(/^[\r\n]$/, tofill.quote, /^[\r\n]$/, :regex_esc_seq) #(you didn't know all_quote could take a regex, did you?) #get rid of fals that otherwise appear to be in the middle of #a string (and are emitted out of order) fal=@moretokens.pop assert FileAndLineToken===fal || fal.nil? back1char assert("\r\n"[nextchar.chr]) #matches terminating reg expr? break if line.elems.size==1 and line.elems[0][tofill.termex] tofill.append_token line tofill.append readnl back1char } assert("\r\n"[nextchar.chr]) tofill.unsafe_to_use=false return tofill.bodyclass.new(tofill) end #ordinary newline handling (possibly implicitly escaped) assert("\r\n"[nextchar.chr]) assert @moretokens.empty? result=if NewlineToken===@last_operative_token or #hack @last_operative_token===/^(;|begin|do|#{INNERBOUNDINGWORDS})$/ or #hack !after_nonid_op?{false} then #hack-o-rama: probly cases left out above a= abort_noparens! ExpectDoOrNlContext===@bracestack.last and @bracestack.pop @bracestack.last.see @bracestack,:semi a << super(ch) @moretokens.replace a+@moretokens @moretokens.shift else offset=@file.pos #@moretokens << EscNlToken.new(@filename,@linenum,readnl,offset) #WsToken.new ' ' #why? #should be "\\\n" ? end start_of_line_directives return result end #----------------------------------- EQBEGIN=%r/^=begin[^a-zA-Z_0-9]$/ EQBEGINLENGTH=7 EQEND='=end' ENDMARKER=/^__END__[\r\n]$/ ENDMARKERLENGTH=8 def start_of_line_directives #handle =begin...=end (at start of a line) while EQBEGIN===readahead(EQBEGINLENGTH) startpos=@file.pos more=@file.read(EQBEGINLENGTH-1) #get =begin #keep reading til /\n=end.*\n/ @file.each(EQEND) {|cblock| more << cblock #must be at start of line break if /^[\r\n]#{EQEND}/o===readback(EQEND.length+1) } #read rest of line after =end more << @file.til_charset(/[\r\n]/) assert((?\r===nextchar or ?\n===nextchar)) assert !(/[\r\n]/===more[-1,1]) newls= more.scan(/\r\n?|\n\r?/) @linenum+= newls.size #inject the fresh comment into future token results @moretokens.push IgnoreToken.new(more,startpos) end #handle __END__ if ENDMARKER===readahead(ENDMARKERLENGTH) assert !(ImplicitContext===@bracestack.last) @moretokens.unshift endoffile_detected(@file.read(6)) @file.pos=@file.stat.size end end #----------------------------------- #used to resolve the ambiguity of # unary ops (+, -, *, &, ~ !) in ruby #returns whether current token is to be the start of a literal IDBEGINCHAR=/^[a-zA-Z_$@]/ def unary_op_expected?(ch) #yukko hack '*&='[readahead(2)[1..1]] and return false after_nonid_op? { #possible func-call as operator not is_var_name? and WHSPLF[prevchar] } end #----------------------------------- #used to resolve the ambiguity of # <<, %, ? in ruby #returns whether current token is to be the start of a literal #/ is not handled right here if whitespace immediately follows the / def quote_expected?(ch) #yukko hack case ch[0] when ?? then readahead(2)[/^\?[#{WHSPLF}]$/o] #not needed? when ?% then readahead(3)[/^%([a-ps-vyzA-PR-VX-Z]|[QqrwWx][a-zA-Z0-9])/] when ?< then !readahead(4)[/^<<-?['"`a-z0-9_]/i] else raise 'unexpected ch (#{ch}) in quote_expected?' # when ?+,?-,?&,?*,?~,?! then '*&='[readahead(2)[1..1]] end and return false after_nonid_op? { #possible func-call as operator not is_var_name? and WHSPLF[prevchar] and not WHSPLF[readahead(2)[1..1]] } end #----------------------------------- #used to resolve the ambiguity of # <<, %, /, ?, :, and newline in ruby def after_nonid_op? case @last_operative_token when MethNameToken,VarNameToken, FUNCLIKE_KEYWORDS.token_pat return yield when StringToken, SymbolToken, NumberToken, HerePlaceholderToken, %r{^(class|module|do|end|self|true|false|nil| __FILE__|__LINE__|[\})\]]|alias|(un)?def|for )$}x.token_pat #do shouldn't be in above list... dunno about def/undef #maybe class/module shouldn't either? #for is also in NewlineToken branch, below. #what about rescue? return false when /^(#{RUBYOPERATORREX}|#{INNERBOUNDINGWORDS})$/o.token_pat #regexs above must match whole string #assert(@last_operative_token==$&) #disabled 'cause $& is now always nil :( return true when NewlineToken, nil, #nil means we're still at beginning of file /^([({\[]|or|not|and|if|unless|then|elsif|else| while|until|begin|for|in|case|when|ensure)$ /x.token_pat return true #when KeywordToken # return true when IgnoreToken raise "last_operative_token shouldn't be ignoreable" else raise "after_nonid_op? after #{@last_operative_token}:#{@last_operative_token.class} -- now what" end end #----------------------------------- def quadriop(ch) #match /&&?=?/ (&, &&, &=, or &&=) assert(%w[& * | < >].include?(ch)) # '&*'[ch] and qe=quote_expected?(ch) #not needed? result=getchar + (eat_next_if(ch)or'') if eat_next_if(?=) result << ?= # elsif qe and result[/^[&*]$/] #not needed? # @moretokens<|~|==?)?/ (= or == or =~ or === or =>) offset=@file.pos str=getchar assert str=='=' c=(eat_next_if(/^[~=>]$/)or'') str << c case c when '=': str<< (eat_next_if(?=)or'') when '>': @bracestack.last.see @bracestack,:arrow when '': #record local variable definitions @bracestack.push AssignmentRhsContext.new(@linenum) @moretokens.unshift AssignmentRhsListStartToken.new( offset+1) end return operator_or_methname_token( str,offset) end #----------------------------------- def exclam(ch) #match /![~=]?/ (! or != or !~) assert nextchar==?! result=getchar k=eat_next_if(/^[~=]$/) if k result+=k else WHSPLF[nextchar.chr] or @moretokens << NoWsToken.new(@file.pos) end return KeywordToken.new(result) #result should distinguish unary ! end #----------------------------------- def dot(ch) str='' eat_next_if(?.) or raise "lexer confusion" #three lumps of sugar or two? eat_next_if(?.) and return KeywordToken.new(eat_next_if(?.)? "..." : "..") #else saw just single . #match a valid ruby id after the dot result= KeywordToken.new( ".") dot_rhs(result) return result end #----------------------------------- def dot_rhs(prevtok) safe_recurse { |a| @last_operative_token=prevtok aa= ignored_tokens tok,pos=callsite_symbol(prevtok) tok and aa.push(*var_or_meth_name(tok,prevtok,pos)) a.unshift(*aa) } end #----------------------------------- def single_quote(ch=nil) double_quote(ch) end #----------------------------------- def back_quote(ch=nil) oldpos=@file.pos @last_operative_token===/^(def|::|\.)$/ and return MethNameToken.new( (eat_next_if(?`) or raise "insanity"), oldpos ) double_quote(ch) end #----------------------------------- def comment(str) result="" #loop{ result<