# coding: utf-8 require 'minjs/ctype' module Minjs class Lex include Ctype attr_reader :pos attr_reader :codes def initialize(str = "", options = {}) str = str.gsub(/\r\n/, "\n") @codes = str.codepoints if !str.match(/\n\z/) @codes.push(10) end @pos = 0 @lit_cache = [] @lit_nextpos = [] @logger = options[:logger] @eval_nest = 0 end def clear_cache @lit_cache = [] @lit_nextpos = [] end # # Fetch next literal # # hint: # :regexp # :div # nil # # ECMA262 says: # # There are no syntactic grammar contexts where both a leading division # or division-assignment, and a leading RegularExpressionLiteral are permitted. # This is not affected by semicolon insertion (see 7.9); in examples such as the following: # To determine `/' is regular expression or not # def next_input_element(hint) if ret = @lit_cache[@pos] @pos = @lit_nextpos[@pos] @head_pos = @pos return ret end pos0 = @pos # # skip white space here, because ECMA262(5.1.2) says: # # Simple white space and single-line comments are discarded and # do not appear in the stream of input elements for the # syntactic grammar. # while white_space or single_line_comment end ret = line_terminator || multi_line_comment || token if ret @lit_cache[pos0] = ret @lit_nextpos[pos0] = @pos @head_pos = @pos return ret end if @codes[@pos].nil? return nil end if hint.nil? ECMA262::LIT_DIV_OR_REGEXP_LITERAL elsif hint == :div ret = div_punctuator if ret @lit_cache[pos0] = ret @lit_nextpos[pos0] = @pos end @head_pos = @pos return ret elsif hint == :regexp ret = regexp_literal if ret @lit_cache[pos0] = ret @lit_nextpos[pos0] = @pos end @head_pos = @pos return ret else ECMA262::LIT_DIV_OR_REGEXP_LITERAL end end # 7.2 def white_space if white_space?(@codes[@pos]) begin @pos += 1 end until !white_space?(@codes[@pos]) return ECMA262::WhiteSpace.get else nil end end #7.3 def line_terminator if line_terminator?(@codes[@pos]) begin @pos += 1 end until !line_terminator?(@codes[@pos]) return ECMA262::LineFeed.get else nil end end #7.4 def comment multi_line_comment || single_line_comment end def multi_line_comment # /* if @codes[@pos] == 0x2f and @codes[@pos + 1] == 0x2a @pos += 2 pos0 = @pos # */ while (code = @codes[@pos] != 0x2a) or @codes[@pos + 1] != 0x2f raise ParseError.new("no `*/' at end of comment", self) if code.nil? @pos += 1 end @pos +=2 return ECMA262::MultiLineComment.new(@codes[pos0...(@pos-2)].pack("U*")) else nil end end def single_line_comment # // if @codes[@pos] == 0x2f and @codes[@pos + 1] == 0x2f @pos += 2 pos0 = @pos while (code = @codes[@pos]) and !line_terminator?(code) @pos += 1 end return ECMA262::SingleLineComment.new(@codes[pos0...@pos].pack("U*")) else nil end end # # 7.5 tokens # def token identifier_name || numeric_literal || punctuator || string_literal end # def unicode_escape? # @codes[@pos] == 0x5c if @codes[@pos+1] == 0x75 #u if hex_digit?(@codes[@pos+2]) and hex_digit?(@codes[@pos+3]) and hex_digit?(@codes[@pos+4]) and hex_digit?(@codes[@pos+5]) @codes[(@pos+2)..(@pos+5)].pack("U*").to_i(16) else raise ParseError.new("bad unicode escpae sequence", self) end else nil end end def identifier_name return nil if (code = @codes[@pos]).nil? pos0 = @pos chars = [] if code == 0x5c and ucode = unicode_escape? and identifier_start?(ucode) chars.push(ucode) @pos += 6 elsif identifier_start?(code) chars.push(code) @pos += 1 else return nil end while true code = @codes[@pos] if code == 0x5c and ucode = unicode_escape? and identifier_part?(ucode) chars.push(ucode) @pos += 6 elsif identifier_part?(code) chars.push(code) @pos += 1 else name = chars.pack("U*").to_sym return ECMA262::IdentifierName.get(nil, name) end end end def punctuator code0 = @codes[@pos] code1 = @codes[@pos+1] code2 = @codes[@pos+2] code3 = @codes[@pos+3] if code0 == 0x21 # ! if code1 == 0x3d and code2 == 0x3d # !== @pos += 3 return ECMA262::PUNC_SNEQ end if code1 == 0x3d # != @pos += 2 return ECMA262::PUNC_NEQ end @pos += 1 # ! return ECMA262::PUNC_LNOT elsif code0 == 0x25 # % if code1 == 0x3d # %= @pos += 2 return ECMA262::PUNC_MODLET end @pos += 1 # % return ECMA262::PUNC_MOD elsif code0 == 0x26 # & if code1 == 0x3d # &= @pos += 2 return ECMA262::PUNC_ANDLET end if code1 == 0x26 # && @pos += 2 return ECMA262::PUNC_LAND end @pos += 1 # & return ECMA262::PUNC_AND elsif code0 == 0x28 # ( @pos += 1 # ( return ECMA262::PUNC_LPARENTHESIS elsif code0 == 0x29 # ) @pos += 1 # ) return ECMA262::PUNC_RPARENTHESIS elsif code0 == 0x2a # * if code1 == 0x3d # *= @pos += 2 return ECMA262::PUNC_MULLET end @pos += 1 # * return ECMA262::PUNC_MUL elsif code0 == 0x2b # + if code1 == 0x3d # += @pos += 2 return ECMA262::PUNC_ADDLET end if code1 == 0x2b # ++ @pos += 2 return ECMA262::PUNC_INC end @pos += 1 # + return ECMA262::PUNC_ADD elsif code0 == 0x2c # , @pos += 1 # , return ECMA262::PUNC_COMMA elsif code0 == 0x2d # - if code1 == 0x3d # -= @pos += 2 return ECMA262::PUNC_SUBLET end if code1 == 0x2d # -- @pos += 2 return ECMA262::PUNC_DEC end @pos += 1 # - return ECMA262::PUNC_SUB elsif code0 == 0x2e # . @pos += 1 # . return ECMA262::PUNC_PERIOD elsif code0 == 0x3a # : @pos += 1 # : return ECMA262::PUNC_COLON elsif code0 == 0x3b # ; @pos += 1 # ; return ECMA262::PUNC_SEMICOLON elsif code0 == 0x3c # < if code1 == 0x3d # <= @pos += 2 return ECMA262::PUNC_LTEQ end if code1 == 0x3c and code2 == 0x3d # <<= @pos += 3 return ECMA262::PUNC_LSHIFTLET end if code1 == 0x3c # << @pos += 2 return ECMA262::PUNC_LSHIFT end @pos += 1 # < return ECMA262::PUNC_LT elsif code0 == 0x3d # = if code1 == 0x3d and code2 == 0x3d # === @pos += 3 return ECMA262::PUNC_SEQ end if code1 == 0x3d # == @pos += 2 return ECMA262::PUNC_EQ end @pos += 1 # = return ECMA262::PUNC_LET elsif code0 == 0x3e # > if code1 == 0x3e and code2 == 0x3e and code3 == 0x3d # >>>= @pos += 4 return ECMA262::PUNC_URSHIFTLET end if code1 == 0x3e and code2 == 0x3e # >>> @pos += 3 return ECMA262::PUNC_URSHIFT end if code1 == 0x3e and code2 == 0x3d # >>= @pos += 3 return ECMA262::PUNC_RSHIFTLET end if code1 == 0x3e # >> @pos += 2 return ECMA262::PUNC_RSHIFT end if code1 == 0x3d # >= @pos += 2 return ECMA262::PUNC_GTEQ end @pos += 1 # > return ECMA262::PUNC_GT elsif code0 == 0x3f # ? @pos += 1 # ? return ECMA262::PUNC_CONDIF elsif code0 == 0x5b # [ @pos += 1 # [ return ECMA262::PUNC_LSQBRAC elsif code0 == 0x5d # ] @pos += 1 # ] return ECMA262::PUNC_RSQBRAC elsif code0 == 0x5e # ^ if code1 == 0x3d # ^= @pos += 2 return ECMA262::PUNC_XORLET end @pos += 1 # ^ return ECMA262::PUNC_XOR elsif code0 == 0x7b # { @pos += 1 # { return ECMA262::PUNC_LCURLYBRAC elsif code0 == 0x7c # | if code1 == 0x7c # || @pos += 2 return ECMA262::PUNC_LOR end if code1 == 0x3d # |= @pos += 2 return ECMA262::PUNC_ORLET end @pos += 1 # | return ECMA262::PUNC_OR elsif code0 == 0x7d # } @pos += 1 # } return ECMA262::PUNC_RCURLYBRAC elsif code0 == 0x7e # ~ @pos += 1 # ~ return ECMA262::PUNC_NOT end nil end def div_punctuator if @codes[@pos] == 0x2f if @codes[@pos+1] == 0x3d @pos += 2 return ECMA262::PUNC_DIVLET else @pos += 1 return ECMA262::PUNC_DIV end end nil end # # 7.8.5 # # RegularExpressionLiteral:: # / RegularExpressionBody / RegularExpressionFlags # def regexp_literal pos0 = @pos return nil unless @codes[@pos] == 0x2f body = regexp_body flags = regexp_flags return ECMA262::ECMA262RegExp.new(body, flags) end def regexp_body if @codes[@pos] == 0x2a raise ParseError.new("first character of regular expression is `*'", self) end pos0 = @pos @pos += 1 while !(@codes[@pos] == 0x2f) if @codes[@pos].nil? raise ParseError.new("no `/' end of regular expression", self) end if line_terminator?(@codes[@pos]) raise ParseError.new("regular expression has line terminator in body", self) end if @codes[@pos] == 0x5c # \ @pos += 1 if line_terminator?(@codes[@pos]) raise ParseError.new("regular expression has line terminator in body", self) end @pos += 1 elsif @codes[@pos] == 0x5b # [ regexp_class else @pos += 1 end end @pos += 1 return @codes[(pos0+1)...(@pos-1)].pack("U*") end def regexp_class if @codes[@pos] != 0x5b raise ParseError.new('bad regular expression', self) end @pos += 1 while !(@codes[@pos] == 0x5d) if @codes[@pos].nil? raise ParseError.new("no `]' end of regular expression class", self) end if line_terminator?(@codes[@pos]) raise ParseError.new("regular expression has line terminator in body", self) end if @codes[@pos] == 0x5c # \ @pos += 1 if line_terminator?(@codes[@pos]) raise ParseError.new("regular expression has line terminator in body", self) end @pos += 1 else @pos += 1 end end @pos += 1 end def regexp_flags pos0 = @pos while(identifier_part?(@codes[@pos])) @pos += 1 end return @codes[pos0...@pos].pack("U*") end #7.8.3 #B.1.1 def numeric_literal hex_integer_literal || octal_integer_literal || decimal_literal end #7.8.3 # # HexIntegerLiteral :: # 0x HexDigit # 0X HexDigit # HexIntegerLiteral HexDigit # def hex_integer_literal code = @codes[@pos] if code.nil? return nil #0x / 0X elsif code == 0x30 and (@codes[@pos+1] == 0x78 || @codes[@pos+1] == 0x58) @pos += 2 pos0 = @pos while code = @codes[@pos] and hex_digit?(code) @pos += 1; end if identifier_start?(code) raise ParseError.new("The source character immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit", self) else return ECMA262::ECMA262Numeric.new(@codes[pos0...@pos].pack("U*").to_i(16)) end else nil end end #B.1.1 # OctalIntegerLiteral :: # 0 OctalDigit # OctalIntegerLiteral OctalDigit # def octal_integer_literal code = @codes[@pos] if code.nil? return nil elsif code == 0x30 and (code1 = @codes[@pos + 1]) >= 0x30 and code1 <= 0x37 @pos += 1 pos0 = @pos while code = @codes[@pos] and code >= 0x30 and code <= 0x37 @pos += 1 end if identifier_start?(code) raise ParseError.new("The source character immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit", self) else return ECMA262::ECMA262Numeric.new(@codes[pos0...@pos].pack("U*").to_i(8)) end else nil end end # 7.8.3 # # DecimalLiteral :: # DecimalIntegerLiteral . DecimalDigitsopt ExponentPartopt # . DecimalDigits ExponentPartopt # DecimalIntegerLiteral ExponentPartopt # def decimal_literal pos0 = @pos code = @codes[@pos] if code.nil? return nil elsif code == 0x2e #. @pos += 1 f = decimal_digits if f.nil? #=> this period is punctuator @pos = pos0 + 1 return ECMA262::PUNC_PERIOD end if (code = @codes[@pos]) == 0x65 || code == 0x45 @pos += 1 e = exponent_part end if identifier_start?(@codes[@pos]) raise ParseError.new("The source character immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit", self) end return ECMA262::ECMA262Numeric.new('0', f, e) elsif code == 0x30 # zero i = "0" @pos += 1 if @codes[@pos] == 0x2e #. @pos += 1 f = decimal_digits if (code = @codes[@pos]) == 0x65 || code == 0x45 #e or E @pos += 1 e = exponent_part end elsif (code = @codes[@pos]) == 0x65 || code == 0x45 #e or E @pos += 1 e = exponent_part end if identifier_start?(@codes[@pos]) raise ParseError.new("The source character immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit", self) end return ECMA262::ECMA262Numeric.new(i, f, e) elsif code >= 0x31 and code <= 0x39 i = decimal_digits if @codes[@pos] == 0x2e #. @pos += 1 f = decimal_digits if (code = @codes[@pos]) == 0x65 || code == 0x45 #e or E @pos += 1 e = exponent_part end elsif (code = @codes[@pos]) == 0x65 || code == 0x45 #e or E @pos += 1 e = exponent_part end if identifier_start?(@codes[@pos]) raise ParseError.new("The source character immediately following a NumericLiteral must not be an IdentifierStart or DecimalDigit", self) end return ECMA262::ECMA262Numeric.new(i, f, e) end nil end # 7.8.3 # # ExponentPart :: # ExponentIndicator SignedInteger # def exponent_part if (code = @codes[@pos]) == 0x2b @pos += 1 elsif code == 0x2d @pos += 1 neg = true end d = decimal_digits raise ParseError.new("unexpecting token", self) if d.nil? if neg e = "-#{d}" else e = d end e end #7.8.3 # # DecimalDigit :: one of # 0 1 2 3 4 5 6 7 8 9 # def decimal_digits pos0 = @pos if (code = @codes[@pos]) >= 0x30 and code <= 0x39 @pos += 1 while code = @codes[@pos] and code >= 0x30 and code <= 0x39 @pos += 1 end return @codes[pos0...@pos].pack("U*") else nil end end #7.8.4 # # StringLiteral :: # " DoubleStringCharactersopt " # ' SingleStringCharactersopt ' # # DoubleStringCharacters :: # DoubleStringCharacter DoubleStringCharactersopt # # SingleStringCharacters :: # SingleStringCharacter SingleStringCharactersopt # # DoubleStringCharacter :: # SourceCharacter but not one of " or \ or LineTerminator # \ EscapeSequence # LineContinuation # # SingleStringCharacter :: # SourceCharacter but not one of ' or \ or LineTerminator # \ EscapeSequence # LineContinuation # def string_literal if (code = @codes[@pos]) == 0x27 #' term = 0x27 elsif code == 0x22 #" term = 0x22 else return nil end @pos += 1 pos0 = @pos str = [] while (code = @codes[@pos]) if code.nil? raise ParseError.new("no `#{term}' at end of string", self) elsif line_terminator?(code) raise ParseError.new("string has line terminator in body", self) elsif code == 0x5c #\ @pos += 1 str.push(escape_sequence) elsif code == term @pos += 1 return ECMA262::ECMA262String.new(str.compact.pack("U*")) else @pos += 1 str.push(code) end end nil end # 7.8.4 # B.1.2 # # EscapeSequence :: # CharacterEscapeSequence # 0 [lookahead ∉ DecimalDigit] # HexEscapeSequence # UnicodeEscapeSequence # OctalEscapeSequence def escape_sequence case (code = @codes[@pos]) # when 0x30 # @pos += 1 # 0 when 0x27 #' @pos += 1 0x27 when 0x22 #" @pos += 1 0x22 when 0x5c #\ @pos += 1 0x5c when 0x62 #b @pos += 1 0x08 when 0x74 #t @pos += 1 0x09 when 0x6e #n @pos += 1 0x0a when 0x76 #v @pos += 1 0x0b when 0x66 #f @pos += 1 0x0c when 0x72 #r @pos += 1 0x0d when 0x78 #x #check t = @codes[(@pos+1)..(@pos+2)].pack("U*").to_i(16) @pos += 3 t when 0x75 #u #check t = @codes[(@pos+1)..(@pos+4)].pack("U*").to_i(16) @pos += 5 t else # line continuation if line_terminator?(code) @pos += 1 nil # Annex B.1.2 # # OctalEscapeSequence :: # OctalDigit [lookahead ∉ DecimalDigit] # ZeroToThree OctalDigit [lookahead ∉ DecimalDigit] # FourToSeven OctalDigit # ZeroToThree OctalDigit OctalDigit # # Note: # # A string such as the following is invalid # as a octal escape sequence. # # \19 or \319 # # However, it is not to an error in most implementations. # Therefore, minjs also intepret it such way. # elsif octal_digit?(code) code1 = @codes[@pos+1] code2 = @codes[@pos+2] if code >= 0x30 and code <= 0x33 if octal_digit?(code1) if octal_digit?(code2) @pos += 3 (code - 0x30) * 64 + (code1 - 0x30) * 8 + (code2 - 0x30) else @pos += 2 (code - 0x30) * 8 + (code1 - 0x30) end else @pos += 1 code - 0x30 end else #if code >= 0x34 and code <= 0x37 if octal_digit?(code1) @pos += 2 (code - 0x30) * 8 + (code1 - 0x30) else @pos += 1 code - 0x30 end end else @pos += 1 code end end end def eof? peek_lit(nil).nil? end # # check next literal is strictly equal to 'l' or not. # white spaces and line terminators are skipped and ignored. # # if next literal is not 'l', position is not forwarded # if next literal is 'l', position is forwarded # def eql_lit?(l, hint = nil) lit = peek_lit(hint) if lit.eql? l fwd_after_peek lit else nil end end # # check next literal is strictly equal to 'l' or not. # white spaces are skipped and ignored. # line terminators are not ignored. # # if next literal is not 'l', position is not forwarded # if next literal is 'l', position is forwarded # def eql_lit_nolt?(l, hint = nil) lit = peek_lit_nolt(hint) if lit.eql? l fwd_after_peek lit else nil end end # # check next literal is equal to 'l' or not. # white spaces and line terminators are skipped and ignored. # # if next literal is not 'l', position is not forwarded # if next literal is 'l', position is forwarded # def match_lit?(l, hint = nil) lit = peek_lit(hint) if lit == l fwd_after_peek lit else nil end end # # check next literal is equal to 'l' or not. # white spaces are skipped and ignored. # line terminators are not ignored. # # if next literal is not 'l', position is not forwarded # if next literal is 'l', position is forwarded # def match_lit_nolt?(l, hint = nil) lit = peek_lit_nolt(hint) if lit == l fwd_after_peek lit else nil end end # # fetch next literal. # position is not forwarded. # white spaces and line terminators are skipped and ignored. # def peek_lit(hint) pos0 = @pos while lit = next_input_element(hint) and (lit.ws? or lit.lt?) end @pos = pos0 lit end # # fetch next literal. # position is not forwarded. # white spaces are skipped and ignored. # line terminators are not ignored. # def peek_lit_nolt(hint) pos0 = @pos while lit = next_input_element(hint) and lit.ws? end @pos = pos0 lit end def fwd_after_peek @pos = @head_pos end # # fetch next literal. # position is forwarded. # white spaces and line terminators are skipped and ignored. # def fwd_lit(hint) while lit = next_input_element(hint) and (lit.ws? or lit.lt?) end lit end # # fetch next literal. # position is forwarded. # white spaces are skipped and ignored. # line terminators are not ignored. # def fwd_lit_nolt(hint) while lit = next_input_element(hint) and lit.ws? end lit end # # break => position is rewind, then break with # return => position is rewind, then return # next => position is not rewind, then break with # def eval_lit(&block) begin saved_pos = @pos @eval_nest += 1 ret = yield ensure @eval_nest -= 1 if ret.nil? @pos = saved_pos nil else if @eval_nest == 0 #STDERR.puts "clear_cache [#{saved_pos}..#{@pos}]" clear_cache end end end end # # position to [row, col] # def row_col(pos) _pos = 0 row = 0 col = 1 @codes.each do |code| break if _pos >= pos if line_terminator?(code) row += 1 col = 0 else col += 1 end _pos += 1 end return [row+1, col+1] end # # position to line # def line(pos) pos0 = pos1 = pos while true pos0 -= 1 break if line_terminator?(@codes[pos0]) end pos0 += 1 while true break if line_terminator?(@codes[pos1]) pos1 += 1 end @codes[pos0..pos1].pack("U*") end def debug_str(pos = nil, row = nil, col = nil) if pos.nil? pos = @head_pos or @pos end t = '' if col >= 80 t << @codes[(pos-80)..(pos+80)].pack("U*") col = 81 else t << line(pos) end if col and col >= 1 col = col - 1; end t << "\n" t << (' ' * col) + "^" t end end end