ext/include/iv/lexer.h in iv-phonic-0.0.3 vs ext/include/iv/lexer.h in iv-phonic-0.0.5

- old
+ new

@@ -7,56 +7,52 @@ #include <vector> #include <string> #include "uchar.h" #include "chars.h" #include "token.h" -#include "source.h" #include "location.h" #include "noncopyable.h" +#include "keyword.h" namespace iv { namespace core { -class Lexer: private Noncopyable<Lexer>::type { +template<typename Source> +class Lexer: private Noncopyable<Lexer<Source> >::type { public: - enum LexType { - kClear = 0, - kIdentifyReservedWords = 1, - kIgnoreReservedWords = 2, - kIgnoreReservedWordsAndIdentifyGetterOrSetter = 4, - kStrict = 8 - }; + enum State { NONE, ESCAPE, DECIMAL, HEX, OCTAL }; - explicit Lexer(BasicSource* src) + explicit Lexer(const Source* src) : source_(src), - buffer8_(kInitialReadBufferCapacity), + buffer8_(), buffer16_(kInitialReadBufferCapacity), pos_(0), end_(source_->size()), has_line_terminator_before_next_(false), has_shebang_(false), line_number_(1), location_() { Initialize(); } - Token::Type Next(int type) { - Token::Type token; + template<typename LexType> + typename Token::Type Next(bool strict) { + typename Token::Type token; has_line_terminator_before_next_ = false; do { - location_.begin_position_ = pos(); while (Chars::IsWhiteSpace(c_)) { // white space Advance(); } + location_.set_begin_position(pos() - 1); switch (c_) { case '"': case '\'': // string literal token = ScanString(); @@ -250,11 +246,16 @@ break; case '^': // ^ Advance(); - token = Token::BIT_XOR; + if (c_ == '=') { + Advance(); + token = Token::ASSIGN_BIT_XOR; + } else { + token = Token::BIT_XOR; + } break; case '.': // . Number Advance(); @@ -336,11 +337,11 @@ PushBack(); } default: if (Chars::IsIdentifierStart(c_)) { - token = ScanIdentifier(type); + token = ScanIdentifier<LexType>(strict); } else if (Chars::IsDecimalDigit(c_)) { token = ScanNumber(false); } else if (Chars::IsLineTerminator(c_)) { SkipLineTerminator(); has_line_terminator_before_next_ = true; @@ -352,19 +353,19 @@ token = Token::ILLEGAL; } break; } } while (token == Token::NOT_FOUND); - location_.end_position_ = pos(); + location_.set_end_position(pos() - 1); return token; } inline const std::vector<uc16>& Buffer() const { return buffer16_; } - inline const std::vector<char>& Buffer8() const { + inline const std::string& Buffer8() const { return buffer8_; } inline const double& Numeric() const { return numeric_; @@ -398,19 +399,29 @@ std::size_t pos() const { return pos_; } - inline BasicSource* source() const { + inline const Source* source() const { return source_; } inline const Location& location() const { return location_; } + inline std::size_t begin_position() const { + return location_.begin_position(); + } + + inline std::size_t end_position() const { + return location_.end_position(); + } + bool ScanRegExpLiteral(bool contains_eq) { + // location begin_position is the same with DIV + // so, no need to set bool character = false; buffer16_.clear(); if (contains_eq) { Record16('='); } @@ -433,11 +444,11 @@ character = false; } Record16Advance(); } } - Advance(); + Advance(); // waste '/' return true; } bool ScanRegExpFlags() { buffer16_.clear(); @@ -447,19 +458,21 @@ Advance(); if (c_ != 'u') { return false; } Advance(); - uc = ScanHexEscape('u', 4); - if (uc == '\\') { + bool ng = false; + uc = ScanHexEscape('u', 4, &ng); + if (ng || uc == '\\') { return false; } Record16(uc); } else { Record16Advance(); } } + location_.set_end_position(pos() - 1); return true; } private: static const std::size_t kInitialReadBufferCapacity = 32; @@ -499,37 +512,10 @@ c_ = source_->Get(pos_-2); --pos_; } } - inline Token::Type IsMatch(char const * keyword, - std::size_t len, - Token::Type guess, bool strict) const { - if (!strict) { - return Token::IDENTIFIER; - } - std::vector<uc16>::const_iterator it = buffer16_.begin(); - do { - if (*it++ != *keyword++) { - return Token::IDENTIFIER; - } - } while (--len); - return guess; - } - - inline Token::Type IsMatch(char const * keyword, - std::size_t len, - Token::Type guess) const { - std::vector<uc16>::const_iterator it = buffer16_.begin(); - do { - if (*it++ != *keyword++) { - return Token::IDENTIFIER; - } - } while (--len); - return guess; - } - Token::Type SkipSingleLineComment() { Advance(); // see ECMA-262 section 7.4 while (c_ >= 0 && !Chars::IsLineTerminator(c_)) { Advance(); @@ -580,24 +566,25 @@ Advance(); } return Token::NOT_FOUND; } - Token::Type ScanIdentifier(int type) { - Token::Type token = Token::IDENTIFIER; + template<typename LexType> + Token::Type ScanIdentifier(bool strict) { uc16 uc; buffer16_.clear(); if (c_ == '\\') { Advance(); if (c_ != 'u') { return Token::ILLEGAL; } Advance(); - uc = ScanHexEscape('u', 4); - if (uc == '\\' || !Chars::IsIdentifierStart(uc)) { + bool ng = false; + uc = ScanHexEscape('u', 4, &ng); + if (ng || uc == '\\' || !Chars::IsIdentifierStart(uc)) { return Token::ILLEGAL; } Record16(uc); } else { Record16Advance(); @@ -608,418 +595,24 @@ Advance(); if (c_ != 'u') { return Token::ILLEGAL; } Advance(); - uc = ScanHexEscape('u', 4); - if (uc == '\\' || !Chars::IsIdentifierPart(uc)) { + bool ng = false; + uc = ScanHexEscape('u', 4, &ng); + if (ng || uc == '\\' || !Chars::IsIdentifierPart(uc)) { return Token::ILLEGAL; } Record16(uc); } else { Record16Advance(); } } - if (type & kIdentifyReservedWords) { - token = DetectKeyword(type & kStrict); - } else if (type & kIgnoreReservedWordsAndIdentifyGetterOrSetter) { - token = DetectGetOrSet(); - } - - return token; + return detail::Keyword<LexType>::Detect(buffer16_, strict); } - // detect which Identifier is Keyword, FutureReservedWord or not - // Keyword and FutureReservedWord are defined in ECMA-262 5th. - // - // Some words such as : - // int, short, boolean, byte, long, char, float, double, abstract, volatile, - // transient, final, throws, goto, native, synchronized - // were defined as FutureReservedWord in ECMA-262 3rd, but not in 5th. - // So, DetectKeyword interprets them as Identifier. - Token::Type DetectKeyword(bool strict) const { - const std::size_t len = buffer16_.size(); - Token::Type token = Token::IDENTIFIER; - switch (len) { - case 2: - // if in do - if (buffer16_[0] == 'i') { - if (buffer16_[1] == 'f') { - token = Token::IF; - } else if (buffer16_[1] == 'n') { - token = Token::IN; - } - } else if (buffer16_[0] == 'd' && buffer16_[1] == 'o') { - // do - token = Token::DO; - } - break; - case 3: - // for var int new try let - switch (buffer16_[2]) { - case 't': - if (buffer16_[0] == 'l' && buffer16_[1] == 'e' && strict) { - // let - token = Token::LET; - } else if (buffer16_[0] == 'i' && buffer16_[1] == 'n') { - // int (removed) - // token = Token::INT; - } - break; - case 'r': - // for var - if (buffer16_[0] == 'f' && buffer16_[1] == 'o') { - // for - token = Token::FOR; - } else if (buffer16_[0] == 'v' && buffer16_[1] == 'a') { - // var - token = Token::VAR; - } - break; - case 'y': - // try - if (buffer16_[0] == 't' && buffer16_[1] == 'r') { - token = Token::TRY; - } - break; - case 'w': - // new - if (buffer16_[0] == 'n' && buffer16_[1] == 'e') { - token = Token::NEW; - } - break; - } - break; - case 4: - // else case true byte null this - // void with long enum char goto - // number 3 character is most duplicated - switch (buffer16_[3]) { - case 'e': - // else case true byte - if (buffer16_[2] == 's') { - if (buffer16_[0] == 'e' && buffer16_[1] == 'l') { - // else - token = Token::ELSE; - } else if (buffer16_[0] == 'c' && buffer16_[1] == 'a') { - // case - token = Token::CASE; - } - } else if (buffer16_[0] == 't' && - buffer16_[1] == 'r' && buffer16_[2] == 'u') { - // true - token = Token::TRUE_LITERAL; - } else if (buffer16_[0] == 'b' && - buffer16_[1] == 'y' && buffer16_[2] == 't') { - // byte (removed) - // token = Token::BYTE; - } - break; - case 'l': - // null - if (buffer16_[0] == 'n' && - buffer16_[1] == 'u' && buffer16_[2] == 'l') { - token = Token::NULL_LITERAL; - } - break; - case 's': - // this - if (buffer16_[0] == 't' && - buffer16_[1] == 'h' && buffer16_[2] == 'i') { - token = Token::THIS; - } - break; - case 'd': - // void - if (buffer16_[0] == 'v' && - buffer16_[1] == 'o' && buffer16_[2] == 'i') { - token = Token::VOID; - } - break; - case 'h': - // with - if (buffer16_[0] == 'w' && - buffer16_[1] == 'i' && buffer16_[2] == 't') { - token = Token::WITH; - } - break; - case 'g': - // long (removed) - if (buffer16_[0] == 'l' && - buffer16_[1] == 'o' && buffer16_[2] == 'n') { - // token = Token::LONG; - } - break; - case 'm': - // enum - if (buffer16_[0] == 'e' && - buffer16_[1] == 'n' && buffer16_[2] == 'u') { - token = Token::ENUM; - } - break; - case 'r': - // char (removed) - if (buffer16_[0] == 'c' && - buffer16_[1] == 'h' && buffer16_[2] == 'a') { - // token = Token::CHAR; - } - break; - case 'o': - // goto (removed) - if (buffer16_[0] == 'g' && - buffer16_[1] == 'o' && buffer16_[2] == 't') { - // token = Token::GOTO; - } - break; - } - break; - case 5: - // break final float catch super while - // throw short class const false yield - // number 3 character is most duplicated - switch (buffer16_[3]) { - case 'a': - // break final float - if (buffer16_[0] == 'b' && buffer16_[1] == 'r' && - buffer16_[2] == 'e' && buffer16_[4] == 'k') { - // break - token = Token::BREAK; - } else if (buffer16_[0] == 'f') { - if (buffer16_[1] == 'i' && - buffer16_[2] == 'n' && buffer16_[4] == 'l') { - // final (removed) - // token = Token::FINAL; - } else if (buffer16_[1] == 'l' && - buffer16_[2] == 'o' && buffer16_[4] == 't') { - // float (removed) - // token = Token::FLOAT; - } - } - break; - case 'c': - if (buffer16_[0] == 'c' && buffer16_[1] == 'a' && - buffer16_[2] == 't' && buffer16_[4] == 'h') { - // catch - token = Token::CATCH; - } - break; - case 'e': - if (buffer16_[0] == 's' && buffer16_[1] == 'u' && - buffer16_[2] == 'p' && buffer16_[4] == 'r') { - // super - token = Token::SUPER; - } - break; - case 'l': - if (buffer16_[0] == 'w' && buffer16_[1] == 'h' && - buffer16_[2] == 'i' && buffer16_[4] == 'e') { - // while - token = Token::WHILE; - } else if (strict && - buffer16_[0] == 'y' && buffer16_[1] == 'i' && - buffer16_[2] == 'e' && buffer16_[4] == 'd') { - // yield - token = Token::YIELD; - } - break; - case 'o': - if (buffer16_[0] == 't' && buffer16_[1] == 'h' && - buffer16_[2] == 'r' && buffer16_[4] == 'w') { - // throw - token = Token::THROW; - } - break; - case 'r': - if (buffer16_[0] == 's' && buffer16_[1] == 'h' && - buffer16_[2] == 'o' && buffer16_[4] == 't') { - // short (removed) - // token = Token::SHORT; - } - break; - case 's': - // class const false - if (buffer16_[0] == 'c') { - if (buffer16_[1] == 'l' && - buffer16_[2] == 'a' && buffer16_[4] == 's') { - // class - token = Token::CLASS; - } else if (buffer16_[1] == 'o' && - buffer16_[2] == 'n' && buffer16_[4] == 't') { - // const - token = Token::CONST; - } - } else if (buffer16_[0] == 'f' && buffer16_[1] == 'a' && - buffer16_[2] == 'l' && buffer16_[4] == 'e') { - // false - token = Token::FALSE_LITERAL; - } - break; - } - break; - case 6: - // double delete export import native - // public return static switch typeof throws - // number 0 character is most duplicated - switch (buffer16_[0]) { - case 'd': - // double delete - if (buffer16_[5] == 'e' && - buffer16_[4] == 'l' && buffer16_[3] == 'b' && - buffer16_[2] == 'u' && buffer16_[1] == 'o') { - // double - // token = Token::DOUBLE; - } else if (buffer16_[5] == 'e' && - buffer16_[4] == 't' && buffer16_[3] == 'e' && - buffer16_[2] == 'l' && buffer16_[1] == 'e') { - // delete - token = Token::DELETE; - } - break; - case 'e': - // export - token = IsMatch("export", len, Token::EXPORT); - break; - case 'i': - // import - token = IsMatch("import", len, Token::IMPORT); - break; - case 'n': - // native (removed) - // token = IsMatch("native", len, Token::NATIVE); - break; - case 'p': - // public - token = IsMatch("public", len, Token::PUBLIC, strict); - break; - case 'r': - // return - token = IsMatch("return", len, Token::RETURN); - break; - case 's': - // switch static - if (buffer16_[1] == 'w' && - buffer16_[2] == 'i' && buffer16_[3] == 't' && - buffer16_[4] == 'c' && buffer16_[5] == 'h') { - // switch - token = Token::SWITCH; - } else if (strict && - buffer16_[1] == 't' && - buffer16_[2] == 'a' && buffer16_[3] == 't' && - buffer16_[4] == 'i' && buffer16_[5] == 'c') { - // static - token = Token::STATIC; - } - break; - case 't': - // typeof throws - if (buffer16_[5] == 'f' && - buffer16_[4] == 'o' && buffer16_[3] == 'e' && - buffer16_[2] == 'p' && buffer16_[1] == 'y') { - // typeof - token = Token::TYPEOF; - } else if (buffer16_[5] == 's' && - buffer16_[4] == 'w' && buffer16_[3] == 'o' && - buffer16_[2] == 'r' && buffer16_[1] == 'h') { - // throws (removed) - // token = Token::THROWS; - } - break; - } - break; - case 7: - // boolean default extends finally package private - // number 0 character is most duplicated - switch (buffer16_[0]) { - case 'b': - // boolean (removed) - // token = IsMatch("boolean", len, Token::BOOLEAN); - break; - case 'd': - token = IsMatch("default", len, Token::DEFAULT); - break; - case 'e': - token = IsMatch("extends", len, Token::EXTENDS); - break; - case 'f': - token = IsMatch("finally", len, Token::FINALLY); - break; - case 'p': - if (buffer16_[1] == 'a') { - token = IsMatch("package", len, Token::PACKAGE, strict); - } else if (buffer16_[1] == 'r') { - token = IsMatch("private", len, Token::PRIVATE, strict); - } - break; - } - break; - case 8: - // debugger continue abstract volatile function - // number 4 character is most duplicated - switch (buffer16_[4]) { - case 'g': - token = IsMatch("debugger", len, Token::DEBUGGER); - break; - case 'i': - token = IsMatch("continue", len, Token::CONTINUE); - break; - case 'r': - // abstract (removed) - // token = IsMatch("abstract", len, Token::ABSTRACT); - break; - case 't': - if (buffer16_[1] == 'o') { - // token = IsMatch("volatile", len, Token::VOLATILE); - } else if (buffer16_[1] == 'u') { - token = IsMatch("function", len, Token::FUNCTION); - } - break; - } - break; - case 9: - // interface protected transient - if (buffer16_[1] == 'n') { - token = IsMatch("interface", len, Token::INTERFACE, strict); - } else if (buffer16_[1] == 'r') { - if (buffer16_[0] == 'p') { - token = IsMatch("protected", len, Token::PROTECTED, strict); - } else if (buffer16_[0] == 't') { - // transient (removed) - // token = IsMatch("transient", len, Token::TRANSIENT); - } - } - break; - case 10: - // instanceof implements - if (buffer16_[1] == 'n') { - token = IsMatch("instanceof", len, Token::INSTANCEOF); - } else if (buffer16_[1] == 'm') { - token = IsMatch("implements", len, Token::IMPLEMENTS, strict); - } - break; - case 12: - // synchronized (removed) - // token = IsMatch("synchronized", len, Token::SYNCHRONIZED); - token = Token::IDENTIFIER; - break; - } - return token; - } - - Token::Type DetectGetOrSet() const { - if (buffer16_.size() == 3) { - if (buffer16_[1] == 'e' && buffer16_[2] == 't') { - if (buffer16_[0] == 'g') { - return Token::GET; - } else if (buffer16_[0] == 's') { - return Token::SET; - } - } - } - return Token::IDENTIFIER; - } - Token::Type ScanString() { type_ = NONE; const uc16 quote = c_; buffer16_.clear(); Advance(); @@ -1029,11 +622,13 @@ // escape sequence if (c_ < 0) return Token::ILLEGAL; if (type_ == NONE) { type_ = ESCAPE; } - ScanEscape(); + if (!ScanEscape()) { + return Token::ILLEGAL; + } } else { Record16Advance(); } } if (c_ != quote) { @@ -1043,14 +638,14 @@ Advance(); return Token::STRING; } - void ScanEscape() { + bool ScanEscape() { if (Chars::IsLineTerminator(c_)) { SkipLineTerminator(); - return; + return true; } switch (c_) { case '\'': case '"' : case '\\': @@ -1074,22 +669,34 @@ break; case 't' : Record16('\t'); Advance(); break; - case 'u' : + case 'u' : { Advance(); - Record16(ScanHexEscape('u', 4)); + bool ng = false; + const uc16 uc = ScanHexEscape('u', 4, &ng); + if (ng) { + return false; + } + Record16(uc); break; + } case 'v' : Record16('\v'); Advance(); break; - case 'x' : + case 'x' : { Advance(); - Record16(ScanHexEscape('x', 2)); + bool ng = false; + const uc16 uc = ScanHexEscape('x', 2, &ng); + if (ng) { + return false; + } + Record16(uc); break; + } case '0' : case '1' : case '2' : case '3' : case '4' : @@ -1100,14 +707,20 @@ type_ = OCTAL; } Record16(ScanOctalEscape()); break; + case '8' : + case '9' : + // section 7.8.4 and B1.2 + return false; + default: Record16Advance(); break; } + return true; } Token::Type ScanNumber(const bool period) { buffer8_.clear(); State type = DECIMAL; @@ -1179,18 +792,17 @@ return Token::ILLEGAL; } if (type == OCTAL) { double val = 0; - for (std::vector<char>::const_iterator it = buffer8_.begin(), + for (std::string::const_iterator it = buffer8_.begin(), last = buffer8_.end(); it != last; ++it) { val = val * 8 + (*it - '0'); } numeric_ = val; } else { - Record8('\0'); // Null Terminated String - numeric_ = std::strtod(buffer8_.data(), NULL); + numeric_ = std::strtod(buffer8_.c_str(), NULL); } type_ = type; return Token::NUMBER; } @@ -1209,18 +821,19 @@ Advance(); } return res; } - uc16 ScanHexEscape(uc16 c, int len) { + uc16 ScanHexEscape(uc16 c, int len, bool* ng) { uc16 res = 0; for (int i = 0; i < len; ++i) { const int d = HexValue(c_); if (d < 0) { for (int j = i - 1; j >= 0; --j) { PushBack(); } + *ng = true; return c; } res = res * 16 + d; Advance(); } @@ -1260,11 +873,11 @@ Advance(); } ++line_number_; } - BasicSource* source_; - std::vector<char> buffer8_; + const Source* source_; + std::string buffer8_; std::vector<uc16> buffer16_; double numeric_; State type_; std::size_t pos_; const std::size_t end_;