ext/include/iv/lexer.h in iv-phonic-0.0.3 vs ext/include/iv/lexer.h in iv-phonic-0.0.5
- old
+ new
@@ -7,56 +7,52 @@
#include <vector>
#include <string>
#include "uchar.h"
#include "chars.h"
#include "token.h"
-#include "source.h"
#include "location.h"
#include "noncopyable.h"
+#include "keyword.h"
namespace iv {
namespace core {
-class Lexer: private Noncopyable<Lexer>::type {
+template<typename Source>
+class Lexer: private Noncopyable<Lexer<Source> >::type {
public:
- enum LexType {
- kClear = 0,
- kIdentifyReservedWords = 1,
- kIgnoreReservedWords = 2,
- kIgnoreReservedWordsAndIdentifyGetterOrSetter = 4,
- kStrict = 8
- };
+
enum State {
NONE,
ESCAPE,
DECIMAL,
HEX,
OCTAL
};
- explicit Lexer(BasicSource* src)
+ explicit Lexer(const Source* src)
: source_(src),
- buffer8_(kInitialReadBufferCapacity),
+ buffer8_(),
buffer16_(kInitialReadBufferCapacity),
pos_(0),
end_(source_->size()),
has_line_terminator_before_next_(false),
has_shebang_(false),
line_number_(1),
location_() {
Initialize();
}
- Token::Type Next(int type) {
- Token::Type token;
+ template<typename LexType>
+ typename Token::Type Next(bool strict) {
+ typename Token::Type token;
has_line_terminator_before_next_ = false;
do {
- location_.begin_position_ = pos();
while (Chars::IsWhiteSpace(c_)) {
// white space
Advance();
}
+ location_.set_begin_position(pos() - 1);
switch (c_) {
case '"':
case '\'':
// string literal
token = ScanString();
@@ -250,11 +246,16 @@
break;
case '^':
// ^
Advance();
- token = Token::BIT_XOR;
+ if (c_ == '=') {
+ Advance();
+ token = Token::ASSIGN_BIT_XOR;
+ } else {
+ token = Token::BIT_XOR;
+ }
break;
case '.':
// . Number
Advance();
@@ -336,11 +337,11 @@
PushBack();
}
default:
if (Chars::IsIdentifierStart(c_)) {
- token = ScanIdentifier(type);
+ token = ScanIdentifier<LexType>(strict);
} else if (Chars::IsDecimalDigit(c_)) {
token = ScanNumber(false);
} else if (Chars::IsLineTerminator(c_)) {
SkipLineTerminator();
has_line_terminator_before_next_ = true;
@@ -352,19 +353,19 @@
token = Token::ILLEGAL;
}
break;
}
} while (token == Token::NOT_FOUND);
- location_.end_position_ = pos();
+ location_.set_end_position(pos() - 1);
return token;
}
inline const std::vector<uc16>& Buffer() const {
return buffer16_;
}
- inline const std::vector<char>& Buffer8() const {
+ inline const std::string& Buffer8() const {
return buffer8_;
}
inline const double& Numeric() const {
return numeric_;
@@ -398,19 +399,29 @@
std::size_t pos() const {
return pos_;
}
- inline BasicSource* source() const {
+ inline const Source* source() const {
return source_;
}
inline const Location& location() const {
return location_;
}
+ inline std::size_t begin_position() const {
+ return location_.begin_position();
+ }
+
+ inline std::size_t end_position() const {
+ return location_.end_position();
+ }
+
bool ScanRegExpLiteral(bool contains_eq) {
+ // location begin_position is the same with DIV
+ // so, no need to set
bool character = false;
buffer16_.clear();
if (contains_eq) {
Record16('=');
}
@@ -433,11 +444,11 @@
character = false;
}
Record16Advance();
}
}
- Advance();
+ Advance(); // waste '/'
return true;
}
bool ScanRegExpFlags() {
buffer16_.clear();
@@ -447,19 +458,21 @@
Advance();
if (c_ != 'u') {
return false;
}
Advance();
- uc = ScanHexEscape('u', 4);
- if (uc == '\\') {
+ bool ng = false;
+ uc = ScanHexEscape('u', 4, &ng);
+ if (ng || uc == '\\') {
return false;
}
Record16(uc);
} else {
Record16Advance();
}
}
+ location_.set_end_position(pos() - 1);
return true;
}
private:
static const std::size_t kInitialReadBufferCapacity = 32;
@@ -499,37 +512,10 @@
c_ = source_->Get(pos_-2);
--pos_;
}
}
- inline Token::Type IsMatch(char const * keyword,
- std::size_t len,
- Token::Type guess, bool strict) const {
- if (!strict) {
- return Token::IDENTIFIER;
- }
- std::vector<uc16>::const_iterator it = buffer16_.begin();
- do {
- if (*it++ != *keyword++) {
- return Token::IDENTIFIER;
- }
- } while (--len);
- return guess;
- }
-
- inline Token::Type IsMatch(char const * keyword,
- std::size_t len,
- Token::Type guess) const {
- std::vector<uc16>::const_iterator it = buffer16_.begin();
- do {
- if (*it++ != *keyword++) {
- return Token::IDENTIFIER;
- }
- } while (--len);
- return guess;
- }
-
Token::Type SkipSingleLineComment() {
Advance();
// see ECMA-262 section 7.4
while (c_ >= 0 && !Chars::IsLineTerminator(c_)) {
Advance();
@@ -580,24 +566,25 @@
Advance();
}
return Token::NOT_FOUND;
}
- Token::Type ScanIdentifier(int type) {
- Token::Type token = Token::IDENTIFIER;
+ template<typename LexType>
+ Token::Type ScanIdentifier(bool strict) {
uc16 uc;
buffer16_.clear();
if (c_ == '\\') {
Advance();
if (c_ != 'u') {
return Token::ILLEGAL;
}
Advance();
- uc = ScanHexEscape('u', 4);
- if (uc == '\\' || !Chars::IsIdentifierStart(uc)) {
+ bool ng = false;
+ uc = ScanHexEscape('u', 4, &ng);
+ if (ng || uc == '\\' || !Chars::IsIdentifierStart(uc)) {
return Token::ILLEGAL;
}
Record16(uc);
} else {
Record16Advance();
@@ -608,418 +595,24 @@
Advance();
if (c_ != 'u') {
return Token::ILLEGAL;
}
Advance();
- uc = ScanHexEscape('u', 4);
- if (uc == '\\' || !Chars::IsIdentifierPart(uc)) {
+ bool ng = false;
+ uc = ScanHexEscape('u', 4, &ng);
+ if (ng || uc == '\\' || !Chars::IsIdentifierPart(uc)) {
return Token::ILLEGAL;
}
Record16(uc);
} else {
Record16Advance();
}
}
- if (type & kIdentifyReservedWords) {
- token = DetectKeyword(type & kStrict);
- } else if (type & kIgnoreReservedWordsAndIdentifyGetterOrSetter) {
- token = DetectGetOrSet();
- }
-
- return token;
+ return detail::Keyword<LexType>::Detect(buffer16_, strict);
}
- // detect which Identifier is Keyword, FutureReservedWord or not
- // Keyword and FutureReservedWord are defined in ECMA-262 5th.
- //
- // Some words such as :
- // int, short, boolean, byte, long, char, float, double, abstract, volatile,
- // transient, final, throws, goto, native, synchronized
- // were defined as FutureReservedWord in ECMA-262 3rd, but not in 5th.
- // So, DetectKeyword interprets them as Identifier.
- Token::Type DetectKeyword(bool strict) const {
- const std::size_t len = buffer16_.size();
- Token::Type token = Token::IDENTIFIER;
- switch (len) {
- case 2:
- // if in do
- if (buffer16_[0] == 'i') {
- if (buffer16_[1] == 'f') {
- token = Token::IF;
- } else if (buffer16_[1] == 'n') {
- token = Token::IN;
- }
- } else if (buffer16_[0] == 'd' && buffer16_[1] == 'o') {
- // do
- token = Token::DO;
- }
- break;
- case 3:
- // for var int new try let
- switch (buffer16_[2]) {
- case 't':
- if (buffer16_[0] == 'l' && buffer16_[1] == 'e' && strict) {
- // let
- token = Token::LET;
- } else if (buffer16_[0] == 'i' && buffer16_[1] == 'n') {
- // int (removed)
- // token = Token::INT;
- }
- break;
- case 'r':
- // for var
- if (buffer16_[0] == 'f' && buffer16_[1] == 'o') {
- // for
- token = Token::FOR;
- } else if (buffer16_[0] == 'v' && buffer16_[1] == 'a') {
- // var
- token = Token::VAR;
- }
- break;
- case 'y':
- // try
- if (buffer16_[0] == 't' && buffer16_[1] == 'r') {
- token = Token::TRY;
- }
- break;
- case 'w':
- // new
- if (buffer16_[0] == 'n' && buffer16_[1] == 'e') {
- token = Token::NEW;
- }
- break;
- }
- break;
- case 4:
- // else case true byte null this
- // void with long enum char goto
- // number 3 character is most duplicated
- switch (buffer16_[3]) {
- case 'e':
- // else case true byte
- if (buffer16_[2] == 's') {
- if (buffer16_[0] == 'e' && buffer16_[1] == 'l') {
- // else
- token = Token::ELSE;
- } else if (buffer16_[0] == 'c' && buffer16_[1] == 'a') {
- // case
- token = Token::CASE;
- }
- } else if (buffer16_[0] == 't' &&
- buffer16_[1] == 'r' && buffer16_[2] == 'u') {
- // true
- token = Token::TRUE_LITERAL;
- } else if (buffer16_[0] == 'b' &&
- buffer16_[1] == 'y' && buffer16_[2] == 't') {
- // byte (removed)
- // token = Token::BYTE;
- }
- break;
- case 'l':
- // null
- if (buffer16_[0] == 'n' &&
- buffer16_[1] == 'u' && buffer16_[2] == 'l') {
- token = Token::NULL_LITERAL;
- }
- break;
- case 's':
- // this
- if (buffer16_[0] == 't' &&
- buffer16_[1] == 'h' && buffer16_[2] == 'i') {
- token = Token::THIS;
- }
- break;
- case 'd':
- // void
- if (buffer16_[0] == 'v' &&
- buffer16_[1] == 'o' && buffer16_[2] == 'i') {
- token = Token::VOID;
- }
- break;
- case 'h':
- // with
- if (buffer16_[0] == 'w' &&
- buffer16_[1] == 'i' && buffer16_[2] == 't') {
- token = Token::WITH;
- }
- break;
- case 'g':
- // long (removed)
- if (buffer16_[0] == 'l' &&
- buffer16_[1] == 'o' && buffer16_[2] == 'n') {
- // token = Token::LONG;
- }
- break;
- case 'm':
- // enum
- if (buffer16_[0] == 'e' &&
- buffer16_[1] == 'n' && buffer16_[2] == 'u') {
- token = Token::ENUM;
- }
- break;
- case 'r':
- // char (removed)
- if (buffer16_[0] == 'c' &&
- buffer16_[1] == 'h' && buffer16_[2] == 'a') {
- // token = Token::CHAR;
- }
- break;
- case 'o':
- // goto (removed)
- if (buffer16_[0] == 'g' &&
- buffer16_[1] == 'o' && buffer16_[2] == 't') {
- // token = Token::GOTO;
- }
- break;
- }
- break;
- case 5:
- // break final float catch super while
- // throw short class const false yield
- // number 3 character is most duplicated
- switch (buffer16_[3]) {
- case 'a':
- // break final float
- if (buffer16_[0] == 'b' && buffer16_[1] == 'r' &&
- buffer16_[2] == 'e' && buffer16_[4] == 'k') {
- // break
- token = Token::BREAK;
- } else if (buffer16_[0] == 'f') {
- if (buffer16_[1] == 'i' &&
- buffer16_[2] == 'n' && buffer16_[4] == 'l') {
- // final (removed)
- // token = Token::FINAL;
- } else if (buffer16_[1] == 'l' &&
- buffer16_[2] == 'o' && buffer16_[4] == 't') {
- // float (removed)
- // token = Token::FLOAT;
- }
- }
- break;
- case 'c':
- if (buffer16_[0] == 'c' && buffer16_[1] == 'a' &&
- buffer16_[2] == 't' && buffer16_[4] == 'h') {
- // catch
- token = Token::CATCH;
- }
- break;
- case 'e':
- if (buffer16_[0] == 's' && buffer16_[1] == 'u' &&
- buffer16_[2] == 'p' && buffer16_[4] == 'r') {
- // super
- token = Token::SUPER;
- }
- break;
- case 'l':
- if (buffer16_[0] == 'w' && buffer16_[1] == 'h' &&
- buffer16_[2] == 'i' && buffer16_[4] == 'e') {
- // while
- token = Token::WHILE;
- } else if (strict &&
- buffer16_[0] == 'y' && buffer16_[1] == 'i' &&
- buffer16_[2] == 'e' && buffer16_[4] == 'd') {
- // yield
- token = Token::YIELD;
- }
- break;
- case 'o':
- if (buffer16_[0] == 't' && buffer16_[1] == 'h' &&
- buffer16_[2] == 'r' && buffer16_[4] == 'w') {
- // throw
- token = Token::THROW;
- }
- break;
- case 'r':
- if (buffer16_[0] == 's' && buffer16_[1] == 'h' &&
- buffer16_[2] == 'o' && buffer16_[4] == 't') {
- // short (removed)
- // token = Token::SHORT;
- }
- break;
- case 's':
- // class const false
- if (buffer16_[0] == 'c') {
- if (buffer16_[1] == 'l' &&
- buffer16_[2] == 'a' && buffer16_[4] == 's') {
- // class
- token = Token::CLASS;
- } else if (buffer16_[1] == 'o' &&
- buffer16_[2] == 'n' && buffer16_[4] == 't') {
- // const
- token = Token::CONST;
- }
- } else if (buffer16_[0] == 'f' && buffer16_[1] == 'a' &&
- buffer16_[2] == 'l' && buffer16_[4] == 'e') {
- // false
- token = Token::FALSE_LITERAL;
- }
- break;
- }
- break;
- case 6:
- // double delete export import native
- // public return static switch typeof throws
- // number 0 character is most duplicated
- switch (buffer16_[0]) {
- case 'd':
- // double delete
- if (buffer16_[5] == 'e' &&
- buffer16_[4] == 'l' && buffer16_[3] == 'b' &&
- buffer16_[2] == 'u' && buffer16_[1] == 'o') {
- // double
- // token = Token::DOUBLE;
- } else if (buffer16_[5] == 'e' &&
- buffer16_[4] == 't' && buffer16_[3] == 'e' &&
- buffer16_[2] == 'l' && buffer16_[1] == 'e') {
- // delete
- token = Token::DELETE;
- }
- break;
- case 'e':
- // export
- token = IsMatch("export", len, Token::EXPORT);
- break;
- case 'i':
- // import
- token = IsMatch("import", len, Token::IMPORT);
- break;
- case 'n':
- // native (removed)
- // token = IsMatch("native", len, Token::NATIVE);
- break;
- case 'p':
- // public
- token = IsMatch("public", len, Token::PUBLIC, strict);
- break;
- case 'r':
- // return
- token = IsMatch("return", len, Token::RETURN);
- break;
- case 's':
- // switch static
- if (buffer16_[1] == 'w' &&
- buffer16_[2] == 'i' && buffer16_[3] == 't' &&
- buffer16_[4] == 'c' && buffer16_[5] == 'h') {
- // switch
- token = Token::SWITCH;
- } else if (strict &&
- buffer16_[1] == 't' &&
- buffer16_[2] == 'a' && buffer16_[3] == 't' &&
- buffer16_[4] == 'i' && buffer16_[5] == 'c') {
- // static
- token = Token::STATIC;
- }
- break;
- case 't':
- // typeof throws
- if (buffer16_[5] == 'f' &&
- buffer16_[4] == 'o' && buffer16_[3] == 'e' &&
- buffer16_[2] == 'p' && buffer16_[1] == 'y') {
- // typeof
- token = Token::TYPEOF;
- } else if (buffer16_[5] == 's' &&
- buffer16_[4] == 'w' && buffer16_[3] == 'o' &&
- buffer16_[2] == 'r' && buffer16_[1] == 'h') {
- // throws (removed)
- // token = Token::THROWS;
- }
- break;
- }
- break;
- case 7:
- // boolean default extends finally package private
- // number 0 character is most duplicated
- switch (buffer16_[0]) {
- case 'b':
- // boolean (removed)
- // token = IsMatch("boolean", len, Token::BOOLEAN);
- break;
- case 'd':
- token = IsMatch("default", len, Token::DEFAULT);
- break;
- case 'e':
- token = IsMatch("extends", len, Token::EXTENDS);
- break;
- case 'f':
- token = IsMatch("finally", len, Token::FINALLY);
- break;
- case 'p':
- if (buffer16_[1] == 'a') {
- token = IsMatch("package", len, Token::PACKAGE, strict);
- } else if (buffer16_[1] == 'r') {
- token = IsMatch("private", len, Token::PRIVATE, strict);
- }
- break;
- }
- break;
- case 8:
- // debugger continue abstract volatile function
- // number 4 character is most duplicated
- switch (buffer16_[4]) {
- case 'g':
- token = IsMatch("debugger", len, Token::DEBUGGER);
- break;
- case 'i':
- token = IsMatch("continue", len, Token::CONTINUE);
- break;
- case 'r':
- // abstract (removed)
- // token = IsMatch("abstract", len, Token::ABSTRACT);
- break;
- case 't':
- if (buffer16_[1] == 'o') {
- // token = IsMatch("volatile", len, Token::VOLATILE);
- } else if (buffer16_[1] == 'u') {
- token = IsMatch("function", len, Token::FUNCTION);
- }
- break;
- }
- break;
- case 9:
- // interface protected transient
- if (buffer16_[1] == 'n') {
- token = IsMatch("interface", len, Token::INTERFACE, strict);
- } else if (buffer16_[1] == 'r') {
- if (buffer16_[0] == 'p') {
- token = IsMatch("protected", len, Token::PROTECTED, strict);
- } else if (buffer16_[0] == 't') {
- // transient (removed)
- // token = IsMatch("transient", len, Token::TRANSIENT);
- }
- }
- break;
- case 10:
- // instanceof implements
- if (buffer16_[1] == 'n') {
- token = IsMatch("instanceof", len, Token::INSTANCEOF);
- } else if (buffer16_[1] == 'm') {
- token = IsMatch("implements", len, Token::IMPLEMENTS, strict);
- }
- break;
- case 12:
- // synchronized (removed)
- // token = IsMatch("synchronized", len, Token::SYNCHRONIZED);
- token = Token::IDENTIFIER;
- break;
- }
- return token;
- }
-
- Token::Type DetectGetOrSet() const {
- if (buffer16_.size() == 3) {
- if (buffer16_[1] == 'e' && buffer16_[2] == 't') {
- if (buffer16_[0] == 'g') {
- return Token::GET;
- } else if (buffer16_[0] == 's') {
- return Token::SET;
- }
- }
- }
- return Token::IDENTIFIER;
- }
-
Token::Type ScanString() {
type_ = NONE;
const uc16 quote = c_;
buffer16_.clear();
Advance();
@@ -1029,11 +622,13 @@
// escape sequence
if (c_ < 0) return Token::ILLEGAL;
if (type_ == NONE) {
type_ = ESCAPE;
}
- ScanEscape();
+ if (!ScanEscape()) {
+ return Token::ILLEGAL;
+ }
} else {
Record16Advance();
}
}
if (c_ != quote) {
@@ -1043,14 +638,14 @@
Advance();
return Token::STRING;
}
- void ScanEscape() {
+ bool ScanEscape() {
if (Chars::IsLineTerminator(c_)) {
SkipLineTerminator();
- return;
+ return true;
}
switch (c_) {
case '\'':
case '"' :
case '\\':
@@ -1074,22 +669,34 @@
break;
case 't' :
Record16('\t');
Advance();
break;
- case 'u' :
+ case 'u' : {
Advance();
- Record16(ScanHexEscape('u', 4));
+ bool ng = false;
+ const uc16 uc = ScanHexEscape('u', 4, &ng);
+ if (ng) {
+ return false;
+ }
+ Record16(uc);
break;
+ }
case 'v' :
Record16('\v');
Advance();
break;
- case 'x' :
+ case 'x' : {
Advance();
- Record16(ScanHexEscape('x', 2));
+ bool ng = false;
+ const uc16 uc = ScanHexEscape('x', 2, &ng);
+ if (ng) {
+ return false;
+ }
+ Record16(uc);
break;
+ }
case '0' :
case '1' :
case '2' :
case '3' :
case '4' :
@@ -1100,14 +707,20 @@
type_ = OCTAL;
}
Record16(ScanOctalEscape());
break;
+ case '8' :
+ case '9' :
+ // section 7.8.4 and B1.2
+ return false;
+
default:
Record16Advance();
break;
}
+ return true;
}
Token::Type ScanNumber(const bool period) {
buffer8_.clear();
State type = DECIMAL;
@@ -1179,18 +792,17 @@
return Token::ILLEGAL;
}
if (type == OCTAL) {
double val = 0;
- for (std::vector<char>::const_iterator it = buffer8_.begin(),
+ for (std::string::const_iterator it = buffer8_.begin(),
last = buffer8_.end(); it != last; ++it) {
val = val * 8 + (*it - '0');
}
numeric_ = val;
} else {
- Record8('\0'); // Null Terminated String
- numeric_ = std::strtod(buffer8_.data(), NULL);
+ numeric_ = std::strtod(buffer8_.c_str(), NULL);
}
type_ = type;
return Token::NUMBER;
}
@@ -1209,18 +821,19 @@
Advance();
}
return res;
}
- uc16 ScanHexEscape(uc16 c, int len) {
+ uc16 ScanHexEscape(uc16 c, int len, bool* ng) {
uc16 res = 0;
for (int i = 0; i < len; ++i) {
const int d = HexValue(c_);
if (d < 0) {
for (int j = i - 1; j >= 0; --j) {
PushBack();
}
+ *ng = true;
return c;
}
res = res * 16 + d;
Advance();
}
@@ -1260,11 +873,11 @@
Advance();
}
++line_number_;
}
- BasicSource* source_;
- std::vector<char> buffer8_;
+ const Source* source_;
+ std::string buffer8_;
std::vector<uc16> buffer16_;
double numeric_;
State type_;
std::size_t pos_;
const std::size_t end_;