/*********************************************************************** A JavaScript tokenizer / parser / beautifier / compressor. This version is suitable for Node.js. With minimal changes (the exports stuff) it should work on any JS platform. This file contains the tokenizer/parser. It is a port to JavaScript of parse-js [1], a JavaScript parser library written in Common Lisp by Marijn Haverbeke. Thank you Marijn! [1] http://marijn.haverbeke.nl/parse-js/ Exported functions: - tokenizer(code) -- returns a function. Call the returned function to fetch the next token. - parse(code) -- returns an AST of the given JavaScript code. -------------------------------- (C) --------------------------------- Author: Mihai Bazon http://mihai.bazon.net/blog Distributed under the same terms as the original code (ZLIB license): Copyright 2010 (c) Mihai Bazon Based on parse-js (http://marijn.haverbeke.nl/parse-js/). This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. ***********************************************************************/ /* -----[ Tokenizer (constants) ]----- */ var KEYWORDS = array_to_hash([ "break", "case", "catch", "const", "continue", "default", "delete", "do", "else", "finally", "for", "function", "if", "in", "instanceof", "new", "return", "switch", "throw", "try", "typeof", "var", "void", "while", "with" ]); var RESERVED_WORDS = array_to_hash([ "abstract", "boolean", "byte", "char", "class", "debugger", "double", "enum", "export", "extends", "final", "float", "goto", "implements", "import", "int", "interface", "long", "native", "package", "private", "protected", "public", "short", "static", "super", "synchronized", "throws", "transient", "volatile" ]); var KEYWORDS_BEFORE_EXPRESSION = array_to_hash([ "return", "new", "delete", "throw", "else" ]); var KEYWORDS_ATOM = array_to_hash([ "false", "null", "true", "undefined" ]); var OPERATOR_CHARS = array_to_hash(characters("+-*&%=<>!?|~^")); var RE_HEX_NUMBER = /^0x[0-9a-f]+$/i; var RE_OCT_NUMBER = /^0[0-7]+$/; var RE_DEC_NUMBER = /^\d*\.?\d*(?:e[+-]?\d*(?:\d\.?|\.?\d)\d*)?$/i; var OPERATORS = array_to_hash([ "in", "instanceof", "typeof", "new", "void", "delete", "++", "--", "+", "-", "!", "~", "&", "|", "^", "*", "/", "%", ">>", "<<", ">>>", "<", ">", "<=", ">=", "==", "===", "!=", "!==", "?", "=", "+=", "-=", "/=", "*=", "%=", ">>=", "<<=", ">>>=", "~=", "%=", "|=", "^=", "&=", "&&", "||" ]); var WHITESPACE_CHARS = array_to_hash(characters(" \n\r\t")); var PUNC_BEFORE_EXPRESSION = array_to_hash(characters("[{}(,.;:")); var PUNC_CHARS = array_to_hash(characters("[]{}(),;:")); var REGEXP_MODIFIERS = array_to_hash(characters("gmsiy")); /* -----[ Tokenizer ]----- */ function is_alphanumeric_char(ch) { ch = ch.charCodeAt(0); return (ch >= 48 && ch <= 57) || (ch >= 65 && ch <= 90) || (ch >= 97 && ch <= 122); }; function is_identifier_char(ch) { return is_alphanumeric_char(ch) || ch == "$" || ch == "_"; }; function is_digit(ch) { ch = ch.charCodeAt(0); return ch >= 48 && ch <= 57; }; function parse_js_number(num) { if (RE_HEX_NUMBER.test(num)) { return parseInt(num.substr(2), 16); } else if (RE_OCT_NUMBER.test(num)) { return parseInt(num.substr(1), 8); } else if (RE_DEC_NUMBER.test(num)) { return parseFloat(num); } }; function JS_Parse_Error(message, line, col, pos) { this.message = message; this.line = line; this.col = col; this.pos = pos; try { ({})(); } catch(ex) { this.stack = ex.stack; }; }; JS_Parse_Error.prototype.toString = function() { return this.message + " (line: " + this.line + ", col: " + this.col + ", pos: " + this.pos + ")" + "\n\n" + this.stack; }; function js_error(message, line, col, pos) { throw new JS_Parse_Error(message, line, col, pos); }; function is_token(token, type, val) { return token.type == type && (val == null || token.value == val); }; var EX_EOF = {}; function tokenizer($TEXT, skip_comments) { var S = { text : $TEXT.replace(/\r\n?|[\n\u2028\u2029]/g, "\n").replace(/^\uFEFF/, ''), pos : 0, tokpos : 0, line : 0, tokline : 0, col : 0, tokcol : 0, newline_before : false, regex_allowed : false }; function peek() { return S.text.charAt(S.pos); }; function next(signal_eof) { var ch = S.text.charAt(S.pos++); if (signal_eof && !ch) throw EX_EOF; if (ch == "\n") { S.newline_before = true; ++S.line; S.col = 0; } else { ++S.col; } return ch; }; function eof() { return !S.peek(); }; function find(what, signal_eof) { var pos = S.text.indexOf(what, S.pos); if (signal_eof && pos == -1) throw EX_EOF; return pos; }; function start_token() { S.tokline = S.line; S.tokcol = S.col; S.tokpos = S.pos; }; function token(type, value) { S.regex_allowed = ((type == "operator" && !HOP(UNARY_POSTFIX, value)) || (type == "keyword" && HOP(KEYWORDS_BEFORE_EXPRESSION, value)) || (type == "punc" && HOP(PUNC_BEFORE_EXPRESSION, value))); var ret = { type : type, value : value, line : S.tokline, col : S.tokcol, pos : S.tokpos, nlb : S.newline_before }; S.newline_before = false; return ret; }; function skip_whitespace() { while (HOP(WHITESPACE_CHARS, peek())) next(); }; function read_while(pred) { var ret = "", ch = peek(), i = 0; while (ch && pred(ch, i++)) { ret += next(); ch = peek(); } return ret; }; function parse_error(err) { js_error(err, S.tokline, S.tokcol, S.tokpos); }; function read_num(prefix) { var has_e = false, after_e = false, has_x = false; var num = read_while(function(ch, i){ if (ch == "x" || ch == "X") { if (has_x) return false; return has_x = true; } if (!has_x && (ch == "E" || ch == "e")) { if (has_e) return false; return has_e = after_e = true; } if (ch == "-") { if (after_e || (i == 0 && !prefix)) return true; return false; } if (ch == "+") return after_e; after_e = false; return is_alphanumeric_char(ch) || ch == "."; }); if (prefix) num = prefix + num; var valid = parse_js_number(num); if (!isNaN(valid)) { return token("num", valid); } else { parse_error("Invalid syntax: " + num); } }; function read_escaped_char() { var ch = next(true); switch (ch) { case "n" : return "\n"; case "r" : return "\r"; case "t" : return "\t"; case "b" : return "\b"; case "v" : return "\v"; case "f" : return "\f"; case "0" : return "\0"; case "x" : return String.fromCharCode(hex_bytes(2)); case "u" : return String.fromCharCode(hex_bytes(4)); default : return ch; } }; function hex_bytes(n) { var num = 0; for (; n > 0; --n) { var digit = parseInt(next(true), 16); if (isNaN(digit)) parse_error("Invalid hex-character pattern in string"); num = (num << 4) | digit; } return num; }; function read_string() { return with_eof_error("Unterminated string constant", function(){ var quote = next(), ret = ""; for (;;) { var ch = next(true); if (ch == "\\") ch = read_escaped_char(); else if (ch == quote) break; ret += ch; } return token("string", ret); }); }; function read_line_comment() { next(); var i = find("\n"), ret; if (i == -1) { ret = S.text.substr(S.pos); S.pos = S.text.length; } else { ret = S.text.substring(S.pos, i); S.pos = i; } return token("comment1", ret); }; function read_multiline_comment() { next(); return with_eof_error("Unterminated multiline comment", function(){ var i = find("*/", true), text = S.text.substring(S.pos, i), tok = token("comment2", text); S.pos = i + 2; S.newline_before = text.indexOf("\n") >= 0; return tok; }); }; function read_regexp() { return with_eof_error("Unterminated regular expression", function(){ var prev_backslash = false, regexp = "", ch, in_class = false; while ((ch = next(true))) if (prev_backslash) { regexp += "\\" + ch; prev_backslash = false; } else if (ch == "[") { in_class = true; regexp += ch; } else if (ch == "]" && in_class) { in_class = false; regexp += ch; } else if (ch == "/" && !in_class) { break; } else if (ch == "\\") { prev_backslash = true; } else { regexp += ch; } var mods = read_while(function(ch){ return HOP(REGEXP_MODIFIERS, ch); }); return token("regexp", [ regexp, mods ]); }); }; function read_operator(prefix) { function grow(op) { var bigger = op + peek(); if (HOP(OPERATORS, bigger)) { next(); return grow(bigger); } else { return op; } }; return token("operator", grow(prefix || next())); }; var handle_slash = skip_comments ? function() { next(); var regex_allowed = S.regex_allowed; switch (peek()) { case "/": read_line_comment(); S.regex_allowed = regex_allowed; return next_token(); case "*": read_multiline_comment(); S.regex_allowed = regex_allowed; return next_token(); } return S.regex_allowed ? read_regexp() : read_operator("/"); } : function() { next(); switch (peek()) { case "/": return read_line_comment(); case "*": return read_multiline_comment(); } return S.regex_allowed ? read_regexp() : read_operator("/"); }; function handle_dot() { next(); return is_digit(peek()) ? read_num(".") : token("punc", "."); }; function read_word() { var word = read_while(is_identifier_char); return !HOP(KEYWORDS, word) ? token("name", word) : HOP(OPERATORS, word) ? token("operator", word) : HOP(KEYWORDS_ATOM, word) ? token("atom", word) : token("keyword", word); }; function with_eof_error(eof_error, cont) { try { return cont(); } catch(ex) { if (ex === EX_EOF) parse_error(eof_error); else throw ex; } }; function next_token(force_regexp) { if (force_regexp) return read_regexp(); skip_whitespace(); start_token(); var ch = peek(); if (!ch) return token("eof"); if (is_digit(ch)) return read_num(); if (ch == '"' || ch == "'") return read_string(); if (HOP(PUNC_CHARS, ch)) return token("punc", next()); if (ch == ".") return handle_dot(); if (ch == "/") return handle_slash(); if (HOP(OPERATOR_CHARS, ch)) return read_operator(); if (is_identifier_char(ch)) return read_word(); parse_error("Unexpected character '" + ch + "'"); }; next_token.context = function(nc) { if (nc) S = nc; return S; }; return next_token; }; /* -----[ Parser (constants) ]----- */ var UNARY_PREFIX = array_to_hash([ "typeof", "void", "delete", "--", "++", "!", "~", "-", "+" ]); var UNARY_POSTFIX = array_to_hash([ "--", "++" ]); var ASSIGNMENT = (function(a, ret, i){ while (i < a.length) { ret[a[i]] = a[i].substr(0, a[i].length - 1); i++; } return ret; })( ["+=", "-=", "/=", "*=", "%=", ">>=", "<<=", ">>>=", "~=", "%=", "|=", "^=", "&="], { "=": true }, 0 ); var PRECEDENCE = (function(a, ret){ for (var i = 0, n = 1; i < a.length; ++i, ++n) { var b = a[i]; for (var j = 0; j < b.length; ++j) { ret[b[j]] = n; } } return ret; })( [ ["||"], ["&&"], ["|"], ["^"], ["&"], ["==", "===", "!=", "!=="], ["<", ">", "<=", ">=", "in", "instanceof"], [">>", "<<", ">>>"], ["+", "-"], ["*", "/", "%"] ], {} ); var STATEMENTS_WITH_LABELS = array_to_hash([ "for", "do", "while", "switch" ]); var ATOMIC_START_TOKEN = array_to_hash([ "atom", "num", "string", "regexp", "name" ]); /* -----[ Parser ]----- */ function NodeWithToken(str, start, end) { this.name = str; this.start = start; this.end = end; }; NodeWithToken.prototype.toString = function() { return this.name; }; function parse($TEXT, strict_mode, embed_tokens) { var S = { input: tokenizer($TEXT, true), token: null, prev: null, peeked: null, in_function: 0, in_loop: 0, labels: [] }; S.token = next(); function is(type, value) { return is_token(S.token, type, value); }; function peek() { return S.peeked || (S.peeked = S.input()); }; function next() { S.prev = S.token; if (S.peeked) { S.token = S.peeked; S.peeked = null; } else { S.token = S.input(); } return S.token; }; function prev() { return S.prev; }; function croak(msg, line, col, pos) { var ctx = S.input.context(); js_error(msg, line != null ? line : ctx.tokline, col != null ? col : ctx.tokcol, pos != null ? pos : ctx.tokpos); }; function token_error(token, msg) { croak(msg, token.line, token.col); }; function unexpected(token) { if (token == null) token = S.token; token_error(token, "Unexpected token: " + token.type + " (" + token.value + ")"); }; function expect_token(type, val) { if (is(type, val)) { return next(); } token_error(S.token, "Unexpected token " + S.token.type + ", expected " + type); }; function expect(punc) { return expect_token("punc", punc); }; function can_insert_semicolon() { return !strict_mode && ( S.token.nlb || is("eof") || is("punc", "}") ); }; function semicolon() { if (is("punc", ";")) next(); else if (!can_insert_semicolon()) unexpected(); }; function as() { return slice(arguments); }; function parenthesised() { expect("("); var ex = expression(); expect(")"); return ex; }; function add_tokens(str, start, end) { return new NodeWithToken(str, start, end); }; var statement = embed_tokens ? function() { var start = S.token; var stmt = $statement(); stmt[0] = add_tokens(stmt[0], start, prev()); return stmt; } : $statement; function $statement() { if (is("operator", "/")) { S.peeked = null; S.token = S.input(true); // force regexp } switch (S.token.type) { case "num": case "string": case "regexp": case "operator": case "atom": return simple_statement(); case "name": return is_token(peek(), "punc", ":") ? labeled_statement(prog1(S.token.value, next, next)) : simple_statement(); case "punc": switch (S.token.value) { case "{": return as("block", block_()); case "[": case "(": return simple_statement(); case ";": next(); return as("block"); default: unexpected(); } case "keyword": switch (prog1(S.token.value, next)) { case "break": return break_cont("break"); case "continue": return break_cont("continue"); case "debugger": semicolon(); return as("debugger"); case "do": return (function(body){ expect_token("keyword", "while"); return as("do", prog1(parenthesised, semicolon), body); })(in_loop(statement)); case "for": return for_(); case "function": return function_(true); case "if": return if_(); case "return": if (S.in_function == 0) croak("'return' outside of function"); return as("return", is("punc", ";") ? (next(), null) : can_insert_semicolon() ? null : prog1(expression, semicolon)); case "switch": return as("switch", parenthesised(), switch_block_()); case "throw": return as("throw", prog1(expression, semicolon)); case "try": return try_(); case "var": return prog1(var_, semicolon); case "const": return prog1(const_, semicolon); case "while": return as("while", parenthesised(), in_loop(statement)); case "with": return as("with", parenthesised(), statement()); default: unexpected(); } } }; function labeled_statement(label) { S.labels.push(label); var start = S.token, stat = statement(); if (strict_mode && !HOP(STATEMENTS_WITH_LABELS, stat[0])) unexpected(start); S.labels.pop(); return as("label", label, stat); }; function simple_statement() { return as("stat", prog1(expression, semicolon)); }; function break_cont(type) { var name = is("name") ? S.token.value : null; if (name != null) { next(); if (!member(name, S.labels)) croak("Label " + name + " without matching loop or statement"); } else if (S.in_loop == 0) croak(type + " not inside a loop or switch"); semicolon(); return as(type, name); }; function for_() { expect("("); var has_var = is("keyword", "var"); if (has_var) next(); if (is("name") && is_token(peek(), "operator", "in")) { // for (i in foo) var name = S.token.value; next(); next(); var obj = expression(); expect(")"); return as("for-in", has_var, name, obj, in_loop(statement)); } else { // classic for var init = is("punc", ";") ? null : has_var ? var_() : expression(); expect(";"); var test = is("punc", ";") ? null : expression(); expect(";"); var step = is("punc", ")") ? null : expression(); expect(")"); return as("for", init, test, step, in_loop(statement)); } }; function function_(in_statement) { var name = is("name") ? prog1(S.token.value, next) : null; if (in_statement && !name) unexpected(); expect("("); return as(in_statement ? "defun" : "function", name, // arguments (function(first, a){ while (!is("punc", ")")) { if (first) first = false; else expect(","); if (!is("name")) unexpected(); a.push(S.token.value); next(); } next(); return a; })(true, []), // body (function(){ ++S.in_function; var loop = S.in_loop; S.in_loop = 0; var a = block_(); --S.in_function; S.in_loop = loop; return a; })()); }; function if_() { var cond = parenthesised(), body = statement(), belse; if (is("keyword", "else")) { next(); belse = statement(); } return as("if", cond, body, belse); }; function block_() { expect("{"); var a = []; while (!is("punc", "}")) { if (is("eof")) unexpected(); a.push(statement()); } next(); return a; }; var switch_block_ = curry(in_loop, function(){ expect("{"); var a = [], cur = null; while (!is("punc", "}")) { if (is("eof")) unexpected(); if (is("keyword", "case")) { next(); cur = []; a.push([ expression(), cur ]); expect(":"); } else if (is("keyword", "default")) { next(); expect(":"); cur = []; a.push([ null, cur ]); } else { if (!cur) unexpected(); cur.push(statement()); } } next(); return a; }); function try_() { var body = block_(), bcatch, bfinally; if (is("keyword", "catch")) { next(); expect("("); if (!is("name")) croak("Name expected"); var name = S.token.value; next(); expect(")"); bcatch = [ name, block_() ]; } if (is("keyword", "finally")) { next(); bfinally = block_(); } if (!bcatch && !bfinally) croak("Missing catch/finally blocks"); return as("try", body, bcatch, bfinally); }; function vardefs() { var a = []; for (;;) { if (!is("name")) unexpected(); var name = S.token.value; next(); if (is("operator", "=")) { next(); a.push([ name, expression(false) ]); } else { a.push([ name ]); } if (!is("punc", ",")) break; next(); } return a; }; function var_() { return as("var", vardefs()); }; function const_() { return as("const", vardefs()); }; function new_() { var newexp = expr_atom(false), args; if (is("punc", "(")) { next(); args = expr_list(")"); } else { args = []; } return subscripts(as("new", newexp, args), true); }; function expr_atom(allow_calls) { if (is("operator", "new")) { next(); return new_(); } if (is("operator") && HOP(UNARY_PREFIX, S.token.value)) { return make_unary("unary-prefix", prog1(S.token.value, next), expr_atom(allow_calls)); } if (is("punc")) { switch (S.token.value) { case "(": next(); return subscripts(prog1(expression, curry(expect, ")")), allow_calls); case "[": next(); return subscripts(array_(), allow_calls); case "{": next(); return subscripts(object_(), allow_calls); } unexpected(); } if (is("keyword", "function")) { next(); return subscripts(function_(false), allow_calls); } if (HOP(ATOMIC_START_TOKEN, S.token.type)) { var atom = S.token.type == "regexp" ? as("regexp", S.token.value[0], S.token.value[1]) : as(S.token.type, S.token.value); return subscripts(prog1(atom, next), allow_calls); } unexpected(); }; function expr_list(closing, allow_trailing_comma) { var first = true, a = []; while (!is("punc", closing)) { if (first) first = false; else expect(","); if (allow_trailing_comma && is("punc", closing)) break; a.push(expression(false)); } next(); return a; }; function array_() { return as("array", expr_list("]", !strict_mode)); }; function object_() { var first = true, a = []; while (!is("punc", "}")) { if (first) first = false; else expect(","); if (!strict_mode && is("punc", "}")) // allow trailing comma break; var name = as_property_name(); expect(":"); var value = expression(false); a.push([ name, value ]); } next(); return as("object", a); }; function as_property_name() { switch (S.token.type) { case "num": case "string": return prog1(S.token.value, next); } return as_name(); }; function as_name() { switch (S.token.type) { case "name": case "operator": case "keyword": case "atom": return prog1(S.token.value, next); default: unexpected(); } }; function subscripts(expr, allow_calls) { if (is("punc", ".")) { next(); return subscripts(as("dot", expr, as_name()), allow_calls); } if (is("punc", "[")) { next(); return subscripts(as("sub", expr, prog1(expression, curry(expect, "]"))), allow_calls); } if (allow_calls && is("punc", "(")) { next(); return subscripts(as("call", expr, expr_list(")")), true); } if (allow_calls && is("operator") && HOP(UNARY_POSTFIX, S.token.value)) { return prog1(curry(make_unary, "unary-postfix", S.token.value, expr), next); } return expr; }; function make_unary(tag, op, expr) { if ((op == "++" || op == "--") && !is_assignable(expr)) croak("Invalid use of " + op + " operator"); return as(tag, op, expr); }; function expr_op(left, min_prec) { var op = is("operator") ? S.token.value : null; var prec = op != null ? PRECEDENCE[op] : null; if (prec != null && prec > min_prec) { next(); var right = expr_op(expr_atom(true), prec); return expr_op(as("binary", op, left, right), min_prec); } return left; }; function expr_ops() { return expr_op(expr_atom(true), 0); }; function maybe_conditional() { var expr = expr_ops(); if (is("operator", "?")) { next(); var yes = expression(false); expect(":"); return as("conditional", expr, yes, expression(false)); } return expr; }; function is_assignable(expr) { switch (expr[0]) { case "dot": case "sub": return true; case "name": return expr[1] != "this"; } }; function maybe_assign() { var left = maybe_conditional(), val = S.token.value; if (is("operator") && HOP(ASSIGNMENT, val)) { if (is_assignable(left)) { next(); return as("assign", ASSIGNMENT[val], left, maybe_assign()); } croak("Invalid assignment"); } return left; }; function expression(commas) { if (arguments.length == 0) commas = true; var expr = maybe_assign(); if (commas && is("punc", ",")) { next(); return as("seq", expr, expression()); } return expr; }; function in_loop(cont) { try { ++S.in_loop; return cont(); } finally { --S.in_loop; } }; return as("toplevel", (function(a){ while (!is("eof")) a.push(statement()); return a; })([])); }; /* -----[ Utilities ]----- */ function curry(f) { var args = slice(arguments, 1); return function() { return f.apply(this, args.concat(slice(arguments))); }; }; function prog1(ret) { if (ret instanceof Function) ret = ret(); for (var i = 1, n = arguments.length; --n > 0; ++i) arguments[i](); return ret; }; function array_to_hash(a) { var ret = {}; for (var i = 0; i < a.length; ++i) ret[a[i]] = true; return ret; }; function slice(a, start) { return Array.prototype.slice.call(a, start == null ? 0 : start); }; function characters(str) { return str.split(""); }; function member(name, array) { for (var i = array.length; --i >= 0;) if (array[i] === name) return true; return false; }; function HOP(obj, prop) { return Object.prototype.hasOwnProperty.call(obj, prop); }; /* -----[ Exports ]----- */ exports.tokenizer = tokenizer; exports.parse = parse; exports.slice = slice; exports.curry = curry; exports.member = member; exports.array_to_hash = array_to_hash; exports.PRECEDENCE = PRECEDENCE; exports.KEYWORDS_ATOM = KEYWORDS_ATOM; exports.RESERVED_WORDS = RESERVED_WORDS; exports.KEYWORDS = KEYWORDS; exports.ATOMIC_START_TOKEN = ATOMIC_START_TOKEN; exports.OPERATORS = OPERATORS; exports.is_alphanumeric_char = is_alphanumeric_char;