var functions = require("./functions"); var Lexer = require("./Lexer"); var symbols = require("./symbols"); var utils = require("./utils"); var ParseError = require("./ParseError"); /** * This file contains the parser used to parse out a TeX expression from the * input. Since TeX isn't context-free, standard parsers don't work particularly * well. * * The strategy of this parser is as such: * * The main functions (the `.parse...` ones) take a position in the current * parse string to parse tokens from. The lexer (found in Lexer.js, stored at * this.lexer) also supports pulling out tokens at arbitrary places. When * individual tokens are needed at a position, the lexer is called to pull out a * token, which is then used. * * The main functions also take a mode that the parser is currently in * (currently "math" or "text"), which denotes whether the current environment * is a math-y one or a text-y one (e.g. inside \text). Currently, this serves * to limit the functions which can be used in text mode. * * The main functions then return an object which contains the useful data that * was parsed at its given point, and a new position at the end of the parsed * data. The main functions can call each other and continue the parsing by * using the returned position as a new starting point. * * There are also extra `.handle...` functions, which pull out some reused * functionality into self-contained functions. * * The earlier functions return `ParseResult`s, which contain a ParseNode and a * new position. * * The later functions (which are called deeper in the parse) sometimes return * ParseFuncOrArgument, which contain a ParseResult as well as some data about * whether the parsed object is a function which is missing some arguments, or a * standalone object which can be used as an argument to another function. */ /** * Main Parser class */ function Parser(input) { // Make a new lexer this.lexer = new Lexer(input); } /** * The resulting parse tree nodes of the parse tree. */ function ParseNode(type, value, mode) { this.type = type; this.value = value; this.mode = mode; } /** * A result and final position returned by the `.parse...` functions. */ function ParseResult(result, newPosition) { this.result = result; this.position = newPosition; } /** * An initial function (without its arguments), or an argument to a function. * The `result` argument should be a ParseResult. */ function ParseFuncOrArgument(result, isFunction, allowedInText, numArgs, numOptionalArgs, argTypes) { this.result = result; // Is this a function (i.e. is it something defined in functions.js)? this.isFunction = isFunction; // Is it allowed in text mode? this.allowedInText = allowedInText; // How many arguments? this.numArgs = numArgs; // How many optional arguments? this.numOptionalArgs = numOptionalArgs; // What types of arguments? this.argTypes = argTypes; } /** * Checks a result to make sure it has the right type, and throws an * appropriate error otherwise. */ Parser.prototype.expect = function(result, text) { if (result.text !== text) { throw new ParseError( "Expected '" + text + "', got '" + result.text + "'", this.lexer, result.position ); } }; /** * Main parsing function, which parses an entire input. * * @return {?Array.} */ Parser.prototype.parse = function(input) { // Try to parse the input var parse = this.parseInput(0, "math"); return parse.result; }; /** * Parses an entire input tree. */ Parser.prototype.parseInput = function(pos, mode) { // Parse an expression var expression = this.parseExpression(pos, mode, false, null); // If we succeeded, make sure there's an EOF at the end var EOF = this.lexer.lex(expression.position, mode); this.expect(EOF, "EOF"); return expression; }; /** * Parses an "expression", which is a list of atoms. * * @param {boolean} breakOnInfix Should the parsing stop when we hit infix * nodes? This happens when functions have higher precendence * than infix nodes in implicit parses. * * @param {?string} breakOnToken The token that the expression should end with, * or `null` if something else should end the expression. * * @return {ParseResult} */ Parser.prototype.parseExpression = function(pos, mode, breakOnInfix, breakOnToken) { var body = []; // Keep adding atoms to the body until we can't parse any more atoms (either // we reached the end, a }, or a \right) while (true) { var lex = this.lexer.lex(pos, mode); if (breakOnToken != null && lex.text === breakOnToken) { break; } var atom = this.parseAtom(pos, mode); if (!atom) { break; } if (breakOnInfix && atom.result.type === "infix") { break; } body.push(atom.result); pos = atom.position; } return new ParseResult(this.handleInfixNodes(body, mode), pos); }; /** * Rewrites infix operators such as \over with corresponding commands such * as \frac. * * There can only be one infix operator per group. If there's more than one * then the expression is ambiguous. This can be resolved by adding {}. * * @returns {Array} */ Parser.prototype.handleInfixNodes = function (body, mode) { var overIndex = -1; var func; var funcName; for (var i = 0; i < body.length; i++) { var node = body[i]; if (node.type === "infix") { if (overIndex !== -1) { throw new ParseError("only one infix operator per group", this.lexer, -1); } overIndex = i; funcName = node.value.replaceWith; func = functions.funcs[funcName]; } } if (overIndex !== -1) { var numerNode, denomNode; var numerBody = body.slice(0, overIndex); var denomBody = body.slice(overIndex + 1); if (numerBody.length === 1 && numerBody[0].type === "ordgroup") { numerNode = numerBody[0]; } else { numerNode = new ParseNode("ordgroup", numerBody, mode); } if (denomBody.length === 1 && denomBody[0].type === "ordgroup") { denomNode = denomBody[0]; } else { denomNode = new ParseNode("ordgroup", denomBody, mode); } var value = func.handler(funcName, numerNode, denomNode); return [new ParseNode(value.type, value, mode)]; } else { return body; } }; // The greediness of a superscript or subscript var SUPSUB_GREEDINESS = 1; /** * Handle a subscript or superscript with nice errors. */ Parser.prototype.handleSupSubscript = function(pos, mode, symbol, name) { var group = this.parseGroup(pos, mode); if (!group) { throw new ParseError( "Expected group after '" + symbol + "'", this.lexer, pos); } else if (group.numArgs > 0) { // ^ and _ have a greediness, so handle interactions with functions' // greediness var funcGreediness = functions.getGreediness(group.result.result); if (funcGreediness > SUPSUB_GREEDINESS) { return this.parseFunction(pos, mode); } else { throw new ParseError( "Got function '" + group.result.result + "' with no arguments " + "as " + name, this.lexer, pos); } } else { return group.result; } }; /** * Parses a group with optional super/subscripts. * * @return {?ParseResult} */ Parser.prototype.parseAtom = function(pos, mode) { // The body of an atom is an implicit group, so that things like // \left(x\right)^2 work correctly. var base = this.parseImplicitGroup(pos, mode); // In text mode, we don't have superscripts or subscripts if (mode === "text") { return base; } // Handle an empty base var currPos; if (!base) { currPos = pos; base = undefined; } else { currPos = base.position; } var superscript; var subscript; var result; while (true) { // Lex the first token var lex = this.lexer.lex(currPos, mode); if (lex.text === "^") { // We got a superscript start if (superscript) { throw new ParseError( "Double superscript", this.lexer, currPos); } result = this.handleSupSubscript( lex.position, mode, lex.text, "superscript"); currPos = result.position; superscript = result.result; } else if (lex.text === "_") { // We got a subscript start if (subscript) { throw new ParseError( "Double subscript", this.lexer, currPos); } result = this.handleSupSubscript( lex.position, mode, lex.text, "subscript"); currPos = result.position; subscript = result.result; } else if (lex.text === "'") { // We got a prime var prime = new ParseNode("textord", "\\prime", mode); // Many primes can be grouped together, so we handle this here var primes = [prime]; currPos = lex.position; // Keep lexing tokens until we get something that's not a prime while ((lex = this.lexer.lex(currPos, mode)).text === "'") { // For each one, add another prime to the list primes.push(prime); currPos = lex.position; } // Put them into an ordgroup as the superscript superscript = new ParseNode("ordgroup", primes, mode); } else { // If it wasn't ^, _, or ', stop parsing super/subscripts break; } } if (superscript || subscript) { // If we got either a superscript or subscript, create a supsub return new ParseResult( new ParseNode("supsub", { base: base && base.result, sup: superscript, sub: subscript }, mode), currPos); } else { // Otherwise return the original body return base; } }; // A list of the size-changing functions, for use in parseImplicitGroup var sizeFuncs = [ "\\tiny", "\\scriptsize", "\\footnotesize", "\\small", "\\normalsize", "\\large", "\\Large", "\\LARGE", "\\huge", "\\Huge" ]; // A list of the style-changing functions, for use in parseImplicitGroup var styleFuncs = [ "\\displaystyle", "\\textstyle", "\\scriptstyle", "\\scriptscriptstyle" ]; /** * Parses an implicit group, which is a group that starts at the end of a * specified, and ends right before a higher explicit group ends, or at EOL. It * is used for functions that appear to affect the current style, like \Large or * \textrm, where instead of keeping a style we just pretend that there is an * implicit grouping after it until the end of the group. E.g. * small text {\Large large text} small text again * It is also used for \left and \right to get the correct grouping. * * @return {?ParseResult} */ Parser.prototype.parseImplicitGroup = function(pos, mode) { var start = this.parseSymbol(pos, mode); if (!start || !start.result) { // If we didn't get anything we handle, fall back to parseFunction return this.parseFunction(pos, mode); } var func = start.result.result; var body; if (func === "\\left") { // If we see a left: // Parse the entire left function (including the delimiter) var left = this.parseFunction(pos, mode); // Parse out the implicit body body = this.parseExpression(left.position, mode, false, "}"); // Check the next token var rightLex = this.parseSymbol(body.position, mode); if (rightLex && rightLex.result.result === "\\right") { // If it's a \right, parse the entire right function (including the delimiter) var right = this.parseFunction(body.position, mode); return new ParseResult( new ParseNode("leftright", { body: body.result, left: left.result.value.value, right: right.result.value.value }, mode), right.position); } else { throw new ParseError("Missing \\right", this.lexer, body.position); } } else if (func === "\\right") { // If we see a right, explicitly fail the parsing here so the \left // handling ends the group return null; } else if (utils.contains(sizeFuncs, func)) { // If we see a sizing function, parse out the implict body body = this.parseExpression(start.result.position, mode, false, "}"); return new ParseResult( new ParseNode("sizing", { // Figure out what size to use based on the list of functions above size: "size" + (utils.indexOf(sizeFuncs, func) + 1), value: body.result }, mode), body.position); } else if (utils.contains(styleFuncs, func)) { // If we see a styling function, parse out the implict body body = this.parseExpression(start.result.position, mode, true, "}"); return new ParseResult( new ParseNode("styling", { // Figure out what style to use by pulling out the style from // the function name style: func.slice(1, func.length - 5), value: body.result }, mode), body.position); } else { // Defer to parseFunction if it's not a function we handle return this.parseFunction(pos, mode); } }; /** * Parses an entire function, including its base and all of its arguments * * @return {?ParseResult} */ Parser.prototype.parseFunction = function(pos, mode) { var baseGroup = this.parseGroup(pos, mode); if (baseGroup) { if (baseGroup.isFunction) { var func = baseGroup.result.result; if (mode === "text" && !baseGroup.allowedInText) { throw new ParseError( "Can't use function '" + func + "' in text mode", this.lexer, baseGroup.position); } var newPos = baseGroup.result.position; var result; var totalArgs = baseGroup.numArgs + baseGroup.numOptionalArgs; if (totalArgs > 0) { var baseGreediness = functions.getGreediness(func); var args = [func]; var positions = [newPos]; for (var i = 0; i < totalArgs; i++) { var argType = baseGroup.argTypes && baseGroup.argTypes[i]; var arg; if (i < baseGroup.numOptionalArgs) { if (argType) { arg = this.parseSpecialGroup(newPos, argType, mode, true); } else { arg = this.parseOptionalGroup(newPos, mode); } if (!arg) { args.push(null); positions.push(newPos); continue; } } else { if (argType) { arg = this.parseSpecialGroup(newPos, argType, mode); } else { arg = this.parseGroup(newPos, mode); } if (!arg) { throw new ParseError( "Expected group after '" + baseGroup.result.result + "'", this.lexer, newPos); } } var argNode; if (arg.numArgs > 0) { var argGreediness = functions.getGreediness(arg.result.result); if (argGreediness > baseGreediness) { argNode = this.parseFunction(newPos, mode); } else { throw new ParseError( "Got function '" + arg.result.result + "' as " + "argument to function '" + baseGroup.result.result + "'", this.lexer, arg.result.position - 1); } } else { argNode = arg.result; } args.push(argNode.result); positions.push(argNode.position); newPos = argNode.position; } args.push(positions); result = functions.funcs[func].handler.apply(this, args); } else { result = functions.funcs[func].handler.apply(this, [func]); } return new ParseResult( new ParseNode(result.type, result, mode), newPos); } else { return baseGroup.result; } } else { return null; } }; /** * Parses a group when the mode is changing. Takes a position, a new mode, and * an outer mode that is used to parse the outside. * * @return {?ParseFuncOrArgument} */ Parser.prototype.parseSpecialGroup = function(pos, mode, outerMode, optional) { if (mode === "color" || mode === "size") { // color and size modes are special because they should have braces and // should only lex a single symbol inside var openBrace = this.lexer.lex(pos, outerMode); if (optional && openBrace.text !== "[") { // optional arguments should return null if they don't exist return null; } this.expect(openBrace, optional ? "[" : "{"); var inner = this.lexer.lex(openBrace.position, mode); var data; if (mode === "color") { data = inner.text; } else { data = inner.data; } var closeBrace = this.lexer.lex(inner.position, outerMode); this.expect(closeBrace, optional ? "]" : "}"); return new ParseFuncOrArgument( new ParseResult( new ParseNode(mode, data, outerMode), closeBrace.position), false); } else if (mode === "text") { // text mode is special because it should ignore the whitespace before // it var whitespace = this.lexer.lex(pos, "whitespace"); pos = whitespace.position; } if (optional) { return this.parseOptionalGroup(pos, mode); } else { return this.parseGroup(pos, mode); } }; /** * Parses a group, which is either a single nucleus (like "x") or an expression * in braces (like "{x+y}") * * @return {?ParseFuncOrArgument} */ Parser.prototype.parseGroup = function(pos, mode) { var start = this.lexer.lex(pos, mode); // Try to parse an open brace if (start.text === "{") { // If we get a brace, parse an expression var expression = this.parseExpression(start.position, mode, false, "}"); // Make sure we get a close brace var closeBrace = this.lexer.lex(expression.position, mode); this.expect(closeBrace, "}"); return new ParseFuncOrArgument( new ParseResult( new ParseNode("ordgroup", expression.result, mode), closeBrace.position), false); } else { // Otherwise, just return a nucleus return this.parseSymbol(pos, mode); } }; /** * Parses a group, which is an expression in brackets (like "[x+y]") * * @return {?ParseFuncOrArgument} */ Parser.prototype.parseOptionalGroup = function(pos, mode) { var start = this.lexer.lex(pos, mode); // Try to parse an open bracket if (start.text === "[") { // If we get a brace, parse an expression var expression = this.parseExpression(start.position, mode, false, "]"); // Make sure we get a close bracket var closeBracket = this.lexer.lex(expression.position, mode); this.expect(closeBracket, "]"); return new ParseFuncOrArgument( new ParseResult( new ParseNode("ordgroup", expression.result, mode), closeBracket.position), false); } else { // Otherwise, return null, return null; } }; /** * Parse a single symbol out of the string. Here, we handle both the functions * we have defined, as well as the single character symbols * * @return {?ParseFuncOrArgument} */ Parser.prototype.parseSymbol = function(pos, mode) { var nucleus = this.lexer.lex(pos, mode); if (functions.funcs[nucleus.text]) { // If there is a function with this name, we use its data var func = functions.funcs[nucleus.text]; // Here, we replace "original" argTypes with the current mode var argTypes = func.argTypes; if (argTypes) { argTypes = argTypes.slice(); for (var i = 0; i < argTypes.length; i++) { if (argTypes[i] === "original") { argTypes[i] = mode; } } } return new ParseFuncOrArgument( new ParseResult(nucleus.text, nucleus.position), true, func.allowedInText, func.numArgs, func.numOptionalArgs, argTypes); } else if (symbols[mode][nucleus.text]) { // Otherwise if this is a no-argument function, find the type it // corresponds to in the symbols map return new ParseFuncOrArgument( new ParseResult( new ParseNode(symbols[mode][nucleus.text].group, nucleus.text, mode), nucleus.position), false); } else { return null; } }; module.exports = Parser;