/** * The Lexer class handles tokenizing the input in various ways. Since our * parser expects us to be able to backtrack, the lexer allows lexing from any * given starting point. * * Its main exposed function is the `lex` function, which takes a position to * lex from and a type of token to lex. It defers to the appropriate `_innerLex` * function. * * The various `_innerLex` functions perform the actual lexing of different * kinds. */ var ParseError = require("./ParseError"); // The main lexer class function Lexer(input) { this._input = input; } // The resulting token returned from `lex`. function Token(text, data, position) { this.text = text; this.data = data; this.position = position; } // "normal" types of tokens. These are tokens which can be matched by a simple // regex var mathNormals = [ /^[/|@.""`0-9a-zA-Z]/, // ords /^[*+-]/, // bins /^[=<>:]/, // rels /^[,;]/, // punctuation /^['\^_{}]/, // misc /^[(\[]/, // opens /^[)\]?!]/, // closes /^~/ // spacing ]; // These are "normal" tokens like above, but should instead be parsed in text // mode. var textNormals = [ /^[a-zA-Z0-9`!@*()-=+\[\]'";:?\/.,]/, // ords /^[{}]/, // grouping /^~/ // spacing ]; // Regexes for matching whitespace var whitespaceRegex = /^\s*/; var whitespaceConcatRegex = /^( +|\\ +)/; // This regex matches any other TeX function, which is a backslash followed by a // word or a single symbol var anyFunc = /^\\(?:[a-zA-Z]+|.)/; /** * This function lexes a single normal token. It takes a position, a list of * "normal" tokens to try, and whether it should completely ignore whitespace or * not. */ Lexer.prototype._innerLex = function(pos, normals, ignoreWhitespace) { var input = this._input.slice(pos); var whitespace; if (ignoreWhitespace) { // Get rid of whitespace. whitespace = input.match(whitespaceRegex)[0]; pos += whitespace.length; input = input.slice(whitespace.length); } else { // Do the funky concatenation of whitespace that happens in text mode. whitespace = input.match(whitespaceConcatRegex); if (whitespace !== null) { return new Token(" ", null, pos + whitespace[0].length); } } // If there's no more input to parse, return an EOF token if (input.length === 0) { return new Token("EOF", null, pos); } var match; if ((match = input.match(anyFunc))) { // If we match a function token, return it return new Token(match[0], null, pos + match[0].length); } else { // Otherwise, we look through the normal token regexes and see if it's // one of them. for (var i = 0; i < normals.length; i++) { var normal = normals[i]; if ((match = input.match(normal))) { // If it is, return it return new Token( match[0], null, pos + match[0].length); } } } throw new ParseError("Unexpected character: '" + input[0] + "'", this, pos); }; // A regex to match a CSS color (like #ffffff or BlueViolet) var cssColor = /^(#[a-z0-9]+|[a-z]+)/i; /** * This function lexes a CSS color. */ Lexer.prototype._innerLexColor = function(pos) { var input = this._input.slice(pos); // Ignore whitespace var whitespace = input.match(whitespaceRegex)[0]; pos += whitespace.length; input = input.slice(whitespace.length); var match; if ((match = input.match(cssColor))) { // If we look like a color, return a color return new Token(match[0], null, pos + match[0].length); } else { throw new ParseError("Invalid color", this, pos); } }; // A regex to match a dimension. Dimensions look like // "1.2em" or ".4pt" or "1 ex" var sizeRegex = /^(-?)\s*(\d+(?:\.\d*)?|\.\d+)\s*([a-z]{2})/; /** * This function lexes a dimension. */ Lexer.prototype._innerLexSize = function(pos) { var input = this._input.slice(pos); // Ignore whitespace var whitespace = input.match(whitespaceRegex)[0]; pos += whitespace.length; input = input.slice(whitespace.length); var match; if ((match = input.match(sizeRegex))) { var unit = match[3]; // We only currently handle "em" and "ex" units if (unit !== "em" && unit !== "ex") { throw new ParseError("Invalid unit: '" + unit + "'", this, pos); } return new Token(match[0], { number: +(match[1] + match[2]), unit: unit }, pos + match[0].length); } throw new ParseError("Invalid size", this, pos); }; /** * This function lexes a string of whitespace. */ Lexer.prototype._innerLexWhitespace = function(pos) { var input = this._input.slice(pos); var whitespace = input.match(whitespaceRegex)[0]; pos += whitespace.length; return new Token(whitespace, null, pos); }; /** * This function lexes a single token starting at `pos` and of the given mode. * Based on the mode, we defer to one of the `_innerLex` functions. */ Lexer.prototype.lex = function(pos, mode) { if (mode === "math") { return this._innerLex(pos, mathNormals, true); } else if (mode === "text") { return this._innerLex(pos, textNormals, false); } else if (mode === "color") { return this._innerLexColor(pos); } else if (mode === "size") { return this._innerLexSize(pos); } else if (mode === "whitespace") { return this._innerLexWhitespace(pos); } }; module.exports = Lexer;