/* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved. * Use of this file is governed by the BSD 3-clause license that * can be found in the LICENSE.txt file in the project root. */ #pragma once #include "Recognizer.h" #include "TokenSource.h" #include "CharStream.h" #include "Token.h" namespace antlr4 { /// A lexer is recognizer that draws input symbols from a character stream. /// lexer grammars result in a subclass of this object. A Lexer object /// uses simplified match() and error recovery mechanisms in the interest /// of speed. class ANTLR4CPP_PUBLIC Lexer : public Recognizer, public TokenSource { public: static const size_t DEFAULT_MODE = 0; static const size_t MORE = static_cast(-2); static const size_t SKIP = static_cast(-3); static const size_t DEFAULT_TOKEN_CHANNEL = Token::DEFAULT_CHANNEL; static const size_t HIDDEN = Token::HIDDEN_CHANNEL; static const size_t MIN_CHAR_VALUE = 0; static const size_t MAX_CHAR_VALUE = 0x10FFFF; CharStream *_input; // Pure reference, usually from statically allocated instance. protected: /// How to create token objects. Ref> _factory; public: /// The goal of all lexer rules/methods is to create a token object. /// This is an instance variable as multiple rules may collaborate to /// create a single token. nextToken will return this object after /// matching lexer rule(s). If you subclass to allow multiple token /// emissions, then set this to the last token to be matched or /// something nonnull so that the auto token emit mechanism will not /// emit another token. // Life cycle of a token is this: // Created by emit() (via the token factory) or by action code, holding ownership of it. // Ownership is handed over to the token stream when calling nextToken(). std::unique_ptr token; /// /// What character index in the stream did the current token start at? /// Needed, for example, to get the text for current token. Set at /// the start of nextToken. /// size_t tokenStartCharIndex; /// /// The line on which the first character of the token resides size_t tokenStartLine; /// The character position of first character within the line. size_t tokenStartCharPositionInLine; /// Once we see EOF on char stream, next token will be EOF. /// If you have DONE : EOF ; then you see DONE EOF. bool hitEOF; /// The channel number for the current token. size_t channel; /// The token type for the current token. size_t type; // Use the vector as a stack. std::vector modeStack; size_t mode; Lexer(); Lexer(CharStream *input); virtual ~Lexer() {} virtual void reset(); /// Return a token from this source; i.e., match a token on the char stream. virtual std::unique_ptr nextToken() override; /// Instruct the lexer to skip creating a token for current lexer rule /// and look for another token. nextToken() knows to keep looking when /// a lexer rule finishes with token set to SKIP_TOKEN. Recall that /// if token == null at end of any token rule, it creates one for you /// and emits it. virtual void skip(); virtual void more(); virtual void setMode(size_t m); virtual void pushMode(size_t m); virtual size_t popMode(); template void setTokenFactory(TokenFactory *factory) { this->_factory = factory; } virtual Ref> getTokenFactory() override; /// Set the char stream and reset the lexer virtual void setInputStream(IntStream *input) override; virtual std::string getSourceName() override; virtual CharStream* getInputStream() override; /// By default does not support multiple emits per nextToken invocation /// for efficiency reasons. Subclasses can override this method, nextToken, /// and getToken (to push tokens into a list and pull from that list /// rather than a single variable as this implementation does). virtual void emit(std::unique_ptr newToken); /// The standard method called to automatically emit a token at the /// outermost lexical rule. The token object should point into the /// char buffer start..stop. If there is a text override in 'text', /// use that to set the token's text. Override this method to emit /// custom Token objects or provide a new factory. virtual Token* emit(); virtual Token* emitEOF(); virtual size_t getLine() const override; virtual size_t getCharPositionInLine() override; virtual void setLine(size_t line); virtual void setCharPositionInLine(size_t charPositionInLine); /// What is the index of the current character of lookahead? virtual size_t getCharIndex(); /// Return the text matched so far for the current token or any /// text override. virtual std::string getText(); /// Set the complete text of this token; it wipes any previous /// changes to the text. virtual void setText(const std::string &text); /// Override if emitting multiple tokens. virtual std::unique_ptr getToken(); virtual void setToken(std::unique_ptr newToken); virtual void setType(size_t ttype); virtual size_t getType(); virtual void setChannel(size_t newChannel); virtual size_t getChannel(); virtual const std::vector& getChannelNames() const = 0; virtual const std::vector& getModeNames() const = 0; /// Return a list of all Token objects in input char stream. /// Forces load of all tokens. Does not include EOF token. virtual std::vector> getAllTokens(); virtual void recover(const LexerNoViableAltException &e); virtual void notifyListeners(const LexerNoViableAltException &e); virtual std::string getErrorDisplay(const std::string &s); /// Lexers can normally match any char in it's vocabulary after matching /// a token, so do the easy thing and just kill a character and hope /// it all works out. You can instead use the rule invocation stack /// to do sophisticated error recovery if you are in a fragment rule. virtual void recover(RecognitionException *re); /// /// Gets the number of syntax errors reported during parsing. This value is /// incremented each time is called. /// /// virtual size_t getNumberOfSyntaxErrors(); protected: /// You can set the text for the current token to override what is in /// the input char buffer (via setText()). std::string _text; private: size_t _syntaxErrors; void InitializeInstanceFields(); }; } // namespace antlr4