/* Copyright 2010 Google Inc. Copyright 2017-2018 Craig Barnes Copyright 2018 Stephen Checkoway Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at https://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* Coding conventions specific to this file: 1. Functions that fill in a token should be named emit_*, and should be followed immediately by a return from the tokenizer. 2. Functions that shuffle data from temporaries to final API structures should be named finish_*, and be called just before the tokenizer exits the state that accumulates the temporary. 3. All internal data structures should be kept in an initialized state from tokenizer creation onwards, ready to accept input. When a buffer's flushed and reset, it should be deallocated and immediately reinitialized. 4. Make sure there are appropriate break statements following each state. 5. Assertions on the state of the temporary and tag buffers are usually a good idea, and should go at the entry point of each state when added. 6. Statement order within states goes: 1. Add parse errors, if appropriate. 2. Call finish_* functions to build up tag state. 2. Switch to new state. Set _reconsume flag if appropriate. 3. Perform any other temporary buffer manipulation. 4. Emit tokens 5. Return/break. This order ensures that we can verify that every emit is followed by a return, ensures that the correct state is recorded with any parse errors, and prevents parse error position from being messed up by possible mark/resets in temporary buffer manipulation. */ #include #include #include "tokenizer.h" #include "ascii.h" #include "attribute.h" #include "char_ref.h" #include "error.h" #include "nokogiri_gumbo.h" #include "parser.h" #include "string_buffer.h" #include "token_type.h" #include "tokenizer_states.h" #include "utf8.h" #include "util.h" #include "vector.h" // Compared against _temporary_buffer to determine if we're in // double-escaped script mode. static const GumboStringPiece kScriptTag = {.data = "script", .length = 6}; // An enum for the return value of each individual state. Each of the emit_* // functions should return EMIT_TOKEN and should be called as // return emit_foo(parser, ..., output); // Each of the handle_*_state functions that do not return emit_* should // instead return CONTINUE to indicate to gumbo_lex to continue lexing. typedef enum { EMIT_TOKEN, CONTINUE, } StateResult; // This is a struct containing state necessary to build up a tag token, // character by character. typedef struct GumboInternalTagState { // A buffer to accumulate characters for various GumboStringPiece fields. GumboStringBuffer _buffer; // A pointer to the start of the original text corresponding to the contents // of the buffer. const char* _original_text; // The current tag enum, computed once the tag name state has finished so that // the buffer can be re-used for building up attributes. GumboTag _tag; // The current tag name. It's set at the same time that _tag is set if _tag // is set to GUMBO_TAG_UNKNOWN. char *_name; // The starting location of the text in the buffer. GumboSourcePosition _start_pos; // The current list of attributes. This is copied (and ownership of its data // transferred) to the GumboStartTag token upon completion of the tag. New // attributes are added as soon as their attribute name state is complete, and // values are filled in by operating on _attributes.data[attributes.length-1]. GumboVector /* GumboAttribute */ _attributes; // If true, the next attribute value to be finished should be dropped. This // happens if a duplicate attribute name is encountered - we want to consume // the attribute value, but shouldn't overwrite the existing value. bool _drop_next_attr_value; // The last start tag to have been emitted by the tokenizer. This is // necessary to check for appropriate end tags. GumboTag _last_start_tag; // If true, then this is a start tag. If false, it's an end tag. This is // necessary to generate the appropriate token type at tag-closing time. bool _is_start_tag; // If true, then this tag is "self-closing" and doesn't have an end tag. bool _is_self_closing; } GumboTagState; // This is the main tokenizer state struct, containing all state used by in // tokenizing the input stream. typedef struct GumboInternalTokenizerState { // The current lexer state. Starts in GUMBO_LEX_DATA. GumboTokenizerEnum _state; // A flag indicating whether the current input character needs to reconsumed // in another state, or whether the next input character should be read for // the next iteration of the state loop. This is set when the spec reads // "Reconsume the current input character in..." bool _reconsume_current_input; // A flag indicating whether the adjusted current node is a foreign element. // This is set by gumbo_tokenizer_set_is_adjusted_current_node_foreign and // checked in the markup declaration state. bool _is_adjusted_current_node_foreign; // A flag indicating whether the tokenizer is in a CDATA section. If so, then // text tokens emitted will be GUMBO_TOKEN_CDATA. bool _is_in_cdata; // Certain states (notably character references) may emit two character tokens // at once, but the contract for lex() fills in only one token at a time. The // extra character is buffered here, and then this is checked on entry to // lex(). If a character is stored here, it's immediately emitted and control // returns from the lexer. kGumboNoChar is used to represent 'no character // stored.' // // Note that characters emitted through this mechanism will have their source // position marked as the character under the mark, i.e. multiple characters // may be emitted with the same position. This is desirable for character // references, but unsuitable for many other cases. Use the _temporary_buffer // mechanism if the buffered characters must have their original positions in // the document. int _buffered_emit_char; // A temporary buffer to accumulate characters, as described by the "temporary // buffer" phrase in the tokenizer spec. We use this in a somewhat unorthodox // way: In situations where the spec calls for inserting characters into the // temporary buffer that exactly match the input in order to emit them as // character tokens, we don't actually do it. // Instead, we mark the input and reset the input to it using set_mark() and // emit_from_mark(). We do use the temporary buffer for other uses such as // DOCTYPEs, comments, and detecting escaped