/* Copyright 2017-2018 Craig Barnes. Copyright 2010 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at https://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include #include #include "ascii.h" #include "attribute.h" #include "error.h" #include "gumbo.h" #include "insertion_mode.h" #include "macros.h" #include "parser.h" #include "replacement.h" #include "tokenizer.h" #include "tokenizer_states.h" #include "token_buffer.h" #include "utf8.h" #include "util.h" #include "vector.h" typedef uint8_t TagSet[GUMBO_TAG_LAST + 1]; #define TAG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_HTML) #define TAG_SVG(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_SVG) #define TAG_MATHML(tag) [GUMBO_TAG_##tag] = (1 << GUMBO_NAMESPACE_MATHML) #define GUMBO_EMPTY_SOURCE_POSITION_INIT { .line = 0, .column = 0, .offset = 0 } #define kGumboEmptySourcePosition (const GumboSourcePosition) \ GUMBO_EMPTY_SOURCE_POSITION_INIT const GumboOptions kGumboDefaultOptions = { .tab_stop = 8, .stop_on_first_error = false, .max_attributes = 400, .max_tree_depth = 400, .max_errors = -1, .fragment_context = NULL, .fragment_namespace = GUMBO_NAMESPACE_HTML, .fragment_encoding = NULL, .quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS, .fragment_context_has_form_ancestor = false, }; #define STRING(s) {.data = s, .length = sizeof(s) - 1} #define TERMINATOR {.data = NULL, .length = 0} // The doctype arrays have an explicit terminator because we want to pass them // to a helper function, and passing them as a pointer discards sizeof // information. The SVG arrays are used only by one-off functions, and so loops // over them use sizeof directly instead of a terminator. static const GumboStringPiece kQuirksModePublicIdPrefixes[] = { STRING("+//Silmaril//dtd html Pro v0r11 19970101//"), STRING("-//AdvaSoft Ltd//DTD HTML 3.0 asWedit + extensions//"), STRING("-//AS//DTD HTML 3.0 asWedit + extensions//"), STRING("-//IETF//DTD HTML 2.0 Level 1//"), STRING("-//IETF//DTD HTML 2.0 Level 2//"), STRING("-//IETF//DTD HTML 2.0 Strict Level 1//"), STRING("-//IETF//DTD HTML 2.0 Strict Level 2//"), STRING("-//IETF//DTD HTML 2.0 Strict//"), STRING("-//IETF//DTD HTML 2.0//"), STRING("-//IETF//DTD HTML 2.1E//"), STRING("-//IETF//DTD HTML 3.0//"), STRING("-//IETF//DTD HTML 3.2 Final//"), STRING("-//IETF//DTD HTML 3.2//"), STRING("-//IETF//DTD HTML 3//"), STRING("-//IETF//DTD HTML Level 0//"), STRING("-//IETF//DTD HTML Level 1//"), STRING("-//IETF//DTD HTML Level 2//"), STRING("-//IETF//DTD HTML Level 3//"), STRING("-//IETF//DTD HTML Strict Level 0//"), STRING("-//IETF//DTD HTML Strict Level 1//"), STRING("-//IETF//DTD HTML Strict Level 2//"), STRING("-//IETF//DTD HTML Strict Level 3//"), STRING("-//IETF//DTD HTML Strict//"), STRING("-//IETF//DTD HTML//"), STRING("-//Metrius//DTD Metrius Presentational//"), STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//"), STRING("-//Microsoft//DTD Internet Explorer 2.0 HTML//"), STRING("-//Microsoft//DTD Internet Explorer 2.0 Tables//"), STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//"), STRING("-//Microsoft//DTD Internet Explorer 3.0 HTML//"), STRING("-//Microsoft//DTD Internet Explorer 3.0 Tables//"), STRING("-//Netscape Comm. Corp.//DTD HTML//"), STRING("-//Netscape Comm. Corp.//DTD Strict HTML//"), STRING("-//O'Reilly and Associates//DTD HTML 2.0//"), STRING("-//O'Reilly and Associates//DTD HTML Extended 1.0//"), STRING("-//O'Reilly and Associates//DTD HTML Extended Relaxed 1.0//"), STRING( "-//SoftQuad Software//DTD HoTMetaL PRO 6.0::19990601::)" "extensions to HTML 4.0//"), STRING( "-//SoftQuad//DTD HoTMetaL PRO 4.0::19971010::" "extensions to HTML 4.0//"), STRING("-//Spyglass//DTD HTML 2.0 Extended//"), STRING("-//SQ//DTD HTML 2.0 HoTMetaL + extensions//"), STRING("-//Sun Microsystems Corp.//DTD HotJava HTML//"), STRING("-//Sun Microsystems Corp.//DTD HotJava Strict HTML//"), STRING("-//W3C//DTD HTML 3 1995-03-24//"), STRING("-//W3C//DTD HTML 3.2 Draft//"), STRING("-//W3C//DTD HTML 3.2 Final//"), STRING("-//W3C//DTD HTML 3.2//"), STRING("-//W3C//DTD HTML 3.2S Draft//"), STRING("-//W3C//DTD HTML 4.0 Frameset//"), STRING("-//W3C//DTD HTML 4.0 Transitional//"), STRING("-//W3C//DTD HTML Experimental 19960712//"), STRING("-//W3C//DTD HTML Experimental 970421//"), STRING("-//W3C//DTD W3 HTML//"), STRING("-//W3O//DTD W3 HTML 3.0//"), STRING("-//WebTechs//DTD Mozilla HTML 2.0//"), STRING("-//WebTechs//DTD Mozilla HTML//"), TERMINATOR }; static const GumboStringPiece kQuirksModePublicIdExactMatches[] = { STRING("-//W3O//DTD W3 HTML Strict 3.0//EN//"), STRING("-/W3C/DTD HTML 4.0 Transitional/EN"), STRING("HTML"), TERMINATOR }; static const GumboStringPiece kQuirksModeSystemIdExactMatches[] = { STRING("http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"), TERMINATOR }; static const GumboStringPiece kLimitedQuirksPublicIdPrefixes[] = { STRING("-//W3C//DTD XHTML 1.0 Frameset//"), STRING("-//W3C//DTD XHTML 1.0 Transitional//"), TERMINATOR }; static const GumboStringPiece kSystemIdDependentPublicIdPrefixes[] = { STRING("-//W3C//DTD HTML 4.01 Frameset//"), STRING("-//W3C//DTD HTML 4.01 Transitional//"), TERMINATOR }; // Indexed by GumboNamespaceEnum; keep in sync with that. static const char* kLegalXmlns[] = { "http://www.w3.org/1999/xhtml", "http://www.w3.org/2000/svg", "http://www.w3.org/1998/Math/MathML" }; // The "scope marker" for the list of active formatting elements. We use a // pointer to this as a generic marker element, since the particular element // scope doesn't matter. static const GumboNode kActiveFormattingScopeMarker; // The tag_is and tag_in function use true & false to denote start & end tags, // but for readability, we define constants for them here. static const bool kStartTag = true; static const bool kEndTag = false; // Because GumboStringPieces are immutable, we can't insert a character directly // into a text node. Instead, we accumulate all pending characters here and // flush them out to a text node whenever a new element is inserted. // // https://html.spec.whatwg.org/multipage/parsing.html#insert-a-character typedef struct _TextNodeBufferState { // The accumulated text to be inserted into the current text node. GumboStringBuffer _buffer; // A pointer to the original text represented by this text node. Note that // because of foster parenting and other strange DOM manipulations, this may // include other non-text HTML tags in it; it is defined as the span of // original text from the first character in this text node to the last // character in this text node. const char* _start_original_text; // The source position of the start of this text node. GumboSourcePosition _start_position; // The type of node that will be inserted (TEXT, CDATA, or WHITESPACE). GumboNodeType _type; } TextNodeBufferState; typedef struct GumboInternalParserState { // https://html.spec.whatwg.org/multipage/parsing.html#insertion-mode GumboInsertionMode _insertion_mode; // Used for run_generic_parsing_algorithm, which needs to switch back to the // original insertion mode at its conclusion. GumboInsertionMode _original_insertion_mode; // https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements GumboVector /*GumboNode*/ _open_elements; // https://html.spec.whatwg.org/multipage/parsing.html#the-list-of-active-formatting-elements GumboVector /*GumboNode*/ _active_formatting_elements; // The stack of template insertion modes. // https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode GumboVector /*InsertionMode*/ _template_insertion_modes; // https://html.spec.whatwg.org/multipage/parsing.html#the-element-pointers GumboNode* _head_element; GumboNode* _form_element; // The element used as fragment context when parsing in fragment mode GumboNode* _fragment_ctx; // The flag for when the spec says "Reprocess the current token in..." bool _reprocess_current_token; // The flag for "acknowledge the token's self-closing flag". bool _self_closing_flag_acknowledged; // The "frameset-ok" flag from the spec. bool _frameset_ok; // The flag for "If the next token is a LINE FEED, ignore that token...". bool _ignore_next_linefeed; // The flag for "whenever a node would be inserted into the current node, it // must instead be foster parented". This is used for misnested table // content, which needs to be handled according to "in body" rules yet foster // parented outside of the table. // It would perhaps be more explicit to have this as a parameter to // handle_in_body and insert_element, but given how special-purpose this is // and the number of call-sites that would need to take the extra parameter, // it's easier just to have a state flag. bool _foster_parent_insertions; // The accumulated text node buffer state. TextNodeBufferState _text_node; // The accumulated character tokens in tables for error purposes. GumboCharacterTokenBuffer _table_character_tokens; // The current token. GumboToken* _current_token; // The way that the spec is written, the and tags are *always* // implicit, because encountering one of those tokens merely switches the // insertion mode out of "in body". So we have individual state flags for // those end tags that are then inspected by pop_current_node when the // and nodes are popped to set the GUMBO_INSERTION_IMPLICIT_END_TAG // flag appropriately. bool _closed_body_tag; bool _closed_html_tag; } GumboParserState; static bool token_has_attribute(const GumboToken* token, const char* name) { assert(token->type == GUMBO_TOKEN_START_TAG); return gumbo_get_attribute(&token->v.start_tag.attributes, name) != NULL; } // Checks if the value of the specified attribute is a case-insensitive match // for the specified string. static bool attribute_matches ( const GumboVector* attributes, const char* name, const char* value ) { const GumboAttribute* attr = gumbo_get_attribute(attributes, name); return attr ? gumbo_ascii_strcasecmp(value, attr->value) == 0 : false; } // Checks if the value of the specified attribute is a case-sensitive match // for the specified string. static bool attribute_matches_case_sensitive ( const GumboVector* attributes, const char* name, const char* value ) { const GumboAttribute* attr = gumbo_get_attribute(attributes, name); return attr ? strcmp(value, attr->value) == 0 : false; } // Checks if the specified attribute vectors are identical. static bool all_attributes_match ( const GumboVector* attr1, const GumboVector* attr2 ) { unsigned int num_unmatched_attr2_elements = attr2->length; for (unsigned int i = 0; i < attr1->length; ++i) { const GumboAttribute* attr = attr1->data[i]; if (attribute_matches_case_sensitive(attr2, attr->name, attr->value)) { --num_unmatched_attr2_elements; } else { return false; } } return num_unmatched_attr2_elements == 0; } static void set_frameset_not_ok(GumboParser* parser) { gumbo_debug("Setting frameset_ok to false.\n"); parser->_parser_state->_frameset_ok = false; } static GumboNode* create_node(GumboNodeType type) { GumboNode* node = gumbo_alloc(sizeof(GumboNode)); node->parent = NULL; node->index_within_parent = -1; node->type = type; node->parse_flags = GUMBO_INSERTION_NORMAL; return node; } static GumboNode* new_document_node() { GumboNode* document_node = create_node(GUMBO_NODE_DOCUMENT); document_node->parse_flags = GUMBO_INSERTION_BY_PARSER; gumbo_vector_init(1, &document_node->v.document.children); // Must be initialized explicitly, as there's no guarantee that we'll see a // doc type token. GumboDocument* document = &document_node->v.document; document->has_doctype = false; document->name = NULL; document->public_identifier = NULL; document->system_identifier = NULL; document->doc_type_quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS; return document_node; } static void output_init(GumboParser* parser) { GumboOutput* output = gumbo_alloc(sizeof(GumboOutput)); output->root = NULL; output->document = new_document_node(); output->document_error = false; output->status = GUMBO_STATUS_OK; parser->_output = output; gumbo_init_errors(parser); } static void parser_state_init(GumboParser* parser) { GumboParserState* parser_state = gumbo_alloc(sizeof(GumboParserState)); parser_state->_insertion_mode = GUMBO_INSERTION_MODE_INITIAL; parser_state->_reprocess_current_token = false; parser_state->_frameset_ok = true; parser_state->_ignore_next_linefeed = false; parser_state->_foster_parent_insertions = false; parser_state->_text_node._type = GUMBO_NODE_WHITESPACE; gumbo_string_buffer_init(&parser_state->_text_node._buffer); gumbo_character_token_buffer_init(&parser_state->_table_character_tokens); gumbo_vector_init(10, &parser_state->_open_elements); gumbo_vector_init(5, &parser_state->_active_formatting_elements); gumbo_vector_init(5, &parser_state->_template_insertion_modes); parser_state->_head_element = NULL; parser_state->_form_element = NULL; parser_state->_fragment_ctx = NULL; parser_state->_current_token = NULL; parser_state->_closed_body_tag = false; parser_state->_closed_html_tag = false; parser->_parser_state = parser_state; } typedef void (*TreeTraversalCallback)(GumboNode* node); static void tree_traverse(GumboNode* node, TreeTraversalCallback callback) { GumboNode* current_node = node; unsigned int offset = 0; tailcall: switch (current_node->type) { case GUMBO_NODE_DOCUMENT: case GUMBO_NODE_TEMPLATE: case GUMBO_NODE_ELEMENT: { GumboVector* children = (current_node->type == GUMBO_NODE_DOCUMENT) ? ¤t_node->v.document.children : ¤t_node->v.element.children ; if (offset >= children->length) { assert(offset == children->length); break; } else { current_node = children->data[offset]; offset = 0; goto tailcall; } } case GUMBO_NODE_TEXT: case GUMBO_NODE_CDATA: case GUMBO_NODE_COMMENT: case GUMBO_NODE_WHITESPACE: assert(offset == 0); break; } offset = current_node->index_within_parent + 1; GumboNode* next_node = current_node->parent; callback(current_node); if (current_node == node) { return; } current_node = next_node; goto tailcall; } static void destroy_node_callback(GumboNode* node) { switch (node->type) { case GUMBO_NODE_DOCUMENT: { GumboDocument* doc = &node->v.document; gumbo_free((void*) doc->children.data); gumbo_free((void*) doc->name); gumbo_free((void*) doc->public_identifier); gumbo_free((void*) doc->system_identifier); } break; case GUMBO_NODE_TEMPLATE: case GUMBO_NODE_ELEMENT: for (unsigned int i = 0; i < node->v.element.attributes.length; ++i) { gumbo_destroy_attribute(node->v.element.attributes.data[i]); } gumbo_free(node->v.element.attributes.data); gumbo_free(node->v.element.children.data); if (node->v.element.tag == GUMBO_TAG_UNKNOWN) gumbo_free((void *)node->v.element.name); break; case GUMBO_NODE_TEXT: case GUMBO_NODE_CDATA: case GUMBO_NODE_COMMENT: case GUMBO_NODE_WHITESPACE: gumbo_free((void*) node->v.text.text); break; } gumbo_free(node); } static void destroy_node(GumboNode* node) { tree_traverse(node, &destroy_node_callback); } static void destroy_fragment_ctx_element(GumboNode* ctx); static void parser_state_destroy(GumboParser* parser) { GumboParserState* state = parser->_parser_state; if (state->_fragment_ctx) { destroy_fragment_ctx_element(state->_fragment_ctx); } gumbo_vector_destroy(&state->_active_formatting_elements); gumbo_vector_destroy(&state->_open_elements); gumbo_vector_destroy(&state->_template_insertion_modes); gumbo_string_buffer_destroy(&state->_text_node._buffer); gumbo_character_token_buffer_destroy(&state->_table_character_tokens); gumbo_free(state); } static GumboNode* get_document_node(const GumboParser* parser) { return parser->_output->document; } static bool is_fragment_parser(const GumboParser* parser) { return !!parser->_parser_state->_fragment_ctx; } // Returns the node at the bottom of the stack of open elements, or NULL if no // elements have been added yet. static GumboNode* get_current_node(const GumboParser* parser) { const GumboVector* open_elements = &parser->_parser_state->_open_elements; if (open_elements->length == 0) { assert(!parser->_output->root); return NULL; } assert(open_elements->length > 0); assert(open_elements->data != NULL); return open_elements->data[open_elements->length - 1]; } static GumboNode* get_adjusted_current_node(const GumboParser* parser) { const GumboParserState* state = parser->_parser_state; if (state->_open_elements.length == 1 && state->_fragment_ctx) { return state->_fragment_ctx; } return get_current_node(parser); } // Returns true if the given needle is in the given array of literal // GumboStringPieces. If exact_match is true, this requires that they match // exactly; otherwise, this performs a prefix match to check if any of the // elements in haystack start with needle. This always performs a // case-insensitive match. static bool is_in_static_list ( const GumboStringPiece* needle, const GumboStringPiece* haystack, bool exact_match ) { if (needle->length == 0) return false; if (exact_match) { for (size_t i = 0; haystack[i].data; ++i) { if (gumbo_string_equals_ignore_case(needle, &haystack[i])) return true; } } else { for (size_t i = 0; haystack[i].data; ++i) { if (gumbo_string_prefix_ignore_case(&haystack[i], needle)) return true; } } return false; } static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) { parser->_parser_state->_insertion_mode = mode; } static void push_template_insertion_mode ( GumboParser* parser, GumboInsertionMode mode ) { gumbo_vector_add ( (void*) mode, &parser->_parser_state->_template_insertion_modes ); } static void pop_template_insertion_mode(GumboParser* parser) { gumbo_vector_pop(&parser->_parser_state->_template_insertion_modes); } // Returns the current template insertion mode. If the stack of template // insertion modes is empty, this returns GUMBO_INSERTION_MODE_INITIAL. static GumboInsertionMode get_current_template_insertion_mode ( const GumboParser* parser ) { GumboVector* modes = &parser->_parser_state->_template_insertion_modes; if (modes->length == 0) { return GUMBO_INSERTION_MODE_INITIAL; } return (GumboInsertionMode) modes->data[(modes->length - 1)]; } // Returns true if the specified token is either a start or end tag // (specified by is_start) with one of the tag types in the TagSet. static bool tag_in ( const GumboToken* token, bool is_start, const TagSet* tags ) { GumboTag token_tag; if (is_start && token->type == GUMBO_TOKEN_START_TAG) { token_tag = token->v.start_tag.tag; } else if (!is_start && token->type == GUMBO_TOKEN_END_TAG) { token_tag = token->v.end_tag.tag; } else { return false; } return (*tags)[(unsigned) token_tag] != 0u; } // Like tag_in, but for the single-tag case. static bool tag_is(const GumboToken* token, bool is_start, GumboTag tag) { if (is_start && token->type == GUMBO_TOKEN_START_TAG) { return token->v.start_tag.tag == tag; } if (!is_start && token->type == GUMBO_TOKEN_END_TAG) { return token->v.end_tag.tag == tag; } return false; } static inline bool tagset_includes ( const TagSet* tagset, GumboNamespaceEnum ns, GumboTag tag ) { return ((*tagset)[(unsigned) tag] & (1u << (unsigned) ns)) != 0u; } // Like tag_in, but checks for the tag of a node, rather than a token. static bool node_tag_in_set(const GumboNode* node, const TagSet* tags) { assert(node != NULL); if (node->type != GUMBO_NODE_ELEMENT && node->type != GUMBO_NODE_TEMPLATE) { return false; } return tagset_includes ( tags, node->v.element.tag_namespace, node->v.element.tag ); } static bool node_qualified_tagname_is ( const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag, const char *name ) { assert(node); assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); assert(node->v.element.name); assert(tag != GUMBO_TAG_UNKNOWN || name); GumboTag element_tag = node->v.element.tag; const char *element_name = node->v.element.name; assert(element_tag != GUMBO_TAG_UNKNOWN || element_name); if (node->v.element.tag_namespace != ns || element_tag != tag) return false; if (tag != GUMBO_TAG_UNKNOWN) return true; return !gumbo_ascii_strcasecmp(element_name, name); } static bool node_html_tagname_is ( const GumboNode* node, GumboTag tag, const char *name ) { return node_qualified_tagname_is(node, GUMBO_NAMESPACE_HTML, tag, name); } static bool node_tagname_is ( const GumboNode* node, GumboTag tag, const char *name ) { assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); return node_qualified_tagname_is(node, node->v.element.tag_namespace, tag, name); } // Like node_tag_in, but for the single-tag case. static bool node_qualified_tag_is ( const GumboNode* node, GumboNamespaceEnum ns, GumboTag tag ) { assert(node); assert(tag != GUMBO_TAG_UNKNOWN); assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); return node->v.element.tag == tag && node->v.element.tag_namespace == ns; } // Like node_tag_in, but for the single-tag case in the HTML namespace static bool node_html_tag_is(const GumboNode* node, GumboTag tag) { return node_qualified_tag_is(node, GUMBO_NAMESPACE_HTML, tag); } // https://html.spec.whatwg.org/multipage/parsing.html#reset-the-insertion-mode-appropriately // This is a helper function that returns the appropriate insertion mode instead // of setting it. Returns GUMBO_INSERTION_MODE_INITIAL as a sentinel value to // indicate that there is no appropriate insertion mode, and the loop should // continue. static GumboInsertionMode get_appropriate_insertion_mode ( const GumboParser* parser, int index ) { const GumboVector* open_elements = &parser->_parser_state->_open_elements; const GumboNode* node = open_elements->data[index]; const bool is_last = index == 0; if (is_last && is_fragment_parser(parser)) { node = parser->_parser_state->_fragment_ctx; } assert(node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE); if (node->v.element.tag_namespace != GUMBO_NAMESPACE_HTML) { return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; } switch (node->v.element.tag) { case GUMBO_TAG_SELECT: { if (is_last) { return GUMBO_INSERTION_MODE_IN_SELECT; } for (int i = index; i > 0; --i) { const GumboNode* ancestor = open_elements->data[i]; if (node_html_tag_is(ancestor, GUMBO_TAG_TEMPLATE)) { return GUMBO_INSERTION_MODE_IN_SELECT; } if (node_html_tag_is(ancestor, GUMBO_TAG_TABLE)) { return GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE; } } return GUMBO_INSERTION_MODE_IN_SELECT; } case GUMBO_TAG_TD: case GUMBO_TAG_TH: if (!is_last) return GUMBO_INSERTION_MODE_IN_CELL; break; case GUMBO_TAG_TR: return GUMBO_INSERTION_MODE_IN_ROW; case GUMBO_TAG_TBODY: case GUMBO_TAG_THEAD: case GUMBO_TAG_TFOOT: return GUMBO_INSERTION_MODE_IN_TABLE_BODY; case GUMBO_TAG_CAPTION: return GUMBO_INSERTION_MODE_IN_CAPTION; case GUMBO_TAG_COLGROUP: return GUMBO_INSERTION_MODE_IN_COLUMN_GROUP; case GUMBO_TAG_TABLE: return GUMBO_INSERTION_MODE_IN_TABLE; case GUMBO_TAG_TEMPLATE: return get_current_template_insertion_mode(parser); case GUMBO_TAG_HEAD: if (!is_last) return GUMBO_INSERTION_MODE_IN_HEAD; break; case GUMBO_TAG_BODY: return GUMBO_INSERTION_MODE_IN_BODY; case GUMBO_TAG_FRAMESET: return GUMBO_INSERTION_MODE_IN_FRAMESET; case GUMBO_TAG_HTML: return parser->_parser_state->_head_element ? GUMBO_INSERTION_MODE_AFTER_HEAD : GUMBO_INSERTION_MODE_BEFORE_HEAD; default: break; } return is_last ? GUMBO_INSERTION_MODE_IN_BODY : GUMBO_INSERTION_MODE_INITIAL; } // This performs the actual "reset the insertion mode" loop. static void reset_insertion_mode_appropriately(GumboParser* parser) { const GumboVector* open_elements = &parser->_parser_state->_open_elements; for (int i = open_elements->length; --i >= 0;) { GumboInsertionMode mode = get_appropriate_insertion_mode(parser, i); if (mode != GUMBO_INSERTION_MODE_INITIAL) { set_insertion_mode(parser, mode); return; } } // Should never get here, because is_last will be set on the last iteration // and will force GUMBO_INSERTION_MODE_IN_BODY. assert(0); } static void parser_add_parse_error ( GumboParser* parser, const GumboToken* token ) { gumbo_debug("Adding parse error.\n"); GumboError* error = gumbo_add_error(parser); if (!error) { return; } error->type = GUMBO_ERR_PARSER; error->position = token->position; error->original_text = token->original_text; GumboParserError* extra_data = &error->v.parser; extra_data->input_type = token->type; extra_data->input_tag = GUMBO_TAG_UNKNOWN; if (token->type == GUMBO_TOKEN_START_TAG) { extra_data->input_tag = token->v.start_tag.tag; } else if (token->type == GUMBO_TOKEN_END_TAG) { extra_data->input_tag = token->v.end_tag.tag; } const GumboParserState* state = parser->_parser_state; extra_data->parser_state = state->_insertion_mode; gumbo_vector_init(state->_open_elements.length, &extra_data->tag_stack); for (unsigned int i = 0; i < state->_open_elements.length; ++i) { const GumboNode* node = state->_open_elements.data[i]; assert ( node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE ); gumbo_vector_add ( (void*) node->v.element.tag, &extra_data->tag_stack ); } } // https://html.spec.whatwg.org/multipage/parsing.html#mathml-text-integration-point static bool is_mathml_integration_point(const GumboNode* node) { static const TagSet mathml_integration_point_tags = { TAG_MATHML(MI), TAG_MATHML(MO), TAG_MATHML(MN), TAG_MATHML(MS), TAG_MATHML(MTEXT) }; return node_tag_in_set(node, &mathml_integration_point_tags); } // https://html.spec.whatwg.org/multipage/parsing.html#html-integration-point static bool is_html_integration_point(const GumboNode* node) { static const TagSet html_integration_point_svg_tags = { TAG_SVG(FOREIGNOBJECT), TAG_SVG(DESC), TAG_SVG(TITLE) }; if (node_tag_in_set(node, &html_integration_point_svg_tags)) { return true; } const bool is_mathml_annotation_xml_element = node_qualified_tag_is ( node, GUMBO_NAMESPACE_MATHML, GUMBO_TAG_ANNOTATION_XML ); const GumboVector* attributes = &node->v.element.attributes; if ( is_mathml_annotation_xml_element && ( attribute_matches(attributes, "encoding", "text/html") || attribute_matches(attributes, "encoding", "application/xhtml+xml") ) ) { return true; } return false; } // This represents a place to insert a node, consisting of a target parent and a // child index within that parent. If the node should be inserted at the end of // the parent's child, index will be -1. typedef struct { GumboNode* target; int index; } InsertionLocation; static InsertionLocation get_appropriate_insertion_location ( const GumboParser* parser, GumboNode* override_target ) { InsertionLocation retval = {override_target, -1}; if (retval.target == NULL) { // No override target; default to the current node, but special-case the // root node since get_current_node() assumes the stack of open elements is // non-empty. retval.target = (parser->_output->root != NULL) ? get_current_node(parser) : get_document_node(parser) ; } if ( !parser->_parser_state->_foster_parent_insertions || !node_tag_in_set(retval.target, &(const TagSet) { TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR) }) ) { return retval; } // Foster-parenting case. int last_template_index = -1; int last_table_index = -1; const GumboVector* open_elements = &parser->_parser_state->_open_elements; for (unsigned int i = 0; i < open_elements->length; ++i) { if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TEMPLATE)) { last_template_index = i; } if (node_html_tag_is(open_elements->data[i], GUMBO_TAG_TABLE)) { last_table_index = i; } } if ( last_template_index != -1 && (last_table_index == -1 || last_template_index > last_table_index) ) { retval.target = open_elements->data[last_template_index]; return retval; } if (last_table_index == -1) { retval.target = open_elements->data[0]; return retval; } const GumboNode* last_table = open_elements->data[last_table_index]; if (last_table->parent != NULL) { retval.target = last_table->parent; retval.index = last_table->index_within_parent; return retval; } retval.target = open_elements->data[last_table_index - 1]; return retval; } // Appends a node to the end of its parent, setting the "parent" and // "index_within_parent" fields appropriately. static void append_node(GumboNode* parent, GumboNode* node) { assert(node->parent == NULL); assert(node->index_within_parent == (unsigned int) -1); GumboVector* children; if ( parent->type == GUMBO_NODE_ELEMENT || parent->type == GUMBO_NODE_TEMPLATE ) { children = &parent->v.element.children; } else { assert(parent->type == GUMBO_NODE_DOCUMENT); children = &parent->v.document.children; } node->parent = parent; node->index_within_parent = children->length; gumbo_vector_add((void*) node, children); assert(node->index_within_parent < children->length); } // Inserts a node at the specified InsertionLocation, updating the // "parent" and "index_within_parent" fields of it and all its siblings. // If the index of the location is -1, this calls append_node. static void insert_node(GumboNode* node, InsertionLocation location) { assert(node->parent == NULL); assert(node->index_within_parent == (unsigned int) -1); GumboNode* parent = location.target; int index = location.index; if (index != -1) { GumboVector* children = NULL; if ( parent->type == GUMBO_NODE_ELEMENT || parent->type == GUMBO_NODE_TEMPLATE ) { children = &parent->v.element.children; } else if (parent->type == GUMBO_NODE_DOCUMENT) { children = &parent->v.document.children; assert(children->length == 0); } else { assert(0); } assert(index >= 0); assert((unsigned int) index < children->length); node->parent = parent; node->index_within_parent = index; gumbo_vector_insert_at((void*) node, index, children); assert(node->index_within_parent < children->length); for (unsigned int i = index + 1; i < children->length; ++i) { GumboNode* sibling = children->data[i]; sibling->index_within_parent = i; assert(sibling->index_within_parent < children->length); } } else { append_node(parent, node); } } static void maybe_flush_text_node_buffer(GumboParser* parser) { GumboParserState* state = parser->_parser_state; TextNodeBufferState* buffer_state = &state->_text_node; if (buffer_state->_buffer.length == 0) { return; } assert ( buffer_state->_type == GUMBO_NODE_WHITESPACE || buffer_state->_type == GUMBO_NODE_TEXT || buffer_state->_type == GUMBO_NODE_CDATA ); GumboNode* text_node = create_node(buffer_state->_type); GumboText* text_node_data = &text_node->v.text; text_node_data->text = gumbo_string_buffer_to_string(&buffer_state->_buffer); text_node_data->original_text.data = buffer_state->_start_original_text; text_node_data->original_text.length = state->_current_token->original_text.data - buffer_state->_start_original_text; text_node_data->start_pos = buffer_state->_start_position; gumbo_debug ( "Flushing text node buffer of %.*s.\n", (int) buffer_state->_buffer.length, buffer_state->_buffer.data ); InsertionLocation location = get_appropriate_insertion_location(parser, NULL); if (location.target->type == GUMBO_NODE_DOCUMENT) { // The DOM does not allow Document nodes to have Text children, so per the // spec, they are dropped on the floor. destroy_node(text_node); } else { insert_node(text_node, location); } gumbo_string_buffer_clear(&buffer_state->_buffer); buffer_state->_type = GUMBO_NODE_WHITESPACE; assert(buffer_state->_buffer.length == 0); } static void record_end_of_element ( const GumboToken* current_token, GumboElement* element ) { element->end_pos = current_token->position; element->original_end_tag = (current_token->type == GUMBO_TOKEN_END_TAG) ? current_token->original_text : kGumboEmptyString; } static GumboNode* pop_current_node(GumboParser* parser) { GumboParserState* state = parser->_parser_state; maybe_flush_text_node_buffer(parser); if (state->_open_elements.length > 0) { assert(node_html_tag_is(state->_open_elements.data[0], GUMBO_TAG_HTML)); gumbo_debug ( "Popping %s node.\n", gumbo_normalized_tagname(get_current_node(parser)->v.element.tag) ); } GumboNode* current_node = gumbo_vector_pop(&state->_open_elements); if (!current_node) { assert(state->_open_elements.length == 0); return NULL; } assert ( current_node->type == GUMBO_NODE_ELEMENT || current_node->type == GUMBO_NODE_TEMPLATE ); bool is_closed_body_or_html_tag = ( node_html_tag_is(current_node, GUMBO_TAG_BODY) && state->_closed_body_tag ) || ( node_html_tag_is(current_node, GUMBO_TAG_HTML) && state->_closed_html_tag ) ; if ( ( state->_current_token->type != GUMBO_TOKEN_END_TAG || !node_qualified_tagname_is ( current_node, GUMBO_NAMESPACE_HTML, state->_current_token->v.end_tag.tag, state->_current_token->v.end_tag.name ) ) && !is_closed_body_or_html_tag ) { current_node->parse_flags |= GUMBO_INSERTION_IMPLICIT_END_TAG; } if (!is_closed_body_or_html_tag) { record_end_of_element(state->_current_token, ¤t_node->v.element); } return current_node; } static void append_comment_node ( GumboParser* parser, GumboNode* node, const GumboToken* token ) { maybe_flush_text_node_buffer(parser); GumboNode* comment = create_node(GUMBO_NODE_COMMENT); comment->type = GUMBO_NODE_COMMENT; comment->parse_flags = GUMBO_INSERTION_NORMAL; comment->v.text.text = token->v.text; comment->v.text.original_text = token->original_text; comment->v.text.start_pos = token->position; append_node(node, comment); } // https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-row-context static void clear_stack_to_table_row_context(GumboParser* parser) { static const TagSet tags = {TAG(HTML), TAG(TR), TAG(TEMPLATE)}; while (!node_tag_in_set(get_current_node(parser), &tags)) { pop_current_node(parser); } } // https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-context static void clear_stack_to_table_context(GumboParser* parser) { static const TagSet tags = {TAG(HTML), TAG(TABLE), TAG(TEMPLATE)}; while (!node_tag_in_set(get_current_node(parser), &tags)) { pop_current_node(parser); } } // https://html.spec.whatwg.org/multipage/parsing.html#clear-the-stack-back-to-a-table-body-context static void clear_stack_to_table_body_context(GumboParser* parser) { static const TagSet tags = { TAG(HTML), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TEMPLATE) }; while (!node_tag_in_set(get_current_node(parser), &tags)) { pop_current_node(parser); } } // Creates a parser-inserted element in the HTML namespace and returns it. static GumboNode* create_element(GumboParser* parser, GumboTag tag) { // XXX: This will fail for creating fragments with an element with tag // GUMBO_TAG_UNKNOWN assert(tag != GUMBO_TAG_UNKNOWN); GumboNode* node = create_node(GUMBO_NODE_ELEMENT); GumboElement* element = &node->v.element; gumbo_vector_init(1, &element->children); gumbo_vector_init(0, &element->attributes); element->tag = tag; element->name = gumbo_normalized_tagname(tag); element->tag_namespace = GUMBO_NAMESPACE_HTML; element->original_tag = kGumboEmptyString; element->original_end_tag = kGumboEmptyString; element->start_pos = (parser->_parser_state->_current_token) ? parser->_parser_state->_current_token->position : kGumboEmptySourcePosition ; element->end_pos = kGumboEmptySourcePosition; return node; } // Constructs an element from the given start tag token. static GumboNode* create_element_from_token ( GumboToken* token, GumboNamespaceEnum tag_namespace ) { assert(token->type == GUMBO_TOKEN_START_TAG); GumboTokenStartTag* start_tag = &token->v.start_tag; GumboNodeType type = ( tag_namespace == GUMBO_NAMESPACE_HTML && start_tag->tag == GUMBO_TAG_TEMPLATE ) ? GUMBO_NODE_TEMPLATE : GUMBO_NODE_ELEMENT ; GumboNode* node = create_node(type); GumboElement* element = &node->v.element; gumbo_vector_init(1, &element->children); element->attributes = start_tag->attributes; element->tag = start_tag->tag; element->name = start_tag->name ? start_tag->name : gumbo_normalized_tagname(start_tag->tag); element->tag_namespace = tag_namespace; assert(token->original_text.length >= 2); assert(token->original_text.data[0] == '<'); assert(token->original_text.data[token->original_text.length - 1] == '>'); element->original_tag = token->original_text; element->start_pos = token->position; element->original_end_tag = kGumboEmptyString; element->end_pos = kGumboEmptySourcePosition; // The element takes ownership of the attributes and name from the token, so // any allocated-memory fields should be nulled out. start_tag->attributes = kGumboEmptyVector; start_tag->name = NULL; return node; } // https://html.spec.whatwg.org/multipage/parsing.html#insert-an-html-element static void insert_element ( GumboParser* parser, GumboNode* node, bool is_reconstructing_formatting_elements ) { GumboParserState* state = parser->_parser_state; // NOTE(jdtang): The text node buffer must always be flushed before inserting // a node, otherwise we're handling nodes in a different order than the spec // mandated. However, one clause of the spec (character tokens in the body) // requires that we reconstruct the active formatting elements *before* adding // the character, and reconstructing the active formatting elements may itself // result in the insertion of new elements (which should be pushed onto the // stack of open elements before the buffer is flushed). We solve this (for // the time being, the spec has been rewritten for