// Copyright 2010 Google Inc. // Copyright 2018 Craig Barnes. // Licensed under the Apache License, version 2.0. // We use Gumbo as a prefix for types, gumbo_ as a prefix for functions, // GUMBO_ as a prefix for enum constants and kGumbo as a prefix for // static constants /** * @file * @mainpage Gumbo HTML Parser * * This provides a conformant, no-dependencies implementation of the * [HTML5] parsing algorithm. It supports only UTF-8 -- if you need * to parse a different encoding, run a preprocessing step to convert * to UTF-8. It returns a parse tree made of the structs in this file. * * Example: * @code * GumboOutput* output = gumbo_parse(input); * do_something_with_doctype(output->document); * do_something_with_html_tree(output->root); * gumbo_destroy_output(output); * @endcode * * [HTML5]: https://html.spec.whatwg.org/multipage/ */ #ifndef GUMBO_H #define GUMBO_H #include #include #ifdef __cplusplus extern "C" { #endif /** * A struct representing a character position within the original text * buffer. Line and column numbers are 1-based and offsets are 0-based, * which matches how most editors and command-line tools work. */ typedef struct { size_t line; size_t column; size_t offset; } GumboSourcePosition; /** * A struct representing a string or part of a string. Strings within * the parser are represented by a `char*` and a length; the `char*` * points into an existing data buffer owned by some other code (often * the original input). `GumboStringPiece`s are assumed (by convention) * to be immutable, because they may share data. Clients should assume * that it is not NUL-terminated and should always use explicit lengths * when manipulating them. */ typedef struct { /** A pointer to the beginning of the string. `NULL` if `length == 0`. */ const char* data; /** The length of the string fragment, in bytes (may be zero). */ size_t length; } GumboStringPiece; #define GUMBO_EMPTY_STRING_INIT { .data = NULL, .length = 0 } /** A constant to represent a 0-length null string. */ #define kGumboEmptyString (const GumboStringPiece)GUMBO_EMPTY_STRING_INIT /** * Compares two `GumboStringPiece`s, and returns `true` if they're * equal or `false` otherwise. */ bool gumbo_string_equals ( const GumboStringPiece* str1, const GumboStringPiece* str2 ); /** * Compares two `GumboStringPiece`s, ignoring case, and returns `true` * if they're equal or `false` otherwise. */ bool gumbo_string_equals_ignore_case ( const GumboStringPiece* str1, const GumboStringPiece* str2 ); /** * Check if the first `GumboStringPiece` is a prefix of the second, ignoring * case. */ bool gumbo_string_prefix_ignore_case ( const GumboStringPiece* prefix, const GumboStringPiece* str ); /** * A simple vector implementation. This stores a pointer to a data array * and a length. All elements are stored as `void*`; client code must * cast to the appropriate type. Overflows upon addition result in * reallocation of the data array, with the size doubling to maintain * `O(1)` amortized cost. There is no removal function, as this isn't * needed for any of the operations within this library. Iteration can * be done through inspecting the structure directly in a `for` loop. */ typedef struct { /** * Data elements. This points to a dynamically-allocated array of * `capacity` elements, each a `void*` to the element itself. */ void** data; /** Number of elements currently in the vector. */ unsigned int length; /** Current array capacity. */ unsigned int capacity; } GumboVector; # define GUMBO_EMPTY_VECTOR_INIT { .data = NULL, .length = 0, .capacity = 0 } /** An empty (0-length, 0-capacity) `GumboVector`. */ #define kGumboEmptyVector (const GumboVector)GUMBO_EMPTY_VECTOR_INIT /** * Returns the first index at which an element appears in this vector * (testing by pointer equality), or `-1` if it never does. */ int gumbo_vector_index_of(GumboVector* vector, const void* element); /** * An `enum` for all the tags defined in the HTML5 standard. These * correspond to the tag names themselves. Enum constants exist only * for tags that appear in the spec itself (or for tags with special * handling in the SVG and MathML namespaces). Any other tags appear * as `GUMBO_TAG_UNKNOWN` and the actual tag name can be obtained * through `original_tag`. * * This is mostly for API convenience, so that clients of this library * don't need to perform a `strcasecmp` to find the normalized tag * name. It also has efficiency benefits, by letting the parser work * with enums instead of strings. */ typedef enum { GUMBO_TAG_HTML, GUMBO_TAG_HEAD, GUMBO_TAG_TITLE, GUMBO_TAG_BASE, GUMBO_TAG_LINK, GUMBO_TAG_META, GUMBO_TAG_STYLE, GUMBO_TAG_SCRIPT, GUMBO_TAG_NOSCRIPT, GUMBO_TAG_TEMPLATE, GUMBO_TAG_BODY, GUMBO_TAG_ARTICLE, GUMBO_TAG_SECTION, GUMBO_TAG_NAV, GUMBO_TAG_ASIDE, GUMBO_TAG_H1, GUMBO_TAG_H2, GUMBO_TAG_H3, GUMBO_TAG_H4, GUMBO_TAG_H5, GUMBO_TAG_H6, GUMBO_TAG_HGROUP, GUMBO_TAG_HEADER, GUMBO_TAG_FOOTER, GUMBO_TAG_ADDRESS, GUMBO_TAG_P, GUMBO_TAG_HR, GUMBO_TAG_PRE, GUMBO_TAG_BLOCKQUOTE, GUMBO_TAG_OL, GUMBO_TAG_UL, GUMBO_TAG_LI, GUMBO_TAG_DL, GUMBO_TAG_DT, GUMBO_TAG_DD, GUMBO_TAG_FIGURE, GUMBO_TAG_FIGCAPTION, GUMBO_TAG_MAIN, GUMBO_TAG_DIV, GUMBO_TAG_A, GUMBO_TAG_EM, GUMBO_TAG_STRONG, GUMBO_TAG_SMALL, GUMBO_TAG_S, GUMBO_TAG_CITE, GUMBO_TAG_Q, GUMBO_TAG_DFN, GUMBO_TAG_ABBR, GUMBO_TAG_DATA, GUMBO_TAG_TIME, GUMBO_TAG_CODE, GUMBO_TAG_VAR, GUMBO_TAG_SAMP, GUMBO_TAG_KBD, GUMBO_TAG_SUB, GUMBO_TAG_SUP, GUMBO_TAG_I, GUMBO_TAG_B, GUMBO_TAG_U, GUMBO_TAG_MARK, GUMBO_TAG_RUBY, GUMBO_TAG_RT, GUMBO_TAG_RP, GUMBO_TAG_BDI, GUMBO_TAG_BDO, GUMBO_TAG_SPAN, GUMBO_TAG_BR, GUMBO_TAG_WBR, GUMBO_TAG_INS, GUMBO_TAG_DEL, GUMBO_TAG_IMAGE, GUMBO_TAG_IMG, GUMBO_TAG_IFRAME, GUMBO_TAG_EMBED, GUMBO_TAG_OBJECT, GUMBO_TAG_PARAM, GUMBO_TAG_VIDEO, GUMBO_TAG_AUDIO, GUMBO_TAG_SOURCE, GUMBO_TAG_TRACK, GUMBO_TAG_CANVAS, GUMBO_TAG_MAP, GUMBO_TAG_AREA, GUMBO_TAG_MATH, GUMBO_TAG_MI, GUMBO_TAG_MO, GUMBO_TAG_MN, GUMBO_TAG_MS, GUMBO_TAG_MTEXT, GUMBO_TAG_MGLYPH, GUMBO_TAG_MALIGNMARK, GUMBO_TAG_ANNOTATION_XML, GUMBO_TAG_SVG, GUMBO_TAG_FOREIGNOBJECT, GUMBO_TAG_DESC, GUMBO_TAG_TABLE, GUMBO_TAG_CAPTION, GUMBO_TAG_COLGROUP, GUMBO_TAG_COL, GUMBO_TAG_TBODY, GUMBO_TAG_THEAD, GUMBO_TAG_TFOOT, GUMBO_TAG_TR, GUMBO_TAG_TD, GUMBO_TAG_TH, GUMBO_TAG_FORM, GUMBO_TAG_FIELDSET, GUMBO_TAG_LEGEND, GUMBO_TAG_LABEL, GUMBO_TAG_INPUT, GUMBO_TAG_BUTTON, GUMBO_TAG_SELECT, GUMBO_TAG_DATALIST, GUMBO_TAG_OPTGROUP, GUMBO_TAG_OPTION, GUMBO_TAG_TEXTAREA, GUMBO_TAG_KEYGEN, GUMBO_TAG_OUTPUT, GUMBO_TAG_PROGRESS, GUMBO_TAG_METER, GUMBO_TAG_DETAILS, GUMBO_TAG_SUMMARY, GUMBO_TAG_MENU, GUMBO_TAG_MENUITEM, GUMBO_TAG_APPLET, GUMBO_TAG_ACRONYM, GUMBO_TAG_BGSOUND, GUMBO_TAG_DIR, GUMBO_TAG_FRAME, GUMBO_TAG_FRAMESET, GUMBO_TAG_NOFRAMES, GUMBO_TAG_LISTING, GUMBO_TAG_XMP, GUMBO_TAG_NEXTID, GUMBO_TAG_NOEMBED, GUMBO_TAG_PLAINTEXT, GUMBO_TAG_RB, GUMBO_TAG_STRIKE, GUMBO_TAG_BASEFONT, GUMBO_TAG_BIG, GUMBO_TAG_BLINK, GUMBO_TAG_CENTER, GUMBO_TAG_FONT, GUMBO_TAG_MARQUEE, GUMBO_TAG_MULTICOL, GUMBO_TAG_NOBR, GUMBO_TAG_SPACER, GUMBO_TAG_TT, GUMBO_TAG_RTC, GUMBO_TAG_DIALOG, GUMBO_TAG_SEARCH, // Used for all tags that don't have special handling in HTML. GUMBO_TAG_UNKNOWN, // A marker value to indicate the end of the enum, for iterating over it. GUMBO_TAG_LAST, } GumboTag; /** * Returns the normalized (all lower case) tag name for a `GumboTag` enum. The * return value is static data owned by the library. */ const char* gumbo_normalized_tagname(GumboTag tag); /** * Extracts the tag name from the `original_text` field of an element * or token by stripping off `` characters and attributes and * adjusting the passed-in `GumboStringPiece` appropriately. The tag * name is in the original case and shares a buffer with the original * text, to simplify memory management. Behavior is undefined if a * string piece that doesn't represent an HTML tag (`` or * ``) is passed in. If the string piece is completely * empty (`NULL` data pointer), then this function will exit * successfully as a no-op. */ void gumbo_tag_from_original_text(GumboStringPiece* text); /** * Fixes the case of SVG elements that are not all lowercase. This is * not done at parse time because there's no place to store a mutated * tag name. `tag_name` is an enum (which will be `TAG_UNKNOWN` for most * SVG tags without special handling), while `original_tag_name` is a * pointer into the original buffer. Instead, we provide this helper * function that clients can use to rename SVG tags as appropriate. * Returns the case-normalized SVG tagname if a replacement is found, or * `NULL` if no normalization is called for. The return value is static * data and owned by the library. * * @see https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inforeign */ const char* gumbo_normalize_svg_tagname(const GumboStringPiece* tagname); /** * Converts a tag name string (which may be in upper or mixed case) to a * tag enum. */ GumboTag gumbo_tagn_enum(const char* tagname, size_t length); /** * Attribute namespaces. * HTML includes special handling for XLink, XML, and XMLNS namespaces * on attributes. Everything else goes in the generic "NONE" namespace. */ typedef enum { GUMBO_ATTR_NAMESPACE_NONE, GUMBO_ATTR_NAMESPACE_XLINK, GUMBO_ATTR_NAMESPACE_XML, GUMBO_ATTR_NAMESPACE_XMLNS, } GumboAttributeNamespaceEnum; /** * A struct representing a single attribute on a HTML tag. This is a * name-value pair, but also includes information about source locations * and original source text. */ typedef struct { /** * The namespace for the attribute. This will usually be * `GUMBO_ATTR_NAMESPACE_NONE`, but some XLink/XMLNS/XML attributes * take special values, per: * https://html.spec.whatwg.org/multipage/parsing.html#adjust-foreign-attributes */ GumboAttributeNamespaceEnum attr_namespace; /** * The name of the attribute. This is in a freshly-allocated buffer to * deal with case-normalization and is null-terminated. */ const char* name; /** * The original text of the attribute name, as a pointer into the * original source buffer. */ GumboStringPiece original_name; /** * The value of the attribute. This is in a freshly-allocated buffer * to deal with unescaping and is null-terminated. It does not include * any quotes that surround the attribute. If the attribute has no * value (for example, `selected` on a checkbox) this will be an empty * string. */ const char* value; /** * The original text of the value of the attribute. This points into * the original source buffer. It includes any quotes that surround * the attribute and you can look at `original_value.data[0]` and * `original_value.data[original_value.length - 1]` to determine what * the quote characters were. If the attribute has no value this will * be a 0-length string. */ GumboStringPiece original_value; /** The starting position of the attribute name. */ GumboSourcePosition name_start; /** * The ending position of the attribute name. This is not always derivable * from the starting position of the value because of the possibility of * whitespace around the `=` sign. */ GumboSourcePosition name_end; /** The starting position of the attribute value. */ GumboSourcePosition value_start; /** The ending position of the attribute value. */ GumboSourcePosition value_end; } GumboAttribute; /** * Given a vector of `GumboAttribute`s, look up the one with the * specified name and return it, or `NULL` if no such attribute exists. * This uses a case-insensitive match, as HTML is case-insensitive. */ GumboAttribute* gumbo_get_attribute(const GumboVector* attrs, const char* name); /** * Enum denoting the type of node. This determines the type of the * `node.v` union. */ typedef enum { /** Document node. `v` will be a `GumboDocument`. */ GUMBO_NODE_DOCUMENT, /** Element node. `v` will be a `GumboElement`. */ GUMBO_NODE_ELEMENT, /** Text node. `v` will be a `GumboText`. */ GUMBO_NODE_TEXT, /** CDATA node. `v` will be a `GumboText`. */ GUMBO_NODE_CDATA, /** Comment node. `v` will be a `GumboText`, excluding comment delimiters. */ GUMBO_NODE_COMMENT, /** Text node, where all contents is whitespace. `v` will be a `GumboText`. */ GUMBO_NODE_WHITESPACE, /** * Template node. This is separate from `GUMBO_NODE_ELEMENT` because * many client libraries will want to ignore the contents of template * nodes, as the spec suggests. Recursing on `GUMBO_NODE_ELEMENT` will * do the right thing here, while clients that want to include template * contents should also check for `GUMBO_NODE_TEMPLATE`. `v` will be a * `GumboElement`. */ GUMBO_NODE_TEMPLATE } GumboNodeType; /** * Forward declaration of GumboNode so it can be used recursively in * GumboNode.parent. */ typedef struct GumboInternalNode GumboNode; /** https://dom.spec.whatwg.org/#concept-document-quirks */ typedef enum { GUMBO_DOCTYPE_NO_QUIRKS, GUMBO_DOCTYPE_QUIRKS, GUMBO_DOCTYPE_LIMITED_QUIRKS } GumboQuirksModeEnum; /** * Namespaces. * Unlike in X(HT)ML, namespaces in HTML5 are not denoted by a prefix. * Rather, anything inside an `` tag is in the SVG namespace, * anything inside the `` tag is in the MathML namespace, and * anything else is inside the HTML namespace. No other namespaces are * supported, so this can be an `enum`. */ typedef enum { GUMBO_NAMESPACE_HTML, GUMBO_NAMESPACE_SVG, GUMBO_NAMESPACE_MATHML } GumboNamespaceEnum; /** * Parse flags. * We track the reasons for parser insertion of nodes and store them in * a bitvector in the node itself. This lets client code optimize out * nodes that are implied by the HTML structure of the document, or flag * constructs that may not be allowed by a style guide, or track the * prevalence of incorrect or tricky HTML code. */ typedef enum { /** * A normal node -- both start and end tags appear in the source, * nothing has been reparented. */ GUMBO_INSERTION_NORMAL = 0, /** * A node inserted by the parser to fulfill some implicit insertion * rule. This is usually set in addition to some other flag giving a * more specific insertion reason; it's a generic catch-all term * meaning "The start tag for this node did not appear in the document * source". */ GUMBO_INSERTION_BY_PARSER = 1 << 0, /** * A flag indicating that the end tag for this node did not appear in * the document source. Note that in some cases, you can still have * parser-inserted nodes with an explicit end tag. For example, * `Text` has `GUMBO_INSERTED_BY_PARSER` set on the `` * node, but `GUMBO_INSERTED_END_TAG_IMPLICITLY` is unset, as the * `` tag actually exists. * * This flag will be set only if the end tag is completely missing. * In some cases, the end tag may be misplaced (e.g. a `` tag * with text afterwards), which will leave this flag unset and require * clients to inspect the parse errors for that case. */ GUMBO_INSERTION_IMPLICIT_END_TAG = 1 << 1, // Value 1 << 2 was for a flag that has since been removed. /** * A flag for nodes that are inserted because their presence is * implied by other tags, e.g. ``, ``, ``, * ``, etc. */ GUMBO_INSERTION_IMPLIED = 1 << 3, /** * A flag for nodes that are converted from their end tag equivalents. * For example, `

` when no paragraph is open implies that the * parser should create a `

` tag and immediately close it, while * `
` means the same thing as `
`. */ GUMBO_INSERTION_CONVERTED_FROM_END_TAG = 1 << 4, // Value 1 << 5 was for a flag that has since been removed. /** A flag for `` tags that are rewritten as ``. */ GUMBO_INSERTION_FROM_IMAGE = 1 << 6, /** * A flag for nodes that are cloned as a result of the reconstruction * of active formatting elements. This is set only on the clone; the * initial portion of the formatting run is a NORMAL node with an * `IMPLICIT_END_TAG`. */ GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT = 1 << 7, /** A flag for nodes that are cloned by the adoption agency algorithm. */ GUMBO_INSERTION_ADOPTION_AGENCY_CLONED = 1 << 8, /** A flag for nodes that are moved by the adoption agency algorithm. */ GUMBO_INSERTION_ADOPTION_AGENCY_MOVED = 1 << 9, /** * A flag for nodes that have been foster-parented out of a table (or * should've been foster-parented, if verbatim mode is set). */ GUMBO_INSERTION_FOSTER_PARENTED = 1 << 10, } GumboParseFlags; /** Information specific to document nodes. */ typedef struct { /** * An array of `GumboNode`s, containing the children of this element. * This will normally consist of the `` element and any comment * nodes found. Pointers are owned. */ GumboVector /* GumboNode* */ children; /** * `true` if there was an explicit doctype token, as opposed to it * being omitted. */ bool has_doctype; // Fields from the doctype token, copied verbatim. const char* name; const char* public_identifier; const char* system_identifier; /** * Whether or not the document is in QuirksMode, as determined by the * values in the GumboTokenDocType template. */ GumboQuirksModeEnum doc_type_quirks_mode; } GumboDocument; /** * The struct used to represent TEXT, CDATA, COMMENT, and WHITESPACE * elements. This contains just a block of text and its position. */ typedef struct { /** * The text of this node, after entities have been parsed and decoded. * For comment and cdata nodes, this does not include the comment * delimiters. */ const char* text; /** * The original text of this node, as a pointer into the original * buffer. For comment/cdata nodes, this includes the comment * delimiters. */ GumboStringPiece original_text; /** * The starting position of this node. This corresponds to the * position of `original_text`, before entities are decoded. * */ GumboSourcePosition start_pos; } GumboText; /** * The struct used to represent all HTML elements. This contains * information about the tag, attributes, and child nodes. */ typedef struct { /** * An array of `GumboNode`s, containing the children of this element. * Pointers are owned. */ GumboVector /* GumboNode* */ children; /** The GumboTag enum for this element. */ GumboTag tag; /** The name for this element. */ const char* name; /** The GumboNamespaceEnum for this element. */ GumboNamespaceEnum tag_namespace; /** * A `GumboStringPiece` pointing to the original tag text for this * element, pointing directly into the source buffer. If the tag was * inserted algorithmically (for example, `` or `` * insertion), this will be a zero-length string. */ GumboStringPiece original_tag; /** * A `GumboStringPiece` pointing to the original end tag text for this * element. If the end tag was inserted algorithmically, (for example, * closing a self-closing tag), this will be a zero-length string. */ GumboStringPiece original_end_tag; /** The source position for the start of the start tag. */ GumboSourcePosition start_pos; /** The source position for the start of the end tag. */ GumboSourcePosition end_pos; /** * An array of `GumboAttribute`s, containing the attributes for this * tag in the order that they were parsed. Pointers are owned. */ GumboVector /* GumboAttribute* */ attributes; } GumboElement; /** * A supertype for `GumboElement` and `GumboText`, so that we can * include one generic type in lists of children and cast as necessary * to subtypes. */ struct GumboInternalNode { /** The type of node that this is. */ GumboNodeType type; /** Pointer back to parent node. Not owned. */ GumboNode* parent; /** The index within the parent's children vector of this node. */ unsigned int index_within_parent; /** * A bitvector of flags containing information about why this element * was inserted into the parse tree, including a variety of special * parse situations. */ GumboParseFlags parse_flags; /** The actual node data. */ union { GumboDocument document; // For GUMBO_NODE_DOCUMENT. GumboElement element; // For GUMBO_NODE_ELEMENT. GumboText text; // For everything else. } v; }; /** * Input struct containing configuration options for the parser. * These let you specify alternate memory managers, provide different * error handling, etc. Use `kGumboDefaultOptions` for sensible * defaults and only set what you need. */ typedef struct GumboInternalOptions { /** * The tab-stop size, for computing positions in HTML files that * use tabs. Default: `8`. */ int tab_stop; /** * Whether or not to stop parsing when the first error is encountered. * Default: `false`. */ bool stop_on_first_error; /** * Maximum allowed number of attributes per element. If this limit is * exceeded, the parser will return early with a partial document and * the returned `GumboOutput` will have its `status` field set to * `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit. * Default: `400`. */ int max_attributes; /** * Maximum allowed depth for the parse tree. If this limit is exceeded, * the parser will return early with a partial document and the returned * `GumboOutput` will have its `status` field set to * `GUMBO_STATUS_TREE_TOO_DEEP`. * Default: `400`. */ unsigned int max_tree_depth; /** * The maximum number of errors before the parser stops recording * them. This is provided so that if the page is totally borked, we * don't completely fill up the errors vector and exhaust memory with * useless redundant errors. Set to `-1` to disable the limit. * Default: `-1`. */ int max_errors; /** * The fragment context for parsing: * https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments * * If `NULL` is passed here, it is assumed to be "no * fragment", i.e. the regular parsing algorithm. Otherwise, pass the * tag name for the intended parent of the parsed fragment. We use the * tag name, namespace, and encoding attribute which are sufficient to * set all of the parsing context needed for fragment parsing. * * Default: `NULL`. */ const char* fragment_context; /** * The namespace for the fragment context. This lets client code * differentiate between, say, parsing a `` tag in SVG vs. * parsing it in HTML. * * Default: `GUMBO_NAMESPACE_HTML`. */ GumboNamespaceEnum fragment_namespace; /** * The value of the fragment context's `encoding` attribute, if any. * Set to `NULL` for no `encoding` attribute. * * Default: `NULL`. */ const char* fragment_encoding; /** * Quirks mode for fragment parsing. The quirks mode for a given DOCTYPE can * be looked up using `gumbo_compute_quirks_mode()`. * * Default: `GUMBO_DOCTYPE_NO_QUIRKS`. */ GumboQuirksModeEnum quirks_mode; /** * For fragment parsing. Set this to true if the context node has a form * element as an ancestor. * * Default: `false`. */ bool fragment_context_has_form_ancestor; } GumboOptions; /** Default options struct; use this with gumbo_parse_with_options. */ extern const GumboOptions kGumboDefaultOptions; /** * Status code indicating whether parsing finished successfully or * was stopped mid-document due to exceptional circumstances. */ typedef enum { /** * Indicates that parsing completed successfuly. The resulting tree * will be a complete document. */ GUMBO_STATUS_OK, /** * Indicates that the maximum element nesting limit * (`GumboOptions::max_tree_depth`) was reached during parsing. The * resulting tree will be a partial document, with no further nodes * created after the point where the limit was reached. The partial * document may be useful for constructing an error message but * typically shouldn't be used for other purposes. */ GUMBO_STATUS_TREE_TOO_DEEP, /** * Indicates that the maximum number of attributes per element * (`GumboOptions::max_attributes`) was reached during parsing. The * resulting tree will be a partial document, with no further nodes * created after the point where the limit was reached. The partial * document may be useful for constructing an error message but * typically shouldn't be used for other purposes. */ GUMBO_STATUS_TOO_MANY_ATTRIBUTES, // Currently unused GUMBO_STATUS_OUT_OF_MEMORY, } GumboOutputStatus; /** The output struct containing the results of the parse. */ typedef struct GumboInternalOutput { /** * Pointer to the document node. This is a `GumboNode` of type * `NODE_DOCUMENT` that contains the entire document as its child. */ GumboNode* document; /** * Pointer to the root node. This is the `<html>` tag that forms the * root of the document. */ GumboNode* root; /** * A list of errors that occurred during the parse. */ GumboVector /* GumboError */ errors; /** * True if the parser encounted an error. * * This can be true and `errors` an empty `GumboVector` if the `max_errors` * option was set to 0. */ bool document_error; /** * A status code indicating whether parsing finished successfully or was * stopped mid-document due to exceptional circumstances. */ GumboOutputStatus status; } GumboOutput; /** * Parses a buffer of UTF-8 text into an `GumboNode` parse tree. The * buffer must live at least as long as the parse tree, as some fields * (eg. `original_text`) point directly into the original buffer. * * This doesn't support buffers longer than 4 gigabytes. */ GumboOutput* gumbo_parse(const char* buffer); /** * Extended version of `gumbo_parse` that takes an explicit options * structure, buffer, and length. */ GumboOutput* gumbo_parse_with_options ( const GumboOptions* options, const char* buffer, size_t buffer_length ); /** * Compute the quirks mode based on the name, public identifier, and system * identifier. Any of these may be `NULL` to indicate a missing value. */ GumboQuirksModeEnum gumbo_compute_quirks_mode ( const char *name, const char *pubid, const char *sysid ); /** Convert a `GumboOutputStatus` code into a readable description. */ const char* gumbo_status_to_string(GumboOutputStatus status); /** Release the memory used for the parse tree and parse errors. */ void gumbo_destroy_output(GumboOutput* output); /** Opaque GumboError type */ typedef struct GumboInternalError GumboError; /** * Returns the position of the error. */ GumboSourcePosition gumbo_error_position(const GumboError* error); /** * Returns a constant string representation of the error's code. This is owned * by the library and should not be freed by the caller. */ const char* gumbo_error_code(const GumboError* error); /** * Prints an error to a string. This stores a freshly-allocated buffer * containing the error message text in output. The caller is responsible for * freeing the buffer. The size of the error message is returned. The error * message itself may not be NULL-terminated and may contain NULL bytes so the * returned size must be used. */ size_t gumbo_error_to_string(const GumboError* error, char **output); /** * Prints a caret diagnostic to a string. This stores a freshly-allocated * buffer containing the error message text in output. The caller is responsible for * freeing the buffer. The size of the error message is returned. The error * message itself may not be NULL-terminated and may contain NULL bytes so the * returned size must be used. */ size_t gumbo_caret_diagnostic_to_string ( const GumboError* error, const char* source_text, size_t source_length, char** output ); /** * Like gumbo_caret_diagnostic_to_string, but prints the text to stdout * instead of writing to a string. */ void gumbo_print_caret_diagnostic ( const GumboError* error, const char* source_text, size_t source_length ); #ifdef __cplusplus } #endif #endif // GUMBO_H