#include #include #include #include "houdini.h" /** * & --> & * < --> < * > --> > * " --> " * ' --> ' */ static const char *LOOKUP_CODES[] = { "", /* reserved: use literal single character */ "", /* unused */ "", /* reserved: 2 character UTF-8 */ "", /* reserved: 3 character UTF-8 */ "", /* reserved: 4 character UTF-8 */ "?", /* invalid UTF-8 character */ """, "&", "'", "<", ">" }; static const int LOOKUP_CODES_LENGTHS[] = { 0, 0, 0, 0, 0, 1, 6, 5, 6, 4, 4 }; static const char CODE_INVALID = 5; static const char XML_LOOKUP_TABLE[] = { /* ASCII: 0xxxxxxx */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0, 0, 6, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0,10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Invalid UTF-8 char start: 10xxxxxx */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, /* Multibyte UTF-8 */ /* 2 bytes: 110xxxxx */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 3 bytes: 1110xxxx */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 4 bytes: 11110xxx */ 4, 4, 4, 4, 4, 4, 4, 4, /* Invalid UTF-8: 11111xxx */ 5, 5, 5, 5, 5, 5, 5, 5, }; int houdini_escape_xml(gh_buf *ob, const uint8_t *src, size_t size) { size_t i = 0; unsigned char code = 0; gh_buf_grow(ob, HOUDINI_ESCAPED_SIZE(size)); while (i < size) { size_t start, end; start = end = i; while (i < size) { unsigned int byte; byte = src[i++]; code = XML_LOOKUP_TABLE[byte]; if (!code) { /* single character used literally */ } else if (code >= CODE_INVALID) { break; /* insert lookup code string */ } else if (code > size - end) { code = CODE_INVALID; /* truncated UTF-8 character */ break; } else { unsigned int chr = byte & (0xff >> code); while (--code) { byte = src[i++]; if ((byte & 0xc0) != 0x80) { code = CODE_INVALID; break; } chr = (chr << 6) + (byte & 0x3f); } switch (i - end) { case 2: if (chr < 0x80) code = CODE_INVALID; break; case 3: if (chr < 0x800 || (chr > 0xd7ff && chr < 0xe000) || chr > 0xfffd) code = CODE_INVALID; break; case 4: if (chr < 0x10000 || chr > 0x10ffff) code = CODE_INVALID; break; default: break; } if (code == CODE_INVALID) break; } end = i; } if (end > start) gh_buf_put(ob, src + start, end - start); /* escaping */ if (end >= size) break; gh_buf_put(ob, LOOKUP_CODES[code], LOOKUP_CODES_LENGTHS[code]); } return 1; }