/* parse.c * Copyright (c) 2011, Peter Ohler * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * - Neither the name of Peter Ohler nor the names of its contributors may be * used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include "ruby.h" #include "ox.h" static void read_instruction(PInfo pi); static void read_doctype(PInfo pi); static void read_comment(PInfo pi); static void read_element(PInfo pi); static void read_text(PInfo pi); /*static void read_reduced_text(PInfo pi); */ static void read_cdata(PInfo pi); static char* read_name_token(PInfo pi); static char* read_quoted_value(PInfo pi); static char* read_hex_uint64(char *b, uint64_t *up); static char* read_10_uint64(char *b, uint64_t *up); static char* ucs_to_utf8_chars(char *text, uint64_t u); static char* read_coded_chars(PInfo pi, char *text); static void next_non_white(PInfo pi); static int collapse_special(PInfo pi, char *str); /* This XML parser is a single pass, destructive, callback parser. It is a * single pass parse since it only make one pass over the characters in the * XML document string. It is destructive because it re-uses the content of * the string for values in the callback and places \0 characters at various * places to mark the end of tokens and strings. It is a callback parser like * a SAX parser because it uses callback when document elements are * encountered. * * Parsing is very tolerant. Lack of headers and even mispelled element * endings are passed over without raising an error. A best attempt is made in * all cases to parse the string. */ inline static void next_non_white(PInfo pi) { for (; 1; pi->s++) { switch(*pi->s) { case ' ': case '\t': case '\f': case '\n': case '\r': break; default: return; } } } inline static void next_white(PInfo pi) { for (; 1; pi->s++) { switch(*pi->s) { case ' ': case '\t': case '\f': case '\n': case '\r': case '\0': return; default: break; } } } VALUE ox_parse(char *xml, ParseCallbacks pcb, char **endp, Options options) { struct _PInfo pi; int body_read = 0; if (0 == xml) { raise_error("Invalid arg, xml string can not be null", xml, 0); } if (DEBUG <= options->trace) { printf("Parsing xml:\n%s\n", xml); } /* initialize parse info */ pi.str = xml; pi.s = xml; pi.h = 0; pi.pcb = pcb; pi.obj = Qnil; pi.circ_array = 0; pi.options = options; while (1) { next_non_white(&pi); /* skip white space */ if ('\0' == *pi.s) { break; } if (body_read && 0 != endp) { *endp = pi.s; break; } if ('<' != *pi.s) { /* all top level entities start with < */ raise_error("invalid format, expected <", pi.str, pi.s); } pi.s++; /* past < */ switch (*pi.s) { case '?': /* prolog */ pi.s++; read_instruction(&pi); break; case '!': /* comment or doctype */ pi.s++; if ('\0' == *pi.s) { raise_error("invalid format, DOCTYPE or comment not terminated", pi.str, pi.s); } else if ('-' == *pi.s) { pi.s++; /* skip - */ if ('-' != *pi.s) { raise_error("invalid format, bad comment format", pi.str, pi.s); } else { pi.s++; /* skip second - */ read_comment(&pi); } } else if (0 == strncmp("DOCTYPE", pi.s, 7)) { pi.s += 7; read_doctype(&pi); } else { raise_error("invalid format, DOCTYPE or comment expected", pi.str, pi.s); } break; case '\0': raise_error("invalid format, document not terminated", pi.str, pi.s); default: read_element(&pi); body_read = 1; break; } } return pi.obj; } /* Entered after the "s; next_non_white(pi); c = *pi->s; *end = '\0'; /* terminate name */ if ('?' != c) { while ('?' != *pi->s) { if ('\0' == *pi->s) { raise_error("invalid format, processing instruction not terminated", pi->str, pi->s); } next_non_white(pi); a->name = read_name_token(pi); end = pi->s; next_non_white(pi); if ('=' != *pi->s++) { raise_error("invalid format, no attribute value", pi->str, pi->s); } *end = '\0'; /* terminate name */ /* read value */ next_non_white(pi); a->value = read_quoted_value(pi); a++; if (MAX_ATTRS <= (a - attrs)) { raise_error("too many attributes", pi->str, pi->s); } next_non_white(pi); } if ('?' == *pi->s) { pi->s++; } } else { pi->s++; } if ('>' != *pi->s++) { raise_error("invalid format, processing instruction not terminated", pi->str, pi->s); } if (0 != pi->pcb->instruct) { pi->pcb->instruct(pi, target, attrs); } } /* Entered after the "s; while (1) { c = *pi->s++; if ('\0' == c) { raise_error("invalid format, prolog not terminated", pi->str, pi->s); } else if ('<' == c) { depth++; } else if ('>' == c) { depth--; if (0 == depth) { /* done, at the end */ pi->s--; break; } } } *pi->s = '\0'; pi->s++; if (0 != pi->pcb->add_doctype) { pi->pcb->add_doctype(pi, docType); } } /* Entered after ""); if (0 == end) { raise_error("invalid format, comment not terminated", pi->str, pi->s); } for (s = end - 1; pi->s < s && !done; s--) { switch(*s) { case ' ': case '\t': case '\f': case '\n': case '\r': break; default: *(s + 1) = '\0'; done = 1; break; } } *end = '\0'; /* in case the comment was blank */ pi->s = end + 3; if (0 != pi->pcb->add_comment) { pi->pcb->add_comment(pi, comment); } } /* Entered after the '<' and the first character after that. Returns status * code. */ static void read_element(PInfo pi) { struct _Attr attrs[MAX_ATTRS]; Attr ap = attrs; char *name; char *ename; char *end; char c; long elen; int hasChildren = 0; int done = 0; ename = read_name_token(pi); end = pi->s; elen = end - ename; next_non_white(pi); c = *pi->s; *end = '\0'; if ('/' == c) { /* empty element, no attributes and no children */ pi->s++; if ('>' != *pi->s) { /*printf("*** '%s' ***\n", pi->s); */ raise_error("invalid format, element not closed", pi->str, pi->s); } pi->s++; /* past > */ ap->name = 0; pi->pcb->add_element(pi, ename, attrs, hasChildren); pi->pcb->end_element(pi, ename); return; } /* read attribute names until the close (/ or >) is reached */ while (!done) { if ('\0' == c) { next_non_white(pi); c = *pi->s; } switch (c) { case '\0': raise_error("invalid format, document not terminated", pi->str, pi->s); case '/': /* Element with just attributes. */ pi->s++; if ('>' != *pi->s) { raise_error("invalid format, element not closed", pi->str, pi->s); } pi->s++; ap->name = 0; pi->pcb->add_element(pi, ename, attrs, hasChildren); pi->pcb->end_element(pi, ename); return; case '>': /* has either children or a value */ pi->s++; hasChildren = 1; done = 1; ap->name = 0; pi->pcb->add_element(pi, ename, attrs, hasChildren); break; default: /* Attribute name so it's an element and the attribute will be */ /* added to it. */ ap->name = read_name_token(pi); end = pi->s; next_non_white(pi); if ('=' != *pi->s++) { raise_error("invalid format, no attribute value", pi->str, pi->s); } *end = '\0'; /* terminate name */ /* read value */ next_non_white(pi); ap->value = read_quoted_value(pi); if (0 != strchr(ap->value, '&')) { if (0 != collapse_special(pi, (char*)ap->value)) { raise_error("invalid format, special character does not end with a semicolon", pi->str, pi->s); } } ap++; if (MAX_ATTRS <= (ap - attrs)) { raise_error("too many attributes", pi->str, pi->s); } break; } c = '\0'; } if (hasChildren) { char *start; done = 0; /* read children */ while (!done) { start = pi->s; next_non_white(pi); c = *pi->s++; if ('\0' == c) { raise_error("invalid format, document not terminated", pi->str, pi->s); } if ('<' == c) { switch (*pi->s) { case '!': /* better be a comment or CDATA */ pi->s++; if ('-' == *pi->s && '-' == *(pi->s + 1)) { pi->s += 2; read_comment(pi); } else if (0 == strncmp("[CDATA[", pi->s, 7)) { pi->s += 7; read_cdata(pi); } else { raise_error("invalid format, invalid comment or CDATA format", pi->str, pi->s); } break; case '/': pi->s++; name = read_name_token(pi); end = pi->s; next_non_white(pi); c = *pi->s; *end = '\0'; if (0 != strcmp(name, ename)) { raise_error("invalid format, elements overlap", pi->str, pi->s); } if ('>' != c) { raise_error("invalid format, element not closed", pi->str, pi->s); } pi->s++; pi->pcb->end_element(pi, ename); return; case '\0': raise_error("invalid format, document not terminated", pi->str, pi->s); default: /* a child element */ read_element(pi); break; } } else { /* read as TEXT */ pi->s = start; /*pi->s--; */ read_text(pi); /*read_reduced_text(pi); */ /* to exit read_text with no errors the next character must be < */ if ('/' == *(pi->s + 1) && 0 == strncmp(ename, pi->s + 2, elen) && '>' == *(pi->s + elen + 2)) { /* close tag after text so treat as a value */ pi->s += elen + 3; pi->pcb->end_element(pi, ename); return; } } } } } static void read_text(PInfo pi) { char buf[MAX_TEXT_LEN]; char *b = buf; char *alloc_buf = 0; char *end = b + sizeof(buf) - 2; char c; int done = 0; while (!done) { c = *pi->s++; switch(c) { case '<': done = 1; pi->s--; break; case '\0': raise_error("invalid format, document not terminated", pi->str, pi->s); default: if (end <= (b + (('&' == c) ? 7 : 0))) { /* extra 8 for special just in case it is sequence of bytes */ unsigned long size; if (0 == alloc_buf) { size = sizeof(buf) * 2; alloc_buf = ALLOC_N(char, size); memcpy(alloc_buf, buf, b - buf); b = alloc_buf + (b - buf); } else { unsigned long pos = b - alloc_buf; size = (end - alloc_buf) * 2; REALLOC_N(alloc_buf, char, size); b = alloc_buf + pos; } end = alloc_buf + size - 2; } if ('&' == c) { b = read_coded_chars(pi, b); } else { *b++ = c; } break; } } *b = '\0'; if (0 != alloc_buf) { pi->pcb->add_text(pi, alloc_buf, ('/' == *(pi->s + 1))); xfree(alloc_buf); } else { pi->pcb->add_text(pi, buf, ('/' == *(pi->s + 1))); } } #if 0 static void read_reduced_text(PInfo pi) { char buf[MAX_TEXT_LEN]; char *b = buf; char *alloc_buf = 0; char *end = b + sizeof(buf) - 2; char c; int spc = 0; int done = 0; while (!done) { c = *pi->s++; switch(c) { case ' ': case '\t': case '\f': case '\n': case '\r': spc = 1; break; case '<': done = 1; pi->s--; break; case '\0': raise_error("invalid format, document not terminated", pi->str, pi->s); default: if (end <= (b + spc + (('&' == c) ? 7 : 0))) { /* extra 8 for special just in case it is sequence of bytes */ unsigned long size; if (0 == alloc_buf) { size = sizeof(buf) * 2; alloc_buf = ALLOC_N(char, size); memcpy(alloc_buf, buf, b - buf); b = alloc_buf + (b - buf); } else { unsigned long pos = b - alloc_buf; size = (end - alloc_buf) * 2; REALLOC(alloc_buf, char, size); b = alloc_buf + pos; } end = alloc_buf + size - 2; } if (spc) { *b++ = ' '; } spc = 0; if ('&' == c) { b = read_coded_chars(pi, b); } else { *b++ = c; } break; } } *b = '\0'; if (0 != alloc_buf) { pi->pcb->add_text(pi, alloc_buf, ('/' == *(pi->s + 1))); xfree(alloc_buf); } else { pi->pcb->add_text(pi, buf, ('/' == *(pi->s + 1))); } } #endif static char* read_name_token(PInfo pi) { char *start; next_non_white(pi); start = pi->s; for (; 1; pi->s++) { switch (*pi->s) { case ' ': case '\t': case '\f': case '?': case '=': case '/': case '>': case '\n': case '\r': return start; case '\0': /* documents never terminate after a name token */ raise_error("invalid format, document not terminated", pi->str, pi->s); break; /* to avoid warnings */ default: break; } } return start; } static void read_cdata(PInfo pi) { char *start; char *end; start = pi->s; end = strstr(pi->s, "]]>"); if (end == 0) { raise_error("invalid format, CDATA not terminated", pi->str, pi->s); } *end = '\0'; pi->s = end + 3; if (0 != pi->pcb->add_cdata) { pi->pcb->add_cdata(pi, start, end - start); } } inline static void next_non_token(PInfo pi) { for (; 1; pi->s++) { switch(*pi->s) { case ' ': case '\t': case '\f': case '\n': case '\r': case '/': case '>': return; default: break; } } } /* Assume the value starts immediately and goes until the quote character is * reached again. Do not read the character after the terminating quote. */ static char* read_quoted_value(PInfo pi) { char *value = 0; if ('"' == *pi->s || ('\'' == *pi->s && StrictEffort != pi->options->effort)) { char term = *pi->s; pi->s++; /* skip quote character */ value = pi->s; for (; *pi->s != term; pi->s++) { if ('\0' == *pi->s) { raise_error("invalid format, document not terminated", pi->str, pi->s); } } *pi->s = '\0'; /* terminate value */ pi->s++; /* move past quote */ } else if (StrictEffort == pi->options->effort) { raise_error("invalid format, expected a quote character", pi->str, pi->s); } else { value = pi->s; next_white(pi); if ('\0' == *pi->s) { raise_error("invalid format, document not terminated", pi->str, pi->s); } *pi->s++ = '\0'; /* terminate value */ } return value; } static char* read_hex_uint64(char *b, uint64_t *up) { uint64_t u = 0; char c; for (; ';' != *b; b++) { c = *b; if ('0' <= c && c <= '9') { u = (u << 4) | (uint64_t)(c - '0'); } else if ('a' <= c && c <= 'f') { u = (u << 4) | (uint64_t)(c - 'a' + 10); } else if ('A' <= c && c <= 'F') { u = (u << 4) | (uint64_t)(c - 'A' + 10); } else { return 0; } } *up = u; return b; } static char* read_10_uint64(char *b, uint64_t *up) { uint64_t u = 0; char c; for (; ';' != *b; b++) { c = *b; if ('0' <= c && c <= '9') { u = (u * 10) + (uint64_t)(c - '0'); } else { return 0; } } *up = u; return b; } /* u0000..u007F 00000000000000xxxxxxx 0xxxxxxx u0080..u07FF 0000000000yyyyyxxxxxx 110yyyyy 10xxxxxx u0800..uD7FF, uE000..uFFFF 00000zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx u10000..u10FFFF uuuzzzzzzyyyyyyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */ static char* ucs_to_utf8_chars(char *text, uint64_t u) { int reading = 0; int i; unsigned char c; if (u <= 0x000000000000007FULL) { /* 0xxxxxxx */ *text++ = (char)u; } else if (u <= 0x00000000000007FFULL) { /* 110yyyyy 10xxxxxx */ *text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6))); *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u)); } else if (u <= 0x000000000000D7FFULL || (0x000000000000E000ULL <= u && u <= 0x000000000000FFFFULL)) { /* 1110zzzz 10yyyyyy 10xxxxxx */ *text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12))); *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6))); *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u)); } else if (0x0000000000010000ULL <= u && u <= 0x000000000010FFFFULL) { /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */ *text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18))); *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12))); *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6))); *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u)); } else { /* assume it is UTF-8 encoded directly and not UCS */ for (i = 56; 0 <= i; i -= 8) { c = (unsigned char)((u >> i) & 0x00000000000000FFULL); if (reading) { *text++ = (char)c; } else if ('\0' != c) { *text++ = (char)c; reading = 1; } } } return text; } static char* read_coded_chars(PInfo pi, char *text) { char *b, buf[32]; char *end = buf + sizeof(buf) - 1; char *s; for (b = buf, s = pi->s; b < end; b++, s++) { *b = *s; if (';' == *s) { *(b + 1) = '\0'; s++; break; } } if (b > end) { *text++ = *pi->s; } else if ('#' == *buf) { uint64_t u = 0; b = buf + 1; if ('x' == *b || 'X' == *b) { b = read_hex_uint64(b + 1, &u); } else { b = read_10_uint64(b, &u); } if (0 == b) { *text++ = *pi->s; } else { pi->s = s; if (u <= 0x000000000000007FULL) { *text++ = (char)u; } else if (ox_utf8_encoding == pi->options->rb_enc) { text = ucs_to_utf8_chars(text, u); } else if (0 == pi->options->rb_enc) { pi->options->rb_enc = ox_utf8_encoding; text = ucs_to_utf8_chars(text, u); } else { /*raise_error("Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); */ raise_error("Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); } } } else if (0 == strcasecmp(buf, "nbsp;")) { pi->s = s; *text++ = ' '; } else if (0 == strcasecmp(buf, "lt;")) { pi->s = s; *text++ = '<'; } else if (0 == strcasecmp(buf, "gt;")) { pi->s = s; *text++ = '>'; } else if (0 == strcasecmp(buf, "amp;")) { pi->s = s; *text++ = '&'; } else if (0 == strcasecmp(buf, "quot;")) { pi->s = s; *text++ = '"'; } else if (0 == strcasecmp(buf, "apos;")) { pi->s = s; *text++ = '\''; } else { *text++ = *pi->s; } return text; } static int collapse_special(PInfo pi, char *str) { char *s = str; char *b = str; while ('\0' != *s) { if ('&' == *s) { int c; char *end; s++; if ('#' == *s) { uint64_t u = 0; s++; if ('x' == *s || 'X' == *s) { s++; end = read_hex_uint64(s, &u); } else { end = read_10_uint64(s, &u); } if (0 == end) { return EDOM; } if (u <= 0x000000000000007FULL) { *b++ = (char)u; } else if (ox_utf8_encoding == pi->options->rb_enc) { b = ucs_to_utf8_chars(b, u); /* TBD support UTF-16 */ } else if (0 == pi->options->rb_enc) { pi->options->rb_enc = ox_utf8_encoding; b = ucs_to_utf8_chars(b, u); } else { /* raise_error("Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);*/ raise_error("Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); } s = end + 1; } else { if (0 == strncasecmp(s, "lt;", 3)) { c = '<'; s += 3; } else if (0 == strncasecmp(s, "gt;", 3)) { c = '>'; s += 3; } else if (0 == strncasecmp(s, "amp;", 4)) { c = '&'; s += 4; } else if (0 == strncasecmp(s, "quot;", 5)) { c = '"'; s += 5; } else if (0 == strncasecmp(s, "apos;", 5)) { c = '\''; s += 5; } else { c = '?'; while (';' != *s++) { if ('\0' == *s) { return EDOM; } } s++; } *b++ = (char)c; } } else { *b++ = *s++; } } *b = '\0'; return 0; }