/* sax.c * Copyright (c) 2011, Peter Ohler * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * - Neither the name of Peter Ohler nor the names of its contributors may be * used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #if NEEDS_UIO #include #endif #include #include #include "ruby.h" #include "ox.h" #define NAME_MISMATCH 1 typedef struct _SaxDrive { char base_buf[0x00010000]; char *buf; char *buf_end; char *cur; char *read_end; /* one past last character read */ char *str; /* start of current string being read */ int line; int col; VALUE handler; VALUE value_obj; int (*read_func)(struct _SaxDrive *dr); int convert_special; int tolerant; union { int fd; VALUE io; const char *in_str; }; int has_instruct; int has_end_instruct; int has_attr; int has_attr_value; int has_doctype; int has_comment; int has_cdata; int has_text; int has_value; int has_start_element; int has_end_element; int has_error; int has_line; int has_column; #if HAS_ENCODING_SUPPORT rb_encoding *encoding; #elif HAS_PRIVATE_ENCODING VALUE encoding; #endif } *SaxDrive; #ifdef NEEDS_STPCPY char *stpncpy(char *dest, const char *src, size_t n); #endif static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert, int tolerant); static void sax_drive_cleanup(SaxDrive dr); static int sax_drive_read(SaxDrive dr); static void sax_drive_error(SaxDrive dr, const char *msg, int critical); static int read_children(SaxDrive dr, int first); static int read_instruction(SaxDrive dr); static int read_doctype(SaxDrive dr); static int read_cdata(SaxDrive dr); static int read_comment(SaxDrive dr); static int read_element(SaxDrive dr); static int read_text(SaxDrive dr); static const char* read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml); static char read_name_token(SaxDrive dr); static int read_quoted_value(SaxDrive dr, char *last); static int collapse_special(char *str, int tolerant); static VALUE rescue_cb(VALUE rdr, VALUE err); static VALUE io_cb(VALUE rdr); static VALUE partial_io_cb(VALUE rdr); static int read_from_io(SaxDrive dr); #ifndef JRUBY_RUBY static int read_from_fd(SaxDrive dr); #endif static int read_from_io_partial(SaxDrive dr); static int read_from_str(SaxDrive dr); static VALUE sax_value_class; /* This is only for CentOS 5.4 with Ruby 1.9.3-p0 and for OS X 10.6 and Solaris 10. */ #ifdef NEEDS_STPCPY char *stpncpy(char *dest, const char *src, size_t n) { size_t cnt = strlen(src) + 1; if (n < cnt) { cnt = n; } strncpy(dest, src, cnt); return dest + cnt - 1; } #endif static inline char sax_drive_get(SaxDrive dr) { if (dr->read_end <= dr->cur) { if (0 != sax_drive_read(dr)) { return 0; } } if ('\n' == *dr->cur) { dr->line++; dr->col = 0; } dr->col++; return *dr->cur++; } static inline void backup(SaxDrive dr) { dr->cur--; dr->col--; // should reverse wrap but not worth it } static inline void reset_reader(SaxDrive dr, char *cur, int line, int col) { dr->cur = cur; dr->line = line; dr->col = col; } /* Starts by reading a character so it is safe to use with an empty or * compacted buffer. */ inline static char next_non_white(SaxDrive dr) { char c; while ('\0' != (c = sax_drive_get(dr))) { switch(c) { case ' ': case '\t': case '\f': case '\n': case '\r': break; default: return c; } } return '\0'; } /* Starts by reading a character so it is safe to use with an empty or * compacted buffer. */ inline static char next_white(SaxDrive dr) { char c; while ('\0' != (c = sax_drive_get(dr))) { switch(c) { case ' ': case '\t': case '\f': case '\n': case '\r': case '\0': return c; default: break; } } return '\0'; } inline static int is_white(char c) { switch(c) { case ' ': case '\t': case '\f': case '\n': case '\r': return 1; default: break; } return 0; } inline static VALUE str2sym(const char *str, SaxDrive dr, char **strp) { VALUE *slot; VALUE sym; if (Qundef == (sym = ox_cache_get(ox_symbol_cache, str, &slot, strp))) { #if HAS_ENCODING_SUPPORT if (0 != dr->encoding) { VALUE rstr = rb_str_new2(str); rb_enc_associate(rstr, dr->encoding); sym = rb_funcall(rstr, ox_to_sym_id, 0); } else { sym = ID2SYM(rb_intern(str)); } #elif HAS_PRIVATE_ENCODING if (Qnil != dr->encoding) { VALUE rstr = rb_str_new2(str); rb_funcall(rstr, ox_force_encoding_id, 1, dr->encoding); sym = rb_funcall(rstr, ox_to_sym_id, 0); } else { sym = ID2SYM(rb_intern(str)); } #else sym = ID2SYM(rb_intern(str)); #endif *slot = sym; } return sym; } void ox_sax_parse(VALUE handler, VALUE io, int convert, int tolerant) { struct _SaxDrive dr; sax_drive_init(&dr, handler, io, convert, tolerant); #if 0 printf("*** sax_parse with these flags\n"); printf(" has_instruct = %s\n", dr.has_instruct ? "true" : "false"); printf(" has_end_instruct = %s\n", dr.has_end_instruct ? "true" : "false"); printf(" has_attr = %s\n", dr.has_attr ? "true" : "false"); printf(" has_attr_value = %s\n", dr.has_attr_value ? "true" : "false"); printf(" has_doctype = %s\n", dr.has_doctype ? "true" : "false"); printf(" has_comment = %s\n", dr.has_comment ? "true" : "false"); printf(" has_cdata = %s\n", dr.has_cdata ? "true" : "false"); printf(" has_text = %s\n", dr.has_text ? "true" : "false"); printf(" has_value = %s\n", dr.has_value ? "true" : "false"); printf(" has_start_element = %s\n", dr.has_start_element ? "true" : "false"); printf(" has_end_element = %s\n", dr.has_end_element ? "true" : "false"); printf(" has_error = %s\n", dr.has_error ? "true" : "false"); printf(" has_line = %s\n", dr.has_line ? "true" : "false"); printf(" has_column = %s\n", dr.has_column ? "true" : "false"); #endif read_children(&dr, 1); sax_drive_cleanup(&dr); } inline static int respond_to(VALUE obj, ID method) { #ifdef JRUBY_RUBY /* There is a bug in JRuby where rb_respond_to() returns true (1) even if * a method is private. */ { VALUE args[1]; *args = ID2SYM(method); return (Qtrue == rb_funcall2(obj, rb_intern("respond_to?"), 1, args)); } #else return rb_respond_to(obj, method); #endif } static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, int convert, int tolerant) { if (ox_stringio_class == rb_obj_class(io)) { VALUE s = rb_funcall2(io, ox_string_id, 0, 0); dr->read_func = read_from_str; dr->in_str = StringValuePtr(s); } else if (rb_respond_to(io, ox_readpartial_id)) { #ifdef JRUBY_RUBY dr->read_func = read_from_io_partial; dr->io = io; #else VALUE rfd; if (rb_respond_to(io, ox_fileno_id) && Qnil != (rfd = rb_funcall(io, ox_fileno_id, 0))) { dr->read_func = read_from_fd; dr->fd = FIX2INT(rfd); } else { dr->read_func = read_from_io_partial; dr->io = io; } #endif } else if (rb_respond_to(io, ox_read_id)) { #ifdef JRUBY_RUBY dr->read_func = read_from_io; dr->io = io; #else VALUE rfd; if (rb_respond_to(io, ox_fileno_id) && Qnil != (rfd = rb_funcall(io, ox_fileno_id, 0))) { dr->read_func = read_from_fd; dr->fd = FIX2INT(rfd); } else { dr->read_func = read_from_io; dr->io = io; } #endif } else { rb_raise(ox_arg_error_class, "sax_parser io argument must respond to readpartial() or read().\n"); } dr->buf = dr->base_buf; *dr->buf = '\0'; dr->buf_end = dr->buf + sizeof(dr->base_buf) - 1; /* 1 less to make debugging easier */ dr->cur = dr->buf; dr->read_end = dr->buf; dr->str = 0; dr->line = 1; dr->col = 0; dr->handler = handler; dr->value_obj = rb_data_object_alloc(sax_value_class, dr, 0, 0); rb_gc_register_address(&dr->value_obj); dr->convert_special = convert; dr->tolerant = tolerant; dr->has_instruct = respond_to(handler, ox_instruct_id); dr->has_end_instruct = respond_to(handler, ox_end_instruct_id); dr->has_attr = respond_to(handler, ox_attr_id); dr->has_attr_value = respond_to(handler, ox_attr_value_id); dr->has_doctype = respond_to(handler, ox_doctype_id); dr->has_comment = respond_to(handler, ox_comment_id); dr->has_cdata = respond_to(handler, ox_cdata_id); dr->has_text = respond_to(handler, ox_text_id); dr->has_value = respond_to(handler, ox_value_id); dr->has_start_element = respond_to(handler, ox_start_element_id); dr->has_end_element = respond_to(handler, ox_end_element_id); dr->has_error = respond_to(handler, ox_error_id); dr->has_line = (Qtrue == rb_ivar_defined(handler, ox_at_line_id)); dr->has_column = (Qtrue == rb_ivar_defined(handler, ox_at_column_id)); #if HAS_ENCODING_SUPPORT if ('\0' == *ox_default_options.encoding) { VALUE encoding; if (rb_respond_to(io, ox_external_encoding_id) && Qnil != (encoding = rb_funcall(io, ox_external_encoding_id, 0))) { dr->encoding = rb_enc_from_index(rb_enc_get_index(encoding)); } else { dr->encoding = 0; } } else { dr->encoding = rb_enc_find(ox_default_options.encoding); } #elif HAS_PRIVATE_ENCODING if ('\0' == *ox_default_options.encoding) { VALUE encoding; if (rb_respond_to(io, ox_external_encoding_id) && Qnil != (encoding = rb_funcall(io, ox_external_encoding_id, 0))) { dr->encoding = encoding; } else { dr->encoding = Qnil; } } else { dr->encoding = rb_str_new2(ox_default_options.encoding); } #endif } static void sax_drive_cleanup(SaxDrive dr) { rb_gc_unregister_address(&dr->value_obj); if (dr->base_buf != dr->buf) { xfree(dr->buf); } } static int sax_drive_read(SaxDrive dr) { int err; size_t shift = 0; if (dr->buf < dr->cur) { if (0 == dr->str) { shift = dr->cur - dr->buf; } else { shift = dr->str - dr->buf; } /*printf("\n*** shift: %lu\n", shift); */ if (0 == shift) { /* no space left so allocate more */ char *old = dr->buf; size_t size = dr->buf_end - dr->buf; if (dr->buf == dr->base_buf) { dr->buf = ALLOC_N(char, size * 2); memcpy(dr->buf, old, size); } else { REALLOC_N(dr->buf, char, size * 2); } dr->buf_end = dr->buf + size * 2; dr->cur = dr->buf + (dr->cur - old); dr->read_end = dr->buf + (dr->read_end - old); if (0 != dr->str) { dr->str = dr->buf + (dr->str - old); } } else { memmove(dr->buf, dr->buf + shift, dr->read_end - (dr->buf + shift)); dr->cur -= shift; dr->read_end -= shift; if (0 != dr->str) { dr->str -= shift; } } } err = dr->read_func(dr); *dr->read_end = '\0'; return err; } static void sax_drive_error(SaxDrive dr, const char *msg, int critical) { if (dr->has_error) { VALUE args[3]; args[0] = rb_str_new2(msg); args[1] = INT2FIX(dr->line); args[2] = INT2FIX(dr->col); if (dr->has_line) { rb_ivar_set(dr->handler, ox_at_line_id, args[1]); } if (dr->has_column) { rb_ivar_set(dr->handler, ox_at_column_id, args[2]); } rb_funcall2(dr->handler, ox_error_id, 3, args); } else if (critical) { sax_drive_cleanup(dr); rb_raise(ox_parse_error_class, "%s at line %d, column %d\n", msg, dr->line, dr->col); } } static int read_children(SaxDrive dr, int first) { int err = 0; int element_read = !first; char c; int line; int col; while (!err) { dr->str = dr->cur; /* protect the start */ c = sax_drive_get(dr); if (first) { if (0xEF == (uint8_t)c) { /* only UTF8 is supported */ if (0xBB == (uint8_t)sax_drive_get(dr) && 0xBF == (uint8_t)sax_drive_get(dr)) { #if HAS_ENCODING_SUPPORT dr->encoding = ox_utf8_encoding; #elif HAS_PRIVATE_ENCODING dr->encoding = ox_utf8_encoding; #endif c = sax_drive_get(dr); } else { sax_drive_error(dr, "invalid format, invalid BOM or a binary file.", 1); } } } if ('\0' == c || (is_white(c) && '\0' == (c = next_non_white(dr)))) { if (!first) { sax_drive_error(dr, "invalid format, element not terminated", 1); err = -1; } break; /* normal completion if first */ } if ('<' != c) { if (first) { /* all top level entities start with < */ sax_drive_error(dr, "invalid format, expected <", 1); break; /* unrecoverable */ } if (0 != (err = read_text(dr))) { /* finished when < is reached */ break; } } dr->str = dr->cur; /* protect the start for elements */ c = sax_drive_get(dr); switch (c) { case '?': /* instructions (xml or otherwise) */ err = read_instruction(dr); break; case '!': /* comment or doctype */ dr->str = dr->cur; c = sax_drive_get(dr); if ('\0' == c) { sax_drive_error(dr, "invalid format, DOCTYPE or comment not terminated", 1); err = 1; } else if ('-' == c) { c = sax_drive_get(dr); /* skip first - and get next character */ if ('-' != c) { sax_drive_error(dr, "invalid format, bad comment format", 1); err = 1; } else { c = sax_drive_get(dr); /* skip second - */ err = read_comment(dr); } } else { int i; for (i = 7; 0 < i; i--) { sax_drive_get(dr); } if ((dr->tolerant) ? 0 == strncasecmp("DOCTYPE", dr->str, 7) : 0 == strncmp("DOCTYPE", dr->str, 7)) { if (element_read || !first) { sax_drive_error(dr, "invalid format, DOCTYPE can not come after an element", 0); } err = read_doctype(dr); } else if ((dr->tolerant) ? 0 == strncasecmp("[CDATA[", dr->str, 7) : 0 == strncmp("[CDATA[", dr->str, 7)) { err = read_cdata(dr); } else { sax_drive_error(dr, "invalid format, DOCTYPE or comment expected", 1); err = 1; } } break; case '/': /* element end */ line = dr->line; col = dr->col; err = ('\0' == read_name_token(dr)); dr->line = line; dr->col = col; if (first && dr->tolerant) { sax_drive_error(dr, "invalid format, unmatched element end", 0); } else { return err; } break; case '\0': sax_drive_error(dr, "invalid format, document not terminated", 1); err = 1; break; default: backup(dr); /* safe since no read occurred after getting last character */ if (first && element_read && !dr->tolerant) { sax_drive_error(dr, "invalid format, multiple top level elements", 0); } err = read_element(dr); if (NAME_MISMATCH == err && dr->tolerant && first) { // must have been a end element with no matching start err = 0; } element_read = 1; break; } } return err; } static void read_content(SaxDrive dr, char *content, size_t len) { char c; char *end = content + len; while ('\0' != (c = sax_drive_get(dr))) { if (end < content) { sax_drive_error(dr, "processing instruction content too large", 1); } if ('?' == c) { if ('\0' == (c = sax_drive_get(dr))) { sax_drive_error(dr, "invalid format, document not terminated", 1); } if ('>' == c) { *content = '\0'; return; } else { *content++ = c; } } else { *content++ = c; } } *content = '\0'; } /* Entered after the "line; int col = dr->col - 1; if ('\0' == (c = read_name_token(dr))) { return -1; } is_xml = (0 == strcmp("xml", dr->str)); if (dr->has_instruct || dr->has_end_instruct) { target = rb_str_new2(dr->str); } if (dr->has_instruct) { VALUE args[1]; if (dr->has_line) { rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line)); } if (dr->has_column) { rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col)); } args[0] = target; rb_funcall2(dr->handler, ox_instruct_id, 1, args); } dr->str = dr->cur; /* make sure the start doesn't get compacted out */ line = dr->line; col = dr->col; read_content(dr, content, sizeof(content) - 1); cend = dr->cur; reset_reader(dr, dr->str, line, col); if (0 != (err = read_attrs(dr, c, '?', '?', is_xml))) { if (dr->has_text) { VALUE args[1]; if (dr->convert_special) { if (0 != collapse_special(content, dr->tolerant)) { sax_drive_error(dr, "invalid format, special character does not end with a semicolon", 0); } } args[0] = rb_str_new2(content); #if HAS_ENCODING_SUPPORT if (0 != dr->encoding) { rb_enc_associate(args[0], dr->encoding); } #elif HAS_PRIVATE_ENCODING if (Qnil != dr->encoding) { rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding); } #endif if (dr->has_line) { rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line)); } if (dr->has_column) { rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col)); } rb_funcall2(dr->handler, ox_text_id, 1, args); } dr->cur = cend; } else { line = dr->line; col = dr->col; c = next_non_white(dr); if ('>' != c) { sax_drive_error(dr, "invalid format, instruction not terminated", 1); return -1; } } if (dr->has_end_instruct) { VALUE args[1]; if (dr->has_line) { rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line)); } if (dr->has_column) { rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col)); } args[0] = target; rb_funcall2(dr->handler, ox_end_instruct_id, 1, args); } dr->str = 0; return 0; } /* Entered after the "line; int col = dr->col - 10; dr->str = dr->cur - 1; /* mark the start */ while ('>' != (c = sax_drive_get(dr))) { if ('\0' == c) { sax_drive_error(dr, "invalid format, doctype terminated unexpectedly", 1); return -1; } } *(dr->cur - 1) = '\0'; if (dr->has_doctype) { VALUE args[1]; if (dr->has_line) { rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line)); } if (dr->has_column) { rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col)); } args[0] = rb_str_new2(dr->str); rb_funcall2(dr->handler, ox_doctype_id, 1, args); } dr->str = 0; return 0; } /* Entered after the "line; int col = dr->col - 10; backup(dr); /* back up to the start in case the cdata is empty */ dr->str = dr->cur; /* mark the start */ while (1) { c = sax_drive_get(dr); if (']' == c) { end++; } else if ('>' == c) { if (2 <= end) { *(dr->cur - 3) = '\0'; break; } end = 0; } else if ('\0' == c) { sax_drive_error(dr, "invalid format, cdata terminated unexpectedly", 1); return -1; } else { end = 0; } } if (dr->has_cdata) { VALUE args[1]; args[0] = rb_str_new2(dr->str); #if HAS_ENCODING_SUPPORT if (0 != dr->encoding) { rb_enc_associate(args[0], dr->encoding); } #elif HAS_PRIVATE_ENCODING if (Qnil != dr->encoding) { rb_funcall(args[0], ox_force_encoding_id, 1, dr->encoding); } #endif if (dr->has_line) { rb_ivar_set(dr->handler, ox_at_line_id, INT2FIX(line)); } if (dr->has_column) { rb_ivar_set(dr->handler, ox_at_column_id, INT2FIX(col)); } rb_funcall2(dr->handler, ox_cdata_id, 1, args); } dr->str = 0; return 0; } /* Entered after the "