/* sax.c * Copyright (c) 2011, Peter Ohler * All rights reserved. */ #include #include #include #include #include #include #if HAVE_SYS_UIO_H #include #endif #include #include #include "intern.h" #include "ox.h" #include "ruby.h" #include "ruby/encoding.h" #include "sax.h" #include "sax_buf.h" #include "sax_stack.h" #include "special.h" #define NAME_MISMATCH 1 #define START_STATE 1 #define BODY_STATE 2 #define AFTER_STATE 3 // error prefixes #define BAD_BOM "Bad BOM: " #define NO_TERM "Not Terminated: " #define INVALID_FORMAT "Invalid Format: " #define CASE_ERROR "Case Error: " #define OUT_OF_ORDER "Out of Order: " #define WRONG_CHAR "Unexpected Character: " #define EL_MISMATCH "Start End Mismatch: " #define INV_ELEMENT "Invalid Element: " #define UTF8_STR "UTF-8" static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options); static void parse(SaxDrive dr); // All read functions should return the next character after the 'thing' that was read and leave dr->cur one after that. static char read_instruction(SaxDrive dr); static char read_doctype(SaxDrive dr); static char read_cdata(SaxDrive dr); static char read_comment(SaxDrive dr); static char read_element_start(SaxDrive dr); static char read_element_end(SaxDrive dr); static char read_text(SaxDrive dr); static char read_jump(SaxDrive dr, const char *pat); static char read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req, Hint h); static char read_name_token(SaxDrive dr); static char read_quoted_value(SaxDrive dr, bool inst); static void hint_clear_empty(SaxDrive dr); static Nv hint_try_close(SaxDrive dr, const char *name); VALUE ox_sax_value_class = Qnil; const rb_data_type_t ox_sax_value_type = { "Ox/Sax/Value", { NULL, NULL, NULL, }, 0, 0, }; static VALUE protect_parse(VALUE drp) { parse((SaxDrive)drp); return Qnil; } VALUE str2sym(SaxDrive dr, const char *str, size_t len, const char **strp) { VALUE sym; if (dr->options.symbolize) { sym = ox_sym_intern(str, len, strp); } else { sym = dr->get_name(str, len, dr->encoding, strp); } return sym; } void ox_sax_parse(VALUE handler, VALUE io, SaxOptions options) { #if HAVE_RB_EXT_RACTOR_SAFE rb_ext_ractor_safe(true); #endif struct _saxDrive dr; int line = 0; sax_drive_init(&dr, handler, io, options); rb_protect(protect_parse, (VALUE)&dr, &line); ox_sax_drive_cleanup(&dr); if (0 != line) { rb_jump_tag(line); } } static void set_long_noop(VALUE handler, long pos) { } static void set_pos(VALUE handler, long pos) { rb_ivar_set(handler, ox_at_pos_id, LONG2NUM(pos)); } static void set_line(VALUE handler, long line) { rb_ivar_set(handler, ox_at_line_id, LONG2NUM(line)); } static void set_col(VALUE handler, long col) { rb_ivar_set(handler, ox_at_column_id, LONG2NUM(col)); } static void attr_noop(SaxDrive dr, VALUE name, char *value, long pos, long line, long col) { } static void attr_text(SaxDrive dr, VALUE name, char *value, long pos, long line, long col) { VALUE args[2]; args[0] = name; if (dr->options.convert_special && '\0' != value[0]) { ox_sax_collapse_special(dr, value, pos, line, col); } args[1] = rb_str_new2(value); if (0 != dr->encoding) { rb_enc_associate(args[1], dr->encoding); } dr->set_pos(dr->handler, pos); dr->set_line(dr->handler, line); dr->set_col(dr->handler, col); rb_funcall2(dr->handler, ox_attr_id, 2, args); } static void attr_value(SaxDrive dr, VALUE name, char *value, long pos, long line, long col) { VALUE args[2]; dr->set_pos(dr->handler, pos); dr->set_line(dr->handler, line); dr->set_col(dr->handler, col); args[0] = name; args[1] = dr->value_obj; rb_funcall2(dr->handler, ox_attr_value_id, 2, args); } static void attrs_done_noop(VALUE handler) { } static void attrs_done(VALUE handler) { rb_funcall(handler, ox_attrs_done_id, 0); } static VALUE instruct_noop(SaxDrive dr, const char *target, long pos, long line, long col) { return Qnil; } static VALUE instruct(SaxDrive dr, const char *target, long pos, long line, long col) { VALUE arg = rb_str_new2(target); dr->set_pos(dr->handler, pos); dr->set_line(dr->handler, line); dr->set_col(dr->handler, col); rb_funcall(dr->handler, ox_instruct_id, 1, arg); return arg; } static VALUE instruct_just_value(SaxDrive dr, const char *target, long pos, long line, long col) { return rb_str_new2(target); } static void end_instruct_noop(SaxDrive dr, VALUE target, long pos, long line, long col) { } static void end_instruct(SaxDrive dr, VALUE target, long pos, long line, long col) { dr->set_pos(dr->handler, pos); dr->set_line(dr->handler, line); dr->set_col(dr->handler, col); rb_funcall(dr->handler, ox_end_instruct_id, 1, target); } static void dr_loc_noop(SaxDrive dr, long pos, long line, long col) { } static void comment(SaxDrive dr, long pos, long line, long col) { if (!dr->blocked) { Nv parent = stack_peek(&dr->stack); Hint h = ox_hint_find(dr->options.hints, "!--"); if (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay || (NULL != h && (ActiveOverlay == h->overlay || ActiveOverlay == h->overlay))) { VALUE arg = rb_str_new2(dr->buf.str); if (0 != dr->encoding) { rb_enc_associate(arg, dr->encoding); } dr->set_pos(dr->handler, pos); dr->set_line(dr->handler, line); dr->set_col(dr->handler, col); rb_funcall(dr->handler, ox_comment_id, 1, arg); } } } static void cdata(SaxDrive dr, long pos, long line, long col) { Nv parent = stack_peek(&dr->stack); if (!dr->blocked && (NULL == parent || NULL == parent->hint || OffOverlay != parent->hint->overlay)) { VALUE arg = rb_str_new2(dr->buf.str); if (0 != dr->encoding) { rb_enc_associate(arg, dr->encoding); } dr->set_pos(dr->handler, pos); dr->set_line(dr->handler, line); dr->set_col(dr->handler, col); rb_funcall(dr->handler, ox_cdata_id, 1, arg); } } static void doctype(SaxDrive dr, long pos, long line, long col) { dr->set_pos(dr->handler, pos); dr->set_line(dr->handler, line); dr->set_col(dr->handler, col); rb_funcall(dr->handler, ox_doctype_id, 1, rb_str_new2(dr->buf.str)); } static void error_noop(SaxDrive dr, const char *msg, long pos, long line, long col) { } static void error(SaxDrive dr, const char *msg, long pos, long line, long col) { VALUE args[3]; args[0] = rb_str_new2(msg); args[1] = LONG2NUM(line); args[2] = LONG2NUM(col); dr->set_pos(dr->handler, pos); dr->set_line(dr->handler, line); dr->set_col(dr->handler, col); rb_funcall2(dr->handler, ox_error_id, 3, args); } static void end_element_cb(SaxDrive dr, VALUE name, long pos, long line, long col, Hint h) { if (dr->has_end_element && 0 >= dr->blocked && (NULL == h || ActiveOverlay == h->overlay || NestOverlay == h->overlay)) { dr->set_pos(dr->handler, pos); dr->set_line(dr->handler, line); dr->set_col(dr->handler, col); rb_funcall(dr->handler, ox_end_element_id, 1, name); } if (NULL != h && BlockOverlay == h->overlay && 0 < dr->blocked) { dr->blocked--; } } static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options) { ox_sax_buf_init(&dr->buf, io); dr->buf.dr = dr; stack_init(&dr->stack); dr->handler = handler; dr->value_obj = TypedData_Wrap_Struct(ox_sax_value_class, &ox_sax_value_type, dr); rb_gc_register_address(&dr->value_obj); dr->options = *options; dr->err = 0; dr->blocked = 0; dr->abort = false; dr->set_pos = (Qtrue == rb_ivar_defined(handler, ox_at_pos_id)) ? set_pos : set_long_noop; dr->set_line = (Qtrue == rb_ivar_defined(handler, ox_at_line_id)) ? set_line : set_long_noop; dr->set_col = (Qtrue == rb_ivar_defined(handler, ox_at_column_id)) ? set_col : set_long_noop; if (rb_respond_to(handler, ox_attr_value_id)) { dr->attr_cb = attr_value; dr->want_attr_name = true; } else if (rb_respond_to(handler, ox_attr_id)) { dr->attr_cb = attr_text; dr->want_attr_name = true; } else { dr->attr_cb = attr_noop; dr->want_attr_name = false; } dr->attrs_done = rb_respond_to(handler, ox_attrs_done_id) ? attrs_done : attrs_done_noop; dr->instruct = rb_respond_to(handler, ox_instruct_id) ? instruct : instruct_noop; dr->end_instruct = rb_respond_to(handler, ox_end_instruct_id) ? end_instruct : end_instruct_noop; if (rb_respond_to(handler, ox_end_instruct_id) && !rb_respond_to(handler, ox_instruct_id)) { dr->instruct = instruct_just_value; } dr->doctype = rb_respond_to(handler, ox_doctype_id) ? doctype : dr_loc_noop; dr->comment = rb_respond_to(handler, ox_comment_id) ? comment : dr_loc_noop; dr->cdata = rb_respond_to(handler, ox_cdata_id) ? cdata : dr_loc_noop; dr->error = rb_respond_to(handler, ox_error_id) ? error : error_noop; dr->has_text = rb_respond_to(handler, ox_text_id); dr->has_value = rb_respond_to(handler, ox_value_id); dr->has_start_element = rb_respond_to(handler, ox_start_element_id); dr->has_end_element = rb_respond_to(handler, ox_end_element_id); if ('\0' == *ox_default_options.encoding) { VALUE encoding; dr->encoding = 0; if (rb_respond_to(io, ox_external_encoding_id) && Qnil != (encoding = rb_funcall(io, ox_external_encoding_id, 0))) { int e = rb_enc_get_index(encoding); if (0 <= e) { dr->encoding = rb_enc_from_index(e); } } } else { dr->encoding = rb_enc_find(ox_default_options.encoding); } dr->utf8 = (NULL == dr->encoding || rb_utf8_encoding() == dr->encoding); if (NULL == dr->encoding || rb_utf8_encoding() == dr->encoding) { // UTF-8 dr->get_name = dr->options.symbolize ? ox_utf8_sym : ox_utf8_name; // TBD UTF8 sym? } else { dr->get_name = dr->options.symbolize ? ox_enc_sym : ox_enc_name; } } void ox_sax_drive_cleanup(SaxDrive dr) { rb_gc_unregister_address(&dr->value_obj); buf_cleanup(&dr->buf); stack_cleanup(&dr->stack); } static void ox_sax_drive_error_at(SaxDrive dr, const char *msg, off_t pos, off_t line, off_t col) { dr->error(dr, msg, pos, line, col); } void ox_sax_drive_error(SaxDrive dr, const char *msg) { ox_sax_drive_error_at(dr, msg, dr->buf.pos, dr->buf.line, dr->buf.col); } static char skipBOM(SaxDrive dr) { char c = buf_get(&dr->buf); if (0xEF == (uint8_t)c) { /* only UTF8 is supported */ if (0xBB == (uint8_t)buf_get(&dr->buf) && 0xBF == (uint8_t)buf_get(&dr->buf)) { dr->encoding = ox_utf8_encoding; c = buf_get(&dr->buf); } else { ox_sax_drive_error(dr, BAD_BOM "invalid BOM or a binary file."); c = '\0'; } } return c; } static void parse(SaxDrive dr) { char c = skipBOM(dr); int state = START_STATE; Nv parent; while ('\0' != c) { buf_protect(&dr->buf); if ('<' == c) { c = buf_get(&dr->buf); switch (c) { case '?': /* instructions (xml or otherwise) */ c = read_instruction(dr); break; case '!': /* comment or doctype */ buf_protect(&dr->buf); c = buf_get(&dr->buf); if ('\0' == c) { ox_sax_drive_error(dr, NO_TERM "DOCTYPE or comment not terminated"); goto DONE; } else if ('-' == c) { c = buf_get(&dr->buf); /* skip first - and get next character */ if ('-' != c) { ox_sax_drive_error(dr, INVALID_FORMAT "bad comment format, expected