/* sax.c * Copyright (c) 2011, Peter Ohler * All rights reserved. */ #include #include #include #include #include #include #if NEEDS_UIO #include #endif #include #include #include "ruby.h" #include "ox.h" #include "sax.h" #include "sax_stack.h" #include "sax_buf.h" #include "special.h" #define NAME_MISMATCH 1 #define START_STATE 1 #define BODY_STATE 2 #define AFTER_STATE 3 // error prefixes #define BAD_BOM "Bad BOM: " #define NO_TERM "Not Terminated: " #define INVALID_FORMAT "Invalid Format: " #define CASE_ERROR "Case Error: " #define OUT_OF_ORDER "Out of Order: " #define WRONG_CHAR "Unexpected Character: " #define EL_MISMATCH "Start End Mismatch: " #define INV_ELEMENT "Invalid Element: " #define UTF8_STR "UTF-8" static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options); static void parse(SaxDrive dr); // All read functions should return the next character after the 'thing' that was read and leave dr->cur one after that. static char read_instruction(SaxDrive dr); static char read_doctype(SaxDrive dr); static char read_cdata(SaxDrive dr); static char read_comment(SaxDrive dr); static char read_element_start(SaxDrive dr); static char read_element_end(SaxDrive dr); static char read_text(SaxDrive dr); static char read_jump(SaxDrive dr, const char *pat); static char read_attrs(SaxDrive dr, char c, char termc, char term2, int is_xml, int eq_req, Hint h); static char read_name_token(SaxDrive dr); static char read_quoted_value(SaxDrive dr); static void end_element_cb(SaxDrive dr, VALUE name, long pos, long line, long col, Hint h); static void hint_clear_empty(SaxDrive dr); static Nv hint_try_close(SaxDrive dr, const char *name); VALUE ox_sax_value_class = Qnil; static VALUE protect_parse(VALUE drp) { parse((SaxDrive)drp); return Qnil; } #if HAS_ENCODING_SUPPORT || HAS_PRIVATE_ENCODING static int strIsAscii(const char *s) { for (; '\0' != *s; s++) { if (*s < ' ' || '~' < *s) { return 0; } } return 1; } #endif VALUE str2sym(SaxDrive dr, const char *str, const char **strp) { VALUE *slot; VALUE sym; if (dr->options.symbolize) { if (Qundef == (sym = ox_cache_get(ox_symbol_cache, str, &slot, strp))) { #if HAS_ENCODING_SUPPORT if (0 != dr->encoding && !strIsAscii(str)) { VALUE rstr = rb_str_new2(str); // TBD if sym can be pinned down then use this all the time rb_enc_associate(rstr, dr->encoding); sym = rb_funcall(rstr, ox_to_sym_id, 0); *slot = Qundef; } else { sym = ID2SYM(rb_intern(str)); *slot = sym; } #elif HAS_PRIVATE_ENCODING if (Qnil != dr->encoding && !strIsAscii(str)) { VALUE rstr = rb_str_new2(str); rb_funcall(rstr, ox_force_encoding_id, 1, dr->encoding); sym = rb_funcall(rstr, ox_to_sym_id, 0); // Needed for Ruby 2.2 to get around the GC of symbols created // with to_sym which is needed for encoded symbols. rb_ary_push(ox_sym_bank, sym); *slot = Qundef; } else { sym = ID2SYM(rb_intern(str)); *slot = sym; } #else sym = ID2SYM(rb_intern(str)); *slot = sym; #endif } } else { sym = rb_str_new2(str); #if HAS_ENCODING_SUPPORT if (0 != dr->encoding) { rb_enc_associate(sym, dr->encoding); } #elif HAS_PRIVATE_ENCODING if (Qnil != dr->encoding) { rb_funcall(sym, ox_force_encoding_id, 1, dr->encoding); } #endif if (0 != strp) { *strp = StringValuePtr(sym); } } return sym; } void ox_sax_parse(VALUE handler, VALUE io, SaxOptions options) { struct _saxDrive dr; int line = 0; sax_drive_init(&dr, handler, io, options); #if 0 printf("*** sax_parse with these flags\n"); printf(" has_instruct = %s\n", dr.has.instruct ? "true" : "false"); printf(" has_end_instruct = %s\n", dr.has.end_instruct ? "true" : "false"); printf(" has_attr = %s\n", dr.has.attr ? "true" : "false"); printf(" has_attr_value = %s\n", dr.has.attr_value ? "true" : "false"); printf(" has_attrs_done = %s\n", dr.has.attrs_done ? "true" : "false"); printf(" has_doctype = %s\n", dr.has.doctype ? "true" : "false"); printf(" has_comment = %s\n", dr.has.comment ? "true" : "false"); printf(" has_cdata = %s\n", dr.has.cdata ? "true" : "false"); printf(" has_text = %s\n", dr.has.text ? "true" : "false"); printf(" has_value = %s\n", dr.has.value ? "true" : "false"); printf(" has_start_element = %s\n", dr.has.start_element ? "true" : "false"); printf(" has_end_element = %s\n", dr.has.end_element ? "true" : "false"); printf(" has_error = %s\n", dr.has.error ? "true" : "false"); printf(" has_pos = %s\n", dr.has.pos ? "true" : "false"); printf(" has_line = %s\n", dr.has.line ? "true" : "false"); printf(" has_column = %s\n", dr.has.column ? "true" : "false"); #endif //parse(&dr); rb_protect(protect_parse, (VALUE)&dr, &line); ox_sax_drive_cleanup(&dr); if (0 != line) { rb_jump_tag(line); } } static void sax_drive_init(SaxDrive dr, VALUE handler, VALUE io, SaxOptions options) { ox_sax_buf_init(&dr->buf, io); dr->buf.dr = dr; stack_init(&dr->stack); dr->handler = handler; dr->value_obj = Data_Wrap_Struct(ox_sax_value_class, 0, 0, dr); rb_gc_register_address(&dr->value_obj); dr->options = *options; dr->err = 0; dr->blocked = 0; dr->abort = false; has_init(&dr->has, handler); #if HAS_ENCODING_SUPPORT if ('\0' == *ox_default_options.encoding) { VALUE encoding; dr->encoding = 0; if (rb_respond_to(io, ox_external_encoding_id) && Qnil != (encoding = rb_funcall(io, ox_external_encoding_id, 0))) { int e = rb_enc_get_index(encoding); if (0 <= e) { dr->encoding = rb_enc_from_index(e); } } } else { dr->encoding = rb_enc_find(ox_default_options.encoding); } #elif HAS_PRIVATE_ENCODING if ('\0' == *ox_default_options.encoding) { VALUE encoding; if (rb_respond_to(io, ox_external_encoding_id) && Qnil != (encoding = rb_funcall(io, ox_external_encoding_id, 0))) { dr->encoding = encoding; } else { dr->encoding = Qnil; } } else { dr->encoding = rb_str_new2(ox_default_options.encoding); } #else dr->encoding = 0; #endif } void ox_sax_drive_cleanup(SaxDrive dr) { rb_gc_unregister_address(&dr->value_obj); buf_cleanup(&dr->buf); stack_cleanup(&dr->stack); } static void ox_sax_drive_error_at(SaxDrive dr, const char *msg, off_t pos, off_t line, off_t col) { if (dr->has.error) { VALUE args[3]; args[0] = rb_str_new2(msg); args[1] = LONG2NUM(line); args[2] = LONG2NUM(col); if (dr->has.pos) { rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos)); } if (dr->has.pos) { rb_ivar_set(dr->handler, ox_at_pos_id, LONG2NUM(pos)); } if (dr->has.line) { rb_ivar_set(dr->handler, ox_at_line_id, args[1]); } if (dr->has.column) { rb_ivar_set(dr->handler, ox_at_column_id, args[2]); } rb_funcall2(dr->handler, ox_error_id, 3, args); } } void ox_sax_drive_error(SaxDrive dr, const char *msg) { ox_sax_drive_error_at(dr, msg, dr->buf.pos, dr->buf.line, dr->buf.col); } static char skipBOM(SaxDrive dr) { char c = buf_get(&dr->buf); if (0xEF == (uint8_t)c) { /* only UTF8 is supported */ if (0xBB == (uint8_t)buf_get(&dr->buf) && 0xBF == (uint8_t)buf_get(&dr->buf)) { #if HAS_ENCODING_SUPPORT dr->encoding = ox_utf8_encoding; #elif HAS_PRIVATE_ENCODING dr->encoding = ox_utf8_encoding; #else dr->encoding = UTF8_STR; #endif c = buf_get(&dr->buf); } else { ox_sax_drive_error(dr, BAD_BOM "invalid BOM or a binary file."); c = '\0'; } } return c; } static void parse(SaxDrive dr) { char c = skipBOM(dr); int state = START_STATE; Nv parent; while ('\0' != c) { buf_protect(&dr->buf); if ('<' == c) { c = buf_get(&dr->buf); switch (c) { case '?': /* instructions (xml or otherwise) */ c = read_instruction(dr); break; case '!': /* comment or doctype */ buf_protect(&dr->buf); c = buf_get(&dr->buf); if ('\0' == c) { ox_sax_drive_error(dr, NO_TERM "DOCTYPE or comment not terminated"); goto DONE; } else if ('-' == c) { c = buf_get(&dr->buf); /* skip first - and get next character */ if ('-' != c) { ox_sax_drive_error(dr, INVALID_FORMAT "bad comment format, expected