ext/yajl/yajl_ext.c in yajl-ruby-1.3.1 vs ext/yajl/yajl_ext.c in yajl-ruby-1.4.0

- old
+ new

@@ -20,33 +20,58 @@ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "yajl_ext.h" +#include "yajl_lex.h" +#include "yajl_alloc.h" +#include "yajl_buf.h" +#include "yajl_encode.h" +#include "api/yajl_common.h" +#include "assert.h" #define YAJL_RB_TO_JSON \ VALUE rb_encoder, cls; \ rb_scan_args(argc, argv, "01", &rb_encoder); \ cls = rb_obj_class(rb_encoder); \ if (rb_encoder == Qnil || cls != cEncoder) { \ rb_encoder = rb_yajl_encoder_new(0, NULL, cEncoder); \ } \ return rb_yajl_encoder_encode(1, &self, rb_encoder); \ +static void *rb_internal_malloc(void *ctx, unsigned int sz) { + return xmalloc(sz); +} + +static void *rb_internal_realloc(void *ctx, void *previous, unsigned int sz) { + return xrealloc(previous, sz); +} + +static void rb_internal_free(void *ctx, void *ptr) { + xfree(ptr); +} + +static yajl_alloc_funcs rb_alloc_funcs = { + rb_internal_malloc, + rb_internal_realloc, + rb_internal_free, + NULL +}; + /* Helpers for building objects */ static void yajl_check_and_fire_callback(void * ctx) { yajl_parser_wrapper * wrapper; GetParser((VALUE)ctx, wrapper); /* No need to do any of this if the callback isn't even setup */ if (wrapper->parse_complete_callback != Qnil) { - int len = RARRAY_LEN(wrapper->builderStack); + long len = RARRAY_LEN(wrapper->builderStack); if (len == 1 && wrapper->nestedArrayLevel == 0 && wrapper->nestedHashLevel == 0) { rb_funcall(wrapper->parse_complete_callback, intern_call, 1, rb_ary_pop(wrapper->builderStack)); } } else { - int len = RARRAY_LEN(wrapper->builderStack); + long len = RARRAY_LEN(wrapper->builderStack); if (len == 1 && wrapper->nestedArrayLevel == 0 && wrapper->nestedHashLevel == 0) { wrapper->objectsFound++; if (wrapper->objectsFound > 1) { rb_raise(cParseError, "%s", "Found multiple JSON objects in the stream but no block or the on_parse_complete callback was assigned to handle them."); } @@ -74,11 +99,11 @@ } static void yajl_set_static_value(void * ctx, VALUE val) { yajl_parser_wrapper * wrapper; VALUE lastEntry, hash; - int len; + long len; GetParser((VALUE)ctx, wrapper); len = RARRAY_LEN(wrapper->builderStack); if (len > 0) { @@ -196,33 +221,33 @@ case T_FIXNUM: case T_FLOAT: case T_BIGNUM: str = rb_funcall(obj, intern_to_s, 0); cptr = RSTRING_PTR(str); - len = RSTRING_LEN(str); + len = (unsigned int)RSTRING_LEN(str); if (memcmp(cptr, "NaN", 3) == 0 || memcmp(cptr, "Infinity", 8) == 0 || memcmp(cptr, "-Infinity", 9) == 0) { rb_raise(cEncodeError, "'%s' is an invalid number", cptr); } CHECK_STATUS(yajl_gen_number(w->encoder, cptr, len)); break; case T_STRING: cptr = RSTRING_PTR(obj); - len = RSTRING_LEN(obj); + len = (unsigned int)RSTRING_LEN(obj); CHECK_STATUS(yajl_gen_string(w->encoder, (const unsigned char *)cptr, len)); break; default: if (rb_respond_to(obj, intern_to_json)) { str = rb_funcall(obj, intern_to_json, 0); Check_Type(str, T_STRING); cptr = RSTRING_PTR(str); - len = RSTRING_LEN(str); + len = (unsigned int)RSTRING_LEN(str); CHECK_STATUS(yajl_gen_number(w->encoder, cptr, len)); } else { str = rb_funcall(obj, intern_to_s, 0); Check_Type(str, T_STRING); cptr = RSTRING_PTR(str); - len = RSTRING_LEN(str); + len = (unsigned int)RSTRING_LEN(str); CHECK_STATUS(yajl_gen_string(w->encoder, (const unsigned char *)cptr, len)); } break; } @@ -418,11 +443,11 @@ } } cfg = (yajl_parser_config){allowComments, checkUTF8}; obj = Data_Make_Struct(klass, yajl_parser_wrapper, yajl_parser_wrapper_mark, yajl_parser_wrapper_free, wrapper); - wrapper->parser = yajl_alloc(&callbacks, &cfg, NULL, (void *)obj); + wrapper->parser = yajl_alloc(&callbacks, &cfg, &rb_alloc_funcs, (void *)obj); wrapper->nestedArrayLevel = 0; wrapper->nestedHashLevel = 0; wrapper->objectsFound = 0; wrapper->symbolizeKeys = symbolizeKeys; wrapper->builderStack = rb_ary_new(); @@ -487,17 +512,17 @@ rb_yajl_parser_set_complete_cb(self, blk); } if (TYPE(input) == T_STRING) { cptr = RSTRING_PTR(input); - len = RSTRING_LEN(input); + len = (unsigned int)RSTRING_LEN(input); yajl_parse_chunk((const unsigned char*)cptr, len, wrapper->parser); } else if (rb_respond_to(input, intern_io_read)) { VALUE parsed = rb_str_new(0, FIX2LONG(rbufsize)); while (rb_funcall(input, intern_io_read, 2, rbufsize, parsed) != Qnil) { cptr = RSTRING_PTR(parsed); - len = RSTRING_LEN(parsed); + len = (unsigned int)RSTRING_LEN(parsed); yajl_parse_chunk((const unsigned char*)cptr, len, wrapper->parser); } } else { rb_raise(cParseError, "input must be a string or IO"); } @@ -533,11 +558,11 @@ rb_raise(cParseError, "Can't parse a nil string."); } if (wrapper->parse_complete_callback != Qnil) { const char * cptr = RSTRING_PTR(chunk); - len = RSTRING_LEN(chunk); + len = (unsigned int)RSTRING_LEN(chunk); yajl_parse_chunk((const unsigned char*)cptr, len, wrapper->parser); } else { rb_raise(cParseError, "The on_parse_complete callback isn't setup, parsing useless."); } @@ -559,10 +584,406 @@ wrapper->parse_complete_callback = callback; return Qnil; } /* + * An event stream pulls data off the IO source into the buffer, + * then runs the lexer over that stream. + */ +struct yajl_event_stream_s { + yajl_alloc_funcs *funcs; + + VALUE stream; // source + + VALUE buffer; + unsigned int offset; + + yajl_lexer lexer; // event source +}; + +typedef struct yajl_event_stream_s *yajl_event_stream_t; + +struct yajl_event_s { + yajl_tok token; + const char *buf; + unsigned int len; +}; +typedef struct yajl_event_s yajl_event_t; + +static yajl_event_t yajl_event_stream_next(yajl_event_stream_t parser, int pop) { + assert(parser->stream); + assert(parser->buffer); + + while (1) { + if (parser->offset >= RSTRING_LEN(parser->buffer)) { + //printf("reading offset %d size %ld\n", parser->offset, RSTRING_LEN(parser->buffer)); + + // Refill the buffer + rb_funcall(parser->stream, intern_io_read, 2, INT2FIX(RSTRING_LEN(parser->buffer)), parser->buffer); + if (RSTRING_LEN(parser->buffer) == 0) { + yajl_event_t event = { + .token = yajl_tok_eof, + }; + return event; + } + + parser->offset = 0; + } + + // Try to pull an event off the lexer + yajl_event_t event; + + yajl_tok token; + if (pop == 0) { + //printf("peeking %p %ld %d\n", RSTRING_PTR(parser->buffer), RSTRING_LEN(parser->buffer), parser->offset); + token = yajl_lex_peek(parser->lexer, (const unsigned char *)RSTRING_PTR(parser->buffer), (unsigned int)RSTRING_LEN(parser->buffer), parser->offset); + //printf("peeked event %d\n", token); + + if (token == yajl_tok_eof) { + parser->offset = (unsigned int)RSTRING_LEN(parser->buffer); + continue; + } + + event.token = token; + + return event; + } + + //printf("popping\n"); + token = yajl_lex_lex(parser->lexer, (const unsigned char *)RSTRING_PTR(parser->buffer), (unsigned int)RSTRING_LEN(parser->buffer), &parser->offset, (const unsigned char **)&event.buf, &event.len); + //printf("popped event %d\n", token); + + if (token == yajl_tok_eof) { + continue; + } + + event.token = token; + + return event; + } + + return (yajl_event_t){}; +} + +static VALUE rb_yajl_projector_filter_array_subtree(yajl_event_stream_t parser, VALUE schema, yajl_event_t event); +static VALUE rb_yajl_projector_filter_object_subtree(yajl_event_stream_t parser, VALUE schema, yajl_event_t event); +static void rb_yajl_projector_ignore_value(yajl_event_stream_t parser); +static void rb_yajl_projector_ignore_container(yajl_event_stream_t parser); +static VALUE rb_yajl_projector_build_simple_value(yajl_event_stream_t parser, yajl_event_t event); +static VALUE rb_yajl_projector_build_string(yajl_event_stream_t parser, yajl_event_t event); + +static VALUE rb_yajl_projector_filter(yajl_event_stream_t parser, VALUE schema, yajl_event_t event) { + assert(parser->stream); + + switch(event.token) { + case yajl_tok_left_brace: + return rb_yajl_projector_filter_array_subtree(parser, schema, event); + break; + case yajl_tok_left_bracket: + return rb_yajl_projector_filter_object_subtree(parser, schema, event); + break; + default: + return rb_yajl_projector_build_simple_value(parser, event); + } +} + +static VALUE rb_yajl_projector_filter_array_subtree(yajl_event_stream_t parser, VALUE schema, yajl_event_t event) { + assert(event.token == yajl_tok_left_brace); + + VALUE ary = rb_ary_new(); + + while (1) { + event = yajl_event_stream_next(parser, 1); + + if (event.token == yajl_tok_right_brace) { + break; + } + + VALUE val = rb_yajl_projector_filter(parser, schema, event); + rb_ary_push(ary, val); + + event = yajl_event_stream_next(parser, 0); + if (event.token == yajl_tok_comma) { + event = yajl_event_stream_next(parser, 1); + assert(event.token == yajl_tok_comma); + + event = yajl_event_stream_next(parser, 0); + if (!(event.token == yajl_tok_string || event.token == yajl_tok_integer || event.token == yajl_tok_double || event.token == yajl_tok_null || event.token == yajl_tok_bool || event.token == yajl_tok_left_bracket || event.token == yajl_tok_left_brace)) { + rb_raise(cParseError, "read a comma, expected a value to follow, actually read %s", yajl_tok_name(event.token)); + } + } else if (event.token != yajl_tok_right_brace) { + rb_raise(cParseError, "didn't read a comma, expected closing array, actually read %s", yajl_tok_name(event.token)); + } + } + + return ary; +} + +static VALUE rb_yajl_projector_filter_object_subtree(yajl_event_stream_t parser, VALUE schema, yajl_event_t event) { + assert(event.token == yajl_tok_left_bracket); + + VALUE hsh = rb_hash_new(); + + while (1) { + event = yajl_event_stream_next(parser, 1); + + if (event.token == yajl_tok_right_bracket) { + break; + } + + if (!(event.token == yajl_tok_string || event.token == yajl_tok_string_with_escapes)) { + rb_raise(cParseError, "Expected string, unexpected stream event %s", yajl_tok_name(event.token)); + } + + VALUE key = rb_yajl_projector_build_string(parser, event); + + event = yajl_event_stream_next(parser, 1); + if (!(event.token == yajl_tok_colon)) { + rb_raise(cParseError, "Expected colon, unexpected stream event %s", yajl_tok_name(event.token)); + } + + // nil schema means reify the subtree from here on + // otherwise if the schema has a key for this we want it + int interesting = (schema == Qnil || rb_funcall(schema, rb_intern("key?"), 1, key) == Qtrue); + if (!interesting) { + rb_yajl_projector_ignore_value(parser); + goto peek_comma; + } + + yajl_event_t value_event = yajl_event_stream_next(parser, 1); + + VALUE key_schema; + if (schema == Qnil) { + key_schema = Qnil; + } else { + key_schema = rb_hash_aref(schema, key); + } + + VALUE val = rb_yajl_projector_filter(parser, key_schema, value_event); + + rb_str_freeze(key); + rb_hash_aset(hsh, key, val); + + peek_comma: + + event = yajl_event_stream_next(parser, 0); + if (event.token == yajl_tok_comma) { + event = yajl_event_stream_next(parser, 1); + assert(event.token == yajl_tok_comma); + + event = yajl_event_stream_next(parser, 0); + if (!(event.token == yajl_tok_string || event.token == yajl_tok_string_with_escapes)) { + rb_raise(cParseError, "read a comma, expected a key to follow, actually read %s", yajl_tok_name(event.token)); + } + } else if (event.token != yajl_tok_right_bracket) { + rb_raise(cParseError, "read a value without tailing comma, expected closing bracket, actually read %s", yajl_tok_name(event.token)); + } + } + + return hsh; +} + +/* +# After reading a key if we know we are not interested in the next value, + # read and discard all its stream events. + # + # Values can be simple (string, numeric, boolean, null) or compound (object + # or array). + # + # Returns nothing. +*/ +static void rb_yajl_projector_ignore_value(yajl_event_stream_t parser) { + yajl_event_t value_event = yajl_event_stream_next(parser, 1); + + switch (value_event.token) { + case yajl_tok_null: + case yajl_tok_bool: + case yajl_tok_integer: + case yajl_tok_double: + case yajl_tok_string: + case yajl_tok_string_with_escapes: + return; + default: + break; + } + + if (value_event.token == yajl_tok_left_brace || value_event.token == yajl_tok_left_bracket) { + rb_yajl_projector_ignore_container(parser); + return; + } + + rb_raise(cParseError, "unknown value type to ignore %s", yajl_tok_name(value_event.token)); +} + +/* +# Given the start of an array or object, read until the closing event. +# Object structures can nest and this is considered. +# +# Returns nothing. +*/ +static void rb_yajl_projector_ignore_container(yajl_event_stream_t parser) { + int depth = 1; + + while (depth > 0) { + yajl_event_t event = yajl_event_stream_next(parser, 1); + + if (event.token == yajl_tok_eof) { + return; + } + + if (event.token == yajl_tok_left_bracket || event.token == yajl_tok_left_brace) { + depth += 1; + } else if (event.token == yajl_tok_right_bracket || event.token == yajl_tok_right_brace) { + depth -= 1; + } + } +} + +static VALUE rb_yajl_projector_build_simple_value(yajl_event_stream_t parser, yajl_event_t event) { + assert(parser->stream); + + switch (event.token) { + case yajl_tok_null:; + return Qnil; + case yajl_tok_bool:; + if (memcmp(event.buf, "true", 4) == 0) { + return Qtrue; + } else if (memcmp(event.buf, "false", 4) == 0) { + return Qfalse; + } else { + rb_raise(cStandardError, "unknown boolean token %s", event.buf); + } + case yajl_tok_integer:; + case yajl_tok_double:; + char *buf = (char *)malloc(event.len + 1); + buf[event.len] = 0; + memcpy(buf, event.buf, event.len); + + VALUE val; + if (memchr(buf, '.', event.len) || + memchr(buf, 'e', event.len) || + memchr(buf, 'E', event.len)) { + val = rb_float_new(strtod(buf, NULL)); + } else { + val = rb_cstr2inum(buf, 10); + } + free(buf); + + return val; + + case yajl_tok_string:; + case yajl_tok_string_with_escapes:; + return rb_yajl_projector_build_string(parser, event); + + case yajl_tok_eof:; + rb_raise(cParseError, "unexpected eof while constructing value"); + + case yajl_tok_comma: + rb_raise(cParseError, "unexpected comma while constructing value"); + + case yajl_tok_colon: + rb_raise(cParseError, "unexpected colon while constructing value"); + + default:; + assert(0); + } +} + +static VALUE rb_yajl_projector_build_string(yajl_event_stream_t parser, yajl_event_t event) { + switch (event.token) { + case yajl_tok_string:; { + VALUE str = rb_str_new(event.buf, event.len); + rb_enc_associate(str, utf8Encoding); + + rb_encoding *default_internal_enc = rb_default_internal_encoding(); + if (default_internal_enc) { + str = rb_str_export_to_enc(str, default_internal_enc); + } + + return str; + } + + case yajl_tok_string_with_escapes:; { + //printf("decoding string with escapes\n"); + + yajl_buf strBuf = yajl_buf_alloc(parser->funcs); + yajl_string_decode(strBuf, (const unsigned char *)event.buf, event.len); + + VALUE str = rb_str_new((const char *)yajl_buf_data(strBuf), yajl_buf_len(strBuf)); + rb_enc_associate(str, utf8Encoding); + + yajl_buf_free(strBuf); + + rb_encoding *default_internal_enc = rb_default_internal_encoding(); + if (default_internal_enc) { + str = rb_str_export_to_enc(str, default_internal_enc); + } + + return str; + } + + default:; { + assert(0); + } + } +} + +static VALUE rb_protected_yajl_projector_filter(VALUE pointer) { + VALUE *args = (VALUE *)pointer; + return rb_yajl_projector_filter((struct yajl_event_stream_s *)args[0], + args[1], + *(yajl_event_t *)args[2]); +} + +/* + * Document-method: project + */ +static VALUE rb_yajl_projector_project(VALUE self, VALUE schema) { + VALUE stream = rb_iv_get(self, "@stream"); + + long buffer_size = FIX2LONG(rb_iv_get(self, "@buffer_size")); + VALUE buffer = rb_str_new(0, buffer_size); + + struct yajl_event_stream_s parser = { + .funcs = &rb_alloc_funcs, + + .stream = stream, + + .buffer = buffer, + .offset = (unsigned int)buffer_size, + + .lexer = yajl_lex_alloc(&rb_alloc_funcs, 0, 1), + }; + + yajl_event_t event = yajl_event_stream_next(&parser, 1); + + RB_GC_GUARD(stream); + RB_GC_GUARD(buffer); + + VALUE result; + int state = 0; + + if (event.token == yajl_tok_left_brace || event.token == yajl_tok_left_bracket) { + VALUE args[3]; + args[0] = (VALUE)&parser; + args[1] = schema; + args[2] = (VALUE)&event; + result = rb_protect(rb_protected_yajl_projector_filter, + (VALUE)args, + &state); + } else { + yajl_lex_free(parser.lexer); + rb_raise(cParseError, "expected left bracket or brace, actually read %s", yajl_tok_name(event.token)); + } + + yajl_lex_free(parser.lexer); + if (state) rb_jump_tag(state); + + return result; +} + +/* * Document-class: Yajl::Encoder * * This class contains methods for encoding a Ruby object into JSON, streaming it's output into an IO object. * The IO object need only respond to #write(str) * The JSON stream created is written to the IO in chunks, as it's being created. @@ -618,11 +1039,11 @@ } cfg = (yajl_gen_config){beautify, (const char *)indentString, htmlSafe}; obj = Data_Make_Struct(klass, yajl_encoder_wrapper, yajl_encoder_wrapper_mark, yajl_encoder_wrapper_free, wrapper); wrapper->indentString = actualIndent; - wrapper->encoder = yajl_gen_alloc(&cfg, NULL); + wrapper->encoder = yajl_gen_alloc(&cfg, &rb_alloc_funcs); wrapper->on_progress_callback = Qnil; if (opts != Qnil && rb_funcall(opts, intern_has_key, 1, sym_terminator) == Qtrue) { wrapper->terminator = rb_hash_aref(opts, sym_terminator); #ifdef HAVE_RUBY_ENCODING_H if (TYPE(wrapper->terminator) == T_STRING) { @@ -898,17 +1319,21 @@ rb_define_const(mYajl, "MAX_DEPTH", INT2NUM(YAJL_MAX_DEPTH)); cParseError = rb_define_class_under(mYajl, "ParseError", rb_eStandardError); cEncodeError = rb_define_class_under(mYajl, "EncodeError", rb_eStandardError); + cStandardError = rb_const_get(rb_cObject, rb_intern("StandardError")); cParser = rb_define_class_under(mYajl, "Parser", rb_cObject); rb_define_singleton_method(cParser, "new", rb_yajl_parser_new, -1); rb_define_method(cParser, "initialize", rb_yajl_parser_init, -1); rb_define_method(cParser, "parse", rb_yajl_parser_parse, -1); rb_define_method(cParser, "parse_chunk", rb_yajl_parser_parse_chunk, 1); rb_define_method(cParser, "<<", rb_yajl_parser_parse_chunk, 1); rb_define_method(cParser, "on_parse_complete=", rb_yajl_parser_set_complete_cb, 1); + + cProjector = rb_define_class_under(mYajl, "Projector", rb_cObject); + rb_define_method(cProjector, "project", rb_yajl_projector_project, 1); cEncoder = rb_define_class_under(mYajl, "Encoder", rb_cObject); rb_define_singleton_method(cEncoder, "new", rb_yajl_encoder_new, -1); rb_define_method(cEncoder, "initialize", rb_yajl_encoder_init, -1); rb_define_method(cEncoder, "encode", rb_yajl_encoder_encode, -1);