vendor/json_pure/ext/json/ext/parser/parser.rl in scout-5.1.2 vs vendor/json_pure/ext/json/ext/parser/parser.rl in scout-5.1.3

- old
+ new

@@ -1,62 +1,86 @@ -#include "ruby.h" -#include "unicode.h" -#if HAVE_RE_H -#include "re.h" -#endif -#if HAVE_RUBY_ST_H -#include "ruby/st.h" -#endif -#if HAVE_ST_H -#include "st.h" -#endif +#include "parser.h" -#define EVIL 0x666 +/* unicode */ -#ifndef RHASH_TBL -#define RHASH_TBL(hsh) (RHASH(hsh)->tbl) -#endif +static const char digit_values[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, + -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1 +}; +static UTF32 unescape_unicode(const unsigned char *p) +{ + char b; + UTF32 result = 0; + b = digit_values[p[0]]; + if (b < 0) return UNI_REPLACEMENT_CHAR; + result = (result << 4) | b; + b = digit_values[p[1]]; + result = (result << 4) | b; + if (b < 0) return UNI_REPLACEMENT_CHAR; + b = digit_values[p[2]]; + result = (result << 4) | b; + if (b < 0) return UNI_REPLACEMENT_CHAR; + b = digit_values[p[3]]; + result = (result << 4) | b; + if (b < 0) return UNI_REPLACEMENT_CHAR; + return result; +} + +static int convert_UTF32_to_UTF8(char *buf, UTF32 ch) +{ + int len = 1; + if (ch <= 0x7F) { + buf[0] = (char) ch; + } else if (ch <= 0x07FF) { + buf[0] = (char) ((ch >> 6) | 0xC0); + buf[1] = (char) ((ch & 0x3F) | 0x80); + len++; + } else if (ch <= 0xFFFF) { + buf[0] = (char) ((ch >> 12) | 0xE0); + buf[1] = (char) (((ch >> 6) & 0x3F) | 0x80); + buf[2] = (char) ((ch & 0x3F) | 0x80); + len += 2; + } else if (ch <= 0x1fffff) { + buf[0] =(char) ((ch >> 18) | 0xF0); + buf[1] =(char) (((ch >> 12) & 0x3F) | 0x80); + buf[2] =(char) (((ch >> 6) & 0x3F) | 0x80); + buf[3] =(char) ((ch & 0x3F) | 0x80); + len += 3; + } else { + buf[0] = '?'; + } + return len; +} + #ifdef HAVE_RUBY_ENCODING_H -#include "ruby/encoding.h" -#define FORCE_UTF8(obj) rb_enc_associate((obj), rb_utf8_encoding()) +static VALUE CEncoding_ASCII_8BIT, CEncoding_UTF_8, CEncoding_UTF_16BE, + CEncoding_UTF_16LE, CEncoding_UTF_32BE, CEncoding_UTF_32LE; +static ID i_encoding, i_encode, i_encode_bang, i_force_encoding; #else -#define FORCE_UTF8(obj) +static ID i_iconv; #endif static VALUE mJSON, mExt, cParser, eParserError, eNestingError; static VALUE CNaN, CInfinity, CMinusInfinity; static ID i_json_creatable_p, i_json_create, i_create_id, i_create_additions, - i_chr, i_max_nesting, i_allow_nan, i_object_class, i_array_class; + i_chr, i_max_nesting, i_allow_nan, i_symbolize_names, i_object_class, + i_array_class, i_key_p, i_deep_const_get; -#define MinusInfinity "-Infinity" - -typedef struct JSON_ParserStruct { - VALUE Vsource; - char *source; - long len; - char *memo; - VALUE create_id; - int max_nesting; - int current_nesting; - int allow_nan; - VALUE object_class; - VALUE array_class; -} JSON_Parser; - -static char *JSON_parse_object(JSON_Parser *json, char *p, char *pe, VALUE *result); -static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *result); -static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *result); -static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *result); -static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *result); -static char *JSON_parse_float(JSON_Parser *json, char *p, char *pe, VALUE *result); - -#define GET_STRUCT \ - JSON_Parser *json; \ - Data_Get_Struct(self, JSON_Parser, json); - %%{ machine JSON_common; cr = '\n'; cr_neg = [^\n]; @@ -99,11 +123,14 @@ fexec np; } } action parse_name { - char *np = JSON_parse_string(json, fpc, pe, &last_name); + char *np; + json->parsing_name = 1; + np = JSON_parse_string(json, fpc, pe, &last_name); + json->parsing_name = 0; if (np == NULL) { fhold; fbreak; } else fexec np; } action exit { fhold; fbreak; } @@ -121,11 +148,11 @@ int cs = EVIL; VALUE last_name = Qnil; VALUE object_class = json->object_class; if (json->max_nesting && json->current_nesting > json->max_nesting) { - rb_raise(eNestingError, "nesting of %d is to deep", json->current_nesting); + rb_raise(eNestingError, "nesting of %d is too deep", json->current_nesting); } *result = NIL_P(object_class) ? rb_hash_new() : rb_class_new_instance(0, 0, object_class); %% write init; @@ -133,11 +160,11 @@ if (cs >= JSON_object_first_final) { if (RTEST(json->create_id)) { VALUE klassname = rb_hash_aref(*result, json->create_id); if (!NIL_P(klassname)) { - VALUE klass = rb_path2class(StringValueCStr(klassname)); + VALUE klass = rb_funcall(mJSON, i_deep_const_get, 1, klassname); if RTEST(rb_funcall(klass, i_json_creatable_p, 0)) { *result = rb_funcall(klass, i_json_create, 1, *result); } } } @@ -334,88 +361,103 @@ { int cs = EVIL; VALUE array_class = json->array_class; if (json->max_nesting && json->current_nesting > json->max_nesting) { - rb_raise(eNestingError, "nesting of %d is to deep", json->current_nesting); + rb_raise(eNestingError, "nesting of %d is too deep", json->current_nesting); } *result = NIL_P(array_class) ? rb_ary_new() : rb_class_new_instance(0, 0, array_class); %% write init; %% write exec; if(cs >= JSON_array_first_final) { return p + 1; } else { rb_raise(eParserError, "%u: unexpected token at '%s'", __LINE__, p); + return NULL; } } -static VALUE json_string_unescape(char *p, char *pe) +static VALUE json_string_unescape(VALUE result, char *string, char *stringEnd) { - VALUE result = rb_str_buf_new(pe - p + 1); + char *p = string, *pe = string, *unescape; + int unescape_len; - while (p < pe) { - if (*p == '\\') { - p++; - if (p >= pe) return Qnil; /* raise an exception later, \ at end */ - switch (*p) { + while (pe < stringEnd) { + if (*pe == '\\') { + unescape = (char *) "?"; + unescape_len = 1; + if (pe > p) rb_str_buf_cat(result, p, pe - p); + switch (*++pe) { + case 'n': + unescape = (char *) "\n"; + break; + case 'r': + unescape = (char *) "\r"; + break; + case 't': + unescape = (char *) "\t"; + break; case '"': + unescape = (char *) "\""; + break; case '\\': - rb_str_buf_cat(result, p, 1); - p++; + unescape = (char *) "\\"; break; case 'b': - rb_str_buf_cat2(result, "\b"); - p++; + unescape = (char *) "\b"; break; case 'f': - rb_str_buf_cat2(result, "\f"); - p++; + unescape = (char *) "\f"; break; - case 'n': - rb_str_buf_cat2(result, "\n"); - p++; - break; - case 'r': - rb_str_buf_cat2(result, "\r"); - p++; - break; - case 't': - rb_str_buf_cat2(result, "\t"); - p++; - break; case 'u': - if (p > pe - 4) { + if (pe > stringEnd - 4) { return Qnil; } else { - p = JSON_convert_UTF16_to_UTF8(result, p, pe, strictConversion); + char buf[4]; + UTF32 ch = unescape_unicode((unsigned char *) ++pe); + pe += 3; + if (UNI_SUR_HIGH_START == (ch & 0xFC00)) { + pe++; + if (pe > stringEnd - 6) return Qnil; + if (pe[0] == '\\' && pe[1] == 'u') { + UTF32 sur = unescape_unicode((unsigned char *) pe + 2); + ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) + | (sur & 0x3FF)); + pe += 5; + } else { + unescape = (char *) "?"; + break; + } + } + unescape_len = convert_UTF32_to_UTF8(buf, ch); + unescape = buf; } break; default: - rb_str_buf_cat(result, p, 1); - p++; - break; + p = pe; + continue; } + rb_str_buf_cat(result, unescape, unescape_len); + p = ++pe; } else { - char *q = p; - while (*q != '\\' && q < pe) q++; - rb_str_buf_cat(result, p, q - p); - p = q; + pe++; } } + rb_str_buf_cat(result, p, pe - p); return result; } %%{ machine JSON_string; include JSON_common; write data; action parse_string { - *result = json_string_unescape(json->memo + 1, p); + *result = json_string_unescape(*result, json->memo + 1, p); if (NIL_P(*result)) { fhold; fbreak; } else { FORCE_UTF8(*result); @@ -430,15 +472,18 @@ static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *result) { int cs = EVIL; - *result = rb_str_new("", 0); + *result = rb_str_buf_new(0); %% write init; json->memo = p; %% write exec; + if (json->symbolize_names && json->parsing_name) { + *result = rb_str_intern(*result); + } if (cs >= JSON_string_first_final) { return p + 1; } else { return NULL; } @@ -482,10 +527,58 @@ * * with the method parser= in JSON. * */ +static VALUE convert_encoding(VALUE source) +{ + char *ptr = RSTRING_PTR(source); + long len = RSTRING_LEN(source); + if (len < 2) { + rb_raise(eParserError, "A JSON text must at least contain two octets!"); + } +#ifdef HAVE_RUBY_ENCODING_H + { + VALUE encoding = rb_funcall(source, i_encoding, 0); + if (encoding == CEncoding_ASCII_8BIT) { + if (len >= 4 && ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0) { + source = rb_str_dup(source); + rb_funcall(source, i_force_encoding, 1, CEncoding_UTF_32BE); + source = rb_funcall(source, i_encode_bang, 1, CEncoding_UTF_8); + } else if (len >= 4 && ptr[0] == 0 && ptr[2] == 0) { + source = rb_str_dup(source); + rb_funcall(source, i_force_encoding, 1, CEncoding_UTF_16BE); + source = rb_funcall(source, i_encode_bang, 1, CEncoding_UTF_8); + } else if (len >= 4 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0) { + source = rb_str_dup(source); + rb_funcall(source, i_force_encoding, 1, CEncoding_UTF_32LE); + source = rb_funcall(source, i_encode_bang, 1, CEncoding_UTF_8); + } else if (len >= 4 && ptr[1] == 0 && ptr[3] == 0) { + source = rb_str_dup(source); + rb_funcall(source, i_force_encoding, 1, CEncoding_UTF_16LE); + source = rb_funcall(source, i_encode_bang, 1, CEncoding_UTF_8); + } else { + FORCE_UTF8(source); + } + } else { + source = rb_funcall(source, i_encode, 1, CEncoding_UTF_8); + } + } +#else + if (len >= 4 && ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0) { + source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-32be"), source); + } else if (len >= 4 && ptr[0] == 0 && ptr[2] == 0) { + source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-16be"), source); + } else if (len >= 4 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0) { + source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-32le"), source); + } else if (len >= 4 && ptr[1] == 0 && ptr[3] == 0) { + source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-16le"), source); + } +#endif + return source; +} + /* * call-seq: new(source, opts => {}) * * Creates a new JSON::Ext::Parser instance for the string _source_. * @@ -499,10 +592,13 @@ * structures. Disable depth checking with :max_nesting => false|nil|0, it * defaults to 19. * * *allow_nan*: If set to true, allow NaN, Infinity and -Infinity in * defiance of RFC 4627 to be parsed by the Parser. This option defaults to * false. + * * *symbolize_names*: If set to true, returns symbols for the names + * (keys) in a JSON object. Otherwise strings are returned, which is also + * the default. * * *create_additions*: If set to false, the Parser doesn't create * additions even if a matchin class and create_id was found. This option * defaults to true. * * *object_class*: Defaults to Hash * * *array_class*: Defaults to Array @@ -510,25 +606,22 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self) { char *ptr; long len; VALUE source, opts; - GET_STRUCT; + GET_PARSER; rb_scan_args(argc, argv, "11", &source, &opts); - source = StringValue(source); + source = convert_encoding(StringValue(source)); ptr = RSTRING_PTR(source); len = RSTRING_LEN(source); - if (len < 2) { - rb_raise(eParserError, "A JSON text must at least contain two octets!"); - } if (!NIL_P(opts)) { opts = rb_convert_type(opts, T_HASH, "Hash", "to_hash"); if (NIL_P(opts)) { rb_raise(rb_eArgError, "opts needs to be like a hash"); } else { VALUE tmp = ID2SYM(i_max_nesting); - if (st_lookup(RHASH_TBL(opts), tmp, 0)) { + if (option_given_p(opts, tmp)) { VALUE max_nesting = rb_hash_aref(opts, tmp); if (RTEST(max_nesting)) { Check_Type(max_nesting, T_FIXNUM); json->max_nesting = FIX2INT(max_nesting); } else { @@ -536,35 +629,42 @@ } } else { json->max_nesting = 19; } tmp = ID2SYM(i_allow_nan); - if (st_lookup(RHASH_TBL(opts), tmp, 0)) { + if (option_given_p(opts, tmp)) { VALUE allow_nan = rb_hash_aref(opts, tmp); json->allow_nan = RTEST(allow_nan) ? 1 : 0; } else { json->allow_nan = 0; } + tmp = ID2SYM(i_symbolize_names); + if (option_given_p(opts, tmp)) { + VALUE symbolize_names = rb_hash_aref(opts, tmp); + json->symbolize_names = RTEST(symbolize_names) ? 1 : 0; + } else { + json->symbolize_names = 0; + } tmp = ID2SYM(i_create_additions); - if (st_lookup(RHASH_TBL(opts), tmp, 0)) { + if (option_given_p(opts, tmp)) { VALUE create_additions = rb_hash_aref(opts, tmp); if (RTEST(create_additions)) { json->create_id = rb_funcall(mJSON, i_create_id, 0); } else { json->create_id = Qnil; } } else { json->create_id = rb_funcall(mJSON, i_create_id, 0); } tmp = ID2SYM(i_object_class); - if (st_lookup(RHASH_TBL(opts), tmp, 0)) { + if (option_given_p(opts, tmp)) { json->object_class = rb_hash_aref(opts, tmp); } else { json->object_class = Qnil; } tmp = ID2SYM(i_array_class); - if (st_lookup(RHASH_TBL(opts), tmp, 0)) { + if (option_given_p(opts, tmp)) { json->array_class = rb_hash_aref(opts, tmp); } else { json->array_class = Qnil; } } @@ -574,22 +674,10 @@ json->create_id = rb_funcall(mJSON, i_create_id, 0); json->object_class = Qnil; json->array_class = Qnil; } json->current_nesting = 0; - /* - Convert these? - if (len >= 4 && ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0) { - rb_raise(eParserError, "Only UTF8 octet streams are supported atm!"); - } else if (len >= 4 && ptr[0] == 0 && ptr[2] == 0) { - rb_raise(eParserError, "Only UTF8 octet streams are supported atm!"); - } else if (len >= 4 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0) { - rb_raise(eParserError, "Only UTF8 octet streams are supported atm!"); - } else if (len >= 4 && ptr[1] == 0 && ptr[3] == 0) { - rb_raise(eParserError, "Only UTF8 octet streams are supported atm!"); - } - */ json->len = len; json->source = ptr; json->Vsource = source; return self; } @@ -603,25 +691,26 @@ static VALUE cParser_parse(VALUE self) { char *p, *pe; int cs = EVIL; VALUE result = Qnil; - GET_STRUCT; + GET_PARSER; %% write init; p = json->source; pe = p + json->len; %% write exec; if (cs >= JSON_first_final && p == pe) { return result; } else { rb_raise(eParserError, "%u: unexpected token at '%s'", __LINE__, p); + return Qnil; } } -inline static JSON_Parser *JSON_allocate() +static JSON_Parser *JSON_allocate() { JSON_Parser *json = ALLOC(JSON_Parser); MEMZERO(json, JSON_Parser, 1); return json; } @@ -651,11 +740,11 @@ * Returns a copy of the current _source_ string, that was used to construct * this Parser. */ static VALUE cParser_source(VALUE self) { - GET_STRUCT; + GET_PARSER; return rb_str_dup(json->Vsource); } void Init_parser() { @@ -679,8 +768,25 @@ i_create_id = rb_intern("create_id"); i_create_additions = rb_intern("create_additions"); i_chr = rb_intern("chr"); i_max_nesting = rb_intern("max_nesting"); i_allow_nan = rb_intern("allow_nan"); + i_symbolize_names = rb_intern("symbolize_names"); i_object_class = rb_intern("object_class"); i_array_class = rb_intern("array_class"); + i_key_p = rb_intern("key?"); + i_deep_const_get = rb_intern("deep_const_get"); +#ifdef HAVE_RUBY_ENCODING_H + CEncoding_UTF_8 = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-8")); + CEncoding_UTF_16BE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-16be")); + CEncoding_UTF_16LE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-16le")); + CEncoding_UTF_32BE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-32be")); + CEncoding_UTF_32LE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-32le")); + CEncoding_ASCII_8BIT = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("ascii-8bit")); + i_encoding = rb_intern("encoding"); + i_encode = rb_intern("encode"); + i_encode_bang = rb_intern("encode!"); + i_force_encoding = rb_intern("force_encoding"); +#else + i_iconv = rb_intern("iconv"); +#endif }