parser.rl in scout-5.1.3

- old
+ new

@@ -1,62 +1,86 @@
-#include "ruby.h"
-#include "unicode.h"
-#if HAVE_RE_H
-#include "re.h"
-#endif
-#if HAVE_RUBY_ST_H
-#include "ruby/st.h"
-#endif
-#if HAVE_ST_H
-#include "st.h"
-#endif
+#include "parser.h"
 
-#define EVIL 0x666
+/* unicode */
 
-#ifndef RHASH_TBL
-#define RHASH_TBL(hsh) (RHASH(hsh)->tbl)
-#endif
+static const char digit_values[256] = { 
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1,
+    -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1
+};
 
+static UTF32 unescape_unicode(const unsigned char *p)
+{
+    char b;
+    UTF32 result = 0;
+    b = digit_values[p[0]];
+    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    result = (result << 4) | b;
+    b = digit_values[p[1]];
+    result = (result << 4) | b;
+    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    b = digit_values[p[2]];
+    result = (result << 4) | b;
+    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    b = digit_values[p[3]];
+    result = (result << 4) | b;
+    if (b < 0) return UNI_REPLACEMENT_CHAR;
+    return result;
+}
+
+static int convert_UTF32_to_UTF8(char *buf, UTF32 ch) 
+{
+    int len = 1;
+    if (ch <= 0x7F) {
+        buf[0] = (char) ch;
+    } else if (ch <= 0x07FF) {
+        buf[0] = (char) ((ch >> 6) | 0xC0);
+        buf[1] = (char) ((ch & 0x3F) | 0x80);
+        len++;
+    } else if (ch <= 0xFFFF) {
+        buf[0] = (char) ((ch >> 12) | 0xE0);
+        buf[1] = (char) (((ch >> 6) & 0x3F) | 0x80);
+        buf[2] = (char) ((ch & 0x3F) | 0x80);
+        len += 2;
+    } else if (ch <= 0x1fffff) {
+        buf[0] =(char) ((ch >> 18) | 0xF0);
+        buf[1] =(char) (((ch >> 12) & 0x3F) | 0x80);
+        buf[2] =(char) (((ch >> 6) & 0x3F) | 0x80);
+        buf[3] =(char) ((ch & 0x3F) | 0x80);
+        len += 3;
+    } else {
+        buf[0] = '?';
+    }
+    return len;
+}
+
 #ifdef HAVE_RUBY_ENCODING_H
-#include "ruby/encoding.h"
-#define FORCE_UTF8(obj) rb_enc_associate((obj), rb_utf8_encoding())
+static VALUE CEncoding_ASCII_8BIT, CEncoding_UTF_8, CEncoding_UTF_16BE,
+    CEncoding_UTF_16LE, CEncoding_UTF_32BE, CEncoding_UTF_32LE;
+static ID i_encoding, i_encode, i_encode_bang, i_force_encoding;
 #else
-#define FORCE_UTF8(obj)
+static ID i_iconv;
 #endif
 
 static VALUE mJSON, mExt, cParser, eParserError, eNestingError;
 static VALUE CNaN, CInfinity, CMinusInfinity;
 
 static ID i_json_creatable_p, i_json_create, i_create_id, i_create_additions,
-          i_chr, i_max_nesting, i_allow_nan, i_object_class, i_array_class; 
+          i_chr, i_max_nesting, i_allow_nan, i_symbolize_names, i_object_class,
+          i_array_class, i_key_p, i_deep_const_get;
 
-#define MinusInfinity "-Infinity"
-
-typedef struct JSON_ParserStruct {
-    VALUE Vsource;
-    char *source;
-    long len;
-    char *memo;
-    VALUE create_id;
-    int max_nesting;
-    int current_nesting;
-    int allow_nan;
-    VALUE object_class;
-    VALUE array_class;
-} JSON_Parser;
-
-static char *JSON_parse_object(JSON_Parser *json, char *p, char *pe, VALUE *result);
-static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *result);
-static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *result);
-static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *result);
-static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *result);
-static char *JSON_parse_float(JSON_Parser *json, char *p, char *pe, VALUE *result);
-
-#define GET_STRUCT                          \
-    JSON_Parser *json;                      \
-    Data_Get_Struct(self, JSON_Parser, json);
-
 %%{
     machine JSON_common;
 
     cr                  = '\n';
     cr_neg              = [^\n];
@@ -99,11 +123,14 @@
             fexec np;
         }
     }
 
     action parse_name {
-        char *np = JSON_parse_string(json, fpc, pe, &last_name);
+        char *np;
+        json->parsing_name = 1;
+        np = JSON_parse_string(json, fpc, pe, &last_name);
+        json->parsing_name = 0;
         if (np == NULL) { fhold; fbreak; } else fexec np;
     }
 
     action exit { fhold; fbreak; }
 
@@ -121,11 +148,11 @@
     int cs = EVIL;
     VALUE last_name = Qnil;
     VALUE object_class = json->object_class;
 
     if (json->max_nesting && json->current_nesting > json->max_nesting) {
-        rb_raise(eNestingError, "nesting of %d is to deep", json->current_nesting);
+        rb_raise(eNestingError, "nesting of %d is too deep", json->current_nesting);
     }
 
     *result = NIL_P(object_class) ? rb_hash_new() : rb_class_new_instance(0, 0, object_class);
 
     %% write init;
@@ -133,11 +160,11 @@
 
     if (cs >= JSON_object_first_final) {
         if (RTEST(json->create_id)) {
             VALUE klassname = rb_hash_aref(*result, json->create_id);
             if (!NIL_P(klassname)) {
-                VALUE klass = rb_path2class(StringValueCStr(klassname));
+                VALUE klass = rb_funcall(mJSON, i_deep_const_get, 1, klassname);
                 if RTEST(rb_funcall(klass, i_json_creatable_p, 0)) {
                     *result = rb_funcall(klass, i_json_create, 1, *result);
                 }
             }
         }
@@ -334,88 +361,103 @@
 {
     int cs = EVIL;
     VALUE array_class = json->array_class;
 
     if (json->max_nesting && json->current_nesting > json->max_nesting) {
-        rb_raise(eNestingError, "nesting of %d is to deep", json->current_nesting);
+        rb_raise(eNestingError, "nesting of %d is too deep", json->current_nesting);
     }
     *result = NIL_P(array_class) ? rb_ary_new() : rb_class_new_instance(0, 0, array_class);
 
     %% write init;
     %% write exec;
 
     if(cs >= JSON_array_first_final) {
         return p + 1;
     } else {
         rb_raise(eParserError, "%u: unexpected token at '%s'", __LINE__, p);
+        return NULL;
     }
 }
 
-static VALUE json_string_unescape(char *p, char *pe)
+static VALUE json_string_unescape(VALUE result, char *string, char *stringEnd)
 {
-    VALUE result = rb_str_buf_new(pe - p + 1);
+    char *p = string, *pe = string, *unescape;
+    int unescape_len;
 
-    while (p < pe) {
-        if (*p == '\\') {
-            p++;
-            if (p >= pe) return Qnil; /* raise an exception later, \ at end */
-            switch (*p) {
+    while (pe < stringEnd) {
+        if (*pe == '\\') {
+            unescape = (char *) "?";
+            unescape_len = 1;
+            if (pe > p) rb_str_buf_cat(result, p, pe - p);
+            switch (*++pe) {
+                case 'n':
+                    unescape = (char *) "\n";
+                    break;
+                case 'r':
+                    unescape = (char *) "\r";
+                    break;
+                case 't':
+                    unescape = (char *) "\t";
+                    break;
                 case '"':
+                    unescape = (char *) "\"";
+                    break;
                 case '\\':
-                    rb_str_buf_cat(result, p, 1);
-                    p++;
+                    unescape = (char *) "\\";
                     break;
                 case 'b':
-                    rb_str_buf_cat2(result, "\b");
-                    p++;
+                    unescape = (char *) "\b";
                     break;
                 case 'f':
-                    rb_str_buf_cat2(result, "\f");
-                    p++;
+                    unescape = (char *) "\f";
                     break;
-                case 'n':
-                    rb_str_buf_cat2(result, "\n");
-                    p++;
-                    break;
-                case 'r':
-                    rb_str_buf_cat2(result, "\r");
-                    p++;
-                    break;
-                case 't':
-                    rb_str_buf_cat2(result, "\t");
-                    p++;
-                    break;
                 case 'u':
-                    if (p > pe - 4) { 
+                    if (pe > stringEnd - 4) { 
                         return Qnil;
                     } else {
-                        p = JSON_convert_UTF16_to_UTF8(result, p, pe, strictConversion);
+                        char buf[4];
+                        UTF32 ch = unescape_unicode((unsigned char *) ++pe);
+                        pe += 3;
+                        if (UNI_SUR_HIGH_START == (ch & 0xFC00)) {
+                            pe++;
+                            if (pe > stringEnd - 6) return Qnil;
+                            if (pe[0] == '\\' && pe[1] == 'u') {
+                                UTF32 sur = unescape_unicode((unsigned char *) pe + 2);
+                                ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
+                                        | (sur & 0x3FF));
+                                pe += 5;
+                            } else {
+                                unescape = (char *) "?";
+                                break;
+                            }
+                        }
+                        unescape_len = convert_UTF32_to_UTF8(buf, ch);
+                        unescape = buf;
                     }
                     break;
                 default:
-                    rb_str_buf_cat(result, p, 1);
-                    p++;
-                    break;
+                    p = pe;
+                    continue;
             }
+            rb_str_buf_cat(result, unescape, unescape_len);
+            p = ++pe;
         } else {
-            char *q = p;
-            while (*q != '\\' && q < pe) q++;
-            rb_str_buf_cat(result, p, q - p);
-            p = q;
+            pe++;
         }
     }
+    rb_str_buf_cat(result, p, pe - p);
     return result;
 }
 
 %%{
     machine JSON_string;
     include JSON_common;
 
     write data;
 
     action parse_string {
-        *result = json_string_unescape(json->memo + 1, p);
+        *result = json_string_unescape(*result, json->memo + 1, p);
         if (NIL_P(*result)) {
 			fhold;
 			fbreak;
 		} else {
 			FORCE_UTF8(*result);
@@ -430,15 +472,18 @@
 
 static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *result)
 {
     int cs = EVIL;
 
-    *result = rb_str_new("", 0);
+    *result = rb_str_buf_new(0);
     %% write init;
     json->memo = p;
     %% write exec;
 
+    if (json->symbolize_names && json->parsing_name) {
+      *result = rb_str_intern(*result);
+    }
     if (cs >= JSON_string_first_final) {
         return p + 1;
     } else {
         return NULL;
     }
@@ -482,10 +527,58 @@
  *
  * with the method parser= in JSON.
  *
  */
 
+static VALUE convert_encoding(VALUE source)
+{
+    char *ptr = RSTRING_PTR(source);
+    long len = RSTRING_LEN(source);
+    if (len < 2) {
+        rb_raise(eParserError, "A JSON text must at least contain two octets!");
+    }
+#ifdef HAVE_RUBY_ENCODING_H
+    {
+        VALUE encoding = rb_funcall(source, i_encoding, 0);
+        if (encoding == CEncoding_ASCII_8BIT) {
+            if (len >= 4 &&  ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0) {
+                source = rb_str_dup(source);
+                rb_funcall(source, i_force_encoding, 1, CEncoding_UTF_32BE);
+                source = rb_funcall(source, i_encode_bang, 1, CEncoding_UTF_8);
+            } else if (len >= 4 && ptr[0] == 0 && ptr[2] == 0) {
+                source = rb_str_dup(source);
+                rb_funcall(source, i_force_encoding, 1, CEncoding_UTF_16BE);
+                source = rb_funcall(source, i_encode_bang, 1, CEncoding_UTF_8);
+            } else if (len >= 4 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0) {
+                source = rb_str_dup(source);
+                rb_funcall(source, i_force_encoding, 1, CEncoding_UTF_32LE);
+                source = rb_funcall(source, i_encode_bang, 1, CEncoding_UTF_8);
+            } else if (len >= 4 && ptr[1] == 0 && ptr[3] == 0) {
+                source = rb_str_dup(source);
+                rb_funcall(source, i_force_encoding, 1, CEncoding_UTF_16LE);
+                source = rb_funcall(source, i_encode_bang, 1, CEncoding_UTF_8);
+            } else {
+                FORCE_UTF8(source);
+            }
+        } else {
+            source = rb_funcall(source, i_encode, 1, CEncoding_UTF_8);
+        }
+    }
+#else
+    if (len >= 4 &&  ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0) {
+      source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-32be"), source);
+    } else if (len >= 4 && ptr[0] == 0 && ptr[2] == 0) {
+      source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-16be"), source);
+    } else if (len >= 4 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0) {
+      source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-32le"), source);
+    } else if (len >= 4 && ptr[1] == 0 && ptr[3] == 0) {
+      source = rb_funcall(mJSON, i_iconv, 3, rb_str_new2("utf-8"), rb_str_new2("utf-16le"), source);
+    }
+#endif
+    return source;
+}
+
 /*
  * call-seq: new(source, opts => {})
  *
  * Creates a new JSON::Ext::Parser instance for the string _source_.
  *
@@ -499,10 +592,13 @@
  *   structures. Disable depth checking with :max_nesting => false|nil|0, it
  *   defaults to 19.
  * * *allow_nan*: If set to true, allow NaN, Infinity and -Infinity in
  *   defiance of RFC 4627 to be parsed by the Parser. This option defaults to
  *   false.
+ * * *symbolize_names*: If set to true, returns symbols for the names
+ *   (keys) in a JSON object. Otherwise strings are returned, which is also
+ *   the default.
  * * *create_additions*: If set to false, the Parser doesn't create
  *   additions even if a matchin class and create_id was found. This option
  *   defaults to true.
  * * *object_class*: Defaults to Hash
  * * *array_class*: Defaults to Array
@@ -510,25 +606,22 @@
 static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
 {
     char *ptr;
     long len;
     VALUE source, opts;
-    GET_STRUCT;
+    GET_PARSER;
     rb_scan_args(argc, argv, "11", &source, &opts);
-    source = StringValue(source);
+    source = convert_encoding(StringValue(source));
     ptr = RSTRING_PTR(source);
     len = RSTRING_LEN(source);
-    if (len < 2) {
-        rb_raise(eParserError, "A JSON text must at least contain two octets!");
-    }
     if (!NIL_P(opts)) {
         opts = rb_convert_type(opts, T_HASH, "Hash", "to_hash");
         if (NIL_P(opts)) {
             rb_raise(rb_eArgError, "opts needs to be like a hash");
         } else {
             VALUE tmp = ID2SYM(i_max_nesting);
-            if (st_lookup(RHASH_TBL(opts), tmp, 0)) {
+            if (option_given_p(opts, tmp)) {
                 VALUE max_nesting = rb_hash_aref(opts, tmp);
                 if (RTEST(max_nesting)) {
                     Check_Type(max_nesting, T_FIXNUM);
                     json->max_nesting = FIX2INT(max_nesting);
                 } else {
@@ -536,35 +629,42 @@
                 }
             } else {
                 json->max_nesting = 19;
             }
             tmp = ID2SYM(i_allow_nan);
-            if (st_lookup(RHASH_TBL(opts), tmp, 0)) {
+            if (option_given_p(opts, tmp)) {
                 VALUE allow_nan = rb_hash_aref(opts, tmp);
                 json->allow_nan = RTEST(allow_nan) ? 1 : 0;
             } else {
                 json->allow_nan = 0;
             }
+            tmp = ID2SYM(i_symbolize_names);
+            if (option_given_p(opts, tmp)) {
+                VALUE symbolize_names = rb_hash_aref(opts, tmp);
+                json->symbolize_names = RTEST(symbolize_names) ? 1 : 0;
+            } else {
+                json->symbolize_names = 0;
+            }
             tmp = ID2SYM(i_create_additions);
-            if (st_lookup(RHASH_TBL(opts), tmp, 0)) {
+            if (option_given_p(opts, tmp)) {
                 VALUE create_additions = rb_hash_aref(opts, tmp);
                 if (RTEST(create_additions)) {
                     json->create_id = rb_funcall(mJSON, i_create_id, 0);
                 } else {
                     json->create_id = Qnil;
                 }
             } else {
                 json->create_id = rb_funcall(mJSON, i_create_id, 0);
             }
             tmp = ID2SYM(i_object_class);
-            if (st_lookup(RHASH_TBL(opts), tmp, 0)) {
+            if (option_given_p(opts, tmp)) {
                 json->object_class = rb_hash_aref(opts, tmp);
             } else {
                 json->object_class = Qnil;
             }
             tmp = ID2SYM(i_array_class);
-            if (st_lookup(RHASH_TBL(opts), tmp, 0)) {
+            if (option_given_p(opts, tmp)) {
                 json->array_class = rb_hash_aref(opts, tmp);
             } else {
                 json->array_class = Qnil;
             }
         }
@@ -574,22 +674,10 @@
         json->create_id = rb_funcall(mJSON, i_create_id, 0);
         json->object_class = Qnil;
         json->array_class = Qnil;
     }
     json->current_nesting = 0;
-    /*
-       Convert these?
-    if (len >= 4 &&  ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0) {
-        rb_raise(eParserError, "Only UTF8 octet streams are supported atm!");
-    } else if (len >= 4 && ptr[0] == 0 && ptr[2] == 0) {
-        rb_raise(eParserError, "Only UTF8 octet streams are supported atm!");
-    } else if (len >= 4 && ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0) {
-        rb_raise(eParserError, "Only UTF8 octet streams are supported atm!");
-    } else if (len >= 4 && ptr[1] == 0 && ptr[3] == 0) {
-        rb_raise(eParserError, "Only UTF8 octet streams are supported atm!");
-    }
-    */
     json->len = len;
     json->source = ptr;
     json->Vsource = source;
     return self;
 }
@@ -603,25 +691,26 @@
 static VALUE cParser_parse(VALUE self)
 {
     char *p, *pe;
     int cs = EVIL;
     VALUE result = Qnil;
-    GET_STRUCT;
+    GET_PARSER;
 
     %% write init;
     p = json->source;
     pe = p + json->len;
     %% write exec;
 
     if (cs >= JSON_first_final && p == pe) {
         return result;
     } else {
         rb_raise(eParserError, "%u: unexpected token at '%s'", __LINE__, p);
+        return Qnil;
     }
 }
 
-inline static JSON_Parser *JSON_allocate()
+static JSON_Parser *JSON_allocate()
 {
     JSON_Parser *json = ALLOC(JSON_Parser);
     MEMZERO(json, JSON_Parser, 1);
     return json;
 }
@@ -651,11 +740,11 @@
  * Returns a copy of the current _source_ string, that was used to construct
  * this Parser.
  */
 static VALUE cParser_source(VALUE self)
 {
-    GET_STRUCT;
+    GET_PARSER;
     return rb_str_dup(json->Vsource);
 }
 
 void Init_parser()
 {
@@ -679,8 +768,25 @@
     i_create_id = rb_intern("create_id");
     i_create_additions = rb_intern("create_additions");
     i_chr = rb_intern("chr");
     i_max_nesting = rb_intern("max_nesting");
     i_allow_nan = rb_intern("allow_nan");
+    i_symbolize_names = rb_intern("symbolize_names");
     i_object_class = rb_intern("object_class");
     i_array_class = rb_intern("array_class");
+    i_key_p = rb_intern("key?");
+    i_deep_const_get = rb_intern("deep_const_get");
+#ifdef HAVE_RUBY_ENCODING_H
+    CEncoding_UTF_8 = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-8"));
+    CEncoding_UTF_16BE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-16be"));
+    CEncoding_UTF_16LE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-16le"));
+    CEncoding_UTF_32BE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-32be"));
+    CEncoding_UTF_32LE = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("utf-32le"));
+    CEncoding_ASCII_8BIT = rb_funcall(rb_path2class("Encoding"), rb_intern("find"), 1, rb_str_new2("ascii-8bit"));
+    i_encoding = rb_intern("encoding");
+    i_encode = rb_intern("encode");
+    i_encode_bang = rb_intern("encode!");
+    i_force_encoding = rb_intern("force_encoding");
+#else
+    i_iconv = rb_intern("iconv");
+#endif
 }