ext/riconv.c in rjb-1.5.9 vs ext/riconv.c in rjb-1.6.0

- old
+ new

@@ -165,20 +165,142 @@ else { objIconvR2J = objIconvJ2R = Qnil; } } +#else +VALUE cEncoding = Qnil; +VALUE encoding_utf8 = Qnil; +static void init_encoding_vars() +{ + cEncoding = rb_const_get(rb_cObject, rb_intern("Encoding")); + encoding_utf8 = rb_const_get(cEncoding, rb_intern("UTF_8")); +} +static int contains_surrogate_pair(const unsigned char* p) +{ + while (*p) + { + switch (*p & 0xf0) + { + case 0xf0: + return 1; + case 0xe0: + p += 3; + break; + default: + p += (*p & 0x80) ? 2 : 1; + } + } + return 0; +} +static int contains_auxchar(const unsigned char* p) +{ + while (*p) + { + if (*p == 0xed) + { +#if defined(DEBUG) + printf("find %02x %02x %02x %02x %02x %02x\n", *p, *(p + 1), *(p + 2), *(p + 3), *(p + 4), *(p + 5)); #endif + return 1; + } + switch (*p & 0xe0) + { + case 0xe0: + p++; + case 0xc0: + p++; + default: + p++; + } + } + return 0; +} +static VALUE encode_to_cesu8(const unsigned char* p) +{ + size_t len = strlen(p); + char* newstr = ALLOCA_N(char, len + (len + 1) / 2); + char* dest = newstr; + int sval, i; + while (*p) + { + switch (*p & 0xf0) + { + case 0xf0: + sval = *p++ & 7; + for (i = 0; i < 3; i++) + { + sval <<= 6; + sval |= (*p++ & 0x3f); + } + *dest++ = '\xed'; + *dest++ = 0xa0 | (((sval >> 16) - 1) & 0x0f); + *dest++ = 0x80 | ((sval >> 10) & 0x3f); + *dest++ = '\xed'; + *dest++ = 0xb0 | ((sval >> 6) & 0x0f); + *dest++ = 0x80 | (sval & 0x3f); + break; + case 0xe0: + *dest++ = *p++; + case 0xc0: + case 0xc1: + *dest++ = *p++; + default: + *dest++ = *p++; + } + } + return rb_str_new(newstr, dest - newstr); +} +static VALUE encode_to_utf8(const unsigned char* p) +{ + size_t len = strlen(p); + char* newstr = ALLOCA_N(char, len); + char* dest = newstr; + int sval, i; + while (*p) + { + if (*p == 0xed) + { + char v = *(p + 1); + char w = *(p + 2); + char y = *(p + 4); + char z = *(p + 5); + p += 6; + sval = 0x10000 + ((v & 0x0f) << 16) + ((w & 0x3f) << 10) + ((y & 0x0f) << 6) + (z & 0x3f); + sval = (((v + 1) & 0x0f) << 16) + ((w & 0x3f) << 10) + ((y & 0x0f) << 6) + (z & 0x3f); + *dest++ = 0xf0 | ((sval >> 18)); + *dest++ = 0x80 | ((sval >> 12) & 0x3f); + *dest++ = 0x80 | ((sval >> 6) & 0x3f); + *dest++ = 0x80 | (sval & 0x3f); + continue; + } + switch (*p & 0xe0) + { + case 0xe0: + *dest++ = *p++; + case 0xc0: + case 0xc1: + *dest++ = *p++; + default: + *dest++ = *p++; + } + } + return rb_str_new(newstr, dest - newstr); +} +#endif + #if defined(DEBUG) static void debug_out(VALUE v) { char* p = StringValuePtr(v); printf("-- %d, %d, %s\n", rb_num2long(rb_funcall(v, rb_intern("size"), 0)), strlen(p), p); fflush(stdout); } +#else +#define debug_out(n) #endif VALUE exticonv_local_to_utf8(VALUE local_string) { #if RJB_RUBY_VERSION_CODE < 190 @@ -190,27 +312,28 @@ else { return local_string; } #else - VALUE cEncoding, encoding, utf8; - cEncoding = rb_const_get(rb_cObject, rb_intern("Encoding")); + VALUE encoding; + if (NIL_P(cEncoding)) + { + init_encoding_vars(); + } encoding = rb_funcall(local_string, rb_intern("encoding"), 0); - utf8 = rb_const_get(cEncoding, rb_intern("UTF_8")); - if (encoding != utf8) + if (encoding != encoding_utf8) { - VALUE ret = rb_funcall(local_string, rb_intern("encode"), 2, utf8, encoding); -#if defined(DEBUG) + VALUE ret = rb_funcall(local_string, rb_intern("encode"), 2, encoding_utf8, encoding); debug_out(local_string); debug_out(ret); -#endif - return ret; + local_string = ret; } - else + if (contains_surrogate_pair(StringValuePtr(local_string))) { - return local_string; + local_string = encode_to_cesu8(StringValuePtr(local_string)); } + return local_string; #endif } VALUE exticonv_utf8_to_local(VALUE utf8_string) { @@ -223,8 +346,16 @@ else { return utf8_string; } #else - return rb_funcall(utf8_string, rb_intern("force_encoding"), 1, rb_const_get(rb_cEncoding, rb_intern("UTF_8"))); + if (NIL_P(cEncoding)) + { + init_encoding_vars(); + } + if (contains_auxchar(StringValuePtr(utf8_string))) + { + utf8_string = encode_to_utf8(StringValuePtr(utf8_string)); + } + return rb_funcall(utf8_string, rb_intern("force_encoding"), 1, encoding_utf8); #endif }