ext/utf8/string_utf8.c in utf8-0.1.3 vs ext/utf8/string_utf8.c in utf8-0.1.4

- old
+ new

@@ -1,10 +1,12 @@ #include "ext.h" #include "utf8.h" extern VALUE intern_as_utf8; +#define REPLACEMENT_CHAR '?' + /* * Document-class: String::UTF-8 */ /* @@ -110,11 +112,11 @@ Check_Type(rb_cp_max, T_FIXNUM); cp_max = FIX2INT(rb_cp_max); } for(; i<len; i+=lastCharLen) { - lastCharLen = utf8CharLen(str, len); + lastCharLen = utf8CharLen(str+i, len); if (lastCharLen < 0) { return Qfalse; } cp = utf8CharToCodepoint(str+i, lastCharLen); if (cp_max >= 0 && cp > cp_max) { @@ -325,14 +327,51 @@ AS_UTF8(utf8Str); return utf8Str; } } +/* + * call-seq: clean + * + * Iterates over the string, replacing invalid UTF-8 characters with '?' + * + * Returns: a new String + */ +static VALUE rb_cString_UTF8_clean(VALUE self) { + unsigned char *str; + unsigned char *out; + unsigned char replace; + size_t len; + int8_t curCharLen; + size_t i; + VALUE rb_out; + + str = (unsigned char *)RSTRING_PTR(self); + len = RSTRING_LEN(self); + replace = REPLACEMENT_CHAR; + out = xmalloc(len); + + for(i=0; i<len; i++) { + curCharLen = utf8CharLen(str+i, len); + if (curCharLen < 0) { + *(out+i) = replace; + } else { + *(out+i) = *(str+i); + } + } + + rb_out = rb_str_new((const char*)out, len); + AS_UTF8(rb_out); + + return rb_out; +} + void init_String_UTF8() { VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString); rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0); rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1); rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1); rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1); rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1); + rb_define_method(rb_cString_UTF8, "clean", rb_cString_UTF8_clean, 0); } \ No newline at end of file