ext/utf8/string_utf8.c in utf8-0.1.3 vs ext/utf8/string_utf8.c in utf8-0.1.4
- old
+ new
@@ -1,10 +1,12 @@
#include "ext.h"
#include "utf8.h"
extern VALUE intern_as_utf8;
+#define REPLACEMENT_CHAR '?'
+
/*
* Document-class: String::UTF-8
*/
/*
@@ -110,11 +112,11 @@
Check_Type(rb_cp_max, T_FIXNUM);
cp_max = FIX2INT(rb_cp_max);
}
for(; i<len; i+=lastCharLen) {
- lastCharLen = utf8CharLen(str, len);
+ lastCharLen = utf8CharLen(str+i, len);
if (lastCharLen < 0) {
return Qfalse;
}
cp = utf8CharToCodepoint(str+i, lastCharLen);
if (cp_max >= 0 && cp > cp_max) {
@@ -325,14 +327,51 @@
AS_UTF8(utf8Str);
return utf8Str;
}
}
+/*
+ * call-seq: clean
+ *
+ * Iterates over the string, replacing invalid UTF-8 characters with '?'
+ *
+ * Returns: a new String
+ */
+static VALUE rb_cString_UTF8_clean(VALUE self) {
+ unsigned char *str;
+ unsigned char *out;
+ unsigned char replace;
+ size_t len;
+ int8_t curCharLen;
+ size_t i;
+ VALUE rb_out;
+
+ str = (unsigned char *)RSTRING_PTR(self);
+ len = RSTRING_LEN(self);
+ replace = REPLACEMENT_CHAR;
+ out = xmalloc(len);
+
+ for(i=0; i<len; i++) {
+ curCharLen = utf8CharLen(str+i, len);
+ if (curCharLen < 0) {
+ *(out+i) = replace;
+ } else {
+ *(out+i) = *(str+i);
+ }
+ }
+
+ rb_out = rb_str_new((const char*)out, len);
+ AS_UTF8(rb_out);
+
+ return rb_out;
+}
+
void init_String_UTF8() {
VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1);
rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1);
rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1);
+ rb_define_method(rb_cString_UTF8, "clean", rb_cString_UTF8_clean, 0);
}
\ No newline at end of file