string_utf8.c in utf8-0.1.4

- old
+ new

@@ -1,10 +1,12 @@
 #include "ext.h"
 #include "utf8.h"
 
 extern VALUE intern_as_utf8;
 
+#define REPLACEMENT_CHAR '?'
+
 /*
  * Document-class: String::UTF-8
  */
 
 /*
@@ -110,11 +112,11 @@
     Check_Type(rb_cp_max, T_FIXNUM);
     cp_max = FIX2INT(rb_cp_max);
   }
 
   for(; i<len; i+=lastCharLen) {
-    lastCharLen = utf8CharLen(str, len);
+    lastCharLen = utf8CharLen(str+i, len);
     if (lastCharLen < 0) {
       return Qfalse;
     }
     cp = utf8CharToCodepoint(str+i, lastCharLen);
     if (cp_max >= 0 && cp > cp_max) {
@@ -325,14 +327,51 @@
     AS_UTF8(utf8Str);
     return utf8Str;
   }
 }
 
+/*
+ * call-seq: clean
+ *
+ * Iterates over the string, replacing invalid UTF-8 characters with '?'
+ *
+ * Returns: a new String
+ */
+static VALUE rb_cString_UTF8_clean(VALUE self) {
+  unsigned char *str;
+  unsigned char *out;
+  unsigned char replace;
+  size_t len;
+  int8_t curCharLen;
+  size_t i;
+  VALUE rb_out;
+
+  str = (unsigned char *)RSTRING_PTR(self);
+  len = RSTRING_LEN(self);
+  replace = REPLACEMENT_CHAR;
+  out = xmalloc(len);
+
+  for(i=0; i<len; i++) {
+    curCharLen = utf8CharLen(str+i, len);
+    if (curCharLen < 0) {
+      *(out+i) = replace;
+    } else {
+      *(out+i) = *(str+i);
+    }
+  }
+
+  rb_out = rb_str_new((const char*)out, len);
+  AS_UTF8(rb_out);
+
+  return rb_out;
+}
+
 void init_String_UTF8() {
   VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
 
   rb_define_method(rb_cString_UTF8, "length",    rb_cString_UTF8_length, 0);
   rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1);
   rb_define_method(rb_cString_UTF8, "[]",        rb_cString_UTF8_slice, -1);
   rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1);
   rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1);
+  rb_define_method(rb_cString_UTF8, "clean", rb_cString_UTF8_clean, 0);
 }
\ No newline at end of file