ext/utf8/string_utf8.c in utf8-0.1.6 vs ext/utf8/string_utf8.c in utf8-0.1.7
- old
+ new
@@ -91,11 +91,11 @@
}
/*
* call-seq: valid?(max_codepoint=nil)
*
- * Iterates over the string, yielding one UTF-8 codepoint at a time
+ * Iterates over the string, returning true/false if it's valid UTF-8
*
* max_codepoint - an optional Fixnum used to declare this string invalid
* if a codepoint higher than that value is found
* if nothing is passed, the UTF-8 maximum of 0x10FFFF is assumed
*
@@ -335,46 +335,75 @@
* Iterates over the string, replacing invalid UTF-8 characters with '?'
*
* Returns: a new String
*/
static VALUE rb_cString_UTF8_clean(VALUE self) {
- unsigned char *str;
- unsigned char *out;
- unsigned char replace;
- size_t len;
+ unsigned char *inBuf, *inBufCur;
+ unsigned char *outBuf, *outBufCur;
+ size_t len, i;
int8_t curCharLen;
- size_t i;
VALUE rb_out;
- str = (unsigned char *)RSTRING_PTR(self);
+ inBuf = (unsigned char *)RSTRING_PTR(self);
+ inBufCur = inBuf;
len = RSTRING_LEN(self);
- replace = REPLACEMENT_CHAR;
- out = xmalloc(len);
+ outBuf = malloc(len);
+ outBufCur = outBuf;
for(i=0; i<len; i+=curCharLen) {
- curCharLen = utf8CharLen(str+i, len);
+ curCharLen = utf8CharLen(inBufCur, len);
if (curCharLen < 0) {
- *(out+i) = replace;
+ if (inBufCur-inBuf > 0) {
+ memcpy(outBufCur, inBuf, inBufCur-inBuf);
+ outBufCur += inBufCur-inBuf;
+ }
+ *outBufCur++ = REPLACEMENT_CHAR;
+ inBuf += (inBufCur-inBuf)+1;
curCharLen = 1;
- } else {
- memcpy(out+i, str+i, curCharLen);
}
+
+ inBufCur += curCharLen;
}
- rb_out = rb_str_new((const char*)out, len);
+ if (inBufCur-inBuf > 0) {
+ memcpy(outBufCur, inBuf, inBufCur-inBuf);
+ }
+
+ rb_out = rb_str_new((const char*)outBuf, len);
AS_UTF8(rb_out);
- xfree(out);
+ free(outBuf);
return rb_out;
}
+/*
+ * call-seq: clean
+ *
+ * Iterates over the string, returning true/false if it's within the low ASCII range
+ *
+ * Returns: a Boolean - true if the string is within the low ASCII range, false if not
+ */
+static VALUE rb_cString_UTF8_ascii_only(VALUE self) {
+ unsigned char *str = (unsigned char *)RSTRING_PTR(self);
+ size_t len = RSTRING_LEN(self), i=0;
+
+ for(; i<len; i+=1) {
+ if (str[i] > 0x7f) {
+ return Qfalse;
+ }
+ }
+
+ return Qtrue;
+}
+
void init_String_UTF8() {
VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
- rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
+ rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1);
- rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
+ rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1);
rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1);
rb_define_method(rb_cString_UTF8, "clean", rb_cString_UTF8_clean, 0);
+ rb_define_method(rb_cString_UTF8, "ascii_only?", rb_cString_UTF8_ascii_only, 0);
}
\ No newline at end of file