ext/utf8/string_utf8.c in utf8-0.1.6 vs ext/utf8/string_utf8.c in utf8-0.1.7

- old
+ new

@@ -91,11 +91,11 @@ } /* * call-seq: valid?(max_codepoint=nil) * - * Iterates over the string, yielding one UTF-8 codepoint at a time + * Iterates over the string, returning true/false if it's valid UTF-8 * * max_codepoint - an optional Fixnum used to declare this string invalid * if a codepoint higher than that value is found * if nothing is passed, the UTF-8 maximum of 0x10FFFF is assumed * @@ -335,46 +335,75 @@ * Iterates over the string, replacing invalid UTF-8 characters with '?' * * Returns: a new String */ static VALUE rb_cString_UTF8_clean(VALUE self) { - unsigned char *str; - unsigned char *out; - unsigned char replace; - size_t len; + unsigned char *inBuf, *inBufCur; + unsigned char *outBuf, *outBufCur; + size_t len, i; int8_t curCharLen; - size_t i; VALUE rb_out; - str = (unsigned char *)RSTRING_PTR(self); + inBuf = (unsigned char *)RSTRING_PTR(self); + inBufCur = inBuf; len = RSTRING_LEN(self); - replace = REPLACEMENT_CHAR; - out = xmalloc(len); + outBuf = malloc(len); + outBufCur = outBuf; for(i=0; i<len; i+=curCharLen) { - curCharLen = utf8CharLen(str+i, len); + curCharLen = utf8CharLen(inBufCur, len); if (curCharLen < 0) { - *(out+i) = replace; + if (inBufCur-inBuf > 0) { + memcpy(outBufCur, inBuf, inBufCur-inBuf); + outBufCur += inBufCur-inBuf; + } + *outBufCur++ = REPLACEMENT_CHAR; + inBuf += (inBufCur-inBuf)+1; curCharLen = 1; - } else { - memcpy(out+i, str+i, curCharLen); } + + inBufCur += curCharLen; } - rb_out = rb_str_new((const char*)out, len); + if (inBufCur-inBuf > 0) { + memcpy(outBufCur, inBuf, inBufCur-inBuf); + } + + rb_out = rb_str_new((const char*)outBuf, len); AS_UTF8(rb_out); - xfree(out); + free(outBuf); return rb_out; } +/* + * call-seq: clean + * + * Iterates over the string, returning true/false if it's within the low ASCII range + * + * Returns: a Boolean - true if the string is within the low ASCII range, false if not + */ +static VALUE rb_cString_UTF8_ascii_only(VALUE self) { + unsigned char *str = (unsigned char *)RSTRING_PTR(self); + size_t len = RSTRING_LEN(self), i=0; + + for(; i<len; i+=1) { + if (str[i] > 0x7f) { + return Qfalse; + } + } + + return Qtrue; +} + void init_String_UTF8() { VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString); - rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0); + rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0); rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1); - rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1); + rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1); rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1); rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1); rb_define_method(rb_cString_UTF8, "clean", rb_cString_UTF8_clean, 0); + rb_define_method(rb_cString_UTF8, "ascii_only?", rb_cString_UTF8_ascii_only, 0); } \ No newline at end of file