ext/utf8/string_utf8.c in utf8-0.1.2 vs ext/utf8/string_utf8.c in utf8-0.1.3

- old
+ new

@@ -2,17 +2,17 @@ #include "utf8.h" extern VALUE intern_as_utf8; /* - * Document-class: String::UTF8 + * Document-class: String::UTF-8 */ /* * call-seq: length * - * Returns the number of UTF8 characters in this string + * Returns: a Fixnum - the number of UTF-8 characters in this string */ static VALUE rb_cString_UTF8_length(VALUE self) { unsigned char *str = (unsigned char *)RSTRING_PTR(self); size_t len = RSTRING_LEN(self); int64_t utf8_len = 0; @@ -26,11 +26,13 @@ } /* * call-seq: each_char {|utf8_char| ...} * - * Iterates over the string, yielding one UTF8 character at a time + * Iterates over the string, yielding one UTF-8 character at a time + * + * Returns: self */ static VALUE rb_cString_UTF8_each_char(int argc, VALUE *argv, VALUE self) { unsigned char *str = (unsigned char *)RSTRING_PTR(self); size_t len = RSTRING_LEN(self), i=0; int8_t lastCharLen=0; @@ -54,12 +56,80 @@ return self; } /* - * Works like String#[] but taking into account UTF8 character boundaries + * call-seq: each_codepoint {|utf8_codepoint| ...} * + * Iterates over the string, yielding one UTF-8 codepoint at a time + * + * Returns: self + */ +static VALUE rb_cString_UTF8_each_codepoint(int argc, VALUE *argv, VALUE self) { + unsigned char *str = (unsigned char *)RSTRING_PTR(self); + size_t len = RSTRING_LEN(self), i=0; + int8_t lastCharLen=0; + int32_t cp; + + // this will return an Enumerator wrapping this string, yielding this method + // when Enumerator#each is called + if (!rb_block_given_p()) { + return rb_funcall(self, rb_intern("to_enum"), 1, ID2SYM(rb_intern("each_codepoint"))); + } + + for(; i<len; i+=lastCharLen) { + lastCharLen = utf8CharLen(str, len); + if (lastCharLen < 0) { + rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); + } + cp = utf8CharToCodepoint(str+i, lastCharLen); + rb_yield(INT2FIX(cp)); + } + + return self; +} + +/* + * call-seq: valid?(max_codepoint=nil) + * + * Iterates over the string, yielding one UTF-8 codepoint at a time + * + * max_codepoint - an optional Fixnum used to declare this string invalid + * if a codepoint higher than that value is found + * if nothing is passed, the UTF-8 maximum of 0x10FFFF is assumed + * + * Returns: a Boolean - true if the string is valid, false if not + */ +static VALUE rb_cString_UTF8_valid(int argc, VALUE *argv, VALUE self) { + unsigned char *str = (unsigned char *)RSTRING_PTR(self); + size_t len = RSTRING_LEN(self), i=0; + int8_t lastCharLen=0; + int32_t cp, cp_max = -1; + VALUE rb_cp_max; + + if (rb_scan_args(argc, argv, "01", &rb_cp_max) == 1) { + Check_Type(rb_cp_max, T_FIXNUM); + cp_max = FIX2INT(rb_cp_max); + } + + for(; i<len; i+=lastCharLen) { + lastCharLen = utf8CharLen(str, len); + if (lastCharLen < 0) { + return Qfalse; + } + cp = utf8CharToCodepoint(str+i, lastCharLen); + if (cp_max >= 0 && cp > cp_max) { + return Qfalse; + } + } + + return Qtrue; +} + +/* + * Works like String#[] but taking into account UTF-8 character boundaries + * * This method doesn't currently (and may never) support Regexp parameters * It also doesn't support a String parameter (yet) */ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) { unsigned char *str = (unsigned char *)RSTRING_PTR(self), *start = str; @@ -261,6 +331,8 @@ VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString); rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0); rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1); rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1); + rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1); + rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1); } \ No newline at end of file