ext/utf8/string_utf8.c in utf8-0.1.2 vs ext/utf8/string_utf8.c in utf8-0.1.3
- old
+ new
@@ -2,17 +2,17 @@
#include "utf8.h"
extern VALUE intern_as_utf8;
/*
- * Document-class: String::UTF8
+ * Document-class: String::UTF-8
*/
/*
* call-seq: length
*
- * Returns the number of UTF8 characters in this string
+ * Returns: a Fixnum - the number of UTF-8 characters in this string
*/
static VALUE rb_cString_UTF8_length(VALUE self) {
unsigned char *str = (unsigned char *)RSTRING_PTR(self);
size_t len = RSTRING_LEN(self);
int64_t utf8_len = 0;
@@ -26,11 +26,13 @@
}
/*
* call-seq: each_char {|utf8_char| ...}
*
- * Iterates over the string, yielding one UTF8 character at a time
+ * Iterates over the string, yielding one UTF-8 character at a time
+ *
+ * Returns: self
*/
static VALUE rb_cString_UTF8_each_char(int argc, VALUE *argv, VALUE self) {
unsigned char *str = (unsigned char *)RSTRING_PTR(self);
size_t len = RSTRING_LEN(self), i=0;
int8_t lastCharLen=0;
@@ -54,12 +56,80 @@
return self;
}
/*
- * Works like String#[] but taking into account UTF8 character boundaries
+ * call-seq: each_codepoint {|utf8_codepoint| ...}
*
+ * Iterates over the string, yielding one UTF-8 codepoint at a time
+ *
+ * Returns: self
+ */
+static VALUE rb_cString_UTF8_each_codepoint(int argc, VALUE *argv, VALUE self) {
+ unsigned char *str = (unsigned char *)RSTRING_PTR(self);
+ size_t len = RSTRING_LEN(self), i=0;
+ int8_t lastCharLen=0;
+ int32_t cp;
+
+ // this will return an Enumerator wrapping this string, yielding this method
+ // when Enumerator#each is called
+ if (!rb_block_given_p()) {
+ return rb_funcall(self, rb_intern("to_enum"), 1, ID2SYM(rb_intern("each_codepoint")));
+ }
+
+ for(; i<len; i+=lastCharLen) {
+ lastCharLen = utf8CharLen(str, len);
+ if (lastCharLen < 0) {
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+ }
+ cp = utf8CharToCodepoint(str+i, lastCharLen);
+ rb_yield(INT2FIX(cp));
+ }
+
+ return self;
+}
+
+/*
+ * call-seq: valid?(max_codepoint=nil)
+ *
+ * Iterates over the string, yielding one UTF-8 codepoint at a time
+ *
+ * max_codepoint - an optional Fixnum used to declare this string invalid
+ * if a codepoint higher than that value is found
+ * if nothing is passed, the UTF-8 maximum of 0x10FFFF is assumed
+ *
+ * Returns: a Boolean - true if the string is valid, false if not
+ */
+static VALUE rb_cString_UTF8_valid(int argc, VALUE *argv, VALUE self) {
+ unsigned char *str = (unsigned char *)RSTRING_PTR(self);
+ size_t len = RSTRING_LEN(self), i=0;
+ int8_t lastCharLen=0;
+ int32_t cp, cp_max = -1;
+ VALUE rb_cp_max;
+
+ if (rb_scan_args(argc, argv, "01", &rb_cp_max) == 1) {
+ Check_Type(rb_cp_max, T_FIXNUM);
+ cp_max = FIX2INT(rb_cp_max);
+ }
+
+ for(; i<len; i+=lastCharLen) {
+ lastCharLen = utf8CharLen(str, len);
+ if (lastCharLen < 0) {
+ return Qfalse;
+ }
+ cp = utf8CharToCodepoint(str+i, lastCharLen);
+ if (cp_max >= 0 && cp > cp_max) {
+ return Qfalse;
+ }
+ }
+
+ return Qtrue;
+}
+
+/*
+ * Works like String#[] but taking into account UTF-8 character boundaries
+ *
* This method doesn't currently (and may never) support Regexp parameters
* It also doesn't support a String parameter (yet)
*/
static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
unsigned char *str = (unsigned char *)RSTRING_PTR(self), *start = str;
@@ -261,6 +331,8 @@
VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0);
rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1);
rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1);
+ rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1);
+ rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1);
}
\ No newline at end of file