string_utf8.c in utf8-0.1.3

- old
+ new

@@ -2,17 +2,17 @@
 #include "utf8.h"
 
 extern VALUE intern_as_utf8;
 
 /*
- * Document-class: String::UTF8
+ * Document-class: String::UTF-8
  */
 
 /*
  * call-seq: length
  *
- * Returns the number of UTF8 characters in this string
+ * Returns: a Fixnum - the number of UTF-8 characters in this string
  */
 static VALUE rb_cString_UTF8_length(VALUE self) {
   unsigned char *str = (unsigned char *)RSTRING_PTR(self);
   size_t len = RSTRING_LEN(self);
   int64_t utf8_len = 0;
@@ -26,11 +26,13 @@
 }
 
 /*
  * call-seq: each_char {|utf8_char| ...}
  *
- * Iterates over the string, yielding one UTF8 character at a time
+ * Iterates over the string, yielding one UTF-8 character at a time
+ *
+ * Returns: self
  */
 static VALUE rb_cString_UTF8_each_char(int argc, VALUE *argv, VALUE self) {
   unsigned char *str = (unsigned char *)RSTRING_PTR(self);
   size_t len = RSTRING_LEN(self), i=0;
   int8_t lastCharLen=0;
@@ -54,12 +56,80 @@
 
   return self;
 }
 
 /*
- * Works like String#[] but taking into account UTF8 character boundaries
+ * call-seq: each_codepoint {|utf8_codepoint| ...}
  *
+ * Iterates over the string, yielding one UTF-8 codepoint at a time
+ *
+ * Returns: self
+ */
+static VALUE rb_cString_UTF8_each_codepoint(int argc, VALUE *argv, VALUE self) {
+  unsigned char *str = (unsigned char *)RSTRING_PTR(self);
+  size_t len = RSTRING_LEN(self), i=0;
+  int8_t lastCharLen=0;
+  int32_t cp;
+
+  // this will return an Enumerator wrapping this string, yielding this method
+  // when Enumerator#each is called
+  if (!rb_block_given_p()) {
+    return rb_funcall(self, rb_intern("to_enum"), 1, ID2SYM(rb_intern("each_codepoint")));
+  }
+
+  for(; i<len; i+=lastCharLen) {
+    lastCharLen = utf8CharLen(str, len);
+    if (lastCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
+    cp = utf8CharToCodepoint(str+i, lastCharLen);
+    rb_yield(INT2FIX(cp));
+  }
+
+  return self;
+}
+
+/*
+ * call-seq: valid?(max_codepoint=nil)
+ *
+ * Iterates over the string, yielding one UTF-8 codepoint at a time
+ *
+ * max_codepoint - an optional Fixnum used to declare this string invalid
+ *                 if a codepoint higher than that value is found
+ *                 if nothing is passed, the UTF-8 maximum of 0x10FFFF is assumed
+ *
+ * Returns: a Boolean - true if the string is valid, false if not
+ */
+static VALUE rb_cString_UTF8_valid(int argc, VALUE *argv, VALUE self) {
+  unsigned char *str = (unsigned char *)RSTRING_PTR(self);
+  size_t len = RSTRING_LEN(self), i=0;
+  int8_t lastCharLen=0;
+  int32_t cp, cp_max = -1;
+  VALUE rb_cp_max;
+
+  if (rb_scan_args(argc, argv, "01", &rb_cp_max) == 1) {
+    Check_Type(rb_cp_max, T_FIXNUM);
+    cp_max = FIX2INT(rb_cp_max);
+  }
+
+  for(; i<len; i+=lastCharLen) {
+    lastCharLen = utf8CharLen(str, len);
+    if (lastCharLen < 0) {
+      return Qfalse;
+    }
+    cp = utf8CharToCodepoint(str+i, lastCharLen);
+    if (cp_max >= 0 && cp > cp_max) {
+      return Qfalse;
+    }
+  }
+
+  return Qtrue;
+}
+
+/*
+ * Works like String#[] but taking into account UTF-8 character boundaries
+ *
  * This method doesn't currently (and may never) support Regexp parameters
  * It also doesn't support a String parameter (yet)
  */
 static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) {
   unsigned char *str = (unsigned char *)RSTRING_PTR(self), *start = str;
@@ -261,6 +331,8 @@
   VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString);
 
   rb_define_method(rb_cString_UTF8, "length",    rb_cString_UTF8_length, 0);
   rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, -1);
   rb_define_method(rb_cString_UTF8, "[]",        rb_cString_UTF8_slice, -1);
+  rb_define_method(rb_cString_UTF8, "each_codepoint", rb_cString_UTF8_each_codepoint, -1);
+  rb_define_method(rb_cString_UTF8, "valid?", rb_cString_UTF8_valid, -1);
 }
\ No newline at end of file