string_utf8.c in utf8-0.1.1

- old
+ new

@@ -13,13 +13,16 @@
  * Returns the number of UTF8 characters in this string
  */
 static VALUE rb_cString_UTF8_length(VALUE self) {
   unsigned char *str = (unsigned char *)RSTRING_PTR(self);
   size_t len = RSTRING_LEN(self);
-  size_t utf8_len = 0;
+  int64_t utf8_len = 0;
 
   utf8_len = utf8CharCount(str, len);
+  if (utf8_len < 0) {
+    rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+  }
 
   return INT2FIX(utf8_len);
 }
 
 /*
@@ -37,10 +40,13 @@
   // when Enumerator#each is called
   RETURN_ENUMERATOR(self, 0, 0);
 
   for(; i<len; i+=lastCharLen) {
     lastCharLen = utf8CharLen(str, len);
+    if (lastCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     utf8Str = rb_str_new((char *)str+i, lastCharLen);
     AS_UTF8(utf8Str);
     rb_yield(utf8Str);
   }
 
@@ -77,43 +83,58 @@
       AS_UTF8(utf8Str);
       return utf8Str;
     }
 
     if (wantPos < 0) {
-      long char_cnt = utf8CharCount(str, len);
+      int64_t char_cnt = utf8CharCount(str, len);
+      if (char_cnt < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       if ((wantPos * -1) > char_cnt) {
         return Qnil;
       }
       wantPos = char_cnt + wantPos;
     }
 
     // scan until starting position
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantPos) {
       // if we're about to step out of bounds, return nil
       if ((size_t)(str-start) >= len) {
         return Qnil;
       }
 
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }
 
     // now scan until we have the number of chars asked for
     curPos = 1;
     offset = str;
     str += curCharLen;
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantLen) {
       // if we're about to step out of bounds, stop
       if ((size_t)(str-start) >= len) {
         break;
       }
 
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }
 
     utf8Str = rb_str_new((char *)offset, str-offset);
     AS_UTF8(utf8Str);
@@ -128,26 +149,32 @@
   if (TYPE(argv[0]) == T_FIXNUM) {
     long wantPos = NUM2LONG(argv[0]), curPos = 0;
     int8_t curCharLen = 0;
 
     if (wantPos < 0) {
-      long char_cnt = utf8CharCount(str, len);
+      int64_t char_cnt = utf8CharCount(str, len);
       if ((wantPos * -1) > char_cnt) {
         return Qnil;
       }
       wantPos = char_cnt + wantPos;
     }
 
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantPos) {
       // if we're about to step out of bounds, return nil
       if ((size_t)(str-start) >= len) {
         return Qnil;
       }
 
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }
 
     utf8Str = rb_str_new((char *)str, curCharLen);
     AS_UTF8(utf8Str);
@@ -156,11 +183,12 @@
     if (TYPE(argv[0]) == T_REGEXP) {
       rb_raise(rb_eArgError, "Regular Expressions aren't supported yet");
     }
 
     // [Range] syntax
-    long wantPos, curPos = 0, wantLen, char_cnt = 0;
+    long wantPos, curPos = 0, wantLen;
+    int64_t char_cnt = 0;
     int8_t curCharLen = 0;
     unsigned char *offset = str;
     VALUE ret;
 
     char_cnt = utf8CharCount(str, len);
@@ -178,35 +206,47 @@
       return utf8Str;
     }
 
     // scan until starting position
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantPos) {
       // if we're about to step out of bounds, return ""
       if ((size_t)(str-start) >= len) {
         utf8Str = rb_str_new("", 0);
         AS_UTF8(utf8Str);
         return utf8Str;
       }
 
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }
 
     // now scan until we have the number of chars asked for
     curPos = 1;
     offset = str;
     str += curCharLen;
     curCharLen = utf8CharLen(str, len);
+    if (curCharLen < 0) {
+      rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+    }
     while (curPos < wantLen) {
       // if we're about to step out of bounds, stop
       if ((size_t)(str-start) >= len) {
         break;
       }
 
       str += curCharLen;
       curCharLen = utf8CharLen(str, len);
+      if (curCharLen < 0) {
+        rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+      }
       curPos++;
     }
 
     utf8Str = rb_str_new((char *)offset, str-offset);
     AS_UTF8(utf8Str);
\ No newline at end of file