ext/utf8/string_utf8.c in utf8-0.1.0 vs ext/utf8/string_utf8.c in utf8-0.1.1
- old
+ new
@@ -13,13 +13,16 @@
* Returns the number of UTF8 characters in this string
*/
static VALUE rb_cString_UTF8_length(VALUE self) {
unsigned char *str = (unsigned char *)RSTRING_PTR(self);
size_t len = RSTRING_LEN(self);
- size_t utf8_len = 0;
+ int64_t utf8_len = 0;
utf8_len = utf8CharCount(str, len);
+ if (utf8_len < 0) {
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+ }
return INT2FIX(utf8_len);
}
/*
@@ -37,10 +40,13 @@
// when Enumerator#each is called
RETURN_ENUMERATOR(self, 0, 0);
for(; i<len; i+=lastCharLen) {
lastCharLen = utf8CharLen(str, len);
+ if (lastCharLen < 0) {
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+ }
utf8Str = rb_str_new((char *)str+i, lastCharLen);
AS_UTF8(utf8Str);
rb_yield(utf8Str);
}
@@ -77,43 +83,58 @@
AS_UTF8(utf8Str);
return utf8Str;
}
if (wantPos < 0) {
- long char_cnt = utf8CharCount(str, len);
+ int64_t char_cnt = utf8CharCount(str, len);
+ if (char_cnt < 0) {
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+ }
if ((wantPos * -1) > char_cnt) {
return Qnil;
}
wantPos = char_cnt + wantPos;
}
// scan until starting position
curCharLen = utf8CharLen(str, len);
+ if (curCharLen < 0) {
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+ }
while (curPos < wantPos) {
// if we're about to step out of bounds, return nil
if ((size_t)(str-start) >= len) {
return Qnil;
}
str += curCharLen;
curCharLen = utf8CharLen(str, len);
+ if (curCharLen < 0) {
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+ }
curPos++;
}
// now scan until we have the number of chars asked for
curPos = 1;
offset = str;
str += curCharLen;
curCharLen = utf8CharLen(str, len);
+ if (curCharLen < 0) {
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+ }
while (curPos < wantLen) {
// if we're about to step out of bounds, stop
if ((size_t)(str-start) >= len) {
break;
}
str += curCharLen;
curCharLen = utf8CharLen(str, len);
+ if (curCharLen < 0) {
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+ }
curPos++;
}
utf8Str = rb_str_new((char *)offset, str-offset);
AS_UTF8(utf8Str);
@@ -128,26 +149,32 @@
if (TYPE(argv[0]) == T_FIXNUM) {
long wantPos = NUM2LONG(argv[0]), curPos = 0;
int8_t curCharLen = 0;
if (wantPos < 0) {
- long char_cnt = utf8CharCount(str, len);
+ int64_t char_cnt = utf8CharCount(str, len);
if ((wantPos * -1) > char_cnt) {
return Qnil;
}
wantPos = char_cnt + wantPos;
}
curCharLen = utf8CharLen(str, len);
+ if (curCharLen < 0) {
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+ }
while (curPos < wantPos) {
// if we're about to step out of bounds, return nil
if ((size_t)(str-start) >= len) {
return Qnil;
}
str += curCharLen;
curCharLen = utf8CharLen(str, len);
+ if (curCharLen < 0) {
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+ }
curPos++;
}
utf8Str = rb_str_new((char *)str, curCharLen);
AS_UTF8(utf8Str);
@@ -156,11 +183,12 @@
if (TYPE(argv[0]) == T_REGEXP) {
rb_raise(rb_eArgError, "Regular Expressions aren't supported yet");
}
// [Range] syntax
- long wantPos, curPos = 0, wantLen, char_cnt = 0;
+ long wantPos, curPos = 0, wantLen;
+ int64_t char_cnt = 0;
int8_t curCharLen = 0;
unsigned char *offset = str;
VALUE ret;
char_cnt = utf8CharCount(str, len);
@@ -178,35 +206,47 @@
return utf8Str;
}
// scan until starting position
curCharLen = utf8CharLen(str, len);
+ if (curCharLen < 0) {
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+ }
while (curPos < wantPos) {
// if we're about to step out of bounds, return ""
if ((size_t)(str-start) >= len) {
utf8Str = rb_str_new("", 0);
AS_UTF8(utf8Str);
return utf8Str;
}
str += curCharLen;
curCharLen = utf8CharLen(str, len);
+ if (curCharLen < 0) {
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+ }
curPos++;
}
// now scan until we have the number of chars asked for
curPos = 1;
offset = str;
str += curCharLen;
curCharLen = utf8CharLen(str, len);
+ if (curCharLen < 0) {
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+ }
while (curPos < wantLen) {
// if we're about to step out of bounds, stop
if ((size_t)(str-start) >= len) {
break;
}
str += curCharLen;
curCharLen = utf8CharLen(str, len);
+ if (curCharLen < 0) {
+ rb_raise(rb_eArgError, "invalid utf-8 byte sequence");
+ }
curPos++;
}
utf8Str = rb_str_new((char *)offset, str-offset);
AS_UTF8(utf8Str);
\ No newline at end of file