ext/utf8/string_utf8.c in utf8-0.1.0 vs ext/utf8/string_utf8.c in utf8-0.1.1

- old
+ new

@@ -13,13 +13,16 @@ * Returns the number of UTF8 characters in this string */ static VALUE rb_cString_UTF8_length(VALUE self) { unsigned char *str = (unsigned char *)RSTRING_PTR(self); size_t len = RSTRING_LEN(self); - size_t utf8_len = 0; + int64_t utf8_len = 0; utf8_len = utf8CharCount(str, len); + if (utf8_len < 0) { + rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); + } return INT2FIX(utf8_len); } /* @@ -37,10 +40,13 @@ // when Enumerator#each is called RETURN_ENUMERATOR(self, 0, 0); for(; i<len; i+=lastCharLen) { lastCharLen = utf8CharLen(str, len); + if (lastCharLen < 0) { + rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); + } utf8Str = rb_str_new((char *)str+i, lastCharLen); AS_UTF8(utf8Str); rb_yield(utf8Str); } @@ -77,43 +83,58 @@ AS_UTF8(utf8Str); return utf8Str; } if (wantPos < 0) { - long char_cnt = utf8CharCount(str, len); + int64_t char_cnt = utf8CharCount(str, len); + if (char_cnt < 0) { + rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); + } if ((wantPos * -1) > char_cnt) { return Qnil; } wantPos = char_cnt + wantPos; } // scan until starting position curCharLen = utf8CharLen(str, len); + if (curCharLen < 0) { + rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); + } while (curPos < wantPos) { // if we're about to step out of bounds, return nil if ((size_t)(str-start) >= len) { return Qnil; } str += curCharLen; curCharLen = utf8CharLen(str, len); + if (curCharLen < 0) { + rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); + } curPos++; } // now scan until we have the number of chars asked for curPos = 1; offset = str; str += curCharLen; curCharLen = utf8CharLen(str, len); + if (curCharLen < 0) { + rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); + } while (curPos < wantLen) { // if we're about to step out of bounds, stop if ((size_t)(str-start) >= len) { break; } str += curCharLen; curCharLen = utf8CharLen(str, len); + if (curCharLen < 0) { + rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); + } curPos++; } utf8Str = rb_str_new((char *)offset, str-offset); AS_UTF8(utf8Str); @@ -128,26 +149,32 @@ if (TYPE(argv[0]) == T_FIXNUM) { long wantPos = NUM2LONG(argv[0]), curPos = 0; int8_t curCharLen = 0; if (wantPos < 0) { - long char_cnt = utf8CharCount(str, len); + int64_t char_cnt = utf8CharCount(str, len); if ((wantPos * -1) > char_cnt) { return Qnil; } wantPos = char_cnt + wantPos; } curCharLen = utf8CharLen(str, len); + if (curCharLen < 0) { + rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); + } while (curPos < wantPos) { // if we're about to step out of bounds, return nil if ((size_t)(str-start) >= len) { return Qnil; } str += curCharLen; curCharLen = utf8CharLen(str, len); + if (curCharLen < 0) { + rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); + } curPos++; } utf8Str = rb_str_new((char *)str, curCharLen); AS_UTF8(utf8Str); @@ -156,11 +183,12 @@ if (TYPE(argv[0]) == T_REGEXP) { rb_raise(rb_eArgError, "Regular Expressions aren't supported yet"); } // [Range] syntax - long wantPos, curPos = 0, wantLen, char_cnt = 0; + long wantPos, curPos = 0, wantLen; + int64_t char_cnt = 0; int8_t curCharLen = 0; unsigned char *offset = str; VALUE ret; char_cnt = utf8CharCount(str, len); @@ -178,35 +206,47 @@ return utf8Str; } // scan until starting position curCharLen = utf8CharLen(str, len); + if (curCharLen < 0) { + rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); + } while (curPos < wantPos) { // if we're about to step out of bounds, return "" if ((size_t)(str-start) >= len) { utf8Str = rb_str_new("", 0); AS_UTF8(utf8Str); return utf8Str; } str += curCharLen; curCharLen = utf8CharLen(str, len); + if (curCharLen < 0) { + rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); + } curPos++; } // now scan until we have the number of chars asked for curPos = 1; offset = str; str += curCharLen; curCharLen = utf8CharLen(str, len); + if (curCharLen < 0) { + rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); + } while (curPos < wantLen) { // if we're about to step out of bounds, stop if ((size_t)(str-start) >= len) { break; } str += curCharLen; curCharLen = utf8CharLen(str, len); + if (curCharLen < 0) { + rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); + } curPos++; } utf8Str = rb_str_new((char *)offset, str-offset); AS_UTF8(utf8Str); \ No newline at end of file