#include "ext.h" #include "utf8.h" extern VALUE intern_as_utf8; /* * Document-class: String::UTF8 */ /* * call-seq: length * * Returns the number of UTF8 characters in this string */ static VALUE rb_cString_UTF8_length(VALUE self) { unsigned char *str = (unsigned char *)RSTRING_PTR(self); size_t len = RSTRING_LEN(self); int64_t utf8_len = 0; utf8_len = utf8CharCount(str, len); if (utf8_len < 0) { rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); } return INT2FIX(utf8_len); } /* * call-seq: each_char {|utf8_char| ...} * * Iterates over the string, yielding one UTF8 character at a time */ static VALUE rb_cString_UTF8_each_char(VALUE self) { unsigned char *str = (unsigned char *)RSTRING_PTR(self); size_t len = RSTRING_LEN(self), i=0; int8_t lastCharLen=0; VALUE utf8Str; // this will return an Enumerator wrapping this string, yielding this method // when Enumerator#each is called RETURN_ENUMERATOR(self, 0, 0); for(; i<len; i+=lastCharLen) { lastCharLen = utf8CharLen(str, len); if (lastCharLen < 0) { rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); } utf8Str = rb_str_new((char *)str+i, lastCharLen); AS_UTF8(utf8Str); rb_yield(utf8Str); } return self; } /* * Works like String#[] but taking into account UTF8 character boundaries * * This method doesn't currently (and may never) support Regexp parameters * It also doesn't support a String parameter (yet) */ static VALUE rb_cString_UTF8_slice(int argc, VALUE *argv, VALUE self) { unsigned char *str = (unsigned char *)RSTRING_PTR(self), *start = str; size_t len = RSTRING_LEN(self); VALUE utf8Str; if (len == 0) return Qnil; if (argc == 2) { if (TYPE(argv[0]) == T_REGEXP) { rb_raise(rb_eArgError, "Regular Expressions aren't supported yet"); } // [offset, length] syntax long wantPos = NUM2LONG(argv[0]), curPos = 0, wantLen = NUM2LONG(argv[1]); int8_t curCharLen = 0; unsigned char *offset = str; if (wantLen < 0) { return Qnil; } else if (wantLen == 0) { utf8Str = rb_str_new("", 0); AS_UTF8(utf8Str); return utf8Str; } if (wantPos < 0) { int64_t char_cnt = utf8CharCount(str, len); if (char_cnt < 0) { rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); } if ((wantPos * -1) > char_cnt) { return Qnil; } wantPos = char_cnt + wantPos; } // scan until starting position curCharLen = utf8CharLen(str, len); if (curCharLen < 0) { rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); } while (curPos < wantPos) { // if we're about to step out of bounds, return nil if ((size_t)(str-start) >= len) { return Qnil; } str += curCharLen; curCharLen = utf8CharLen(str, len); if (curCharLen < 0) { rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); } curPos++; } // now scan until we have the number of chars asked for curPos = 1; offset = str; str += curCharLen; curCharLen = utf8CharLen(str, len); if (curCharLen < 0) { rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); } while (curPos < wantLen) { // if we're about to step out of bounds, stop if ((size_t)(str-start) >= len) { break; } str += curCharLen; curCharLen = utf8CharLen(str, len); if (curCharLen < 0) { rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); } curPos++; } utf8Str = rb_str_new((char *)offset, str-offset); AS_UTF8(utf8Str); return utf8Str; } if (argc != 1) { rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc); } // [Fixnum] syntax if (TYPE(argv[0]) == T_FIXNUM) { long wantPos = NUM2LONG(argv[0]), curPos = 0; int8_t curCharLen = 0; if (wantPos < 0) { int64_t char_cnt = utf8CharCount(str, len); if ((wantPos * -1) > char_cnt) { return Qnil; } wantPos = char_cnt + wantPos; } curCharLen = utf8CharLen(str, len); if (curCharLen < 0) { rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); } while (curPos < wantPos) { // if we're about to step out of bounds, return nil if ((size_t)(str-start) >= len) { return Qnil; } str += curCharLen; curCharLen = utf8CharLen(str, len); if (curCharLen < 0) { rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); } curPos++; } utf8Str = rb_str_new((char *)str, curCharLen); AS_UTF8(utf8Str); return utf8Str; } else { if (TYPE(argv[0]) == T_REGEXP) { rb_raise(rb_eArgError, "Regular Expressions aren't supported yet"); } // [Range] syntax long wantPos, curPos = 0, wantLen; int64_t char_cnt = 0; int8_t curCharLen = 0; unsigned char *offset = str; VALUE ret; char_cnt = utf8CharCount(str, len); ret = rb_range_beg_len(argv[0], &wantPos, &wantLen, char_cnt, 0); if (ret == Qnil) { return Qnil; } else if (ret == Qfalse) { // TODO: wtf do we do :P } if (wantLen == 0) { utf8Str = rb_str_new("", 0); AS_UTF8(utf8Str); return utf8Str; } // scan until starting position curCharLen = utf8CharLen(str, len); if (curCharLen < 0) { rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); } while (curPos < wantPos) { // if we're about to step out of bounds, return "" if ((size_t)(str-start) >= len) { utf8Str = rb_str_new("", 0); AS_UTF8(utf8Str); return utf8Str; } str += curCharLen; curCharLen = utf8CharLen(str, len); if (curCharLen < 0) { rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); } curPos++; } // now scan until we have the number of chars asked for curPos = 1; offset = str; str += curCharLen; curCharLen = utf8CharLen(str, len); if (curCharLen < 0) { rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); } while (curPos < wantLen) { // if we're about to step out of bounds, stop if ((size_t)(str-start) >= len) { break; } str += curCharLen; curCharLen = utf8CharLen(str, len); if (curCharLen < 0) { rb_raise(rb_eArgError, "invalid utf-8 byte sequence"); } curPos++; } utf8Str = rb_str_new((char *)offset, str-offset); AS_UTF8(utf8Str); return utf8Str; } } void init_String_UTF8() { VALUE rb_cString_UTF8 = rb_define_class_under(rb_cString, "UTF8", rb_cString); rb_define_method(rb_cString_UTF8, "length", rb_cString_UTF8_length, 0); rb_define_method(rb_cString_UTF8, "each_char", rb_cString_UTF8_each_char, 0); rb_define_method(rb_cString_UTF8, "[]", rb_cString_UTF8_slice, -1); }