ext/utf8/utf8.c in utf8-0.1.2 vs ext/utf8/utf8.c in utf8-0.1.3
- old
+ new
@@ -3,11 +3,11 @@
#define CHECK_LEN if ((size_t)(in-start) >= in_len) return -1;
/*
* Scans the current position of the buffer
- * returning the length of this UTF8 character
+ * returning the length of this UTF-8 character
*/
inline int8_t utf8CharLen(unsigned char *in, size_t in_len) {
if (in_len > 0) {
unsigned char curChar, *start;
@@ -58,11 +58,11 @@
return -1;
}
/*
* Scans the current position of the buffer
- * returning the total number of UTF8 characters found
+ * returning the total number of UTF-8 characters found
*/
int64_t utf8CharCount(unsigned char *in, size_t in_len) {
int64_t total = 0, leftOver = in_len;
int8_t len = 0;
unsigned char *start = in;
@@ -78,6 +78,28 @@
total++;
}
}
return total;
+}
+
+/*
+ * Scans the current position of the buffer
+ * returning the codepoint for the UTF-8 character at this position
+ */
+int32_t utf8CharToCodepoint(unsigned char *in, size_t in_len) {
+ int32_t cp, ncp, len;
+
+ len = utf8CharLen(in, in_len);
+ cp = *in++;
+ if (len > 1) {
+ len--;
+ ncp = cp & ((1 << (6 - len)) - 1);
+ while (len--) {
+ cp = *in++;
+ ncp = (ncp << 6) | (cp & ((1 << 6) - 1));
+ }
+ return ncp;
+ } else {
+ return cp;
+ }
}