utf-conv.c in rugged-0.21.0

- old
+ new
@@ -6,76 +6,133 @@
  */
 
 #include "common.h"
 #include "utf-conv.h"
 
-#define U16_LEAD(c) (wchar_t)(((c)>>10)+0xd7c0)
-#define U16_TRAIL(c) (wchar_t)(((c)&0x3ff)|0xdc00)
+#ifndef WC_ERR_INVALID_CHARS
+# define WC_ERR_INVALID_CHARS	0x80
+#endif
 
-#if 0
-void git__utf8_to_16(wchar_t *dest, size_t length, const char *src)
+GIT_INLINE(DWORD) get_wc_flags(void)
 {
-	wchar_t *pDest = dest;
-	uint32_t ch;
-	const uint8_t* pSrc = (uint8_t*) src;
+	static char inited = 0;
+	static DWORD flags;
 
-	assert(dest && src && length);
+	/* Invalid code point check supported on Vista+ only */
+	if (!inited) {
+		flags = git_has_win32_version(6, 0, 0) ? WC_ERR_INVALID_CHARS : 0;
+		inited = 1;
+	}
 
-	length--;
+	return flags;
+}
 
-	while(*pSrc && length > 0) {
-		ch = *pSrc++;
-		length--;
+/**
+ * Converts a UTF-8 string to wide characters.
+ *
+ * @param dest The buffer to receive the wide string.
+ * @param dest_size The size of the buffer, in characters.
+ * @param src The UTF-8 string to convert.
+ * @return The length of the wide string, in characters (not counting the NULL terminator), or < 0 for failure
+ */
+int git__utf8_to_16(wchar_t *dest, size_t dest_size, const char *src)
+{
+	/* Length of -1 indicates NULL termination of the input string. Subtract 1 from the result to
+	* turn 0 into -1 (an error code) and to not count the NULL terminator as part of the string's
+	* length. MultiByteToWideChar never returns int's minvalue, so underflow is not possible */
+	return MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src, -1, dest, (int)dest_size) - 1;
+}
 
-		if(ch < 0xc0) {
-			/*
-			 * ASCII, or a trail byte in lead position which is treated like
-			 * a single-byte sequence for better character boundary
-			 * resynchronization after illegal sequences.
-			 */
-			*pDest++ = (wchar_t)ch;
-			continue;
-		} else if(ch < 0xe0) { /* U+0080..U+07FF */
-			if (pSrc[0]) {
-				/* 0x3080 = (0xc0 << 6) + 0x80 */
-				*pDest++ = (wchar_t)((ch << 6) + *pSrc++ - 0x3080);
-				continue;
-			}
-		} else if(ch < 0xf0) { /* U+0800..U+FFFF */
-			if (pSrc[0] && pSrc[1]) {
-				/* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
-				/* 0x2080 = (0x80 << 6) + 0x80 */
-				ch = (ch << 12) + (*pSrc++ << 6);
-				*pDest++ = (wchar_t)(ch + *pSrc++ - 0x2080);
-				continue;
-			}
-		} else /* f0..f4 */ { /* U+10000..U+10FFFF */
-			if (length >= 1 && pSrc[0] && pSrc[1] && pSrc[2]) {
-				/* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
-				ch = (ch << 18) + (*pSrc++ << 12);
-				ch += *pSrc++ << 6;
-				ch += *pSrc++ - 0x3c82080;
-				*(pDest++) = U16_LEAD(ch);
-				*(pDest++) = U16_TRAIL(ch);
-				length--; /* two bytes for this character */
-				continue;
-			}
-		}
+/**
+ * Converts a wide string to UTF-8.
+ *
+ * @param dest The buffer to receive the UTF-8 string.
+ * @param dest_size The size of the buffer, in bytes.
+ * @param src The wide string to convert.
+ * @return The length of the UTF-8 string, in bytes (not counting the NULL terminator), or < 0 for failure
+ */
+int git__utf16_to_8(char *dest, size_t dest_size, const wchar_t *src)
+{
+	/* Length of -1 indicates NULL termination of the input string. Subtract 1 from the result to
+	 * turn 0 into -1 (an error code) and to not count the NULL terminator as part of the string's
+	 * length. WideCharToMultiByte never returns int's minvalue, so underflow is not possible */
+	return WideCharToMultiByte(CP_UTF8, get_wc_flags(), src, -1, dest, (int)dest_size, NULL, NULL) - 1;
+}
 
-		/* truncated character at the end */
-		*pDest++ = 0xfffd;
-		break;
+/**
+ * Converts a UTF-8 string to wide characters.
+ * Memory is allocated to hold the converted string.
+ * The caller is responsible for freeing the string with git__free.
+ *
+ * @param dest Receives a pointer to the wide string.
+ * @param src The UTF-8 string to convert.
+ * @return The length of the wide string, in characters (not counting the NULL terminator), or < 0 for failure
+ */
+int git__utf8_to_16_alloc(wchar_t **dest, const char *src)
+{
+	int utf16_size;
+
+	*dest = NULL;
+
+	/* Length of -1 indicates NULL termination of the input string */
+	utf16_size = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src, -1, NULL, 0);
+
+	if (!utf16_size)
+		return -1;
+
+	*dest = git__malloc(utf16_size * sizeof(wchar_t));
+
+	if (!*dest)
+		return -1;
+
+	utf16_size = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, src, -1, *dest, utf16_size);
+
+	if (!utf16_size) {
+		git__free(*dest);
+		*dest = NULL;
 	}
 
-	*pDest++ = 0x0;
+	/* Subtract 1 from the result to turn 0 into -1 (an error code) and to not count the NULL
+	 * terminator as part of the string's length. MultiByteToWideChar never returns int's minvalue,
+	 * so underflow is not possible */
+	return utf16_size - 1;
 }
-#endif
 
-int git__utf8_to_16(wchar_t *dest, size_t length, const char *src)
+/**
+ * Converts a wide string to UTF-8.
+ * Memory is allocated to hold the converted string.
+ * The caller is responsible for freeing the string with git__free.
+ *
+ * @param dest Receives a pointer to the UTF-8 string.
+ * @param src The wide string to convert.
+ * @return The length of the UTF-8 string, in bytes (not counting the NULL terminator), or < 0 for failure
+ */
+int git__utf16_to_8_alloc(char **dest, const wchar_t *src)
 {
-	return MultiByteToWideChar(CP_UTF8, 0, src, -1, dest, (int)length);
-}
+	int utf8_size;
+	DWORD dwFlags = get_wc_flags();
 
-int git__utf16_to_8(char *out, const wchar_t *input)
-{
-	return WideCharToMultiByte(CP_UTF8, 0, input, -1, out, GIT_WIN_PATH, NULL, NULL);
+	*dest = NULL;
+
+	/* Length of -1 indicates NULL termination of the input string */
+	utf8_size = WideCharToMultiByte(CP_UTF8, dwFlags, src, -1, NULL, 0, NULL, NULL);
+
+	if (!utf8_size)
+		return -1;
+
+	*dest = git__malloc(utf8_size);
+
+	if (!*dest)
+		return -1;
+
+	utf8_size = WideCharToMultiByte(CP_UTF8, dwFlags, src, -1, *dest, utf8_size, NULL, NULL);
+
+	if (!utf8_size) {
+		git__free(*dest);
+		*dest = NULL;
+	}
+
+	/* Subtract 1 from the result to turn 0 into -1 (an error code) and to not count the NULL
+	 * terminator as part of the string's length. MultiByteToWideChar never returns int's minvalue,
+	 * so underflow is not possible */
+	return utf8_size - 1;
 }