utf8proc.h in tree-sitter-0.1.0

- old
+ new

@@ -20,11 +20,11 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */
 
 
-/** 
+/**
  * @mainpage
  *
  * utf8proc is a free/open-source (MIT/expat licensed) C library
  * providing Unicode normalization, case-folding, and other operations
  * for strings in the UTF-8 encoding, supporting Unicode version
@@ -52,34 +52,36 @@
 
 #ifndef UTF8PROC_H
 #define UTF8PROC_H
 
 /** @name API version
- *  
+ *
  * The utf8proc API version MAJOR.MINOR.PATCH, following
  * semantic-versioning rules (http://semver.org) based on API
  * compatibility.
  *
  * This is also returned at runtime by @ref utf8proc_version; however, the
  * runtime version may append a string like "-dev" to the version number
  * for prerelease versions.
  *
- * @note The shared-library version number in the Makefile may be different,
+ * @note The shared-library version number in the Makefile
+ *       (and CMakeLists.txt, and MANIFEST) may be different,
  *       being based on ABI compatibility rather than API compatibility.
  */
 /** @{ */
 /** The MAJOR version number (increased when backwards API compatibility is broken). */
-#define UTF8PROC_VERSION_MAJOR 1
+#define UTF8PROC_VERSION_MAJOR 2
 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
-#define UTF8PROC_VERSION_MINOR 3
+#define UTF8PROC_VERSION_MINOR 1
 /** The PATCH version (increased for fixes that do not change the API). */
 #define UTF8PROC_VERSION_PATCH 0
 /** @} */
 
 #include <stdlib.h>
 #include <sys/types.h>
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && _MSC_VER < 1800
+// MSVC prior to 2013 lacked stdbool.h and inttypes.h
 typedef signed char utf8proc_int8_t;
 typedef unsigned char utf8proc_uint8_t;
 typedef short utf8proc_int16_t;
 typedef unsigned short utf8proc_uint16_t;
 typedef int utf8proc_int32_t;
@@ -90,26 +92,32 @@
 #  else
 typedef int utf8proc_ssize_t;
 typedef unsigned int utf8proc_size_t;
 #  endif
 #  ifndef __cplusplus
+// emulate C99 bool
 typedef unsigned char utf8proc_bool;
-enum {false, true};
+#    ifndef __bool_true_false_are_defined
+#      define false 0
+#      define true 1
+#      define __bool_true_false_are_defined 1
+#    endif
 #  else
 typedef bool utf8proc_bool;
 #  endif
 #else
+#  include <stddef.h>
 #  include <stdbool.h>
 #  include <inttypes.h>
 typedef int8_t utf8proc_int8_t;
 typedef uint8_t utf8proc_uint8_t;
 typedef int16_t utf8proc_int16_t;
 typedef uint16_t utf8proc_uint16_t;
 typedef int32_t utf8proc_int32_t;
 typedef uint32_t utf8proc_uint32_t;
 typedef size_t utf8proc_size_t;
-typedef ssize_t utf8proc_ssize_t;
+typedef ptrdiff_t utf8proc_ssize_t;
 typedef bool utf8proc_bool;
 #endif
 #include <limits.h>
 
 #ifdef _WIN32
@@ -131,11 +139,11 @@
 #ifndef SSIZE_MAX
 #define SSIZE_MAX ((size_t)SIZE_MAX/2)
 #endif
 
 #ifndef UINT16_MAX
-#  define UINT16_MAX ~(utf8proc_uint16_t)0
+#  define UINT16_MAX 65535U
 #endif
 
 /**
  * Option flags used by several functions in the library.
  */
@@ -240,34 +248,34 @@
   /**
    * @anchor Decomposition type.
    * @see utf8proc_decomp_type_t.
    */
   utf8proc_propval_t decomp_type;
-  utf8proc_uint16_t decomp_mapping;
-  utf8proc_uint16_t casefold_mapping;
-  utf8proc_int32_t uppercase_mapping;
-  utf8proc_int32_t lowercase_mapping;
-  utf8proc_int32_t titlecase_mapping;
-  utf8proc_int32_t comb1st_index;
-  utf8proc_int32_t comb2nd_index;
+  utf8proc_uint16_t decomp_seqindex;
+  utf8proc_uint16_t casefold_seqindex;
+  utf8proc_uint16_t uppercase_seqindex;
+  utf8proc_uint16_t lowercase_seqindex;
+  utf8proc_uint16_t titlecase_seqindex;
+  utf8proc_uint16_t comb_index;
   unsigned bidi_mirrored:1;
   unsigned comp_exclusion:1;
   /**
    * Can this codepoint be ignored?
    *
    * Used by @ref utf8proc_decompose_char when @ref UTF8PROC_IGNORE is
    * passed as an option.
    */
   unsigned ignorable:1;
   unsigned control_boundary:1;
+  /** The width of the codepoint. */
+  unsigned charwidth:2;
+  unsigned pad:2;
   /**
    * Boundclass.
    * @see utf8proc_boundclass_t.
    */
-  unsigned boundclass:4;
-  /** The width of the codepoint. */
-  unsigned charwidth:2;
+  unsigned boundclass:8;
 } utf8proc_property_t;
 
 /** Unicode categories. */
 typedef enum {
   UTF8PROC_CATEGORY_CN  = 0, /**< Other, not assigned */
@@ -347,11 +355,11 @@
   UTF8PROC_DECOMP_TYPE_SQUARE   = 14, /**< Square */
   UTF8PROC_DECOMP_TYPE_FRACTION = 15, /**< Fraction */
   UTF8PROC_DECOMP_TYPE_COMPAT   = 16, /**< Compat */
 } utf8proc_decomp_type_t;
 
-/** Boundclass property. */
+/** Boundclass property. (TR29) */
 typedef enum {
   UTF8PROC_BOUNDCLASS_START              =  0, /**< Start */
   UTF8PROC_BOUNDCLASS_OTHER              =  1, /**< Other */
   UTF8PROC_BOUNDCLASS_CR                 =  2, /**< Cr */
   UTF8PROC_BOUNDCLASS_LF                 =  3, /**< Lf */
@@ -362,13 +370,26 @@
   UTF8PROC_BOUNDCLASS_T                  =  8, /**< T */
   UTF8PROC_BOUNDCLASS_LV                 =  9, /**< Lv */
   UTF8PROC_BOUNDCLASS_LVT                = 10, /**< Lvt */
   UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
   UTF8PROC_BOUNDCLASS_SPACINGMARK        = 12, /**< Spacingmark */
+  UTF8PROC_BOUNDCLASS_PREPEND            = 13, /**< Prepend */
+  UTF8PROC_BOUNDCLASS_ZWJ                = 14, /**< Zero Width Joiner */
+  UTF8PROC_BOUNDCLASS_E_BASE             = 15, /**< Emoji Base */
+  UTF8PROC_BOUNDCLASS_E_MODIFIER         = 16, /**< Emoji Modifier */
+  UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ     = 17, /**< Glue_After_ZWJ */
+  UTF8PROC_BOUNDCLASS_E_BASE_GAZ         = 18, /**< E_BASE + GLUE_AFTER_ZJW */
 } utf8proc_boundclass_t;
 
 /**
+ * Function pointer type passed to @ref utf8proc_map_custom and
+ * @ref utf8proc_decompose_custom, which is used to specify a user-defined
+ * mapping of codepoints to be applied in conjunction with other mappings.
+ */
+typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);
+
+/**
  * Array containing the byte lengths of a UTF-8 encoded codepoint based
  * on the first byte.
  */
 UTF8PROC_DLLEXPORT extern const utf8proc_int8_t utf8proc_utf8class[256];
 
@@ -471,10 +492,11 @@
  * bytes are processed.  The result (in the form of 32-bit unicode
  * codepoints) is written into the buffer being pointed to by
  * `buffer` (which must contain at least `bufsize` entries).  In case of
  * success, the number of codepoints written is returned; in case of an
  * error, a negative error code is returned (@ref utf8proc_errmsg).
+ * See @ref utf8proc_decompose_custom to supply additional transformations.
  *
  * If the number of written codepoints would be bigger than `bufsize`, the
  * required buffer size is returned, while the buffer will be overwritten with
  * undefined data.
  */
@@ -482,12 +504,50 @@
   const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
   utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
 );
 
 /**
+ * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
+ * that is called on each codepoint in `str` before any other transformations
+ * (along with a `custom_data` pointer that is passed through to `custom_func`).
+ * The `custom_func` argument is ignored if it is `NULL`.  See also @ref utf8proc_map_custom.
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
+  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
+  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
+);
+
+/**
+ * Normalizes the sequence of `length` codepoints pointed to by `buffer`
+ * in-place (i.e., the result is also stored in `buffer`).
+ *
+ * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
+ * @param length the length (in codepoints) of the buffer.
+ * @param options a bitwise or (`|`) of one or more of the following flags:
+ * - @ref UTF8PROC_NLF2LS  - convert LF, CRLF, CR and NEL into LS
+ * - @ref UTF8PROC_NLF2PS  - convert LF, CRLF, CR and NEL into PS
+ * - @ref UTF8PROC_NLF2LF  - convert LF, CRLF, CR and NEL into LF
+ * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
+ * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
+ *                           codepoints
+ * - @ref UTF8PROC_STABLE  - prohibit combining characters that would violate
+ *                           the unicode versioning stability
+ *
+ * @return
+ * In case of success, the length (in codepoints) of the normalized UTF-32 string is
+ * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
+ *
+ * @warning The entries of the array pointed to by `str` have to be in the
+ *          range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
+
+/**
  * Reencodes the sequence of `length` codepoints pointed to by `buffer`
  * UTF-8 data in-place (i.e., the result is also stored in `buffer`).
+ * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
  *
  * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
  * @param length the length (in codepoints) of the buffer.
  * @param options a bitwise or (`|`) of one or more of the following flags:
  * - @ref UTF8PROC_NLF2LS  - convert LF, CRLF, CR and NEL into LS
@@ -496,14 +556,16 @@
  * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
  * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
  *                           codepoints
  * - @ref UTF8PROC_STABLE  - prohibit combining characters that would violate
  *                           the unicode versioning stability
+ * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
  *
  * @return
- * In case of success, the length (in bytes) of the resulting UTF-8 string is
- * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
+ * In case of success, the length (in bytes) of the resulting nul-terminated
+ * UTF-8 string is returned; otherwise, a negative error code is returned
+ * (@ref utf8proc_errmsg).
  *
  * @warning The amount of free space pointed to by `buffer` must
  *          exceed the amount of the input data by one byte, and the
  *          entries of the array pointed to by `str` have to be in the
  *          range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
@@ -511,14 +573,32 @@
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
 
 /**
  * Given a pair of consecutive codepoints, return whether a grapheme break is
  * permitted between them (as defined by the extended grapheme clusters in UAX#29).
+ *
+ * @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
+ *              state to break graphemes. This state can be passed in as a pointer
+ *              in the `state` argument and should initially be set to 0. If the
+ *              state is not passed in (i.e. a null pointer is passed), UAX#29 rules
+ *              GB10/12/13 which require this state will not be applied, essentially
+ *              matching the rules in Unicode 8.0.0.
+ *
+ * @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must
+ *          be called IN ORDER on ALL potential breaks in a string.
  */
-UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
+UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
+    utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
 
+/**
+ * Same as @ref utf8proc_grapheme_break_stateful, except without support for the
+ * Unicode 9 additions to the algorithm. Supported for legacy reasons.
+ */
+UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
+    utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
 
+
 /**
  * Given a codepoint `c`, return the codepoint of the corresponding
  * lower-case character, if any; otherwise (if there is no lower-case
  * variant, or if `c` is not a valid codepoint) return `c`.
  */
@@ -530,14 +610,21 @@
  * variant, or if `c` is not a valid codepoint) return `c`.
  */
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
 
 /**
+ * Given a codepoint `c`, return the codepoint of the corresponding
+ * title-case character, if any; otherwise (if there is no title-case
+ * variant, or if `c` is not a valid codepoint) return `c`.
+ */
+UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
+
+/**
  * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
  * except that a width of 0 is returned for non-printable codepoints
  * instead of -1 as in `wcwidth`.
- * 
+ *
  * @note
  * If you want to check for particular types of non-printable characters,
  * (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
 UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint);
 
@@ -561,11 +648,12 @@
  * the length is determined by a NULL terminator, otherwise the
  * parameter `strlen` is evaluated to determine the string length, but
  * in any case the result will be NULL terminated (though it might
  * contain NULL characters with the string if `str` contained NULL
  * characters). Other flags in the `options` field are passed to the
- * functions defined above, and regarded as described.
+ * functions defined above, and regarded as described.  See also
+ * @ref utfproc_map_custom to supply a custom codepoint transformation.
  *
  * In case of success the length of the new string is returned,
  * otherwise a negative error code is returned.
  *
  * @note The memory of the new UTF-8 string will have been allocated
@@ -573,10 +661,21 @@
  */
 UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
   const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
 );
 
+/**
+ * Like @ref utf8proc_map, but also takes a `custom_func` mapping function
+ * that is called on each codepoint in `str` before any other transformations
+ * (along with a `custom_data` pointer that is passed through to `custom_func`).
+ * The `custom_func` argument is ignored if it is `NULL`.
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
+  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
+);
+
 /** @name Unicode normalization
  *
  * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
  * normalized version of the null-terminated string `str`.  These
  * are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM
@@ -585,17 +684,16 @@
 /** @{ */
 /** NFD normalization (@ref UTF8PROC_DECOMPOSE). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
 /** NFC normalization (@ref UTF8PROC_COMPOSE). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
-/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
+/** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
-/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
+/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
 /** @} */
 
 #ifdef __cplusplus
 }
 #endif
 
 #endif
-