ext/tree-sitter/tree-sitter/externals/utf8proc/utf8proc.h in tree-sitter-0.0.1 vs ext/tree-sitter/tree-sitter/externals/utf8proc/utf8proc.h in tree-sitter-0.1.0

- old
+ new

@@ -20,11 +20,11 @@ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ -/** +/** * @mainpage * * utf8proc is a free/open-source (MIT/expat licensed) C library * providing Unicode normalization, case-folding, and other operations * for strings in the UTF-8 encoding, supporting Unicode version @@ -52,34 +52,36 @@ #ifndef UTF8PROC_H #define UTF8PROC_H /** @name API version - * + * * The utf8proc API version MAJOR.MINOR.PATCH, following * semantic-versioning rules (http://semver.org) based on API * compatibility. * * This is also returned at runtime by @ref utf8proc_version; however, the * runtime version may append a string like "-dev" to the version number * for prerelease versions. * - * @note The shared-library version number in the Makefile may be different, + * @note The shared-library version number in the Makefile + * (and CMakeLists.txt, and MANIFEST) may be different, * being based on ABI compatibility rather than API compatibility. */ /** @{ */ /** The MAJOR version number (increased when backwards API compatibility is broken). */ -#define UTF8PROC_VERSION_MAJOR 1 +#define UTF8PROC_VERSION_MAJOR 2 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */ -#define UTF8PROC_VERSION_MINOR 3 +#define UTF8PROC_VERSION_MINOR 1 /** The PATCH version (increased for fixes that do not change the API). */ #define UTF8PROC_VERSION_PATCH 0 /** @} */ #include <stdlib.h> #include <sys/types.h> -#ifdef _MSC_VER +#if defined(_MSC_VER) && _MSC_VER < 1800 +// MSVC prior to 2013 lacked stdbool.h and inttypes.h typedef signed char utf8proc_int8_t; typedef unsigned char utf8proc_uint8_t; typedef short utf8proc_int16_t; typedef unsigned short utf8proc_uint16_t; typedef int utf8proc_int32_t; @@ -90,26 +92,32 @@ # else typedef int utf8proc_ssize_t; typedef unsigned int utf8proc_size_t; # endif # ifndef __cplusplus +// emulate C99 bool typedef unsigned char utf8proc_bool; -enum {false, true}; +# ifndef __bool_true_false_are_defined +# define false 0 +# define true 1 +# define __bool_true_false_are_defined 1 +# endif # else typedef bool utf8proc_bool; # endif #else +# include <stddef.h> # include <stdbool.h> # include <inttypes.h> typedef int8_t utf8proc_int8_t; typedef uint8_t utf8proc_uint8_t; typedef int16_t utf8proc_int16_t; typedef uint16_t utf8proc_uint16_t; typedef int32_t utf8proc_int32_t; typedef uint32_t utf8proc_uint32_t; typedef size_t utf8proc_size_t; -typedef ssize_t utf8proc_ssize_t; +typedef ptrdiff_t utf8proc_ssize_t; typedef bool utf8proc_bool; #endif #include <limits.h> #ifdef _WIN32 @@ -131,11 +139,11 @@ #ifndef SSIZE_MAX #define SSIZE_MAX ((size_t)SIZE_MAX/2) #endif #ifndef UINT16_MAX -# define UINT16_MAX ~(utf8proc_uint16_t)0 +# define UINT16_MAX 65535U #endif /** * Option flags used by several functions in the library. */ @@ -240,34 +248,34 @@ /** * @anchor Decomposition type. * @see utf8proc_decomp_type_t. */ utf8proc_propval_t decomp_type; - utf8proc_uint16_t decomp_mapping; - utf8proc_uint16_t casefold_mapping; - utf8proc_int32_t uppercase_mapping; - utf8proc_int32_t lowercase_mapping; - utf8proc_int32_t titlecase_mapping; - utf8proc_int32_t comb1st_index; - utf8proc_int32_t comb2nd_index; + utf8proc_uint16_t decomp_seqindex; + utf8proc_uint16_t casefold_seqindex; + utf8proc_uint16_t uppercase_seqindex; + utf8proc_uint16_t lowercase_seqindex; + utf8proc_uint16_t titlecase_seqindex; + utf8proc_uint16_t comb_index; unsigned bidi_mirrored:1; unsigned comp_exclusion:1; /** * Can this codepoint be ignored? * * Used by @ref utf8proc_decompose_char when @ref UTF8PROC_IGNORE is * passed as an option. */ unsigned ignorable:1; unsigned control_boundary:1; + /** The width of the codepoint. */ + unsigned charwidth:2; + unsigned pad:2; /** * Boundclass. * @see utf8proc_boundclass_t. */ - unsigned boundclass:4; - /** The width of the codepoint. */ - unsigned charwidth:2; + unsigned boundclass:8; } utf8proc_property_t; /** Unicode categories. */ typedef enum { UTF8PROC_CATEGORY_CN = 0, /**< Other, not assigned */ @@ -347,11 +355,11 @@ UTF8PROC_DECOMP_TYPE_SQUARE = 14, /**< Square */ UTF8PROC_DECOMP_TYPE_FRACTION = 15, /**< Fraction */ UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */ } utf8proc_decomp_type_t; -/** Boundclass property. */ +/** Boundclass property. (TR29) */ typedef enum { UTF8PROC_BOUNDCLASS_START = 0, /**< Start */ UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */ UTF8PROC_BOUNDCLASS_CR = 2, /**< Cr */ UTF8PROC_BOUNDCLASS_LF = 3, /**< Lf */ @@ -362,13 +370,26 @@ UTF8PROC_BOUNDCLASS_T = 8, /**< T */ UTF8PROC_BOUNDCLASS_LV = 9, /**< Lv */ UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */ UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */ UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */ + UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */ + UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */ + UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */ + UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */ + UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */ + UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */ } utf8proc_boundclass_t; /** + * Function pointer type passed to @ref utf8proc_map_custom and + * @ref utf8proc_decompose_custom, which is used to specify a user-defined + * mapping of codepoints to be applied in conjunction with other mappings. + */ +typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data); + +/** * Array containing the byte lengths of a UTF-8 encoded codepoint based * on the first byte. */ UTF8PROC_DLLEXPORT extern const utf8proc_int8_t utf8proc_utf8class[256]; @@ -471,10 +492,11 @@ * bytes are processed. The result (in the form of 32-bit unicode * codepoints) is written into the buffer being pointed to by * `buffer` (which must contain at least `bufsize` entries). In case of * success, the number of codepoints written is returned; in case of an * error, a negative error code is returned (@ref utf8proc_errmsg). + * See @ref utf8proc_decompose_custom to supply additional transformations. * * If the number of written codepoints would be bigger than `bufsize`, the * required buffer size is returned, while the buffer will be overwritten with * undefined data. */ @@ -482,12 +504,50 @@ const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options ); /** + * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function + * that is called on each codepoint in `str` before any other transformations + * (along with a `custom_data` pointer that is passed through to `custom_func`). + * The `custom_func` argument is ignored if it is `NULL`. See also @ref utf8proc_map_custom. + */ +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, + utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options, + utf8proc_custom_func custom_func, void *custom_data +); + +/** + * Normalizes the sequence of `length` codepoints pointed to by `buffer` + * in-place (i.e., the result is also stored in `buffer`). + * + * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode. + * @param length the length (in codepoints) of the buffer. + * @param options a bitwise or (`|`) of one or more of the following flags: + * - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS + * - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS + * - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF + * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters + * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite + * codepoints + * - @ref UTF8PROC_STABLE - prohibit combining characters that would violate + * the unicode versioning stability + * + * @return + * In case of success, the length (in codepoints) of the normalized UTF-32 string is + * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg). + * + * @warning The entries of the array pointed to by `str` have to be in the + * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash! + */ +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options); + +/** * Reencodes the sequence of `length` codepoints pointed to by `buffer` * UTF-8 data in-place (i.e., the result is also stored in `buffer`). + * Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion. * * @param buffer the (native-endian UTF-32) unicode codepoints to re-encode. * @param length the length (in codepoints) of the buffer. * @param options a bitwise or (`|`) of one or more of the following flags: * - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS @@ -496,14 +556,16 @@ * - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters * - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite * codepoints * - @ref UTF8PROC_STABLE - prohibit combining characters that would violate * the unicode versioning stability + * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster * * @return - * In case of success, the length (in bytes) of the resulting UTF-8 string is - * returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg). + * In case of success, the length (in bytes) of the resulting nul-terminated + * UTF-8 string is returned; otherwise, a negative error code is returned + * (@ref utf8proc_errmsg). * * @warning The amount of free space pointed to by `buffer` must * exceed the amount of the input data by one byte, and the * entries of the array pointed to by `str` have to be in the * range `0x0000` to `0x10FFFF`. Otherwise, the program might crash! @@ -511,14 +573,32 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options); /** * Given a pair of consecutive codepoints, return whether a grapheme break is * permitted between them (as defined by the extended grapheme clusters in UAX#29). + * + * @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires + * state to break graphemes. This state can be passed in as a pointer + * in the `state` argument and should initially be set to 0. If the + * state is not passed in (i.e. a null pointer is passed), UAX#29 rules + * GB10/12/13 which require this state will not be applied, essentially + * matching the rules in Unicode 8.0.0. + * + * @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must + * be called IN ORDER on ALL potential breaks in a string. */ -UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2); +UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful( + utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state); +/** + * Same as @ref utf8proc_grapheme_break_stateful, except without support for the + * Unicode 9 additions to the algorithm. Supported for legacy reasons. + */ +UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break( + utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2); + /** * Given a codepoint `c`, return the codepoint of the corresponding * lower-case character, if any; otherwise (if there is no lower-case * variant, or if `c` is not a valid codepoint) return `c`. */ @@ -530,14 +610,21 @@ * variant, or if `c` is not a valid codepoint) return `c`. */ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c); /** + * Given a codepoint `c`, return the codepoint of the corresponding + * title-case character, if any; otherwise (if there is no title-case + * variant, or if `c` is not a valid codepoint) return `c`. + */ +UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c); + +/** * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`, * except that a width of 0 is returned for non-printable codepoints * instead of -1 as in `wcwidth`. - * + * * @note * If you want to check for particular types of non-printable characters, * (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */ UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint); @@ -561,11 +648,12 @@ * the length is determined by a NULL terminator, otherwise the * parameter `strlen` is evaluated to determine the string length, but * in any case the result will be NULL terminated (though it might * contain NULL characters with the string if `str` contained NULL * characters). Other flags in the `options` field are passed to the - * functions defined above, and regarded as described. + * functions defined above, and regarded as described. See also + * @ref utfproc_map_custom to supply a custom codepoint transformation. * * In case of success the length of the new string is returned, * otherwise a negative error code is returned. * * @note The memory of the new UTF-8 string will have been allocated @@ -573,10 +661,21 @@ */ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options ); +/** + * Like @ref utf8proc_map, but also takes a `custom_func` mapping function + * that is called on each codepoint in `str` before any other transformations + * (along with a `custom_data` pointer that is passed through to `custom_func`). + * The `custom_func` argument is ignored if it is `NULL`. + */ +UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( + const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options, + utf8proc_custom_func custom_func, void *custom_data +); + /** @name Unicode normalization * * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC * normalized version of the null-terminated string `str`. These * are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM @@ -585,17 +684,16 @@ /** @{ */ /** NFD normalization (@ref UTF8PROC_DECOMPOSE). */ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str); /** NFC normalization (@ref UTF8PROC_COMPOSE). */ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str); -/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */ +/** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str); -/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */ +/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str); /** @} */ #ifdef __cplusplus } #endif #endif -