From 41c6b23aab330d019789bf1fbb870c7e74e703bf Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Tue, 28 Jun 2016 16:04:25 -0400 Subject: Unicode 9 updates (#70) * Updates for Unicode 9.0.0 TR29 Changes - New rules GB10/(12/13) are used to combine emoji-zwj sequences/ (force grapheme breaks every two RI codepoints). Unfortunately this breaks statelessness of grapheme-boundary determination. Deal with this by ignoring the problem in utf8proc_grapheme_break, and by hacking in a special case in decompose - ZWJ moved to its own boundclass, update what is now GB9 accordingly. - Add comments to indicate which rule a given case implements - The Number of bound classes Now exceeds 4 bits, expand to 8 and reorganize fields * Import Unicode 9 data * Update Grapheme break API to expose state override * Bump MAJOR version --- utf8proc.h | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'utf8proc.h') diff --git a/utf8proc.h b/utf8proc.h index a8d53fc..f9f0d92 100644 --- a/utf8proc.h +++ b/utf8proc.h @@ -68,9 +68,9 @@ */ /** @{ */ /** The MAJOR version number (increased when backwards API compatibility is broken). */ -#define UTF8PROC_VERSION_MAJOR 1 +#define UTF8PROC_VERSION_MAJOR 2 /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */ -#define UTF8PROC_VERSION_MINOR 3 +#define UTF8PROC_VERSION_MINOR 0 /** The PATCH version (increased for fixes that do not change the API). */ #define UTF8PROC_VERSION_PATCH 0 /** @} */ @@ -259,13 +259,14 @@ typedef struct utf8proc_property_struct { */ unsigned ignorable:1; unsigned control_boundary:1; + /** The width of the codepoint. */ + unsigned charwidth:2; + unsigned pad:2; /** * Boundclass. * @see utf8proc_boundclass_t. */ - unsigned boundclass:4; - /** The width of the codepoint. */ - unsigned charwidth:2; + unsigned boundclass:8; } utf8proc_property_t; /** Unicode categories. */ @@ -349,7 +350,7 @@ typedef enum { UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */ } utf8proc_decomp_type_t; -/** Boundclass property. */ +/** Boundclass property. (TR29) */ typedef enum { UTF8PROC_BOUNDCLASS_START = 0, /**< Start */ UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */ @@ -364,6 +365,12 @@ typedef enum { UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */ UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */ UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */ + UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */ + UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */ + UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */ + UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */ + UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */ + UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */ } utf8proc_boundclass_t; /** @@ -513,8 +520,19 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, /** * Given a pair of consecutive codepoints, return whether a grapheme break is * permitted between them (as defined by the extended grapheme clusters in UAX#29). + * + * @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires + * state to break graphemes. This state can be passed in as a pointer + * in the `state` argument and should initially be set to 0. If the + * state is not passed in (i.e. a null pointer is passed), UAX#29 rules + * GB10/12/13 which require this state will not be applied, essentially + * matching the rules in Unicode 8.0.0. + * + * @warning If the state parameter is used, `utf8proc_grapheme_break` must be called + * IN ORDER on ALL potential breaks in a string. */ -UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2); +UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break( + utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state); /** -- cgit v1.2.3