From 41c6b23aab330d019789bf1fbb870c7e74e703bf Mon Sep 17 00:00:00 2001 From: Keno Fischer Date: Tue, 28 Jun 2016 16:04:25 -0400 Subject: Unicode 9 updates (#70) * Updates for Unicode 9.0.0 TR29 Changes - New rules GB10/(12/13) are used to combine emoji-zwj sequences/ (force grapheme breaks every two RI codepoints). Unfortunately this breaks statelessness of grapheme-boundary determination. Deal with this by ignoring the problem in utf8proc_grapheme_break, and by hacking in a special case in decompose - ZWJ moved to its own boundclass, update what is now GB9 accordingly. - Add comments to indicate which rule a given case implements - The Number of bound classes Now exceeds 4 bits, expand to 8 and reorganize fields * Import Unicode 9 data * Update Grapheme break API to expose state override * Bump MAJOR version --- utf8proc.c | 110 ++++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 80 insertions(+), 30 deletions(-) (limited to 'utf8proc.c') diff --git a/utf8proc.c b/utf8proc.c index dc1000a..08f8a92 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -233,36 +233,87 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc); } -/* return whether there is a grapheme break between boundclasses lbc and tbc */ -static utf8proc_bool grapheme_break(int lbc, int tbc) { - return - (lbc == UTF8PROC_BOUNDCLASS_START) ? true : - (lbc == UTF8PROC_BOUNDCLASS_CR && - tbc == UTF8PROC_BOUNDCLASS_LF) ? false : - (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : - (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : - (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false : - (lbc == UTF8PROC_BOUNDCLASS_L && - (tbc == UTF8PROC_BOUNDCLASS_L || - tbc == UTF8PROC_BOUNDCLASS_V || - tbc == UTF8PROC_BOUNDCLASS_LV || - tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : - ((lbc == UTF8PROC_BOUNDCLASS_LV || - lbc == UTF8PROC_BOUNDCLASS_V) && - (tbc == UTF8PROC_BOUNDCLASS_V || - tbc == UTF8PROC_BOUNDCLASS_T)) ? false : - ((lbc == UTF8PROC_BOUNDCLASS_LVT || - lbc == UTF8PROC_BOUNDCLASS_T) && - tbc == UTF8PROC_BOUNDCLASS_T) ? false : - (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && - tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : - (tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK); +/* return whether there is a grapheme break between boundclasses lbc and tbc + (according to the definition of extended grapheme clusters) + + Rule numbering refers to TR29 Version 29 (Unicode 9.0.0): + http://www.unicode.org/reports/tr29/tr29-29.html + + CAVEATS: + Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences) + and GB 12/13 (regional indicator code points) require knowledge of previous characters + and are thus not handled by this function. This may result in an incorrect break before + an E_Modifier class codepoint and an incorrectly missing break between two + REGIONAL_INDICATOR class code points if such support does not exist in the caller. + + See the special support in grapheme_break_extended, for required bookkeeping by the caller. +*/ +static utf8proc_bool grapheme_break_simple(int lbc, int tbc) { + return + (lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1 + (lbc == UTF8PROC_BOUNDCLASS_CR && // GB3 + tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // --- + (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4 + (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5 + (lbc == UTF8PROC_BOUNDCLASS_L && // GB6 + (tbc == UTF8PROC_BOUNDCLASS_L || // --- + tbc == UTF8PROC_BOUNDCLASS_V || // --- + tbc == UTF8PROC_BOUNDCLASS_LV || // --- + tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // --- + ((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7 + lbc == UTF8PROC_BOUNDCLASS_V) && // --- + (tbc == UTF8PROC_BOUNDCLASS_V || // --- + tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // --- + ((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8 + lbc == UTF8PROC_BOUNDCLASS_T) && // --- + tbc == UTF8PROC_BOUNDCLASS_T) ? false : // --- + (tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9 + tbc == UTF8PROC_BOUNDCLASS_ZWJ || // --- + tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a + lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b + ((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below) + lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ---- + tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ---- + (lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11 + (tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ---- + tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ---- + (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below) + tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ---- + true; // GB999 } -/* return whether there is a grapheme break between codepoints c1 and c2 */ -UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, utf8proc_int32_t c2) { - return grapheme_break(utf8proc_get_property(c1)->boundclass, - utf8proc_get_property(c2)->boundclass); +static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state) +{ + int lbc_override = lbc; + if (state && *state != UTF8PROC_BOUNDCLASS_START) + lbc_override = *state; + utf8proc_bool break_permitted = grapheme_break_simple(lbc, tbc); + if (state) { + // Special support for GB 12/13 made possible by GB999. After two RI + // class codepoints we want to force a break. Do this by resetting the + // second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break + // after that character according to GB999 (unless of course such a break is + // forbidden by a different rule such as GB9). + if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) + *state = UTF8PROC_BOUNDCLASS_OTHER; + // Special support for GB10. Fold any EXTEND codepoints into the previous + // boundclass if we're dealing with an emoji base boundclass. + else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE || + *state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && + tbc == UTF8PROC_BOUNDCLASS_EXTEND) + *state = UTF8PROC_BOUNDCLASS_E_BASE; + else + *state = tbc; + } + return break_permitted; +} + +UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break( + utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) { + + return grapheme_break_extended(utf8proc_get_property(c1)->boundclass, + utf8proc_get_property(c2)->boundclass, + state); } UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) @@ -388,8 +439,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, if (options & UTF8PROC_CHARBOUND) { utf8proc_bool boundary; int tbc = property->boundclass; - boundary = grapheme_break(*last_boundclass, tbc); - *last_boundclass = tbc; + boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass); if (boundary) { if (bufsize >= 1) dst[0] = 0xFFFF; if (bufsize >= 2) dst[1] = uc; -- cgit v1.2.3