diff options
author | Steven G. Johnson <stevenj@mit.edu> | 2018-07-24 13:18:48 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-07-24 13:18:48 -0400 |
commit | d4a58cfec5345bbb2bb0db1e85172a8cff278da7 (patch) | |
tree | 01a7a3f741550c3a1dbec3b49a70f5d5f2061d6f /utf8proc.c | |
parent | 02f4e1890cf8135b609b404c58ac7e8b27136ad6 (diff) | |
download | libutf8proc-d4a58cfec5345bbb2bb0db1e85172a8cff278da7.tar.gz libutf8proc-d4a58cfec5345bbb2bb0db1e85172a8cff278da7.tar.bz2 |
update data and algorithms for Unicode 11 (#140)
Diffstat (limited to 'utf8proc.c')
-rw-r--r-- | utf8proc.c | 23 |
1 files changed, 11 insertions, 12 deletions
@@ -271,12 +271,8 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) { tbc == UTF8PROC_BOUNDCLASS_ZWJ || // --- tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b - ((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below) - lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ---- - tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ---- - (lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11 - (tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ---- - tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ---- + (lbc == UTF8PROC_BOUNDCLASS_E_ZWG && // GB11 (requires additional handling below) + tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ---- (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below) tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ---- true; // GB999 @@ -295,12 +291,15 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t // forbidden by a different rule such as GB9). if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) *state = UTF8PROC_BOUNDCLASS_OTHER; - // Special support for GB10. Fold any EXTEND codepoints into the previous - // boundclass if we're dealing with an emoji base boundclass. - else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE || - *state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && - tbc == UTF8PROC_BOUNDCLASS_EXTEND) - *state = UTF8PROC_BOUNDCLASS_E_BASE; + // Special support for GB11 (emoji extend* zwj / emoji) + else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) { + if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji + *state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC; + else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ) + *state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo + else + *state = tbc; + } else *state = tbc; } |