summaryrefslogtreecommitdiff
path: root/utf8proc.c
diff options
context:
space:
mode:
authorSteven G. Johnson <stevenj@mit.edu>2018-07-24 13:25:51 -0400
committerSteven G. Johnson <stevenj@mit.edu>2018-07-24 13:25:51 -0400
commite0295be467d15e7abec2af275bcca30dc816bc9e (patch)
tree370dcfc22d38a3ed302c1beeafa2ee5bf1b52db7 /utf8proc.c
parent98e5529a0a6cd4dd09a8885029253f26c677c85f (diff)
parentd4a58cfec5345bbb2bb0db1e85172a8cff278da7 (diff)
downloadlibutf8proc-e0295be467d15e7abec2af275bcca30dc816bc9e.tar.gz
libutf8proc-e0295be467d15e7abec2af275bcca30dc816bc9e.tar.bz2
Merge branch 'master' of https://github.com/JuliaLang/utf8proc
Diffstat (limited to 'utf8proc.c')
-rw-r--r--utf8proc.c23
1 files changed, 11 insertions, 12 deletions
diff --git a/utf8proc.c b/utf8proc.c
index 5e1bdce..413d04d 100644
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -271,12 +271,8 @@ static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
- ((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below)
- lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ----
- tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
- (lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11
- (tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ----
- tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ----
+ (lbc == UTF8PROC_BOUNDCLASS_E_ZWG && // GB11 (requires additional handling below)
+ tbc == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) ? false : // ----
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
true; // GB999
@@ -295,12 +291,15 @@ static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t
// forbidden by a different rule such as GB9).
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
*state = UTF8PROC_BOUNDCLASS_OTHER;
- // Special support for GB10. Fold any EXTEND codepoints into the previous
- // boundclass if we're dealing with an emoji base boundclass.
- else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE ||
- *state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
- tbc == UTF8PROC_BOUNDCLASS_EXTEND)
- *state = UTF8PROC_BOUNDCLASS_E_BASE;
+ // Special support for GB11 (emoji extend* zwj / emoji)
+ else if (*state == UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC) {
+ if (tbc == UTF8PROC_BOUNDCLASS_EXTEND) // fold EXTEND codepoints into emoji
+ *state = UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC;
+ else if (tbc == UTF8PROC_BOUNDCLASS_ZWJ)
+ *state = UTF8PROC_BOUNDCLASS_E_ZWG; // state to record emoji+zwg combo
+ else
+ *state = tbc;
+ }
else
*state = tbc;
}