From 4603e00cfc72c58056a18962ea43ebffaf89ec30 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Sat, 30 Mar 2019 15:22:25 -0400 Subject: fix CHARBOUND option for non-characters (#149) --- utf8proc.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'utf8proc.c') diff --git a/utf8proc.c b/utf8proc.c index f03126c..98e754d 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -196,9 +196,13 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, ut } else return 0; } -/* internal "unsafe" version that does not check whether uc is in range */ -static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { +/* internal version used for inserting 0xff bytes between graphemes */ +static utf8proc_ssize_t charbound_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) { if (uc < 0x00) { + if (uc == -1) { /* internal value used for grapheme breaks */ + dst[0] = (utf8proc_uint8_t)0xFF; + return 1; + } return 0; } else if (uc < 0x80) { dst[0] = (utf8proc_uint8_t)uc; @@ -207,12 +211,6 @@ static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6)); dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F)); return 2; - } else if (uc == 0xFFFF) { - dst[0] = (utf8proc_uint8_t)0xFF; - return 1; - } else if (uc == 0xFFFE) { - dst[0] = (utf8proc_uint8_t)0xFE; - return 1; } else if (uc < 0x10000) { dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12)); dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F)); @@ -480,7 +478,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, int tbc = property->boundclass; boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass); if (boundary) { - if (bufsize >= 1) dst[0] = 0xFFFF; + if (bufsize >= 1) dst[0] = -1; /* sentinel value for grapheme break */ if (bufsize >= 2) dst[1] = uc; return 2; } @@ -686,7 +684,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, if (options & UTF8PROC_CHARBOUND) { for (rpos = 0; rpos < length; rpos++) { uc = buffer[rpos]; - wpos += unsafe_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); + wpos += charbound_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos); } } else { for (rpos = 0; rpos < length; rpos++) { -- cgit v1.2.3