From bdc8b9e4b2063e4b4563938d5077ee3b826cf342 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Wed, 2 May 2018 08:15:02 -0400 Subject: Case folding fixes (#133) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fixes allowing for “Full” folding and NFKC_CaseFold compliance. * Only include C (Common) and F (Full) foldings from CaseFolding.txt. Removed S (Simple) since F & S are specified to be exclusive. * Extend UTF8PROC_IGNORE to also ignore unassigned codepoints (such as \u2065) which are specified as being discarded by NFKC_CF. * Document the changes to UTF8PROC_IGNORE in header. * Add NFKC_CF helper function with documentation. * restore old IGNORE behavior, add UTF8PROC_STRIPNA, rename to utf8proc_NFKC_Casefold, add a test * success message * test that IGNORE does not strip NA * data update * NFKC_Casefold shouldn't strip NA --- utf8proc.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'utf8proc.h') diff --git a/utf8proc.h b/utf8proc.h index 9129853..aa61658 100644 --- a/utf8proc.h +++ b/utf8proc.h @@ -213,6 +213,10 @@ typedef enum { * @ref UTF8PROC_DECOMPOSE */ UTF8PROC_STRIPMARK = (1<<13), + /** + * Strip unassigned codepoints. + */ + UTF8PROC_STRIPNA = (1<<14), } utf8proc_option_t; /** @name Error codes @@ -469,6 +473,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster * - @ref UTF8PROC_LUMP - lump certain different codepoints together * - @ref UTF8PROC_STRIPMARK - remove all character marks + * - @ref UTF8PROC_STRIPNA - remove unassigned codepoints * @param last_boundclass * Pointer to an integer variable containing * the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND @@ -680,8 +685,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( /** @name Unicode normalization * - * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC - * normalized version of the null-terminated string `str`. These + * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD, NFKC or + * NFKC_Casefold normalized version of the null-terminated string `str`. These * are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM * combined with @ref UTF8PROC_STABLE and flags indicating the normalization. */ @@ -694,6 +699,11 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str); UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str); /** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str); +/** + * NFKC_Casefold normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT + * and @ref UTF8PROC_CASEFOLD and @ref UTF8PROC_IGNORE). + **/ +UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str); /** @} */ #ifdef __cplusplus -- cgit v1.2.3