summaryrefslogtreecommitdiff
path: root/utf8proc.c
diff options
context:
space:
mode:
authorSteven G. Johnson <stevenj@mit.edu>2018-05-02 08:15:02 -0400
committerGitHub <noreply@github.com>2018-05-02 08:15:02 -0400
commitbdc8b9e4b2063e4b4563938d5077ee3b826cf342 (patch)
treeb82ecf4a68d8b8841f4cb5aa4f903841f729bb47 /utf8proc.c
parent48949bd3ebd66bb94a40f4c3fcfb26dd4bf2be2b (diff)
downloadlibutf8proc-bdc8b9e4b2063e4b4563938d5077ee3b826cf342.tar.gz
libutf8proc-bdc8b9e4b2063e4b4563938d5077ee3b826cf342.tar.bz2
Case folding fixes (#133)
* Fixes allowing for “Full” folding and NFKC_CaseFold compliance. * Only include C (Common) and F (Full) foldings from CaseFolding.txt. Removed S (Simple) since F & S are specified to be exclusive. * Extend UTF8PROC_IGNORE to also ignore unassigned codepoints (such as \u2065) which are specified as being discarded by NFKC_CF. * Document the changes to UTF8PROC_IGNORE in header. * Add NFKC_CF helper function with documentation. * restore old IGNORE behavior, add UTF8PROC_STRIPNA, rename to utf8proc_NFKC_Casefold, add a test * success message * test that IGNORE does not strip NA * data update * NFKC_Casefold shouldn't strip NA
Diffstat (limited to 'utf8proc.c')
-rw-r--r--utf8proc.c10
1 files changed, 10 insertions, 0 deletions
diff --git a/utf8proc.c b/utf8proc.c
index c0f84d9..279d16f 100644
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -423,6 +423,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
if (options & UTF8PROC_IGNORE) {
if (property->ignorable) return 0;
}
+ if (options & UTF8PROC_STRIPNA) {
+ if (!category) return 0;
+ }
if (options & UTF8PROC_LUMP) {
if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
@@ -752,3 +755,10 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str)
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
return retval;
}
+
+UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str) {
+ utf8proc_uint8_t *retval;
+ utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
+ UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE);
+ return retval;
+}