Case folding fixes (#133)

* Fixes allowing for “Full” folding and NFKC_CaseFold compliance. * Only include C (Common) and F (Full) foldings from CaseFolding.txt. Removed S (Simple) since F & S are specified to be exclusive. * Extend UTF8PROC_IGNORE to also ignore unassigned codepoints (such as \u2065) which are specified as being discarded by NFKC_CF. * Document the changes to UTF8PROC_IGNORE in header. * Add NFKC_CF helper function with documentation. * restore old IGNORE behavior, add UTF8PROC_STRIPNA, rename to utf8proc_NFKC_Casefold, add a test * success message * test that IGNORE does not strip NA * data update * NFKC_Casefold shouldn't strip NA
author: Steven G. Johnson <stevenj@mit.edu> 2018-05-02 08:15:02 -0400
committer: GitHub <noreply@github.com> 2018-05-02 08:15:02 -0400
commit: bdc8b9e4b2063e4b4563938d5077ee3b826cf342 (patch)
tree: b82ecf4a68d8b8841f4cb5aa4f903841f729bb47 /utf8proc.h
parent: 48949bd3ebd66bb94a40f4c3fcfb26dd4bf2be2b (diff)
download: libutf8proc-bdc8b9e4b2063e4b4563938d5077ee3b826cf342.tar.gz
libutf8proc-bdc8b9e4b2063e4b4563938d5077ee3b826cf342.tar.bz2
1 files changed, 12 insertions, 2 deletions
diff --git a/utf8proc.h b/utf8proc.h
index 9129853..aa61658 100644
--- a/utf8proc.h
+++ b/utf8proc.h
@@ -213,6 +213,10 @@ typedef enum {
    *       @ref UTF8PROC_DECOMPOSE
    */
   UTF8PROC_STRIPMARK = (1<<13),
+  /**
+   * Strip unassigned codepoints.
+   */
+  UTF8PROC_STRIPNA    = (1<<14),
 } utf8proc_option_t;
 
 /** @name Error codes
@@ -469,6 +473,7 @@ UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int
  * - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
  * - @ref UTF8PROC_LUMP      - lump certain different codepoints together
  * - @ref UTF8PROC_STRIPMARK - remove all character marks
+ * - @ref UTF8PROC_STRIPNA   - remove unassigned codepoints
  * @param last_boundclass
  * Pointer to an integer variable containing
  * the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
@@ -680,8 +685,8 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
 
 /** @name Unicode normalization
  *
- * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
- * normalized version of the null-terminated string `str`.  These
+ * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD, NFKC or
+ * NFKC_Casefold normalized version of the null-terminated string `str`.  These
  * are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM
  * combined with @ref UTF8PROC_STABLE and flags indicating the normalization.
  */
@@ -694,6 +699,11 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
 /** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
+/**
+ * NFKC_Casefold normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT
+ * and @ref UTF8PROC_CASEFOLD and @ref UTF8PROC_IGNORE).
+ **/
+UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str);
 /** @} */
 
 #ifdef __cplusplus
author	Steven G. Johnson <stevenj@mit.edu>	2018-05-02 08:15:02 -0400
committer	GitHub <noreply@github.com>	2018-05-02 08:15:02 -0400
commit	bdc8b9e4b2063e4b4563938d5077ee3b826cf342 (patch)
tree	b82ecf4a68d8b8841f4cb5aa4f903841f729bb47 /utf8proc.h
parent	48949bd3ebd66bb94a40f4c3fcfb26dd4bf2be2b (diff)
download	libutf8proc-bdc8b9e4b2063e4b4563938d5077ee3b826cf342.tar.gz libutf8proc-bdc8b9e4b2063e4b4563938d5077ee3b826cf342.tar.bz2