summaryrefslogtreecommitdiff
path: root/utf8proc.h
diff options
context:
space:
mode:
authorKeno Fischer <kfischer+github@college.harvard.edu>2016-06-28 16:04:25 -0400
committerSteven G. Johnson <stevenj@mit.edu>2016-06-28 16:04:25 -0400
commit41c6b23aab330d019789bf1fbb870c7e74e703bf (patch)
tree15c109853d5d5dba78b0002897368501b94d2fc0 /utf8proc.h
parent3d0576a9b9669a6e9fd170ffba3d3838d46986df (diff)
downloadlibutf8proc-41c6b23aab330d019789bf1fbb870c7e74e703bf.tar.gz
libutf8proc-41c6b23aab330d019789bf1fbb870c7e74e703bf.tar.bz2
Unicode 9 updates (#70)
* Updates for Unicode 9.0.0 TR29 Changes - New rules GB10/(12/13) are used to combine emoji-zwj sequences/ (force grapheme breaks every two RI codepoints). Unfortunately this breaks statelessness of grapheme-boundary determination. Deal with this by ignoring the problem in utf8proc_grapheme_break, and by hacking in a special case in decompose - ZWJ moved to its own boundclass, update what is now GB9 accordingly. - Add comments to indicate which rule a given case implements - The Number of bound classes Now exceeds 4 bits, expand to 8 and reorganize fields * Import Unicode 9 data * Update Grapheme break API to expose state override * Bump MAJOR version
Diffstat (limited to 'utf8proc.h')
-rw-r--r--utf8proc.h32
1 files changed, 25 insertions, 7 deletions
diff --git a/utf8proc.h b/utf8proc.h
index a8d53fc..f9f0d92 100644
--- a/utf8proc.h
+++ b/utf8proc.h
@@ -68,9 +68,9 @@
*/
/** @{ */
/** The MAJOR version number (increased when backwards API compatibility is broken). */
-#define UTF8PROC_VERSION_MAJOR 1
+#define UTF8PROC_VERSION_MAJOR 2
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
-#define UTF8PROC_VERSION_MINOR 3
+#define UTF8PROC_VERSION_MINOR 0
/** The PATCH version (increased for fixes that do not change the API). */
#define UTF8PROC_VERSION_PATCH 0
/** @} */
@@ -259,13 +259,14 @@ typedef struct utf8proc_property_struct {
*/
unsigned ignorable:1;
unsigned control_boundary:1;
+ /** The width of the codepoint. */
+ unsigned charwidth:2;
+ unsigned pad:2;
/**
* Boundclass.
* @see utf8proc_boundclass_t.
*/
- unsigned boundclass:4;
- /** The width of the codepoint. */
- unsigned charwidth:2;
+ unsigned boundclass:8;
} utf8proc_property_t;
/** Unicode categories. */
@@ -349,7 +350,7 @@ typedef enum {
UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */
} utf8proc_decomp_type_t;
-/** Boundclass property. */
+/** Boundclass property. (TR29) */
typedef enum {
UTF8PROC_BOUNDCLASS_START = 0, /**< Start */
UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */
@@ -364,6 +365,12 @@ typedef enum {
UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */
UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
+ UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
+ UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
+ UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
+ UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
+ UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
+ UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
} utf8proc_boundclass_t;
/**
@@ -513,8 +520,19 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
/**
* Given a pair of consecutive codepoints, return whether a grapheme break is
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
+ *
+ * @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
+ * state to break graphemes. This state can be passed in as a pointer
+ * in the `state` argument and should initially be set to 0. If the
+ * state is not passed in (i.e. a null pointer is passed), UAX#29 rules
+ * GB10/12/13 which require this state will not be applied, essentially
+ * matching the rules in Unicode 8.0.0.
+ *
+ * @warning If the state parameter is used, `utf8proc_grapheme_break` must be called
+ * IN ORDER on ALL potential breaks in a string.
*/
-UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
+UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
+ utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
/**