diff options
-rw-r--r-- | NEWS.md | 4 | ||||
-rw-r--r-- | test/printproperty.c | 4 | ||||
-rw-r--r-- | utf8proc.c | 63 | ||||
-rw-r--r-- | utf8proc.h | 77 |
4 files changed, 92 insertions, 56 deletions
@@ -28,6 +28,10 @@ - Most `#defined` constants are now `enums`. +- New preprocessor constants `UTF8PROC_VERSION_MAJOR`, + `UTF8PROC_VERSION_MINOR`, and `UTF8PROC_VERSION_PATCH` for compile-time + detection of the API version. + - Doxygen-formatted documentation. ## Version 1.1.6 ## diff --git a/test/printproperty.c b/test/printproperty.c index 6be8cb1..b876f0c 100644 --- a/test/printproperty.c +++ b/test/printproperty.c @@ -8,6 +8,10 @@ int main(int argc, char **argv) for (i = 1; i < argc; ++i) { int c; + if (!strcmp(argv[i], "-V")) { + printf("utf8proc version %s\n", utf8proc_version()); + continue; + } check(sscanf(argv[i],"%x",&c) == 1, "invalid hex input %s", argv[i]); const utf8proc_property_t *p = utf8proc_get_property(c); printf("U+%s:\n" @@ -1,3 +1,4 @@ +/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */ /* * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany * @@ -84,8 +85,10 @@ DLLEXPORT const int8_t utf8proc_utf8class[256] = { /* Should follow semantic-versioning rules (semver.org) based on API compatibility. (Note that the shared-library version number will be different, being based on ABI compatibility.): */ +#define STRINGIZEx(x) #x +#define STRINGIZE(x) STRINGIZEx(x) DLLEXPORT const char *utf8proc_version(void) { - return "1.2-dev"; + return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "-dev"; } DLLEXPORT const char *utf8proc_errmsg(ssize_t errcode) { @@ -193,54 +196,54 @@ static const utf8proc_property_t *get_property(int32_t uc) { } DLLEXPORT const utf8proc_property_t *utf8proc_get_property(int32_t uc) { - return uc < 0 || uc >= 0x110000 ? utf8proc_properties : get_property(uc); + return uc < 0 || uc >= 0x110000 ? utf8proc_properties : get_property(uc); } /* return whether there is a grapheme break between boundclasses lbc and tbc */ static bool grapheme_break(int lbc, int tbc) { - return - (lbc == UTF8PROC_BOUNDCLASS_START) ? true : - (lbc == UTF8PROC_BOUNDCLASS_CR && - tbc == UTF8PROC_BOUNDCLASS_LF) ? false : - (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : - (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : - (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false : - (lbc == UTF8PROC_BOUNDCLASS_L && - (tbc == UTF8PROC_BOUNDCLASS_L || - tbc == UTF8PROC_BOUNDCLASS_V || - tbc == UTF8PROC_BOUNDCLASS_LV || - tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : - ((lbc == UTF8PROC_BOUNDCLASS_LV || - lbc == UTF8PROC_BOUNDCLASS_V) && - (tbc == UTF8PROC_BOUNDCLASS_V || - tbc == UTF8PROC_BOUNDCLASS_T)) ? false : - ((lbc == UTF8PROC_BOUNDCLASS_LVT || - lbc == UTF8PROC_BOUNDCLASS_T) && - tbc == UTF8PROC_BOUNDCLASS_T) ? false : - (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && - tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : - (tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK); + return + (lbc == UTF8PROC_BOUNDCLASS_START) ? true : + (lbc == UTF8PROC_BOUNDCLASS_CR && + tbc == UTF8PROC_BOUNDCLASS_LF) ? false : + (lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : + (tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : + (tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false : + (lbc == UTF8PROC_BOUNDCLASS_L && + (tbc == UTF8PROC_BOUNDCLASS_L || + tbc == UTF8PROC_BOUNDCLASS_V || + tbc == UTF8PROC_BOUNDCLASS_LV || + tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : + ((lbc == UTF8PROC_BOUNDCLASS_LV || + lbc == UTF8PROC_BOUNDCLASS_V) && + (tbc == UTF8PROC_BOUNDCLASS_V || + tbc == UTF8PROC_BOUNDCLASS_T)) ? false : + ((lbc == UTF8PROC_BOUNDCLASS_LVT || + lbc == UTF8PROC_BOUNDCLASS_T) && + tbc == UTF8PROC_BOUNDCLASS_T) ? false : + (lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && + tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : + (tbc != UTF8PROC_BOUNDCLASS_SPACINGMARK); } /* return whether there is a grapheme break between codepoints c1 and c2 */ DLLEXPORT bool utf8proc_grapheme_break(int32_t c1, int32_t c2) { - return grapheme_break(utf8proc_get_property(c1)->boundclass, - utf8proc_get_property(c2)->boundclass); + return grapheme_break(utf8proc_get_property(c1)->boundclass, + utf8proc_get_property(c2)->boundclass); } /* return a character width analogous to wcwidth (except portable and hopefully less buggy than most system wcwidth functions). */ DLLEXPORT int utf8proc_charwidth(int32_t c) { - return utf8proc_get_property(c)->charwidth; + return utf8proc_get_property(c)->charwidth; } DLLEXPORT utf8proc_category_t utf8proc_category(int32_t c) { - return utf8proc_get_property(c)->category; + return utf8proc_get_property(c)->category; } DLLEXPORT const char *utf8proc_category_string(int32_t c) { - static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"}; - return s[utf8proc_category(c)]; + static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"}; + return s[utf8proc_category(c)]; } #define utf8proc_decompose_lump(replacement_uc) \ @@ -24,31 +24,27 @@ /** * @mainpage * - * uf8proc is a tool for processing UTF-8 strings, with the following features: + * utf8proc is a free/open-source (MIT/expat licensed) C library + * providing Unicode normalization, case-folding, and other operations + * for strings in the UTF-8 encoding, supporting Unicode version + * 7.0.0. See the utf8proc home page (http://julialang.org/utf8proc/) + * for downloads and other information, or the source code on github + * (https://github.com/JuliaLang/utf8proc). * - * - decomposing and composing of strings - * - replacing compatibility characters with their equivalents - * - grapheme segmentation - * - stripping of "default ignorable characters" - * like SOFT-HYPHEN or ZERO-WIDTH-SPACE - * - folding of certain characters for string comparison - * (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-") - * (see "LUMP" option) - * - optional rejection of strings containing non-assigned code points - * - stripping of control characters - * - stripping of character marks (accents, etc.) - * - transformation of LF, CRLF, CR and NEL to line-feed (LF) - * or to the unicode chararacters for paragraph separation (PS) - * or line separation (LS). - * - unicode case folding (for case insensitive string comparisons) - * - rejection of invalid UTF-8 data - * (i.e. UTF-8 encoded UTF-16 surrogates) - * - support for korean hangul characters - * - character widths + * For the utf8proc API documentation, see: @ref utf8proc.h * - * Unicode Version 7.0.0 is supported. + * The features of utf8proc include: * - * See @ref utf8proc.h for the API. + * - Transformation of strings (@ref utf8proc_map) to: + * - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character) + * - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT) + * - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK) + * - case-folding (@ref UTF8PROC_CASEFOLD) + * - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC + * - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND) + * - Character-width computation: @ref utf8proc_charwidth + * - Classification of characters by Unicode category: @ref utf8proc_category and @ref utf8proc_category_string + * - Encode (@ref utf8proc_encode_char) and decode (@ref utf8proc_iterate) Unicode codepoints to/from UTF-8. */ /** @file */ @@ -56,6 +52,27 @@ #ifndef UTF8PROC_H #define UTF8PROC_H +/** @name API version + * + * The utf8proc API version MAJOR.MINOR.PATCH, following + * semantic-versioning rules (http://semver.org) based on API + * compatibility. + * + * This is also returned at runtime by @ref utf8proc_version; however, the + * runtime version may append a string like "-dev" to the version number + * for prerelease versions. + * + * @note The shared-library version number in the Makefile may be different, + * being based on ABI compatibility rather than API compatibility. + */ +/** @{ */ +/** The MAJOR version number (increased when backwards API compatibility is broken). */ +#define UTF8PROC_VERSION_MAJOR 1 +/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */ +#define UTF8PROC_VERSION_MINOR 2 +/** The PATCH version (increased for fixes that do not change the API). */ +#define UTF8PROC_VERSION_PATCH 0 +/** @} */ #include <stdlib.h> #include <sys/types.h> @@ -114,7 +131,7 @@ typedef enum { UTF8PROC_COMPOSE = (1<<3), /** Return a result with decomposed characters. */ UTF8PROC_DECOMPOSE = (1<<4), - /** Strip "default ignorable characters". */ + /** Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE. */ UTF8PROC_IGNORE = (1<<5), /** Return an error, if the input contains unassigned code points. */ UTF8PROC_REJECTNA = (1<<6), @@ -337,7 +354,9 @@ typedef enum { DLLEXPORT extern const int8_t utf8proc_utf8class[256]; /** - * Returns the version as a string. + * Returns the utf8proc API version as a string MAJOR.MINOR.PATCH + * (http://semver.org format), possibly with a "-dev" suffix for + * development versions. */ DLLEXPORT const char *utf8proc_version(void); @@ -524,15 +543,21 @@ DLLEXPORT ssize_t utf8proc_map( const uint8_t *str, ssize_t strlen, uint8_t **dstptr, utf8proc_option_t options ); -/** @name Normalized versions. +/** @name Unicode normalization * * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC - * normalized version of the null-terminated string 'str'. + * normalized version of the null-terminated string 'str'. These + * are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM + * combined with @ref UTF8PROC_STABLE and flags indicating the normalization. */ /** @{ */ +/** NFD normalization (@ref UTF8PROC_DECOMPOSE). */ DLLEXPORT uint8_t *utf8proc_NFD(const uint8_t *str); +/** NFC normalization (@ref UTF8PROC_COMPOSE). */ DLLEXPORT uint8_t *utf8proc_NFC(const uint8_t *str); +/** NFD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */ DLLEXPORT uint8_t *utf8proc_NFKD(const uint8_t *str); +/** NFD normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */ DLLEXPORT uint8_t *utf8proc_NFKC(const uint8_t *str); /** @} */ |