diff options
Diffstat (limited to 'utils/utf8.c')
-rw-r--r-- | utils/utf8.c | 338 |
1 files changed, 246 insertions, 92 deletions
diff --git a/utils/utf8.c b/utils/utf8.c index 0482beb4b..6721305fb 100644 --- a/utils/utf8.c +++ b/utils/utf8.c @@ -28,10 +28,37 @@ #include <strings.h> #include <iconv.h> +/** \todo Once we can enable hubbub on all platforms, these ifdefs must go */ +#ifdef WITH_HUBBUB +#include <parserutils/charset/utf8.h> +#endif + #include "utils/config.h" #include "utils/log.h" #include "utils/utf8.h" +#ifndef WITH_HUBBUB +/** Number of continuation bytes for a given start byte */ +static const uint8_t numContinuations[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, +}; +#endif + static utf8_convert_ret utf8_convert(const char *string, size_t len, const char *from, const char *to, char **result); @@ -45,41 +72,79 @@ static utf8_convert_ret utf8_convert(const char *string, size_t len, * \param l Length of sequence * \return UCS4 character */ -size_t utf8_to_ucs4(const char *s_in, size_t l) +uint32_t utf8_to_ucs4(const char *s_in, size_t l) { - size_t c = 0; - const unsigned char *s = (const unsigned char *) s_in; - - if (!s) - assert(0); - else if (l > 0 && *s < 0x80) - c = *s; - else if (l > 1 && (*s & 0xE0) == 0xC0 && (*(s+1) & 0xC0) == 0x80) - c = ((*s & 0x1F) << 6) | (*(s+1) & 0x3F); - else if (l > 2 && (*s & 0xF0) == 0xE0 && (*(s+1) & 0xC0) == 0x80 && - (*(s+2) & 0xC0) == 0x80) - c = ((*s & 0x0F) << 12) | ((*(s+1) & 0x3F) << 6) | - (*(s+2) & 0x3F); - else if (l > 3 && (*s & 0xF8) == 0xF0 && (*(s+1) & 0xC0) == 0x80 && - (*(s+2) & 0xC0) == 0x80 && (*(s+3) & 0xC0) == 0x80) - c = ((*s & 0x0F) << 18) | ((*(s+1) & 0x3F) << 12) | - ((*(s+2) & 0x3F) << 6) | (*(s+3) & 0x3F); - else if (l > 4 && (*s & 0xFC) == 0xF8 && (*(s+1) & 0xC0) == 0x80 && - (*(s+2) & 0xC0) == 0x80 && (*(s+3) & 0xC0) == 0x80 && - (*(s+4) & 0xC0) == 0x80) - c = ((*s & 0x0F) << 24) | ((*(s+1) & 0x3F) << 18) | - ((*(s+2) & 0x3F) << 12) | ((*(s+3) & 0x3F) << 6) | - (*(s+4) & 0x3F); - else if (l > 5 && (*s & 0xFE) == 0xFC && (*(s+1) & 0xC0) == 0x80 && - (*(s+2) & 0xC0) == 0x80 && (*(s+3) & 0xC0) == 0x80 && - (*(s+4) & 0xC0) == 0x80 && (*(s+5) & 0xC0) == 0x80) - c = ((*s & 0x0F) << 28) | ((*(s+1) & 0x3F) << 24) | - ((*(s+2) & 0x3F) << 18) | ((*(s+3) & 0x3F) << 12) | - ((*(s+4) & 0x3F) << 6) | (*(s+5) & 0x3F); - else +#ifdef WITH_HUBBUB + uint32_t ucs4; + size_t len; + parserutils_error perror; + + perror = parserutils_charset_utf8_to_ucs4((const uint8_t *) s_in, l, + &ucs4, &len); + if (perror != PARSERUTILS_OK) + ucs4 = 0xfffd; + + return ucs4; +#else + const uint8_t *s = (const uint8_t *) s_in; + uint32_t c, min; + uint8_t n; + uint8_t i; + + assert(s != NULL && l > 0); + + c = s[0]; + + if (c < 0x80) { + n = 1; + min = 0; + } else if ((c & 0xE0) == 0xC0) { + c &= 0x1F; + n = 2; + min = 0x80; + } else if ((c & 0xF0) == 0xE0) { + c &= 0x0F; + n = 3; + min = 0x800; + } else if ((c & 0xF8) == 0xF0) { + c &= 0x07; + n = 4; + min = 0x10000; + } else if ((c & 0xFC) == 0xF8) { + c &= 0x03; + n = 5; + min = 0x200000; + } else if ((c & 0xFE) == 0xFC) { + c &= 0x01; + n = 6; + min = 0x4000000; + } else { assert(0); + } + + if (l < n) { + return 0xfffd; + } + + for (i = 1; i < n; i++) { + uint32_t t = s[i]; + + if ((t & 0xC0) != 0x80) { + return 0xfffd; + } + + c <<= 6; + c |= t & 0x3F; + } + + /* Detect overlong sequences, surrogates and fffe/ffff */ + if (c < min || (c >= 0xD800 && c <= 0xDFFF) || + c == 0xFFFE || c == 0xFFFF) { + c = 0xfffd; + } return c; +#endif } /** @@ -92,53 +157,60 @@ size_t utf8_to_ucs4(const char *s_in, size_t l) * \param s Pointer to 6 byte long output buffer * \return Length of multibyte sequence */ -size_t utf8_from_ucs4(size_t c, char *s) +size_t utf8_from_ucs4(uint32_t c, char *s) { - size_t l = 0; +#ifdef WITH_HUBBUB + uint8_t *in = (uint8_t *) s; + size_t len = 6; + parserutils_error perror; + + perror = parserutils_charset_utf8_from_ucs4(c, &in, &len); + if (perror != PARSERUTILS_OK) { + s[0] = 0xef; + s[1] = 0xbf; + s[2] = 0xbd; + return 3; + } - if (c > 0x7FFFFFFF || s == NULL) - assert(0); - else if (c < 0x80) { - *s = (char)c; + return len; +#else + uint8_t *buf; + uint8_t l = 0; + + assert(s != NULL); + + if (c < 0x80) { l = 1; - } - else if (c < 0x800) { - *s = 0xC0 | ((c >> 6) & 0x1F); - *(s+1) = 0x80 | (c & 0x3F); + } else if (c < 0x800) { l = 2; - } - else if (c < 0x10000) { - *s = 0xE0 | ((c >> 12) & 0xF); - *(s+1) = 0x80 | ((c >> 6) & 0x3F); - *(s+2) = 0x80 | (c & 0x3F); + } else if (c < 0x10000) { l = 3; - } - else if (c < 0x200000) { - *s = 0xF0 | ((c >> 18) & 0x7); - *(s+1) = 0x80 | ((c >> 12) & 0x3F); - *(s+2) = 0x80 | ((c >> 6) & 0x3F); - *(s+3) = 0x80 | (c & 0x3F); + } else if (c < 0x200000) { l = 4; - } - else if (c < 0x4000000) { - *s = 0xF8 | ((c >> 24) & 0x3); - *(s+1) = 0x80 | ((c >> 18) & 0x3F); - *(s+2) = 0x80 | ((c >> 12) & 0x3F); - *(s+3) = 0x80 | ((c >> 6) & 0x3F); - *(s+4) = 0x80 | (c & 0x3F); + } else if (c < 0x4000000) { l = 5; - } - else if (c <= 0x7FFFFFFF) { - *s = 0xFC | ((c >> 30) & 0x1); - *(s+1) = 0x80 | ((c >> 24) & 0x3F); - *(s+2) = 0x80 | ((c >> 18) & 0x3F); - *(s+3) = 0x80 | ((c >> 12) & 0x3F); - *(s+4) = 0x80 | ((c >> 6) & 0x3F); - *(s+5) = 0x80 | (c & 0x3F); + } else if (c <= 0x7FFFFFFF) { l = 6; + } else { + assert(0); + } + + buf = (uint8_t *) s; + + if (l == 1) { + buf[0] = (uint8_t) c; + } else { + uint8_t i; + + for (i = l; i > 1; i--) { + buf[i - 1] = 0x80 | (c & 0x3F); + c >>= 6; + } + buf[0] = ~((1 << (8 - l)) - 1) | c; } return l; +#endif } /** @@ -149,30 +221,83 @@ size_t utf8_from_ucs4(size_t c, char *s) */ size_t utf8_length(const char *s) { - const char *__s = s; - int l = 0; - - assert(__s != NULL); - - while (*__s != '\0') { - if ((*__s & 0x80) == 0x00) - __s += 1; - else if ((*__s & 0xE0) == 0xC0) - __s += 2; - else if ((*__s & 0xF0) == 0xE0) - __s += 3; - else if ((*__s & 0xF8) == 0xF0) - __s += 4; - else if ((*__s & 0xFC) == 0xF8) - __s += 5; - else if ((*__s & 0xFE) == 0xFC) - __s += 6; - else + return utf8_bounded_length(s, strlen(s)); +} + +/** + * Calculated the length (in characters) of a bounded UTF-8 string + * + * \param s The string + * \param l Maximum length of input (in bytes) + * \return Length of string, in characters + */ +size_t utf8_bounded_length(const char *s, size_t l) +{ +#ifdef WITH_HUBBUB + size_t len; + parserutils_error perror; + + perror = parserutils_charset_utf8_length((const uint8_t *) s, l, &len); + if (perror != PARSERUTILS_OK) + return 0; + + return len; +#else + const uint8_t *p = (const uint8_t *) s; + const uint8_t *end = p + l; + size_t len = 0; + + assert(s != NULL); + + while (p < end) { + uint32_t c = p[0]; + + if ((c & 0x80) == 0x00) + p += 1; + else if ((c & 0xE0) == 0xC0) + p += 2; + else if ((c & 0xF0) == 0xE0) + p += 3; + else if ((c & 0xF8) == 0xF0) + p += 4; + else if ((c & 0xFC) == 0xF8) + p += 5; + else if ((c & 0xFE) == 0xFC) + p += 6; + else { assert(0); - l++; + } + + len++; } - return l; + return len; +#endif +} + +/** + * Calculate the length (in bytes) of a UTF-8 character + * + * \param s Pointer to start of character + * \return Length of character, in bytes + */ +size_t utf8_char_byte_length(const char *s) +{ +#ifdef WITH_HUBBUB + size_t len; + parserutils_error perror; + + perror = parserutils_charset_utf8_char_byte_length((const uint8_t *) s, + &len); + assert(perror == PARSERUTILS_OK); + + return len; +#else + const uint8_t *p = (const uint8_t *) s; + assert(s != NULL); + + return numContinuations[p[0]] + 1 /* Start byte */; +#endif } /** @@ -184,12 +309,24 @@ size_t utf8_length(const char *s) */ size_t utf8_prev(const char *s, size_t o) { +#ifdef WITH_HUBBUB + uint32_t prev; + parserutils_error perror; + + perror = parserutils_charset_utf8_prev((const uint8_t *) s, o, &prev); + assert(perror == PARSERUTILS_OK); + + return prev; +#else + const uint8_t *p = (const uint8_t *) s; + assert(s != NULL); - while (o != 0 && (s[--o] & 0xC0) == 0x80) + while (o != 0 && (p[--o] & 0xC0) == 0x80) /* do nothing */; return o; +#endif } /** @@ -202,12 +339,29 @@ size_t utf8_prev(const char *s, size_t o) */ size_t utf8_next(const char *s, size_t l, size_t o) { - assert(s != NULL); +#ifdef WITH_HUBBUB + uint32_t next; + parserutils_error perror; - while (o != l && (s[++o] & 0xC0) == 0x80) - /* do nothing */; + perror = parserutils_charset_utf8_next((const uint8_t *) s, l, o, + &next); + assert(perror == PARSERUTILS_OK); + + return next; +#else + const uint8_t *p = (const uint8_t *) s; + + assert(s != NULL && o < l); + + /* Skip current start byte (if present - may be mid-sequence) */ + if (p[o] < 0x80 || (p[o] & 0xC0) == 0xC0) + o++; + + while (o < l && (p[o] & 0xC0) == 0x80) + o++; return o; +#endif } /* Cache of previous iconv conversion descriptor used by utf8_convert */ |