summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2009-02-17 23:44:17 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2009-02-17 23:44:17 +0000
commit41d2eca3b12de0c23f18d8f54918d4e71c1a8b56 (patch)
tree156c1240ec0f3fd246947daf6d3421e508c6b4a8
parentd3c890941330d6d0ea5e24a661a2d03f9369b045 (diff)
downloadnetsurf-41d2eca3b12de0c23f18d8f54918d4e71c1a8b56.tar.gz
netsurf-41d2eca3b12de0c23f18d8f54918d4e71c1a8b56.tar.bz2
Port our UTF-8 routines to parserutils (only enabled when building against Hubbub).
Sync our temporary internal copies of these functions with parserutils (which is rather better tested and fixes a number of known bugs in NetSurf's previous UTF-8 handling). Ideally, this will be the only place in NetSurf that has any dependency on parserutils, so port the amiga font code to our internal APIs. svn path=/trunk/netsurf/; revision=6550
-rw-r--r--amiga/font.c10
-rw-r--r--utils/utf8.c338
-rw-r--r--utils/utf8.h9
3 files changed, 257 insertions, 100 deletions
diff --git a/amiga/font.c b/amiga/font.c
index 48a232ff7..7d1da082b 100644
--- a/amiga/font.c
+++ b/amiga/font.c
@@ -33,8 +33,6 @@
#include <proto/exec.h>
#include <graphics/blitattr.h>
#include "amiga/options.h"
-#include <parserutils/charset/utf8.h>
-#include <parserutils/charset/utf16.h>
#include <proto/utility.h>
static struct OutlineFont *of[CSS_FONT_FAMILY_NOT_SET];
@@ -123,7 +121,7 @@ bool nsfont_position_in_string(const struct css_style *style,
uint8 *utf8;
uint32 co = 0;
- parserutils_charset_utf8_length(string, length, &len);
+ len = utf8_bounded_length(string, length);
if(utf8_to_enc(string,"UTF-16",length,&utf16) != UTF8_CONVERT_OK) return;
outf16 = utf16;
@@ -143,7 +141,7 @@ bool nsfont_position_in_string(const struct css_style *style,
{
*actual_x = tx;
if(utf8_from_enc(utf16,"UTF-16",4,&utf8) != UTF8_CONVERT_OK) return;
- parserutils_charset_utf8_char_byte_length(utf8,&utf8len);
+ utf8len = utf8_char_byte_length(utf8);
free(utf8);
if(x<tx+glyph->glm_X1)
@@ -235,7 +233,7 @@ bool nsfont_split(const struct css_style *style,
uint32 tx=0,i=0;
size_t len;
- parserutils_charset_utf8_length(string, length, &len);
+ len = utf8_bounded_length(string, length);
if(utf8_to_enc(string,"UTF-16",length,&utf16) != UTF8_CONVERT_OK) return;
outf16 = utf16;
if(!(ofont = ami_open_outline_font(style))) return 0;
@@ -472,7 +470,7 @@ ULONG ami_unicode_text(struct RastPort *rp,char *string,ULONG length,struct css_
if(!string || string[0]=='\0') return 0;
if(!length) return 0;
- parserutils_charset_utf8_length(string, length, &len);
+ len = utf8_bounded_length(string, length);
if(utf8_to_enc(string,"UTF-16",length,&utf16) != UTF8_CONVERT_OK) return 0;
outf16 = utf16;
if(!(ofont = ami_open_outline_font(style))) return 0;
diff --git a/utils/utf8.c b/utils/utf8.c
index 0482beb4b..6721305fb 100644
--- a/utils/utf8.c
+++ b/utils/utf8.c
@@ -28,10 +28,37 @@
#include <strings.h>
#include <iconv.h>
+/** \todo Once we can enable hubbub on all platforms, these ifdefs must go */
+#ifdef WITH_HUBBUB
+#include <parserutils/charset/utf8.h>
+#endif
+
#include "utils/config.h"
#include "utils/log.h"
#include "utils/utf8.h"
+#ifndef WITH_HUBBUB
+/** Number of continuation bytes for a given start byte */
+static const uint8_t numContinuations[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
+};
+#endif
+
static utf8_convert_ret utf8_convert(const char *string, size_t len,
const char *from, const char *to, char **result);
@@ -45,41 +72,79 @@ static utf8_convert_ret utf8_convert(const char *string, size_t len,
* \param l Length of sequence
* \return UCS4 character
*/
-size_t utf8_to_ucs4(const char *s_in, size_t l)
+uint32_t utf8_to_ucs4(const char *s_in, size_t l)
{
- size_t c = 0;
- const unsigned char *s = (const unsigned char *) s_in;
-
- if (!s)
- assert(0);
- else if (l > 0 && *s < 0x80)
- c = *s;
- else if (l > 1 && (*s & 0xE0) == 0xC0 && (*(s+1) & 0xC0) == 0x80)
- c = ((*s & 0x1F) << 6) | (*(s+1) & 0x3F);
- else if (l > 2 && (*s & 0xF0) == 0xE0 && (*(s+1) & 0xC0) == 0x80 &&
- (*(s+2) & 0xC0) == 0x80)
- c = ((*s & 0x0F) << 12) | ((*(s+1) & 0x3F) << 6) |
- (*(s+2) & 0x3F);
- else if (l > 3 && (*s & 0xF8) == 0xF0 && (*(s+1) & 0xC0) == 0x80 &&
- (*(s+2) & 0xC0) == 0x80 && (*(s+3) & 0xC0) == 0x80)
- c = ((*s & 0x0F) << 18) | ((*(s+1) & 0x3F) << 12) |
- ((*(s+2) & 0x3F) << 6) | (*(s+3) & 0x3F);
- else if (l > 4 && (*s & 0xFC) == 0xF8 && (*(s+1) & 0xC0) == 0x80 &&
- (*(s+2) & 0xC0) == 0x80 && (*(s+3) & 0xC0) == 0x80 &&
- (*(s+4) & 0xC0) == 0x80)
- c = ((*s & 0x0F) << 24) | ((*(s+1) & 0x3F) << 18) |
- ((*(s+2) & 0x3F) << 12) | ((*(s+3) & 0x3F) << 6) |
- (*(s+4) & 0x3F);
- else if (l > 5 && (*s & 0xFE) == 0xFC && (*(s+1) & 0xC0) == 0x80 &&
- (*(s+2) & 0xC0) == 0x80 && (*(s+3) & 0xC0) == 0x80 &&
- (*(s+4) & 0xC0) == 0x80 && (*(s+5) & 0xC0) == 0x80)
- c = ((*s & 0x0F) << 28) | ((*(s+1) & 0x3F) << 24) |
- ((*(s+2) & 0x3F) << 18) | ((*(s+3) & 0x3F) << 12) |
- ((*(s+4) & 0x3F) << 6) | (*(s+5) & 0x3F);
- else
+#ifdef WITH_HUBBUB
+ uint32_t ucs4;
+ size_t len;
+ parserutils_error perror;
+
+ perror = parserutils_charset_utf8_to_ucs4((const uint8_t *) s_in, l,
+ &ucs4, &len);
+ if (perror != PARSERUTILS_OK)
+ ucs4 = 0xfffd;
+
+ return ucs4;
+#else
+ const uint8_t *s = (const uint8_t *) s_in;
+ uint32_t c, min;
+ uint8_t n;
+ uint8_t i;
+
+ assert(s != NULL && l > 0);
+
+ c = s[0];
+
+ if (c < 0x80) {
+ n = 1;
+ min = 0;
+ } else if ((c & 0xE0) == 0xC0) {
+ c &= 0x1F;
+ n = 2;
+ min = 0x80;
+ } else if ((c & 0xF0) == 0xE0) {
+ c &= 0x0F;
+ n = 3;
+ min = 0x800;
+ } else if ((c & 0xF8) == 0xF0) {
+ c &= 0x07;
+ n = 4;
+ min = 0x10000;
+ } else if ((c & 0xFC) == 0xF8) {
+ c &= 0x03;
+ n = 5;
+ min = 0x200000;
+ } else if ((c & 0xFE) == 0xFC) {
+ c &= 0x01;
+ n = 6;
+ min = 0x4000000;
+ } else {
assert(0);
+ }
+
+ if (l < n) {
+ return 0xfffd;
+ }
+
+ for (i = 1; i < n; i++) {
+ uint32_t t = s[i];
+
+ if ((t & 0xC0) != 0x80) {
+ return 0xfffd;
+ }
+
+ c <<= 6;
+ c |= t & 0x3F;
+ }
+
+ /* Detect overlong sequences, surrogates and fffe/ffff */
+ if (c < min || (c >= 0xD800 && c <= 0xDFFF) ||
+ c == 0xFFFE || c == 0xFFFF) {
+ c = 0xfffd;
+ }
return c;
+#endif
}
/**
@@ -92,53 +157,60 @@ size_t utf8_to_ucs4(const char *s_in, size_t l)
* \param s Pointer to 6 byte long output buffer
* \return Length of multibyte sequence
*/
-size_t utf8_from_ucs4(size_t c, char *s)
+size_t utf8_from_ucs4(uint32_t c, char *s)
{
- size_t l = 0;
+#ifdef WITH_HUBBUB
+ uint8_t *in = (uint8_t *) s;
+ size_t len = 6;
+ parserutils_error perror;
+
+ perror = parserutils_charset_utf8_from_ucs4(c, &in, &len);
+ if (perror != PARSERUTILS_OK) {
+ s[0] = 0xef;
+ s[1] = 0xbf;
+ s[2] = 0xbd;
+ return 3;
+ }
- if (c > 0x7FFFFFFF || s == NULL)
- assert(0);
- else if (c < 0x80) {
- *s = (char)c;
+ return len;
+#else
+ uint8_t *buf;
+ uint8_t l = 0;
+
+ assert(s != NULL);
+
+ if (c < 0x80) {
l = 1;
- }
- else if (c < 0x800) {
- *s = 0xC0 | ((c >> 6) & 0x1F);
- *(s+1) = 0x80 | (c & 0x3F);
+ } else if (c < 0x800) {
l = 2;
- }
- else if (c < 0x10000) {
- *s = 0xE0 | ((c >> 12) & 0xF);
- *(s+1) = 0x80 | ((c >> 6) & 0x3F);
- *(s+2) = 0x80 | (c & 0x3F);
+ } else if (c < 0x10000) {
l = 3;
- }
- else if (c < 0x200000) {
- *s = 0xF0 | ((c >> 18) & 0x7);
- *(s+1) = 0x80 | ((c >> 12) & 0x3F);
- *(s+2) = 0x80 | ((c >> 6) & 0x3F);
- *(s+3) = 0x80 | (c & 0x3F);
+ } else if (c < 0x200000) {
l = 4;
- }
- else if (c < 0x4000000) {
- *s = 0xF8 | ((c >> 24) & 0x3);
- *(s+1) = 0x80 | ((c >> 18) & 0x3F);
- *(s+2) = 0x80 | ((c >> 12) & 0x3F);
- *(s+3) = 0x80 | ((c >> 6) & 0x3F);
- *(s+4) = 0x80 | (c & 0x3F);
+ } else if (c < 0x4000000) {
l = 5;
- }
- else if (c <= 0x7FFFFFFF) {
- *s = 0xFC | ((c >> 30) & 0x1);
- *(s+1) = 0x80 | ((c >> 24) & 0x3F);
- *(s+2) = 0x80 | ((c >> 18) & 0x3F);
- *(s+3) = 0x80 | ((c >> 12) & 0x3F);
- *(s+4) = 0x80 | ((c >> 6) & 0x3F);
- *(s+5) = 0x80 | (c & 0x3F);
+ } else if (c <= 0x7FFFFFFF) {
l = 6;
+ } else {
+ assert(0);
+ }
+
+ buf = (uint8_t *) s;
+
+ if (l == 1) {
+ buf[0] = (uint8_t) c;
+ } else {
+ uint8_t i;
+
+ for (i = l; i > 1; i--) {
+ buf[i - 1] = 0x80 | (c & 0x3F);
+ c >>= 6;
+ }
+ buf[0] = ~((1 << (8 - l)) - 1) | c;
}
return l;
+#endif
}
/**
@@ -149,30 +221,83 @@ size_t utf8_from_ucs4(size_t c, char *s)
*/
size_t utf8_length(const char *s)
{
- const char *__s = s;
- int l = 0;
-
- assert(__s != NULL);
-
- while (*__s != '\0') {
- if ((*__s & 0x80) == 0x00)
- __s += 1;
- else if ((*__s & 0xE0) == 0xC0)
- __s += 2;
- else if ((*__s & 0xF0) == 0xE0)
- __s += 3;
- else if ((*__s & 0xF8) == 0xF0)
- __s += 4;
- else if ((*__s & 0xFC) == 0xF8)
- __s += 5;
- else if ((*__s & 0xFE) == 0xFC)
- __s += 6;
- else
+ return utf8_bounded_length(s, strlen(s));
+}
+
+/**
+ * Calculated the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s The string
+ * \param l Maximum length of input (in bytes)
+ * \return Length of string, in characters
+ */
+size_t utf8_bounded_length(const char *s, size_t l)
+{
+#ifdef WITH_HUBBUB
+ size_t len;
+ parserutils_error perror;
+
+ perror = parserutils_charset_utf8_length((const uint8_t *) s, l, &len);
+ if (perror != PARSERUTILS_OK)
+ return 0;
+
+ return len;
+#else
+ const uint8_t *p = (const uint8_t *) s;
+ const uint8_t *end = p + l;
+ size_t len = 0;
+
+ assert(s != NULL);
+
+ while (p < end) {
+ uint32_t c = p[0];
+
+ if ((c & 0x80) == 0x00)
+ p += 1;
+ else if ((c & 0xE0) == 0xC0)
+ p += 2;
+ else if ((c & 0xF0) == 0xE0)
+ p += 3;
+ else if ((c & 0xF8) == 0xF0)
+ p += 4;
+ else if ((c & 0xFC) == 0xF8)
+ p += 5;
+ else if ((c & 0xFE) == 0xFC)
+ p += 6;
+ else {
assert(0);
- l++;
+ }
+
+ len++;
}
- return l;
+ return len;
+#endif
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s Pointer to start of character
+ * \return Length of character, in bytes
+ */
+size_t utf8_char_byte_length(const char *s)
+{
+#ifdef WITH_HUBBUB
+ size_t len;
+ parserutils_error perror;
+
+ perror = parserutils_charset_utf8_char_byte_length((const uint8_t *) s,
+ &len);
+ assert(perror == PARSERUTILS_OK);
+
+ return len;
+#else
+ const uint8_t *p = (const uint8_t *) s;
+ assert(s != NULL);
+
+ return numContinuations[p[0]] + 1 /* Start byte */;
+#endif
}
/**
@@ -184,12 +309,24 @@ size_t utf8_length(const char *s)
*/
size_t utf8_prev(const char *s, size_t o)
{
+#ifdef WITH_HUBBUB
+ uint32_t prev;
+ parserutils_error perror;
+
+ perror = parserutils_charset_utf8_prev((const uint8_t *) s, o, &prev);
+ assert(perror == PARSERUTILS_OK);
+
+ return prev;
+#else
+ const uint8_t *p = (const uint8_t *) s;
+
assert(s != NULL);
- while (o != 0 && (s[--o] & 0xC0) == 0x80)
+ while (o != 0 && (p[--o] & 0xC0) == 0x80)
/* do nothing */;
return o;
+#endif
}
/**
@@ -202,12 +339,29 @@ size_t utf8_prev(const char *s, size_t o)
*/
size_t utf8_next(const char *s, size_t l, size_t o)
{
- assert(s != NULL);
+#ifdef WITH_HUBBUB
+ uint32_t next;
+ parserutils_error perror;
- while (o != l && (s[++o] & 0xC0) == 0x80)
- /* do nothing */;
+ perror = parserutils_charset_utf8_next((const uint8_t *) s, l, o,
+ &next);
+ assert(perror == PARSERUTILS_OK);
+
+ return next;
+#else
+ const uint8_t *p = (const uint8_t *) s;
+
+ assert(s != NULL && o < l);
+
+ /* Skip current start byte (if present - may be mid-sequence) */
+ if (p[o] < 0x80 || (p[o] & 0xC0) == 0xC0)
+ o++;
+
+ while (o < l && (p[o] & 0xC0) == 0x80)
+ o++;
return o;
+#endif
}
/* Cache of previous iconv conversion descriptor used by utf8_convert */
diff --git a/utils/utf8.h b/utils/utf8.h
index e33b07d0d..9d8ec74fa 100644
--- a/utils/utf8.h
+++ b/utils/utf8.h
@@ -23,16 +23,21 @@
#ifndef _NETSURF_UTILS_UTF8_H_
#define _NETSURF_UTILS_UTF8_H_
+#include <stdint.h>
+
typedef enum {
UTF8_CONVERT_OK,
UTF8_CONVERT_NOMEM,
UTF8_CONVERT_BADENC
} utf8_convert_ret;
-size_t utf8_to_ucs4(const char *s, size_t l);
-size_t utf8_from_ucs4(size_t c, char *s);
+uint32_t utf8_to_ucs4(const char *s, size_t l);
+size_t utf8_from_ucs4(uint32_t c, char *s);
size_t utf8_length(const char *s);
+size_t utf8_bounded_length(const char *s, size_t l);
+
+size_t utf8_char_byte_length(const char *s);
size_t utf8_prev(const char *s, size_t o);
size_t utf8_next(const char *s, size_t l, size_t o);