diff options
author | John Tytgat <joty@netsurf-browser.org> | 2008-06-28 20:57:47 +0000 |
---|---|---|
committer | John Tytgat <joty@netsurf-browser.org> | 2008-06-28 20:57:47 +0000 |
commit | 5aa01bdb330f82e7bc3756ac18fd47d137059ce4 (patch) | |
tree | dfbb28d786259a30f5e8d5db6522f3278c8ba80b /src | |
parent | 5784a7659386a8681445d85837a70b45ed7d6968 (diff) | |
download | libparserutils-5aa01bdb330f82e7bc3756ac18fd47d137059ce4.tar.gz libparserutils-5aa01bdb330f82e7bc3756ac18fd47d137059ce4.tar.bz2 |
- parserutils_charset_utf16_to_ucs4(): fixed surrogate handling.
- cscodec-utf8.c(run_test): Added more asserts.
- Added UTF-16 tester (based on the UTF-8 one).
svn path=/trunk/libparserutils/; revision=4472
Diffstat (limited to 'src')
-rw-r--r-- | src/charset/encodings/utf16.c | 15 |
1 files changed, 10 insertions, 5 deletions
diff --git a/src/charset/encodings/utf16.c b/src/charset/encodings/utf16.c index 59cb146..3611646 100644 --- a/src/charset/encodings/utf16.c +++ b/src/charset/encodings/utf16.c @@ -19,7 +19,7 @@ * Convert a UTF-16 sequence into a single UCS-4 character * * \param s The sequence to process - * \param len Length of sequence + * \param len Length of sequence in bytes * \param ucs4 Pointer to location to receive UCS-4 character (host endian) * \param clen Pointer to location to receive byte length of UTF-16 sequence * \return PARSERUTILS_OK on success, appropriate error otherwise @@ -38,17 +38,22 @@ parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, if (*ss < 0xD800 || *ss > 0xDFFF) { *ucs4 = *ss; *clen = 2; - } else if (0xD800 <= *ss && *ss <= 0xBFFF) { + } else if (0xD800 <= *ss && *ss <= 0xDBFF) { + /* High-surrogate code unit. */ if (len < 4) return PARSERUTILS_NEEDDATA; - if (0xDC00 <= ss[1] && ss[1] <= 0xE000) { - *ucs4 = (((s[0] >> 6) & 0x1f) + 1) | - ((s[0] & 0x3f) | (s[1] & 0x3ff)); + if (0xDC00 <= ss[1] && ss[1] <= 0xDFFF) { + /* We have a valid surrogate pair. */ + *ucs4 = (((ss[0] & 0x3FF) << 10) | (ss[1] & 0x3FF)) + + (1<<16); *clen = 4; } else { return PARSERUTILS_INVALID; } + } else { + /* Low-surrogate code unit. */ + return PARSERUTILS_INVALID; } return PARSERUTILS_OK; |