From 7b30a5520cfb56e651f0eb4da85a3e07747da7dc Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sat, 23 Jun 2007 22:40:25 +0000 Subject: Import hubbub -- an HTML parsing library. Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359 --- src/charset/codec_iconv.c | 837 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 837 insertions(+) create mode 100644 src/charset/codec_iconv.c (limited to 'src/charset/codec_iconv.c') diff --git a/src/charset/codec_iconv.c b/src/charset/codec_iconv.c new file mode 100644 index 0000000..097e82a --- /dev/null +++ b/src/charset/codec_iconv.c @@ -0,0 +1,837 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/* This codec is hideously slow. Only use it as a last resort */ + +#include +#include +#include + +#include + +/* These two are for htonl / ntohl */ +#include +#include + +#include "charset/aliases.h" +#include "utils/utils.h" + +#include "codec_impl.h" + +/** + * A note on endianness: + * + * UCS4 is big-endian by default. Therefore, this codec reads and writes + * big-endian values. This is fine, and causes no problems. However, to + * make life easier for client-supplied filter code, character values passed + * to a filter and those read back from a filter are in host-endian. + * Therefore, we need to convert from big-endian to host-endian when passing + * characters to a filter and perform the reverse translation when reading + * characters back. + */ + +/** + * Iconv-based charset codec + */ +typedef struct hubbub_iconv_codec { + hubbub_charsetcodec base; /**< Base class */ + + iconv_t read_cd; /**< Iconv handle for reading */ +#define INVAL_BUFSIZE (32) + uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up + * incomplete input + * sequences */ + size_t inval_len; /**< Number of bytes in inval_buf */ + +#define READ_BUFSIZE (8) + uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial + * output sequences (decode) + */ + size_t read_len; /**< Number of characters in + * read_buf */ + + iconv_t write_cd; /**< Iconv handle for writing */ +#define WRITE_BUFSIZE (8) + uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial + * output sequences (encode) + */ + size_t write_len; /**< Number of characters in + * write_buf */ +} hubbub_iconv_codec; + + +static bool hubbub_iconv_codec_handles_charset(const char *charset); +static hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset, + hubbub_alloc alloc, void *pw); +static void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec); +static hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec); +static hubbub_error hubbub_iconv_codec_filter_decoded_char( + hubbub_iconv_codec *c, uint32_t ucs4, uint8_t **dest, + size_t *destlen); +static bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c); +static hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen); + +/** + * Determine whether this codec handles a specific charset + * + * \param charset Charset to test + * \return true if handleable, false otherwise + */ +bool hubbub_iconv_codec_handles_charset(const char *charset) +{ + iconv_t cd; + bool ret; + + cd = iconv_open("UCS-4", charset); + + ret = (cd != (iconv_t) -1); + + if (ret) + iconv_close(cd); + + return ret; +} + +/** + * Create an iconv-based codec + * + * \param charset The charset to read from / write to + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec, or NULL on failure + */ +hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset, + hubbub_alloc alloc, void *pw) +{ + hubbub_iconv_codec *codec; + + codec = alloc(NULL, sizeof(hubbub_iconv_codec), pw); + if (codec == NULL) + return NULL; + + codec->read_cd = iconv_open("UCS-4", charset); + if (codec->read_cd == (iconv_t) -1) { + alloc(codec, 0, pw); + return NULL; + } + + codec->write_cd = iconv_open(charset, "UCS-4"); + if (codec->write_cd == (iconv_t) -1) { + iconv_close(codec->read_cd); + alloc(codec, 0, pw); + return NULL; + } + + codec->inval_buf[0] = '\0'; + codec->inval_len = 0; + + codec->read_buf[0] = 0; + codec->read_len = 0; + + codec->write_buf[0] = 0; + codec->write_len = 0; + + /* Finally, populate vtable */ + codec->base.handler.destroy = hubbub_iconv_codec_destroy; + codec->base.handler.encode = hubbub_iconv_codec_encode; + codec->base.handler.decode = hubbub_iconv_codec_decode; + codec->base.handler.reset = hubbub_iconv_codec_reset; + + return (hubbub_charsetcodec *) codec; +} + +/** + * Destroy an iconv-based codec + * + * \param codec The codec to destroy + */ +void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + + iconv_close(c->read_cd); + iconv_close(c->write_cd); + + return; +} + +/** + * Encode a chunk of UCS4 data into an iconv-based codec's charset + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read. Any remaining output for the character will be buffered by the + * codec for writing on the next call. This buffered data is post-filtering, + * so will not be refiltered on the next call. + * + * In the case of the filter function failing, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the encoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * There is no way to determine the output character which caused a + * failure (as it may be one in a filter-injected replacement sequence). + * It is, however, possible to determine which source character caused it + * (this being the character immediately before the location pointed to by + * ::source on exit). + * + * [I.e. the process of filtering results in a potential one-to-many mapping + * between source characters and output characters, and identification of + * individual output characters is impossible.] + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + uint32_t ucs4; + const uint32_t *towrite; + size_t towritelen; + hubbub_error error; + + /* Process any outstanding characters from the previous call */ + if (c->write_len > 0) { + uint32_t *pwrite = c->write_buf; + + while (c->write_len > 0) { + error = hubbub_iconv_codec_write_char(c, pwrite[0], + dest, destlen); + if (error != HUBBUB_OK) { + /* Copy outstanding chars down, skipping + * invalid one, if present, so as to avoid + * reprocessing the invalid character */ + if (error == HUBBUB_INVALID) { + for (ucs4 = 1; ucs4 < c->write_len; + ucs4++) { + c->write_buf[ucs4] = + pwrite[ucs4]; + } + } + + return error; + } + + pwrite++; + c->write_len--; + } + } + + /* Now process the characters for this call */ + while (*sourcelen > 0) { + towrite = (const uint32_t *) (const void *) *source; + towritelen = 1; + ucs4 = *towrite; + + /* Run character we're about to output through the + * registered filter, so it can replace it, if it sees + * fit to do so */ + if (c->base.filter != NULL) { + uint32_t *replacement; + + error = c->base.filter(ntohl(ucs4), + &replacement, &towritelen, + c->base.filter_pw); + if (error != HUBBUB_OK) { + /* Don't eat character -- filter failed, + * so nothing gets written or buffered. + * It's up to the client to ensure that + * the filter works in the case where it + * reprocesses this character after the + * fault is fixed up. */ + + return error; + } + + /* Convert filter output to big endian UCS4 */ + for (ucs4 = 0; ucs4 < towritelen; ucs4++) { + replacement[ucs4] = htonl(replacement[ucs4]); + } + + towrite = (const uint32_t *) replacement; + } + + /* Output current character(s) */ + while (towritelen > 0) { + error = hubbub_iconv_codec_write_char(c, towrite[0], + dest, destlen); + + if (error != HUBBUB_OK) { + ucs4 = (error == HUBBUB_INVALID) ? 1 : 0; + + if (towritelen - ucs4 >= WRITE_BUFSIZE) + abort(); + + c->write_len = towritelen - ucs4; + + /* Copy pending chars to save area, for + * processing next call; skipping invalid + * character, if present, so it's not + * reprocessed. */ + for (; ucs4 < towritelen; ucs4++) { + c->write_buf[ucs4] = towrite[ucs4]; + } + + /* Claim character we've just buffered, + * so it's not repreocessed */ + *source += 4; + *sourcelen -= 4; + + return error; + } + + towrite++; + towritelen--; + } + + *source += 4; + *sourcelen -= 4; + } + + return HUBBUB_OK; +} + +/** + * Decode a chunk of data in an iconv-based codec's charset into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * This buffered data is post-filtering, so will not be refiltered on the + * next call. + * + * In the case of the result being _INVALID or the filter function failing, + * ::source will point _at_ the last input character read; nothing will be + * written or buffered for the failed character. It is up to the client to + * fix the cause of the failure and retry the decoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * There is no way to determine the output character which caused a + * failure (as it may be one in a filter-injected replacement sequence). + * It is, however, possible to determine which source character caused it + * (this being the character immediately at or before the location pointed + * to by ::source on exit). + * + * [I.e. the process of filtering results in a potential one-to-many mapping + * between source characters and output characters, and identification of + * individual output characters is impossible.] + * + * If STRICT error handling is configured and an illegal sequence is split + * over two calls, then _INVALID will be returned from the second call, + * but ::source will point mid-way through the invalid sequence (i.e. it + * will be unmodified over the second call). In addition, the internal + * incomplete-sequence buffer will be emptied, such that subsequent calls + * will progress, rather than re-evaluating the same invalid sequence. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + * + * Call this with a source length of 0 to flush the output buffer. + */ +hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + hubbub_error error; + + if (c->read_len > 0) { + /* Output left over from last decode + * Attempt to finish this here */ + uint32_t *pread = c->read_buf; + + while (c->read_len > 0 && *destlen >= c->read_len * 4) { + *((uint32_t *) (void *) *dest) = pread[0]; + + *dest += 4; + *destlen -= 4; + + pread++; + c->read_len--; + } + + if (*destlen < c->read_len * 4) { + /* Run out of output buffer */ + size_t i; + + /* Shuffle remaining output down */ + for (i = 0; i < c->read_len; i++) { + c->read_buf[i] = pread[i]; + } + + return HUBBUB_NOMEM; + } + } + + if (c->inval_len > 0) { + /* The last decode ended in an incomplete sequence. + * Fill up inval_buf with data from the start of the + * new chunk and process it. */ + uint8_t *in = c->inval_buf; + size_t ol = c->inval_len; + size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); + size_t orig_l = l; + + memcpy(c->inval_buf + ol, *source, l); + + l += c->inval_len; + + error = hubbub_iconv_codec_read_char(c, + (const uint8_t **) &in, &l, dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + return error; + } + + + /* And now, fix everything up so the normal processing + * does the right thing. */ + *source += max((signed) (orig_l - l), 0); + *sourcelen -= max((signed) (orig_l - l), 0); + + /* Failed to resolve an incomplete character and + * ran out of buffer space. No recovery strategy + * possible, so explode everywhere. */ + if ((orig_l + ol) - l == 0) + abort(); + + /* Handle memry exhaustion case from above */ + if (error != HUBBUB_OK) + return error; + } + + while (*sourcelen > 0) { + error = hubbub_iconv_codec_read_char(c, + source, sourcelen, dest, destlen); + if (error != HUBBUB_OK) { + return error; + } + } + + return HUBBUB_OK; +} + +/** + * Clear an iconv-based codec's encoding state + * + * \param codec The codec to reset + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + + iconv(c->read_cd, NULL, NULL, NULL, NULL); + iconv(c->write_cd, NULL, NULL, NULL, NULL); + + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + c->read_buf[0] = 0; + c->read_len = 0; + + c->write_buf[0] = 0; + c->write_len = 0; + + return HUBBUB_OK; +} + +/** + * Feed a UCS4 character through the registered filter and output the result + * + * \param c Codec to use + * \param ucs4 UCS4 character (big endian) + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to output buffer length + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * as a result of the failure of the + * client-provided filter function. + */ +hubbub_error hubbub_iconv_codec_filter_decoded_char(hubbub_iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + if (c->base.filter != NULL) { + uint32_t *rep; + size_t replen; + hubbub_error error; + + error = c->base.filter(ntohl(ucs4), &rep, &replen, + c->base.filter_pw); + if (error != HUBBUB_OK) { + return error; + } + + while (replen > 0 && *destlen >= replen * 4) { + *((uint32_t *) (void *) *dest) = htonl(*rep); + + *dest += 4; + *destlen -= 4; + + rep++; + replen--; + } + + if (*destlen < replen * 4) { + /* Run out of output buffer */ + size_t i; + + /* Buffer remaining output */ + c->read_len = replen; + + for (i = 0; i < replen; i++) { + c->read_buf[i] = htonl(rep[i]); + } + + return HUBBUB_NOMEM; + } + + } else { + if (*destlen < 4) { + /* Run out of output buffer */ + + c->read_len = 1; + c->read_buf[0] = ucs4; + + return HUBBUB_NOMEM; + } + + *((uint32_t *) (void *) *dest) = ucs4; + *dest += 4; + *destlen -= 4; + } + + return HUBBUB_OK; +} + +/** + * Detect if a codec's charset is Unicode capable + * + * \param c Codec to consider + * \return true if a Unicode variant, false otherwise + */ +bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c) +{ + static uint16_t ucs4; + static uint16_t ucs2; + static uint16_t utf8; + static uint16_t utf16; + static uint16_t utf16be; + static uint16_t utf16le; + static uint16_t utf32; + static uint16_t utf32be; + static uint16_t utf32le; + + if (ucs4 == 0) { + ucs4 = hubbub_mibenum_from_name("UCS-4", SLEN("UCS-4")); + ucs2 = hubbub_mibenum_from_name("UCS-2", SLEN("UCS-2")); + utf8 = hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); + utf16 = hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16")); + utf16be = hubbub_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")); + utf16le = hubbub_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")); + utf32 = hubbub_mibenum_from_name("UTF-32", SLEN("UTF-32")); + utf32be = hubbub_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE")); + utf32le = hubbub_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")); + } + + return (c->base.mibenum == ucs4 || + c->base.mibenum == ucs2 || + c->base.mibenum == utf8 || + c->base.mibenum == utf16 || + c->base.mibenum == utf16be || + c->base.mibenum == utf16le || + c->base.mibenum == utf32 || + c->base.mibenum == utf32be || + c->base.mibenum == utf32le); +} + +/** + * Read a character from the codec's native charset to UCS4 (big endian) + * + * \param c The codec + * \param source Pointer to pointer to source buffer (updated on exit) + * \param sourcelen Pointer to length of source buffer (updated on exit) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * This buffered data is post-filtering, so will not be refiltered on the + * next call. + * + * In the case of the result being _INVALID or the filter function failing, + * ::source will point _at_ the last input character read; nothing will be + * written or buffered for the failed character. It is up to the client to + * fix the cause of the failure and retry the decoding process. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + size_t iconv_ret; + const uint8_t *origsrc = *source; + size_t origsrclen = *sourcelen; + uint32_t ucs4; + uint8_t *pucs4 = (uint8_t *) &ucs4; + size_t sucs4 = 4; + hubbub_error error; + + /* Use iconv to convert a single character + * Side effect: Updates *source to point at next input + * character and *sourcelen to reflect reduced input length + */ + iconv_ret = iconv(c->read_cd, (char **) source, sourcelen, + (char **) (void *) &pucs4, &sucs4); + + if (iconv_ret != (size_t) -1 || + (*source != origsrc && sucs4 == 0)) { + /* Read a character */ + error = hubbub_iconv_codec_filter_decoded_char(c, + ucs4, dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + /* filter function failed; restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return error; + } else if (errno == E2BIG) { + /* Should never happen */ + abort(); + } else if (errno == EINVAL) { + /* Incomplete input sequence */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (const char *) *source, *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + return HUBBUB_OK; + } else if (errno == EILSEQ) { + /* Illegal input sequence */ + bool found = false; + const uint8_t *oldsrc; + size_t oldsrclen; + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + /* Strict errormode; simply flag invalid character */ + if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) { + /* restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + + return HUBBUB_INVALID; + } + + /* Ok, this becomes problematic. The iconv API here + * is particularly unhelpful; *source will point at + * the _start_ of the illegal sequence. This means + * that we must find the end of the sequence */ + + /* Search for the start of the next valid input + * sequence (or the end of the input stream) */ + while (*sourcelen > 1) { + pucs4 = (uint8_t *) &ucs4; + sucs4 = 4; + + (*source)++; + (*sourcelen)--; + + oldsrc = *source; + oldsrclen = *sourcelen; + + iconv_ret = iconv(c->read_cd, + (char **) source, sourcelen, + (char **) (void *) &pucs4, &sucs4); + if (iconv_ret != (size_t) -1 || errno != EILSEQ) { + found = true; + break; + } + } + + if (found) { + /* Found start of next valid sequence */ + *source = oldsrc; + *sourcelen = oldsrclen; + } else { + /* Not found - skip last byte in buffer */ + (*source)++; + (*sourcelen)--; + + if (*sourcelen != 0) + abort(); + } + + /* output U+FFFD and continue processing. */ + error = hubbub_iconv_codec_filter_decoded_char(c, + htonl(0xFFFD), dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + /* filter function failed; restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + } + + return error; + } + + return HUBBUB_OK; +} + +/** + * Write a UCS4 character in a codec's native charset + * + * \param c The codec + * \param ucs4 The UCS4 character to write (big endian) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if character cannot be represented and the + * codec's error handling mode is set to STRICT. + */ +hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + size_t iconv_ret; + uint8_t *pucs4 = (uint8_t *) &ucs4; + size_t sucs4 = 4; + uint8_t *origdest = *dest; + + iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4, + &sucs4, (char **) dest, destlen); + + if (iconv_ret == (size_t) -1 && errno == E2BIG) { + /* Output buffer is too small */ + return HUBBUB_NOMEM; + } else if (iconv_ret == (size_t) -1 && errno == EILSEQ) { + /* Illegal multibyte sequence */ + /* This should never happen */ + abort(); + } else if (iconv_ret == (size_t) -1 && errno == EINVAL) { + /* Incomplete input character */ + /* This should never happen */ + abort(); + } else if (*dest == origdest) { + /* Nothing was output */ + switch (c->base.errormode) { + case HUBBUB_CHARSETCODEC_ERROR_STRICT: + return HUBBUB_INVALID; + + case HUBBUB_CHARSETCODEC_ERROR_TRANSLIT: + /** \todo transliteration */ + case HUBBUB_CHARSETCODEC_ERROR_LOOSE: + { + pucs4 = (uint8_t *) &ucs4; + sucs4 = 4; + + ucs4 = hubbub_iconv_codec_is_unicode(c) + ? htonl(0xFFFD) : htonl(0x3F); + + iconv_ret = iconv(c->write_cd, + (char **) (void *) &pucs4, &sucs4, + (char **) dest, destlen); + + if (iconv_ret == (size_t) -1 && errno == E2BIG) { + return HUBBUB_NOMEM; + } else if (iconv_ret == (size_t) -1 && + errno == EILSEQ) { + /* Illegal multibyte sequence */ + /* This should never happen */ + abort(); + } else if (iconv_ret == (size_t) -1 && + errno == EINVAL) { + /* Incomplete input character */ + /* This should never happen */ + abort(); + } + } + break; + } + } + + return HUBBUB_OK; +} + +const hubbub_charsethandler hubbub_iconv_codec_handler = { + hubbub_iconv_codec_handles_charset, + hubbub_iconv_codec_create +}; -- cgit v1.2.3