diff options
-rw-r--r-- | src/charset/detect.c | 15 | ||||
-rw-r--r-- | src/parser.c | 20 |
2 files changed, 33 insertions, 2 deletions
diff --git a/src/charset/detect.c b/src/charset/detect.c index 562c12d..fd3de13 100644 --- a/src/charset/detect.c +++ b/src/charset/detect.c @@ -48,10 +48,23 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, if (data == NULL || mibenum == NULL || source == NULL) return PARSERUTILS_BADPARM; + /** + * Meaning of *source on entry: + * + * CONFIDENT - Do not pass Go, do not attempt auto-detection. + * TENTATIVE - We've tried to autodetect already, but subsequently + * discovered that we don't actually support the detected + * charset. Thus, we've defaulted to Windows-1252. Don't + * perform auto-detection again, as it would be futile. + * (This bit diverges from the spec) + * UNKNOWN - No autodetection performed yet. Get on with it. + */ + /* 1. */ /* If the source is dictated, there's nothing for us to do */ - if (*source == HUBBUB_CHARSET_CONFIDENT) { + if (*source == HUBBUB_CHARSET_CONFIDENT || + *source == HUBBUB_CHARSET_TENTATIVE) { return PARSERUTILS_OK; } diff --git a/src/parser.c b/src/parser.c index 575eb73..1ebbaab 100644 --- a/src/parser.c +++ b/src/parser.c @@ -217,9 +217,27 @@ hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser, perror = parserutils_inputstream_append(parser->stream, data, len); if (perror != PARSERUTILS_OK) - return !HUBBUB_OK; + return hubbub_error_from_parserutils_error(perror); error = hubbub_tokeniser_run(parser->tok); + if (error == HUBBUB_BADENCODING) { + /* Ok, we autodetected an encoding that we don't actually + * support. We've not actually processed any data at this + * point so fall back to Windows-1252 and hope for the best + */ + perror = parserutils_inputstream_change_charset(parser->stream, + "Windows-1252", HUBBUB_CHARSET_TENTATIVE); + /* Under no circumstances should we get here if we've managed + * to process data. If there is a way, I want to know about it + */ + assert(perror != PARSERUTILS_INVALID); + if (perror != PARSERUTILS_OK) + return hubbub_error_from_parserutils_error(perror); + + /* Retry the tokenisation */ + error = hubbub_tokeniser_run(parser->tok); + } + if (error != HUBBUB_OK) return error; |