summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/charset/detect.c15
-rw-r--r--src/parser.c20
2 files changed, 33 insertions, 2 deletions
diff --git a/src/charset/detect.c b/src/charset/detect.c
index 562c12d..fd3de13 100644
--- a/src/charset/detect.c
+++ b/src/charset/detect.c
@@ -48,10 +48,23 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
if (data == NULL || mibenum == NULL || source == NULL)
return PARSERUTILS_BADPARM;
+ /**
+ * Meaning of *source on entry:
+ *
+ * CONFIDENT - Do not pass Go, do not attempt auto-detection.
+ * TENTATIVE - We've tried to autodetect already, but subsequently
+ * discovered that we don't actually support the detected
+ * charset. Thus, we've defaulted to Windows-1252. Don't
+ * perform auto-detection again, as it would be futile.
+ * (This bit diverges from the spec)
+ * UNKNOWN - No autodetection performed yet. Get on with it.
+ */
+
/* 1. */
/* If the source is dictated, there's nothing for us to do */
- if (*source == HUBBUB_CHARSET_CONFIDENT) {
+ if (*source == HUBBUB_CHARSET_CONFIDENT ||
+ *source == HUBBUB_CHARSET_TENTATIVE) {
return PARSERUTILS_OK;
}
diff --git a/src/parser.c b/src/parser.c
index 575eb73..1ebbaab 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -217,9 +217,27 @@ hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser,
perror = parserutils_inputstream_append(parser->stream, data, len);
if (perror != PARSERUTILS_OK)
- return !HUBBUB_OK;
+ return hubbub_error_from_parserutils_error(perror);
error = hubbub_tokeniser_run(parser->tok);
+ if (error == HUBBUB_BADENCODING) {
+ /* Ok, we autodetected an encoding that we don't actually
+ * support. We've not actually processed any data at this
+ * point so fall back to Windows-1252 and hope for the best
+ */
+ perror = parserutils_inputstream_change_charset(parser->stream,
+ "Windows-1252", HUBBUB_CHARSET_TENTATIVE);
+ /* Under no circumstances should we get here if we've managed
+ * to process data. If there is a way, I want to know about it
+ */
+ assert(perror != PARSERUTILS_INVALID);
+ if (perror != PARSERUTILS_OK)
+ return hubbub_error_from_parserutils_error(perror);
+
+ /* Retry the tokenisation */
+ error = hubbub_tokeniser_run(parser->tok);
+ }
+
if (error != HUBBUB_OK)
return error;