summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2009-01-06 17:16:09 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2009-01-06 17:16:09 +0000
commit9b5ab3bdedb0431168d676775df12780cbd17e51 (patch)
tree39fc94b73ab07781816c42b746ce5bdf47bb33d7
parentc00d6e3b878e120e6e2b1b131127262b3c0d4eeb (diff)
downloadlibhubbub-9b5ab3bdedb0431168d676775df12780cbd17e51.tar.gz
libhubbub-9b5ab3bdedb0431168d676775df12780cbd17e51.tar.bz2
Some kind of recovery from not supporting an auto-detected charset. We fall back to Windows-1252 and hope for the best.
Note that this only occurs when autodetecting. If the client has specified a charset, they get told about it immediately and get to decide what to do about it. If a meta charset is encountered after a successful autodetection, the client is informed in the usual way. As this requires the client to throw out the parser and start afresh, specifying the meta charset as the one to use, there's no problem as, again, the client will be informed immediately in that case. All of this ensures that charset autodetection is pretty well transparent as far as the client application is concerned. svn path=/trunk/hubbub/; revision=5970
-rw-r--r--src/charset/detect.c15
-rw-r--r--src/parser.c20
2 files changed, 33 insertions, 2 deletions
diff --git a/src/charset/detect.c b/src/charset/detect.c
index 562c12d..fd3de13 100644
--- a/src/charset/detect.c
+++ b/src/charset/detect.c
@@ -48,10 +48,23 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
if (data == NULL || mibenum == NULL || source == NULL)
return PARSERUTILS_BADPARM;
+ /**
+ * Meaning of *source on entry:
+ *
+ * CONFIDENT - Do not pass Go, do not attempt auto-detection.
+ * TENTATIVE - We've tried to autodetect already, but subsequently
+ * discovered that we don't actually support the detected
+ * charset. Thus, we've defaulted to Windows-1252. Don't
+ * perform auto-detection again, as it would be futile.
+ * (This bit diverges from the spec)
+ * UNKNOWN - No autodetection performed yet. Get on with it.
+ */
+
/* 1. */
/* If the source is dictated, there's nothing for us to do */
- if (*source == HUBBUB_CHARSET_CONFIDENT) {
+ if (*source == HUBBUB_CHARSET_CONFIDENT ||
+ *source == HUBBUB_CHARSET_TENTATIVE) {
return PARSERUTILS_OK;
}
diff --git a/src/parser.c b/src/parser.c
index 575eb73..1ebbaab 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -217,9 +217,27 @@ hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser,
perror = parserutils_inputstream_append(parser->stream, data, len);
if (perror != PARSERUTILS_OK)
- return !HUBBUB_OK;
+ return hubbub_error_from_parserutils_error(perror);
error = hubbub_tokeniser_run(parser->tok);
+ if (error == HUBBUB_BADENCODING) {
+ /* Ok, we autodetected an encoding that we don't actually
+ * support. We've not actually processed any data at this
+ * point so fall back to Windows-1252 and hope for the best
+ */
+ perror = parserutils_inputstream_change_charset(parser->stream,
+ "Windows-1252", HUBBUB_CHARSET_TENTATIVE);
+ /* Under no circumstances should we get here if we've managed
+ * to process data. If there is a way, I want to know about it
+ */
+ assert(perror != PARSERUTILS_INVALID);
+ if (perror != PARSERUTILS_OK)
+ return hubbub_error_from_parserutils_error(perror);
+
+ /* Retry the tokenisation */
+ error = hubbub_tokeniser_run(parser->tok);
+ }
+
if (error != HUBBUB_OK)
return error;