From 0eb6188c3a931063f78b017c621b79709746706e Mon Sep 17 00:00:00 2001 From: Daniel Silverstone Date: Sat, 9 Sep 2017 09:59:58 +0100 Subject: Support falling back to space separated charset In some cases, for example, Apple Mail, programs generate HTML with apallingly bad meta tags such as: This is bad because *a* no http-equiv="Content-Type" and *b* because the content type and charset do not have a separating semi-colon. Sadly, Chrome et-al support this, so we need to in Hubbub. This change adjusts the content="" parser to retry if it cannot find a semicolon, and work forwards to first whitespace instead. Fixes: #2549 --- src/charset/detect.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'src') diff --git a/src/charset/detect.c b/src/charset/detect.c index 93cbe63..d2d6816 100644 --- a/src/charset/detect.c +++ b/src/charset/detect.c @@ -369,6 +369,7 @@ uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, uint16_t hubbub_charset_parse_content(const uint8_t *value, uint32_t valuelen) { + const uint8_t *restart = value; const uint8_t *end; const uint8_t *tentative = NULL; uint32_t tentative_len = 0; @@ -388,8 +389,22 @@ uint16_t hubbub_charset_parse_content(const uint8_t *value, value++; } - if (value >= end) - return 0; + if (value >= end) { + /* Fallback, no semicolon, try for first whitespace */ + value = restart; + while (value < end) { + /* This condition is odd, because ISSPACE() includes + * forward slash, which we need to skip so that content + * types work properly. + */ + if (ISSPACE(*value) && (*value != '/')) { + value++; + break; + } + + value++; + } + } /* 2 */ while (value < end && ISSPACE(*value)) { -- cgit v1.2.3