diff options
author | Daniel Silverstone <dsilvers@digital-scurf.org> | 2017-09-09 09:59:58 +0100 |
---|---|---|
committer | Daniel Silverstone <dsilvers@digital-scurf.org> | 2017-09-09 09:59:58 +0100 |
commit | 0eb6188c3a931063f78b017c621b79709746706e (patch) | |
tree | 9947fbb140fbc0e7013b2b8246dcc66a21f9202c | |
parent | 73071c0dea1e4bcfd094810d051aebc74e6c648c (diff) | |
download | libhubbub-0eb6188c3a931063f78b017c621b79709746706e.tar.gz libhubbub-0eb6188c3a931063f78b017c621b79709746706e.tar.bz2 |
Support falling back to space separated charset
In some cases, for example, Apple Mail, programs generate HTML
with apallingly bad meta tags such as:
<meta content="text/html charset=utf-8">
This is bad because *a* no http-equiv="Content-Type" and *b* because
the content type and charset do not have a separating semi-colon.
Sadly, Chrome et-al support this, so we need to in Hubbub. This
change adjusts the content="" parser to retry if it cannot find
a semicolon, and work forwards to first whitespace instead.
Fixes: #2549
-rw-r--r-- | src/charset/detect.c | 19 |
1 files changed, 17 insertions, 2 deletions
diff --git a/src/charset/detect.c b/src/charset/detect.c index 93cbe63..d2d6816 100644 --- a/src/charset/detect.c +++ b/src/charset/detect.c @@ -369,6 +369,7 @@ uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, uint16_t hubbub_charset_parse_content(const uint8_t *value, uint32_t valuelen) { + const uint8_t *restart = value; const uint8_t *end; const uint8_t *tentative = NULL; uint32_t tentative_len = 0; @@ -388,8 +389,22 @@ uint16_t hubbub_charset_parse_content(const uint8_t *value, value++; } - if (value >= end) - return 0; + if (value >= end) { + /* Fallback, no semicolon, try for first whitespace */ + value = restart; + while (value < end) { + /* This condition is odd, because ISSPACE() includes + * forward slash, which we need to skip so that content + * types work properly. + */ + if (ISSPACE(*value) && (*value != '/')) { + value++; + break; + } + + value++; + } + } /* 2 */ while (value < end && ISSPACE(*value)) { |