diff options
author | Rupinder Singh Khokhar <rsk1coder99@gmail.com> | 2014-07-16 08:59:09 +0530 |
---|---|---|
committer | Rupinder Singh Khokhar <rsk1coder99@gmail.com> | 2014-08-01 21:44:33 +0530 |
commit | a0a0787a34e0ad510c58dccf17a67c951aac6c3a (patch) | |
tree | bbb072bfb1a326ff59e52c3bdd603ced88ad5d65 /src | |
parent | 9e698caff63d13a78923ea5eb574cc6bd4acf766 (diff) | |
download | libhubbub-a0a0787a34e0ad510c58dccf17a67c951aac6c3a.tar.gz libhubbub-a0a0787a34e0ad510c58dccf17a67c951aac6c3a.tar.bz2 |
Minor fixes to charset detection. Currently pre-scanning upto 1024 bytes. Removed larger cases in which encoding declaration is beyond 512 bytes, for the time-being. Also removed some outdated tests.
Diffstat (limited to 'src')
-rw-r--r-- | src/charset/detect.c | 36 |
1 files changed, 22 insertions, 14 deletions
diff --git a/src/charset/detect.c b/src/charset/detect.c index fd3de13..ebd6b32 100644 --- a/src/charset/detect.c +++ b/src/charset/detect.c @@ -215,13 +215,13 @@ uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len) if (data == NULL) return 0; - end = pos + min(512, len); + end = pos + min(1024, len); /* 1. */ while (pos < end) { /* a */ if (PEEK("<!--")) { - pos += SLEN("<!--"); + pos += SLEN("<!"); ADVANCE("-->"); /* b */ } else if (PEEK("<meta")) { @@ -303,6 +303,8 @@ uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, const uint8_t *value; uint32_t namelen, valuelen; uint16_t mibenum = 0; + bool got_pragma = false; + bool need_pragma = true; if (pos == NULL || *pos == NULL || end == NULL) return 0; @@ -327,12 +329,23 @@ uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, mibenum = parserutils_charset_mibenum_from_name( (const char *) value, valuelen); + need_pragma = false; /* 5 */ + } else if (namelen == SLEN("content") && valuelen > 0 && strncasecmp((const char *) name, "content", - SLEN("content")) == 0) { + SLEN("content")) == 0 && + mibenum == 0) { mibenum = hubbub_charset_parse_content(value, valuelen); + need_pragma = true; + } else if (namelen == SLEN("http-equiv") && valuelen == + SLEN("content-type") && strncasecmp((const char *) + value, "content-type", + SLEN("content-type")) == 0 && + strncasecmp((const char *) name, "http-equiv", + SLEN("http-equiv")) == 0) { + got_pragma = true; } /* 6 */ @@ -349,12 +362,14 @@ uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, } /* 7 */ - if (mibenum != 0) { + + } + if (mibenum != 0) { + if(got_pragma != false || need_pragma != true) { /* confidence = tentative; */ return mibenum; } } - return 0; } @@ -505,13 +520,6 @@ bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, return false; } - /* 2. Invalid element open character */ - if (*pos == '<') { - pos--; - *data = pos; - return false; - } - /* 3. End of element */ if (*pos == '>') { *data = pos; @@ -537,7 +545,7 @@ bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, } /* c */ - if (*pos == '/' || *pos == '<' || *pos == '>') { + if (*pos == '/' || *pos == '>') { *data = pos; return true; } @@ -631,7 +639,7 @@ bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, while (pos < end) { /* 12. Extract unquoted value */ /* a */ - if (ISSPACE(*pos) || *pos == '<' || *pos == '>') { + if (ISSPACE(*pos) || *pos == '>') { *data = pos; return true; } |