/* * This file is part of Hubbub. * Licensed under the MIT License, * http://www.opensource.org/licenses/mit-license.php * Copyright 2007 John-Mark Bell */ #include #include #include "charset/aliases.h" #include "utils/utils.h" #include "detect.h" static uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len); static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len); static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, const uint8_t *end); static uint16_t hubbub_charset_parse_content(const uint8_t *value, uint32_t valuelen); static bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, const uint8_t **name, uint32_t *namelen, const uint8_t **value, uint32_t *valuelen); /** * Extract a charset from a chunk of data * * \param data Pointer to pointer to buffer containing data * \param len Pointer to buffer length * \param mibenum Pointer to location to store MIB enum representing charset * \param source Pointer to location to receive charset source * \return HUBBUB_OK on success, appropriate error otherwise * * The data pointer and length will be modified by this function if * a byte order mark is encountered at the start of the buffer. The updated * data pointer will point to the first byte in the buffer after the BOM. * The length will be modified appropriately. * * The larger a chunk of data fed to this routine, the better, as it allows * charset autodetection access to a larger dataset for analysis. */ hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len, uint16_t *mibenum, hubbub_charset_source *source) { uint16_t charset = 0; if (data == NULL || *data == NULL || len == NULL || mibenum == NULL || source == NULL) return HUBBUB_BADPARM; /* We need at least 4 bytes of data */ if (*len < 4) goto default_encoding; /* First, look for a BOM */ charset = hubbub_charset_read_bom(data, len); if (charset != 0) { *mibenum = charset; *source = HUBBUB_CHARSET_DOCUMENT; return HUBBUB_OK; } /* No BOM was found, so we must look for a meta charset within * the document itself. */ charset = hubbub_charset_scan_meta(*data, *len); if (charset != 0) { /* ISO-8859-1 becomes Windows-1252 */ if (charset == hubbub_mibenum_from_name("ISO-8859-1", SLEN("ISO-8859-1"))) { charset = hubbub_mibenum_from_name("Windows-1252", SLEN("Windows-1252")); /* Fallback to 8859-1 if that failed */ if (charset == 0) charset = hubbub_mibenum_from_name( "ISO-8859-1", SLEN("ISO-8859-1")); } /* If we've encountered a meta charset for a non-ASCII- * compatible encoding, don't trust it. * * Firstly, it should have been sent with a BOM (and thus * detected above). * * Secondly, we've just used an ASCII-only parser to * extract the encoding from the document. Therefore, * the document plainly isn't what the meta charset * claims it is. * * What we do in this case is to ignore the meta charset's * claims and leave the charset determination to the * autodetection routines (or the fallback case if they * fail). */ if (charset != hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16")) && charset != hubbub_mibenum_from_name("UTF-16LE", SLEN("UTF-16LE")) && charset != hubbub_mibenum_from_name("UTF-16BE", SLEN("UTF-16BE")) && charset != hubbub_mibenum_from_name("UTF-32", SLEN("UTF-32")) && charset != hubbub_mibenum_from_name("UTF-32LE", SLEN("UTF-32LE")) && charset != hubbub_mibenum_from_name("UTF-32BE", SLEN("UTF-32BE"))) { *mibenum = charset; *source = HUBBUB_CHARSET_DOCUMENT; return HUBBUB_OK; } } /* No charset was specified within the document, attempt to * autodetect the encoding from the data that we have available. */ /** \todo Charset autodetection */ /* We failed to autodetect a charset, so use the default fallback */ default_encoding: charset = hubbub_mibenum_from_name("Windows-1252", SLEN("Windows-1252")); if (charset == 0) charset = hubbub_mibenum_from_name("ISO-8859-1", SLEN("ISO-8859-1")); *mibenum = charset; *source = HUBBUB_CHARSET_DEFAULT; return HUBBUB_OK; } /** * Inspect the beginning of a buffer of data for the presence of a * UTF Byte Order Mark. * * \param data Pointer to pointer to buffer containing data * \param len Pointer to buffer length * \return MIB enum representing encoding described by BOM, or 0 if not found * * If a BOM is found, the data pointer will be modified to point to the first * byte in the buffer after the BOM. The length will also be modified * appropriately. */ uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len) { if (data == NULL || *data == NULL || len == NULL) return 0; /* We require at least 4 bytes of data */ if (*len < 4) return 0; #define UTF32BOM_LEN (4) #define UTF16BOM_LEN (2) #define UTF8BOM_LEN (3) if ((*data)[0] == 0x00 && (*data)[1] == 0x00 && (*data)[2] == 0xFE && (*data)[3] == 0xFF) { *data += UTF32BOM_LEN; *len -= UTF32BOM_LEN; return hubbub_mibenum_from_name("UTF-32BE", SLEN("UTF-32BE")); } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE && (*data)[2] == 0x00 && (*data)[3] == 0x00) { *data += UTF32BOM_LEN; *len -= UTF32BOM_LEN; return hubbub_mibenum_from_name("UTF-32LE", SLEN("UTF-32LE")); } else if ((*data)[0] == 0xFE && (*data)[1] == 0xFF) { *data += UTF16BOM_LEN; *len -= UTF16BOM_LEN; return hubbub_mibenum_from_name("UTF-16BE", SLEN("UTF-16BE")); } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE) { *data += UTF16BOM_LEN; *len -= UTF16BOM_LEN; return hubbub_mibenum_from_name("UTF-16LE", SLEN("UTF-16LE")); } else if ((*data)[0] == 0xEF && (*data)[1] == 0xBB && (*data)[2] == 0xBF) { *data += UTF8BOM_LEN; *len -= UTF8BOM_LEN; return hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); } #undef UTF32BOM_LEN #undef UTF16BOM_LEN #undef UTF8BOM_LEN return 0; } #define PEEK(a) \ (pos < end - SLEN(a) && \ strncasecmp((const char *) pos, a, SLEN(a)) == 0) #define ADVANCE(a) \ while (pos < end - SLEN(a)) { \ if (PEEK(a)) \ break; \ pos++; \ } \ \ if (pos == end - SLEN(a)) \ return 0; #define ISSPACE(a) \ (a == 0x09 || a == 0x0a || a == 0x0b || \ a == 0x0c || a == 0x0d || a == 0x20) /** * Search for a meta charset within a buffer of data * * \param data Pointer to buffer containing data * \param len Length of buffer in data * \return MIB enum representing encoding, or 0 if none found */ uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len) { const uint8_t *pos = data; const uint8_t *end; uint16_t mibenum; if (data == NULL) return 0; end = pos + min(512, len); /* 1. */ while (pos < end) { /* a */ if (PEEK(""); /* b */ } else if (PEEK("= end - 1) return 0; if (ISSPACE(*(pos + SLEN("= end) return 0; } /* c */ } else if ((PEEK("' || *pos == '<') break; pos++; } if (pos >= end) return 0; /* 3 */ if (*pos != '<') { const uint8_t *n; const uint8_t *v; uint32_t nl, vl; while (hubbub_charset_get_attribute(&pos, end, &n, &nl, &v, &vl)) ; /* do nothing */ /* 2 */ } else continue; /* d */ } else if (PEEK(""); } /* e - do nothing */ /* 2 */ pos++; } return 0; } /** * Parse attributes on a meta tag * * \param pos Pointer to pointer to current location (updated on exit) * \param end Pointer to end of data stream * \return MIB enum of detected encoding, or 0 if none found */ uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, const uint8_t *end) { const uint8_t *name; const uint8_t *value; uint32_t namelen, valuelen; uint16_t mibenum; if (pos == NULL || *pos == NULL || end == NULL) return 0; /* 2 */ while (hubbub_charset_get_attribute(pos, end, &name, &namelen, &value, &valuelen)) { /* 3 */ /* a */ if (namelen == SLEN("charset") && valuelen > 0 && strncasecmp((const char *) name, "charset", SLEN("charset")) == 0) { /* strip value */ while (ISSPACE(*value)) { value++; valuelen--; } while (valuelen > 0 && ISSPACE(value[valuelen - 1])) valuelen--; mibenum = hubbub_mibenum_from_name( (const char *) value, valuelen); if (mibenum != 0) return mibenum; /* b */ } else if (namelen == SLEN("content") && valuelen > 0 && strncasecmp((const char *) name, "content", SLEN("content")) == 0) { mibenum = hubbub_charset_parse_content(value, valuelen); if (mibenum != 0) return mibenum; } /* c - do nothing */ /* 1 */ while (*pos < end) { if (ISSPACE(**pos)) break; (*pos)++; } if (*pos >= end) { return 0; } } return 0; } /** * Parse a content= attribute's value * * \param value Attribute's value * \param valuelen Length of value * \return MIB enum of detected encoding, or 0 if none found */ uint16_t hubbub_charset_parse_content(const uint8_t *value, uint32_t valuelen) { const uint8_t *end; const uint8_t *tentative = NULL; uint32_t tentative_len = 0; if (value == NULL) return 0; end = value + valuelen; /* 1 */ while (value < end) { if (*value == ';') { value++; break; } value++; } if (value >= end) return 0; /* 2 */ while (value < end && ISSPACE(*value)) { value++; } if (value >= end) return 0; /* 3 */ if (value < end - SLEN("charset") && strncasecmp((const char *) value, "charset", SLEN("charset")) != 0) return 0; value += SLEN("charset"); /* 4 */ while (value < end && ISSPACE(*value)) { value++; } if (value >= end) return 0; /* 5 */ if (*value != '=') return 0; /* skip '=' */ value++; /* 6 */ while (value < end && ISSPACE(*value)) { value++; } if (value >= end) return 0; /* 7 */ tentative = value; /* a */ if (*value == '"') { while (++value < end && *value != '"') { tentative_len++; } if (value < end) tentative++; else tentative = NULL; /* b */ } else if (*value == '\'') { while (++value < end && *value != '\'') { tentative_len++; } if (value < end) tentative++; else tentative = NULL; /* c */ } else { while (value < end && !ISSPACE(*value)) { value++; tentative_len++; } } /* 8 */ if (tentative != NULL) { return hubbub_mibenum_from_name((const char *) tentative, tentative_len); } /* 9 */ return 0; } /** * Extract an attribute from the data stream * * \param data Pointer to pointer to current location (updated on exit) * \param end Pointer to end of data stream * \param name Pointer to location to receive attribute name * \param namelen Pointer to location to receive attribute name length * \param value Pointer to location to receive attribute value * \param valuelen Pointer to location to receive attribute value langth * \return true if attribute extracted, false otherwise. * * Note: The caller should heed the returned lengths; these are the only * indicator that useful content resides in name or value. */ bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, const uint8_t **name, uint32_t *namelen, const uint8_t **value, uint32_t *valuelen) { const uint8_t *pos; if (data == NULL || *data == NULL || end == NULL || name == NULL || namelen == NULL || value == NULL || valuelen == NULL) return false; pos = *data; /* 1. Skip leading spaces or '/' characters */ while (pos < end && (ISSPACE(*pos) || *pos == '/')) { pos++; } if (pos >= end) { *data = pos; return false; } /* 2. Invalid element open character */ if (*pos == '<') { pos--; *data = pos; return false; } /* 3. End of element */ if (*pos == '>') { *data = pos; return false; } /* 4. Initialise name & value to empty string */ *name = pos; *namelen = 0; *value = (const uint8_t *) ""; *valuelen = 0; /* 5. Extract name */ while (pos < end) { /* a */ if (*pos == '=') { break; } /* b */ if (ISSPACE(*pos)) { break; } /* c */ if (*pos == '/' || *pos == '<' || *pos == '>') { return true; } /* d is handled by strncasecmp in _parse_attributes */ /* e */ (*namelen)++; /* 6 */ pos++; } if (pos >= end) { *data = pos; return false; } if (ISSPACE(*pos)) { /* 7. Skip trailing spaces */ while (pos < end && ISSPACE(*pos)) { pos++; } if (pos >= end) { *data = pos; return false; } /* 8. Must be '=' */ if (*pos != '=') { pos--; *data = pos; return true; } } /* 9. Skip '=' */ pos++; /* 10. Skip any spaces after '=' */ while (pos < end && ISSPACE(*pos)) { pos++; } if (pos >= end) { *data = pos; return false; } /* 11. Extract value, if quoted */ /* a */ if (*pos == '\'' || *pos == '"') { /* 1 */ const uint8_t *quote = pos; /* 2 */ while (++pos < end) { /* 3 */ if (*pos == *quote) { *value = (quote + 1); *data = ++pos; return true; } /* 4 is handled by strncasecmp */ /* 5 */ (*valuelen)++; /* 6 */ } if (pos >= end) { *data = pos; return false; } } /* b */ if (*pos == '<' || *pos == '>') { *data = pos; return true; } /* c is handled by strncasecmp */ /* d */ *value = pos; while (pos < end) { /* 12. Extract unquoted value */ /* a */ if (ISSPACE(*pos) || *pos == '<' || *pos == '>') { *data = pos; return true; } /* b is handled by strncasecmp */ /* c */ (*valuelen)++; /* 13. Advance */ pos++; } if (pos >= end) { *data = pos; return false; } /* should never be reached */ abort(); return false; }