From 6807fa854da64166e84efd0074b1e4dfeb5d8b17 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sun, 4 Sep 2011 06:28:09 +0000 Subject: Sniff content types where appropriate. We never sniff for CSS, nor for non-page artefacts (e.g. treeview icons) svn path=/trunk/netsurf/; revision=12707 --- content/content_factory.c | 41 +-- content/content_factory.h | 3 +- content/hlcache.c | 215 +++++++++------ content/hlcache.h | 4 +- content/llcache.c | 17 +- content/llcache.h | 6 +- content/mimesniff.c | 687 ++++++++++++++++++++++++++++++++++++++++++++++ content/mimesniff.h | 54 ++++ 8 files changed, 918 insertions(+), 109 deletions(-) create mode 100644 content/mimesniff.c create mode 100644 content/mimesniff.h (limited to 'content') diff --git a/content/content_factory.c b/content/content_factory.c index d3a69c34d..8baa20f7a 100644 --- a/content/content_factory.c +++ b/content/content_factory.c @@ -147,42 +147,43 @@ content_type content_factory_type_from_mime_type(lwc_string *mime_type) * \param llcache Underlying source data handle * \param fallback_charset Character set to fall back to if none specified * \param quirks Quirkiness of containing document + * \param effective_type Effective MIME type of content * \return Pointer to content object, or NULL on failure */ struct content *content_factory_create_content(llcache_handle *llcache, - const char *fallback_charset, bool quirks) + const char *fallback_charset, bool quirks, + lwc_string *effective_type) { struct content *c; const char *content_type_header; const content_handler *handler; - http_content_type *ct; + http_content_type *ct = NULL; nserror error; - content_type_header = - llcache_handle_get_header(llcache, "Content-Type"); - if (content_type_header == NULL) - content_type_header = "text/plain"; - - error = http_parse_content_type(content_type_header, &ct); - if (error != NSERROR_OK) + handler = content_lookup(effective_type); + if (handler == NULL) return NULL; - handler = content_lookup(ct->media_type); - if (handler == NULL) { - http_content_type_destroy(ct); - return NULL; + assert(handler->create != NULL); + + /* Use the parameters from the declared Content-Type header */ + content_type_header = + llcache_handle_get_header(llcache, "Content-Type"); + if (content_type_header != NULL) { + /* We don't care if this fails */ + http_parse_content_type(content_type_header, &ct); } - assert(handler->create != NULL); + error = handler->create(handler, effective_type, + ct != NULL ? ct->parameters : NULL, + llcache, fallback_charset, quirks, + &c); - error = handler->create(handler, ct->media_type, ct->parameters, - llcache, fallback_charset, quirks, &c); - if (error != NSERROR_OK) { + if (ct != NULL) http_content_type_destroy(ct); - return NULL; - } - http_content_type_destroy(ct); + if (error != NSERROR_OK) + return NULL; return c; } diff --git a/content/content_factory.h b/content/content_factory.h index 777e314f0..b383f461b 100644 --- a/content/content_factory.h +++ b/content/content_factory.h @@ -82,7 +82,8 @@ nserror content_factory_register_handler(lwc_string *mime_type, const content_handler *handler); struct content *content_factory_create_content(struct llcache_handle *llcache, - const char *fallback_charset, bool quirks); + const char *fallback_charset, bool quirks, + lwc_string *effective_type); content_type content_factory_type_from_mime_type(lwc_string *mime_type); diff --git a/content/hlcache.c b/content/hlcache.c index 40b6486c9..38ec18eb6 100644 --- a/content/hlcache.c +++ b/content/hlcache.c @@ -26,6 +26,7 @@ #include "content/content.h" #include "content/hlcache.h" +#include "content/mimesniff.h" #include "utils/http.h" #include "utils/log.h" #include "utils/messages.h" @@ -50,7 +51,7 @@ struct hlcache_retrieval_ctx { content_type accepted_types; /**< Accepted types */ - hlcache_child_context child; /**< Child context */ + hlcache_child_context child; /**< Child context */ }; /** High-level cache handle */ @@ -79,9 +80,12 @@ static void hlcache_clean(void *ignored); static nserror hlcache_llcache_callback(llcache_handle *handle, const llcache_event *event, void *pw); -static bool hlcache_type_is_acceptable(llcache_handle *llcache, +static nserror hlcache_migrate_ctx(hlcache_retrieval_ctx *ctx, + lwc_string *effective_type); +static bool hlcache_type_is_acceptable(lwc_string *mime_type, content_type accepted_types, content_type *computed_type); -static nserror hlcache_find_content(hlcache_retrieval_ctx *ctx); +static nserror hlcache_find_content(hlcache_retrieval_ctx *ctx, + lwc_string *effective_type); static void hlcache_content_callback(struct content *c, content_msg msg, union content_msg_data data, void *pw); @@ -441,77 +445,55 @@ nserror hlcache_llcache_callback(llcache_handle *handle, const llcache_event *event, void *pw) { hlcache_retrieval_ctx *ctx = pw; + lwc_string *effective_type = NULL; nserror error; assert(ctx->llcache == handle); switch (event->type) { case LLCACHE_EVENT_HAD_HEADERS: - { - content_type type = 0; - - /* Unlink the context to prevent recursion */ - RING_REMOVE(hlcache_retrieval_ctx_ring, ctx); - - if (hlcache_type_is_acceptable(handle, - ctx->accepted_types, &type)) { - error = hlcache_find_content(ctx); - if (error != NSERROR_OK) { - hlcache_event hlevent; - - hlevent.type = CONTENT_MSG_ERROR; - hlevent.data.error = messages_get("MiscError"); - - ctx->handle->cb(ctx->handle, &hlevent, - ctx->handle->pw); - - llcache_handle_abort(handle); - llcache_handle_release(handle); - free((char *) ctx->child.charset); - free(ctx); - return error; - } - } else if (type == CONTENT_NONE && - (ctx->flags & HLCACHE_RETRIEVE_MAY_DOWNLOAD)) { - /* Unknown type, and we can download, so convert */ - llcache_handle_force_stream(handle); + error = mimesniff_compute_effective_type(handle, NULL, 0, + ctx->flags & HLCACHE_RETRIEVE_SNIFF_TYPE, + &effective_type); + if (error == NSERROR_OK || error == NSERROR_NOT_FOUND) { + /* If the sniffer was successful or failed to find + * a Content-Type header when sniffing was + * prohibited, we must migrate the retrieval context. */ + error = hlcache_migrate_ctx(ctx, effective_type); + + if (effective_type != NULL) + lwc_string_unref(effective_type); + } - if (ctx->handle->cb != NULL) { - hlcache_event hlevent; + /* No need to report that we need data: + * we'll get some anyway if there is any */ + if (error == NSERROR_NEED_DATA) + error = NSERROR_OK; - hlevent.type = CONTENT_MSG_DOWNLOAD; - hlevent.data.download = handle; + return error; - ctx->handle->cb(ctx->handle, &hlevent, - ctx->handle->pw); - } - } else { - /* Unacceptable type: abort fetch and report error */ - llcache_handle_abort(handle); - llcache_handle_release(handle); + break; + case LLCACHE_EVENT_HAD_DATA: + error = mimesniff_compute_effective_type(handle, + event->data.data.buf, event->data.data.len, + ctx->flags & HLCACHE_RETRIEVE_SNIFF_TYPE, + &effective_type); + if (error != NSERROR_OK) { + assert(0 && "MIME sniff failed with data"); + } - if (ctx->handle->cb != NULL) { - hlcache_event hlevent; + error = hlcache_migrate_ctx(ctx, effective_type); - hlevent.type = CONTENT_MSG_ERROR; - hlevent.data.error = messages_get("BadType"); + lwc_string_unref(effective_type); - ctx->handle->cb(ctx->handle, &hlevent, - ctx->handle->pw); - } - } + return error; - /* No longer require retrieval context */ - free((char *) ctx->child.charset); - free(ctx); - } break; - case LLCACHE_EVENT_HAD_DATA: - /* fall through */ case LLCACHE_EVENT_DONE: - /* should never happen: the handler must be changed */ - assert(0 && "Unexpected llcache event"); - break; + /* DONE event before we could determine the effective MIME type. + * Treat this as an error. + */ + /* Fall through */ case LLCACHE_EVENT_ERROR: if (ctx->handle->cb != NULL) { hlcache_event hlevent; @@ -530,33 +512,93 @@ nserror hlcache_llcache_callback(llcache_handle *handle, } /** - * Determine if the type of a low-level cache object is acceptable + * Migrate a retrieval context into its final destination content * - * \param llcache Low-level cache object to consider + * \param ctx Context to migrate + * \param effective_type The effective MIME type of the content, or NULL + * \return NSERROR_OK on success, + * NSERROR_NEED_DATA on success where data is needed, + * appropriate error otherwise + */ +nserror hlcache_migrate_ctx(hlcache_retrieval_ctx *ctx, + lwc_string *effective_type) +{ + content_type type = CONTENT_NONE; + nserror error = NSERROR_OK; + + /* Unlink the context to prevent recursion */ + RING_REMOVE(hlcache_retrieval_ctx_ring, ctx); + + if (effective_type != NULL && + hlcache_type_is_acceptable(effective_type, + ctx->accepted_types, &type)) { + error = hlcache_find_content(ctx, effective_type); + if (error != NSERROR_OK && error != NSERROR_NEED_DATA) { + hlcache_event hlevent; + + hlevent.type = CONTENT_MSG_ERROR; + hlevent.data.error = messages_get("MiscError"); + + ctx->handle->cb(ctx->handle, &hlevent, + ctx->handle->pw); + + llcache_handle_abort(ctx->llcache); + llcache_handle_release(ctx->llcache); + } + } else if (type == CONTENT_NONE && + (ctx->flags & HLCACHE_RETRIEVE_MAY_DOWNLOAD)) { + /* Unknown type, and we can download, so convert */ + llcache_handle_force_stream(ctx->llcache); + + if (ctx->handle->cb != NULL) { + hlcache_event hlevent; + + hlevent.type = CONTENT_MSG_DOWNLOAD; + hlevent.data.download = ctx->llcache; + + ctx->handle->cb(ctx->handle, &hlevent, + ctx->handle->pw); + } + + /* Ensure caller knows we need data */ + error = NSERROR_NEED_DATA; + } else { + /* Unacceptable type: abort fetch and report error */ + llcache_handle_abort(ctx->llcache); + llcache_handle_release(ctx->llcache); + + if (ctx->handle->cb != NULL) { + hlcache_event hlevent; + + hlevent.type = CONTENT_MSG_ERROR; + hlevent.data.error = messages_get("BadType"); + + ctx->handle->cb(ctx->handle, &hlevent, + ctx->handle->pw); + } + } + + /* No longer require retrieval context */ + free((char *) ctx->child.charset); + free(ctx); + + return error; +} + +/** + * Determine if the specified MIME type is acceptable + * + * \param mime_type MIME type to consider * \param accepted_types Array of acceptable types, or NULL for any * \param computed_type Pointer to location to receive computed type of object * \return True if the type is acceptable, false otherwise */ -bool hlcache_type_is_acceptable(llcache_handle *llcache, +bool hlcache_type_is_acceptable(lwc_string *mime_type, content_type accepted_types, content_type *computed_type) { - const char *content_type_header; - http_content_type *ct; content_type type; - nserror error; - content_type_header = - llcache_handle_get_header(llcache, "Content-Type"); - if (content_type_header == NULL) - content_type_header = "text/plain"; - - error = http_parse_content_type(content_type_header, &ct); - if (error != NSERROR_OK) - return false; - - type = content_factory_type_from_mime_type(ct->media_type); - - http_content_type_destroy(ct); + type = content_factory_type_from_mime_type(mime_type); *computed_type = type; @@ -566,18 +608,23 @@ bool hlcache_type_is_acceptable(llcache_handle *llcache, /** * Find a content for the high-level cache handle * - * \param ctx High-level cache retrieval context - * \return NSERROR_OK on success, appropriate error otherwise + * \param ctx High-level cache retrieval context + * \param effective_type Effective MIME type of content + * \return NSERROR_OK on success, + * NSERROR_NEED_DATA on success where data is needed, + * appropriate error otherwise * * \pre handle::state == HLCACHE_HANDLE_NEW * \pre Headers must have been received for associated low-level handle * \post Low-level handle is either released, or associated with new content * \post High-level handle is registered with content */ -nserror hlcache_find_content(hlcache_retrieval_ctx *ctx) +nserror hlcache_find_content(hlcache_retrieval_ctx *ctx, + lwc_string *effective_type) { hlcache_entry *entry; hlcache_event event; + nserror error = NSERROR_OK; /* Search list of cached contents for a suitable one */ for (entry = hlcache_content_list; entry != NULL; entry = entry->next) { @@ -617,7 +664,8 @@ nserror hlcache_find_content(hlcache_retrieval_ctx *ctx) /* Create content using llhandle */ entry->content = content_factory_create_content(ctx->llcache, - ctx->child.charset, ctx->child.quirks); + ctx->child.charset, ctx->child.quirks, + effective_type); if (entry->content == NULL) { free(entry); return NSERROR_NOMEM; @@ -629,6 +677,9 @@ nserror hlcache_find_content(hlcache_retrieval_ctx *ctx) if (hlcache_content_list != NULL) hlcache_content_list->prev = entry; hlcache_content_list = entry; + + /* Signal to caller that we created a content */ + error = NSERROR_NEED_DATA; } else { /* Found a suitable content: no longer need low-level handle */ llcache_handle_release(ctx->llcache); @@ -676,7 +727,7 @@ nserror hlcache_find_content(hlcache_retrieval_ctx *ctx) } } - return NSERROR_OK; + return error; } /** diff --git a/content/hlcache.h b/content/hlcache.h index 2372158e1..714fcce45 100644 --- a/content/hlcache.h +++ b/content/hlcache.h @@ -60,7 +60,9 @@ enum hlcache_retrieve_flag { * To avoid confusion, high-level flags are allocated from bit 31 down. */ /** It's permitted to convert this request into a download */ - HLCACHE_RETRIEVE_MAY_DOWNLOAD = (1 << 31) + HLCACHE_RETRIEVE_MAY_DOWNLOAD = (1 << 31), + /* Permit content-type sniffing */ + HLCACHE_RETRIEVE_SNIFF_TYPE = (1 << 30) }; /** diff --git a/content/llcache.c b/content/llcache.c index 3141ad05f..b02d0135f 100644 --- a/content/llcache.c +++ b/content/llcache.c @@ -1455,6 +1455,8 @@ nserror llcache_object_notify_users(llcache_object *object) if (handle->state == LLCACHE_FETCH_DATA && objstate >= LLCACHE_FETCH_DATA && object->source_len > handle->bytes) { + size_t orig_handle_read; + /* Construct HAD_DATA event */ event.type = LLCACHE_EVENT_HAD_DATA; event.data.data.buf = @@ -1466,9 +1468,13 @@ nserror llcache_object_notify_users(llcache_object *object) if (object->fetch.flags & LLCACHE_RETRIEVE_STREAM_DATA) { /* Streaming, so reset to zero to - * minimise amount of cached source data */ + * minimise amount of cached source data. + * Additionally, we don't support replay + * when streaming. */ + orig_handle_read = 0; handle->bytes = object->source_len = 0; } else { + orig_handle_read = handle->bytes; handle->bytes = object->source_len; } @@ -1482,6 +1488,15 @@ nserror llcache_object_notify_users(llcache_object *object) if (error != NSERROR_OK) return error; + continue; + } else if (error == NSERROR_NEED_DATA) { + /* User requested replay */ + handle->bytes = orig_handle_read; + + /* Continue with the next user -- we'll + * reemit the data next time round */ + user->iterator_target = false; + next_user = user->next; continue; } else if (error != NSERROR_OK) { user->iterator_target = false; diff --git a/content/llcache.h b/content/llcache.h index 215e6cc1a..e6584e165 100644 --- a/content/llcache.h +++ b/content/llcache.h @@ -89,12 +89,10 @@ enum llcache_retrieve_flag { LLCACHE_RETRIEVE_FORCE_FETCH = (1 << 0), /** Requested URL was verified */ LLCACHE_RETRIEVE_VERIFIABLE = (1 << 1), - /** Permit content-type sniffing */ - LLCACHE_RETRIEVE_SNIFF_TYPE = (1 << 2), /**< No error pages */ - LLCACHE_RETRIEVE_NO_ERROR_PAGES = (1 << 3), + LLCACHE_RETRIEVE_NO_ERROR_PAGES = (1 << 2), /**< Stream data (implies that object is not cacheable) */ - LLCACHE_RETRIEVE_STREAM_DATA = (1 << 4) + LLCACHE_RETRIEVE_STREAM_DATA = (1 << 3) }; /** Low-level cache query types */ diff --git a/content/mimesniff.c b/content/mimesniff.c new file mode 100644 index 000000000..a911318f9 --- /dev/null +++ b/content/mimesniff.c @@ -0,0 +1,687 @@ +/* + * Copyright 2011 John-Mark Bell + * + * This file is part of NetSurf, http://www.netsurf-browser.org/ + * + * NetSurf is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * NetSurf is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** \file + * MIME type sniffer (implementation) + */ + +#include + +#include "content/content_factory.h" +#include "content/llcache.h" +#include "content/mimesniff.h" +#include "utils/http.h" +#include "utils/utils.h" + +static lwc_string *unknown_unknown; +static lwc_string *application_unknown; +static lwc_string *any; +static lwc_string *text_xml; +static lwc_string *application_xml; +static lwc_string *text_html; +static lwc_string *text_plain; +static lwc_string *application_octet_stream; +static lwc_string *image_gif; +static lwc_string *image_png; +static lwc_string *image_jpeg; +static lwc_string *image_bmp; +static lwc_string *image_vnd_microsoft_icon; +static lwc_string *image_webp; +static lwc_string *application_rss_xml; +static lwc_string *application_atom_xml; +static lwc_string *audio_x_wave; +static lwc_string *application_ogg; +static lwc_string *video_webm; +static lwc_string *application_x_rar_compressed; +static lwc_string *application_zip; +static lwc_string *application_x_gzip; +static lwc_string *application_postscript; +static lwc_string *application_pdf; + +nserror mimesniff_init(void) +{ + lwc_error lerror; + +#define SINIT(v, s) \ + lerror = lwc_intern_string(s, SLEN(s), &v); \ + if (lerror != lwc_error_ok) \ + return NSERROR_NOMEM + + SINIT(unknown_unknown, "unknown/unknown"); + SINIT(application_unknown, "application/unknown"); + SINIT(any, "*/*"); + SINIT(text_xml, "text/xml"); + SINIT(application_xml, "application/xml"); + SINIT(text_html, "text/html"); + SINIT(text_plain, "text/plain"); + SINIT(application_octet_stream, "application/octet-stream"); + SINIT(image_gif, "image/gif"); + SINIT(image_png, "image/png"); + SINIT(image_jpeg, "image/jpeg"); + SINIT(image_bmp, "image/bmp"); + SINIT(image_vnd_microsoft_icon, "image/vnd.microsoft.icon"); + SINIT(image_webp, "image/webp"); + SINIT(application_rss_xml, "application/rss+xml"); + SINIT(application_atom_xml, "application/atom+xml"); + SINIT(audio_x_wave, "audio/x-wave"); + SINIT(application_ogg, "application/ogg"); + SINIT(video_webm, "video/webm"); + SINIT(application_x_rar_compressed, "application/x-rar-compressed"); + SINIT(application_zip, "application/zip"); + SINIT(application_x_gzip, "application/x-gzip"); + SINIT(application_postscript, "application/postscript"); + SINIT(application_pdf, "application/pdf"); +#undef SINIT + + return NSERROR_OK; +} + +void mimesniff_fini(void) +{ + lwc_string_unref(application_pdf); + lwc_string_unref(application_postscript); + lwc_string_unref(application_x_gzip); + lwc_string_unref(application_zip); + lwc_string_unref(application_x_rar_compressed); + lwc_string_unref(video_webm); + lwc_string_unref(application_ogg); + lwc_string_unref(audio_x_wave); + lwc_string_unref(application_atom_xml); + lwc_string_unref(application_rss_xml); + lwc_string_unref(image_webp); + lwc_string_unref(image_vnd_microsoft_icon); + lwc_string_unref(image_bmp); + lwc_string_unref(image_jpeg); + lwc_string_unref(image_png); + lwc_string_unref(image_gif); + lwc_string_unref(application_octet_stream); + lwc_string_unref(text_plain); + lwc_string_unref(text_html); + lwc_string_unref(application_xml); + lwc_string_unref(text_xml); + lwc_string_unref(any); + lwc_string_unref(application_unknown); + lwc_string_unref(unknown_unknown); +} + +static bool mimesniff__has_binary_octets(const uint8_t *data, size_t len) +{ + const uint8_t *end = data + len; + + while (data != end) { + const uint8_t c = *data; + + /* Binary iff in C0 and not ESC, CR, FF, LF, HT */ + if (c <= 0x1f && c != 0x1b && c != '\r' && c != '\f' && + c != '\n' && c != '\t') + break; + + data++; + } + + return data != end; +} + +struct map_s { + const uint8_t *sig; + size_t len; + bool safe; + lwc_string **type; +}; + +static nserror mimesniff__match_unknown_ws(const uint8_t *data, size_t len, + lwc_string **effective_type) +{ +#define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t } + static const struct map_s ws_exact_match_types[] = { + SIG(&text_xml, "sig != NULL; it++) { + if (it->len <= len && memcmp(data, it->sig, it->len) == 0) { + *effective_type = lwc_string_ref(*it->type); + return NSERROR_OK; + } + } + + for (it = ws_inexact_match_types; it->sig != NULL; it++) { + /* +1 for trailing space or > */ + if (len < it->len + 1) + continue; + + if (strncasecmp((const char *) data, + (const char *) it->sig, it->len) == 0 && + (data[it->len] == ' ' || + data[it->len] == '>')) { + *effective_type = lwc_string_ref(*it->type); + return NSERROR_OK; + } + } + + return NSERROR_NOT_FOUND; +} + +static nserror mimesniff__match_unknown_bom(const uint8_t *data, size_t len, + lwc_string **effective_type) +{ +#define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t } + static const struct map_s bom_match_types[] = { + SIG(&text_plain, "\xfe\xff", false), + SIG(&text_plain, "\xff\xfe", false), + SIG(&text_plain, "\xef\xbb\xbf", false), + { NULL, 0, false, NULL } + }; +#undef SIG + const struct map_s *it; + + for (it = bom_match_types; it->sig != NULL; it++) { + if (it->len <= len && memcmp(data, it->sig, it->len) == 0) { + *effective_type = lwc_string_ref(*it->type); + return NSERROR_OK; + } + } + + return NSERROR_NOT_FOUND; +} + +static nserror mimesniff__match_unknown_riff(const uint8_t *data, size_t len, + lwc_string **effective_type) +{ +#define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t } + static const struct map_s riff_match_types[] = { + SIG(&image_webp, "WEBPVP", true), + SIG(&audio_x_wave, "WAVE", true), + { NULL, 0, false, NULL } + }; +#undef SIG + const struct map_s *it; + + for (it = riff_match_types; it->sig != NULL; it++) { + if (it->len + SLEN("RIFF????") <= len && + memcmp(data, "RIFF", SLEN("RIFF")) == 0 && + memcmp(data + SLEN("RIFF????"), + it->sig, it->len) == 0) { + *effective_type = lwc_string_ref(*it->type); + return NSERROR_OK; + } + } + + return NSERROR_NOT_FOUND; +} + +static nserror mimesniff__match_unknown_exact(const uint8_t *data, size_t len, + bool allow_unsafe, lwc_string **effective_type) +{ +#define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t } + static const struct map_s exact_match_types[] = { + SIG(&image_gif, "GIF87a", true), + SIG(&image_gif, "GIF89a", true), + SIG(&image_png, "\x89PNG\r\n\x1a\n", true), + SIG(&image_jpeg, "\xff\xd8\xff", true), + SIG(&image_bmp, "BM", true), + SIG(&image_vnd_microsoft_icon, "\x00\x00\x01\x00", true), + SIG(&application_ogg, "OggS\x00", true), + SIG(&video_webm, "\x1a\x45\xdf\xa3", true), + SIG(&application_x_rar_compressed, "Rar \x1a\x07\x00", true), + SIG(&application_zip, "PK\x03\x04", true), + SIG(&application_x_gzip, "\x1f\x8b\x08", true), + SIG(&application_postscript, "%!PS-Adobe-", true), + SIG(&application_pdf, "%PDF-", false), + { NULL, 0, false, NULL } + }; +#undef SIG + const struct map_s *it; + + for (it = exact_match_types; it->sig != NULL; it++) { + if (it->len <= len && memcmp(data, it->sig, it->len) == 0 && + (allow_unsafe || it->safe)) { + *effective_type = lwc_string_ref(*it->type); + return NSERROR_OK; + } + } + + return NSERROR_NOT_FOUND; +} + +static nserror mimesniff__match_unknown(const uint8_t *data, size_t len, + bool allow_unsafe, lwc_string **effective_type) +{ + if (mimesniff__match_unknown_exact(data, len, allow_unsafe, + effective_type) == NSERROR_OK) + return NSERROR_OK; + + if (mimesniff__match_unknown_riff(data, len, + effective_type) == NSERROR_OK) + return NSERROR_OK; + + if (allow_unsafe == false) + return NSERROR_NOT_FOUND; + + if (mimesniff__match_unknown_bom(data, len, + effective_type) == NSERROR_OK) + return NSERROR_OK; + + if (mimesniff__match_unknown_ws(data, len, + effective_type) == NSERROR_OK) + return NSERROR_OK; + + return NSERROR_NOT_FOUND; +} + +static nserror mimesniff__compute_unknown(const uint8_t *data, size_t len, + lwc_string **effective_type) +{ + if (data == NULL) + return NSERROR_NEED_DATA; + + len = min(len, 512); + + if (mimesniff__match_unknown(data, len, true, + effective_type) == NSERROR_OK) + return NSERROR_OK; + + if (mimesniff__has_binary_octets(data, len) == false) { + /* No binary octets => text/plain */ + *effective_type = lwc_string_ref(text_plain); + return NSERROR_OK; + } + + *effective_type = lwc_string_ref(application_octet_stream); + + return NSERROR_OK; +} + +static nserror mimesniff__compute_text_or_binary(const uint8_t *data, + size_t len, lwc_string **effective_type) +{ + if (data == NULL) + return NSERROR_NEED_DATA; + + len = min(len, 512); + + if (len >= 3 && ((data[0] == 0xfe && data[1] == 0xff) || + (data[0] == 0xff && data[1] == 0xfe) || + (data[0] == 0xef && data[1] == 0xbb && + data[2] == 0xbf))) { + /* Found a BOM => text/plain */ + *effective_type = lwc_string_ref(text_plain); + return NSERROR_OK; + } + + if (mimesniff__has_binary_octets(data, len) == false) { + /* No binary octets => text/plain */ + *effective_type = lwc_string_ref(text_plain); + return NSERROR_OK; + } + + if (mimesniff__match_unknown(data, len, false, + effective_type) == NSERROR_OK) + return NSERROR_OK; + + *effective_type = lwc_string_ref(application_octet_stream); + + return NSERROR_OK; +} + +static nserror mimesniff__compute_image(lwc_string *official_type, + const uint8_t *data, size_t len, lwc_string **effective_type) +{ +#define SIG(t, s) { (const uint8_t *) s, SLEN(s), t } + static const struct it_s { + const uint8_t *sig; + size_t len; + lwc_string **type; + } image_types[] = { + SIG(&image_gif, "GIF87a"), + SIG(&image_gif, "GIF89a"), + SIG(&image_png, "\x89PNG\r\n\x1a\n"), + SIG(&image_jpeg, "\xff\xd8\xff"), + SIG(&image_bmp, "BM"), + SIG(&image_vnd_microsoft_icon, "\x00\x00\x01\x00"), + { NULL, 0, NULL } + }; +#undef SIG + + const struct it_s *it; + + if (data == NULL) + return NSERROR_NEED_DATA; + + for (it = image_types; it->sig != NULL; it++) { + if (it->len <= len && memcmp(data, it->sig, it->len) == 0) { + lwc_string_unref(official_type); + *effective_type = lwc_string_ref(*it->type); + return NSERROR_OK; + } + } + + /* WebP has a signature that doesn't fit into the above table */ + if (SLEN("RIFF????WEBPVP") <= len && + memcmp(data, "RIFF", SLEN("RIFF")) == 0 && + memcmp(data + SLEN("RIFF????"), + "WEBPVP", SLEN("WEBPVP")) == 0 ) { + lwc_string_unref(official_type); + *effective_type = lwc_string_ref(image_webp); + return NSERROR_OK; + } + + *effective_type = official_type; + + return NSERROR_OK; +} + +static nserror mimesniff__compute_feed_or_html(const uint8_t *data, + size_t len, lwc_string **effective_type) +{ +#define RDF_NS "http://www.w3.org/1999/02/22-rdf-syntax-ns#" +#define RSS_NS "http://purl.org/rss/1.0" + + enum state_e { + BEFORE_BOM, + BEFORE_MARKUP, + MARKUP_START, + COMMENT_OR_DOCTYPE, + IN_COMMENT, + IN_DOCTYPE, + IN_PI, + IN_TAG, + IN_RDF + } state = BEFORE_BOM; + + bool rdf = false, rss = false; + const uint8_t *end; + + if (data == NULL) + return NSERROR_NEED_DATA; + + end = data + min(len, 512); + + while (data < end) { + const uint8_t c = *data; + +#define MATCH(s) SLEN(s) <= (size_t) (end - data) && \ + memcmp(data, s, SLEN(s)) == 0 + + switch (state) { + case BEFORE_BOM: + if (3 <= end - data && c == 0xef && data[1] == 0xbb && + data[2] == 0xbf) { + data += 3; + } + + state = BEFORE_MARKUP; + break; + case BEFORE_MARKUP: + if (c == '\t' || c == '\n' || c == '\r' || c == ' ') + data++; + else if (c != '<') + data = end; + else { + state = MARKUP_START; + data++; + } + break; + case MARKUP_START: + if (c == '!') { + state = COMMENT_OR_DOCTYPE; + data++; + } else if (c == '?') { + state = IN_PI; + data++; + } else { + /* Reconsume input */ + state = IN_TAG; + } + break; + case COMMENT_OR_DOCTYPE: + if (2 <= end - data && c == '-' && data[1] == '-') { + state = IN_COMMENT; + data += 2; + } else { + /* Reconsume input */ + state = IN_DOCTYPE; + } + break; + case IN_COMMENT: + if (3 <= end - data && c == '-' && data[1] == '-' && + data[2] == '>') { + state = BEFORE_MARKUP; + data += 3; + } else + data++; + break; + case IN_DOCTYPE: + if (c == '>') + state = BEFORE_MARKUP; + data++; + break; + case IN_PI: + if (2 <= end - data && c == '?' && data[1] == '>') { + state = BEFORE_MARKUP; + data += 2; + } else + data++; + break; + case IN_TAG: + if (MATCH("rss")) { + *effective_type = + lwc_string_ref(application_rss_xml); + return NSERROR_OK; + } else if (MATCH("feed")) { + *effective_type = + lwc_string_ref(application_atom_xml); + return NSERROR_OK; + } else if (MATCH("rdf:RDF")) { + state = IN_RDF; + data += SLEN("rdf:RDF"); + } else + data = end; + break; + case IN_RDF: + if (MATCH(RSS_NS)) { + rss = true; + data += SLEN(RSS_NS); + } else if (MATCH(RDF_NS)) { + rdf = true; + data += SLEN(RDF_NS); + } else + data++; + + if (rdf && rss) { + *effective_type = + lwc_string_ref(application_rss_xml); + return NSERROR_OK; + } + + break; + } +#undef MATCH + } + + *effective_type = lwc_string_ref(text_html); + + return NSERROR_OK; + +#undef RSS_NS +#undef RDF_NS +} + +/* See mimesniff.h for documentation */ +nserror mimesniff_compute_effective_type(llcache_handle *handle, + const uint8_t *data, size_t len, bool sniff_allowed, + lwc_string **effective_type) +{ +#define S(s) { s, SLEN(s) } + static const struct tt_s { + const char *data; + size_t len; + } text_types[] = { + S("text/plain"), + S("text/plain; charset=ISO-8859-1"), + S("text/plain; charset=iso-8859-1"), + S("text/plain; charset=UTF-8"), + { NULL, 0 } + }; +#undef S + + const char *content_type_header; + size_t content_type_header_len; + http_content_type *ct; + const struct tt_s *tt; + bool match; + nserror error; + + content_type_header = + llcache_handle_get_header(handle, "Content-Type"); + if (content_type_header == NULL) { + if (sniff_allowed == false) + return NSERROR_NOT_FOUND; + + /* No official type => unknown */ + return mimesniff__compute_unknown(data, len, effective_type); + } + + error = http_parse_content_type(content_type_header, &ct); + if (error != NSERROR_OK) { + if (sniff_allowed == false) + return NSERROR_NOT_FOUND; + + /* Unparseable => unknown */ + return mimesniff__compute_unknown(data, len, effective_type); + } + + if (sniff_allowed == false) { + *effective_type = lwc_string_ref(ct->media_type); + http_content_type_destroy(ct); + return NSERROR_OK; + } + + content_type_header_len = strlen(content_type_header); + + /* Look for text types */ + for (tt = text_types; tt->data != NULL; tt++) { + if (tt->len == content_type_header_len && + memcmp(tt->data, content_type_header, + content_type_header_len) == 0) { + http_content_type_destroy(ct); + return mimesniff__compute_text_or_binary(data, len, + effective_type); + } + } + + /* unknown/unknown, application/unknown, * / * */ + if ((lwc_string_caseless_isequal(ct->media_type, unknown_unknown, + &match) == lwc_error_ok && match) || + (lwc_string_caseless_isequal(ct->media_type, + application_unknown, &match) == lwc_error_ok && + match) || + (lwc_string_caseless_isequal(ct->media_type, any, + &match) == lwc_error_ok && match)) { + http_content_type_destroy(ct); + return mimesniff__compute_unknown(data, len, effective_type); + } + + /* +xml */ + if (lwc_string_length(ct->media_type) > SLEN("+xml") && + strncasecmp(lwc_string_data(ct->media_type) + + lwc_string_length(ct->media_type) - + SLEN("+xml"), + "+xml", SLEN("+xml")) == 0) { + /* Use official type */ + *effective_type = lwc_string_ref(ct->media_type); + http_content_type_destroy(ct); + return NSERROR_OK; + } + + /* text/xml, application/xml */ + if ((lwc_string_caseless_isequal(ct->media_type, text_xml, + &match) == lwc_error_ok && match) || + (lwc_string_caseless_isequal(ct->media_type, + application_xml, &match) == lwc_error_ok && + match)) { + /* Use official type */ + *effective_type = lwc_string_ref(ct->media_type); + http_content_type_destroy(ct); + return NSERROR_OK; + } + + /* Image types */ + if (content_factory_type_from_mime_type(ct->media_type) == + CONTENT_IMAGE) { + lwc_string *official_type = lwc_string_ref(ct->media_type); + http_content_type_destroy(ct); + return mimesniff__compute_image(official_type, + data, len, effective_type); + } + + /* text/html */ + if ((lwc_string_caseless_isequal(ct->media_type, text_html, + &match) == lwc_error_ok && match)) { + http_content_type_destroy(ct); + return mimesniff__compute_feed_or_html(data, len, + effective_type); + } + + /* Use official type */ + *effective_type = lwc_string_ref(ct->media_type); + + http_content_type_destroy(ct); + + return NSERROR_OK; +} + diff --git a/content/mimesniff.h b/content/mimesniff.h new file mode 100644 index 000000000..8ddabd2e7 --- /dev/null +++ b/content/mimesniff.h @@ -0,0 +1,54 @@ +/* + * Copyright 2011 John-Mark Bell + * + * This file is part of NetSurf, http://www.netsurf-browser.org/ + * + * NetSurf is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * NetSurf is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** \file + * MIME type sniffer (interface) + */ + +#ifndef NETSURF_CONTENT_MIMESNIFF_H_ +#define NETSURF_CONTENT_MIMESNIFF_H_ + +#include + +#include +#include "utils/errors.h" + +struct llcache_handle; + +/** + * Compute the effective MIME type for an object using the sniffing + * algorithm described in draft-abarth-mime-sniff-06. + * + * \param handle Source data handle to sniff + * \param data First data chunk, or NULL + * \param len Length of \a data, in bytes + * \param sniff_allowed Whether MIME type sniffing is allowed + * \param effective_type Location to receive computed type + * \return NSERROR_OK on success, + * NSERROR_NEED_DATA iff \a data is NULL and data is needed + * NSERROR_NOT_FOUND if sniffing is prohibited and no + * Content-Type header was found + */ +nserror mimesniff_compute_effective_type(struct llcache_handle *handle, + const uint8_t *data, size_t len, bool sniff_allowed, + lwc_string **effective_type); + +nserror mimesniff_init(void); +void mimesniff_fini(void); + +#endif -- cgit v1.2.3