From 6807fa854da64166e84efd0074b1e4dfeb5d8b17 Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Sun, 4 Sep 2011 06:28:09 +0000
Subject: Sniff content types where appropriate. We never sniff for CSS, nor
 for non-page artefacts (e.g. treeview icons)

svn path=/trunk/netsurf/; revision=12707
---
 content/content_factory.c |  41 +--
 content/content_factory.h |   3 +-
 content/hlcache.c         | 215 +++++++++------
 content/hlcache.h         |   4 +-
 content/llcache.c         |  17 +-
 content/llcache.h         |   6 +-
 content/mimesniff.c       | 687 ++++++++++++++++++++++++++++++++++++++++++++++
 content/mimesniff.h       |  54 ++++
 8 files changed, 918 insertions(+), 109 deletions(-)
 create mode 100644 content/mimesniff.c
 create mode 100644 content/mimesniff.h

(limited to 'content')

diff --git a/content/content_factory.c b/content/content_factory.c
index d3a69c34d..8baa20f7a 100644
--- a/content/content_factory.c
+++ b/content/content_factory.c
@@ -147,42 +147,43 @@ content_type content_factory_type_from_mime_type(lwc_string *mime_type)
  * \param llcache           Underlying source data handle
  * \param fallback_charset  Character set to fall back to if none specified
  * \param quirks            Quirkiness of containing document
+ * \param effective_type    Effective MIME type of content
  * \return Pointer to content object, or NULL on failure
  */
 struct content *content_factory_create_content(llcache_handle *llcache,
-		const char *fallback_charset, bool quirks)
+		const char *fallback_charset, bool quirks,
+		lwc_string *effective_type)
 {
 	struct content *c;
 	const char *content_type_header;
 	const content_handler *handler;
-	http_content_type *ct;
+	http_content_type *ct = NULL;
 	nserror error;
 
-	content_type_header = 
-			llcache_handle_get_header(llcache, "Content-Type");
-	if (content_type_header == NULL)
-		content_type_header = "text/plain";
-
-	error = http_parse_content_type(content_type_header, &ct);
-	if (error != NSERROR_OK)
+	handler = content_lookup(effective_type);
+	if (handler == NULL)
 		return NULL;
 
-	handler = content_lookup(ct->media_type);
-	if (handler == NULL) {
-		http_content_type_destroy(ct);
-		return NULL;
+	assert(handler->create != NULL);
+
+	/* Use the parameters from the declared Content-Type header */
+	content_type_header = 
+			llcache_handle_get_header(llcache, "Content-Type");
+	if (content_type_header != NULL) {
+		/* We don't care if this fails */
+		http_parse_content_type(content_type_header, &ct);
 	}
 
-	assert(handler->create != NULL);
+	error = handler->create(handler, effective_type, 
+			ct != NULL ? ct->parameters : NULL, 
+			llcache, fallback_charset, quirks, 
+			&c);
 
-	error = handler->create(handler, ct->media_type, ct->parameters, 
-			llcache, fallback_charset, quirks, &c);
-	if (error != NSERROR_OK) {
+	if (ct != NULL)
 		http_content_type_destroy(ct);
-		return NULL;
-	}
 
-	http_content_type_destroy(ct);
+	if (error != NSERROR_OK)
+		return NULL;
 
 	return c;
 }
diff --git a/content/content_factory.h b/content/content_factory.h
index 777e314f0..b383f461b 100644
--- a/content/content_factory.h
+++ b/content/content_factory.h
@@ -82,7 +82,8 @@ nserror content_factory_register_handler(lwc_string *mime_type,
 		const content_handler *handler);
 
 struct content *content_factory_create_content(struct llcache_handle *llcache, 
-		const char *fallback_charset, bool quirks);
+		const char *fallback_charset, bool quirks,
+		lwc_string *effective_type);
 
 content_type content_factory_type_from_mime_type(lwc_string *mime_type);
 
diff --git a/content/hlcache.c b/content/hlcache.c
index 40b6486c9..38ec18eb6 100644
--- a/content/hlcache.c
+++ b/content/hlcache.c
@@ -26,6 +26,7 @@
 
 #include "content/content.h"
 #include "content/hlcache.h"
+#include "content/mimesniff.h"
 #include "utils/http.h"
 #include "utils/log.h"
 #include "utils/messages.h"
@@ -50,7 +51,7 @@ struct hlcache_retrieval_ctx {
 
 	content_type accepted_types;	/**< Accepted types */
 
-	hlcache_child_context child;	/**< Child context */	
+	hlcache_child_context child;	/**< Child context */
 };
 
 /** High-level cache handle */
@@ -79,9 +80,12 @@ static void hlcache_clean(void *ignored);
 
 static nserror hlcache_llcache_callback(llcache_handle *handle,
 		const llcache_event *event, void *pw);
-static bool hlcache_type_is_acceptable(llcache_handle *llcache, 
+static nserror hlcache_migrate_ctx(hlcache_retrieval_ctx *ctx,
+		lwc_string *effective_type);
+static bool hlcache_type_is_acceptable(lwc_string *mime_type, 
 		content_type accepted_types, content_type *computed_type);
-static nserror hlcache_find_content(hlcache_retrieval_ctx *ctx);
+static nserror hlcache_find_content(hlcache_retrieval_ctx *ctx,
+		lwc_string *effective_type);
 static void hlcache_content_callback(struct content *c,
 		content_msg msg, union content_msg_data data, void *pw);
 
@@ -441,77 +445,55 @@ nserror hlcache_llcache_callback(llcache_handle *handle,
 		const llcache_event *event, void *pw)
 {
 	hlcache_retrieval_ctx *ctx = pw;
+	lwc_string *effective_type = NULL;
 	nserror error;
 
 	assert(ctx->llcache == handle);
 
 	switch (event->type) {
 	case LLCACHE_EVENT_HAD_HEADERS:
-	{
-		content_type type = 0;
-		
-		/* Unlink the context to prevent recursion */
-		RING_REMOVE(hlcache_retrieval_ctx_ring, ctx);
-		
-		if (hlcache_type_is_acceptable(handle, 
-				ctx->accepted_types, &type)) {
-			error = hlcache_find_content(ctx);
-			if (error != NSERROR_OK) {
-				hlcache_event hlevent;
-
-				hlevent.type = CONTENT_MSG_ERROR;
-				hlevent.data.error = messages_get("MiscError");
-
-				ctx->handle->cb(ctx->handle, &hlevent, 
-						ctx->handle->pw);
-				
-				llcache_handle_abort(handle);
-				llcache_handle_release(handle);
-				free((char *) ctx->child.charset);
-				free(ctx);
-				return error;
-			}
-		} else if (type == CONTENT_NONE && 
-				(ctx->flags & HLCACHE_RETRIEVE_MAY_DOWNLOAD)) {
-			/* Unknown type, and we can download, so convert */
-			llcache_handle_force_stream(handle);
+		error = mimesniff_compute_effective_type(handle, NULL, 0,
+				ctx->flags & HLCACHE_RETRIEVE_SNIFF_TYPE, 
+				&effective_type);
+		if (error == NSERROR_OK || error == NSERROR_NOT_FOUND) {
+			/* If the sniffer was successful or failed to find 
+			 * a Content-Type header when sniffing was 
+			 * prohibited, we must migrate the retrieval context. */
+			error = hlcache_migrate_ctx(ctx, effective_type);
+
+			if (effective_type != NULL)
+				lwc_string_unref(effective_type);
+		}
 
-			if (ctx->handle->cb != NULL) {
-				hlcache_event hlevent;
+		/* No need to report that we need data: 
+		 * we'll get some anyway if there is any */
+		if (error == NSERROR_NEED_DATA)
+			error = NSERROR_OK;
 
-				hlevent.type = CONTENT_MSG_DOWNLOAD;
-				hlevent.data.download = handle;
+		return error;
 
-				ctx->handle->cb(ctx->handle, &hlevent,
-						ctx->handle->pw);
-			}
-		} else {
-			/* Unacceptable type: abort fetch and report error */
-			llcache_handle_abort(handle);
-			llcache_handle_release(handle);
+		break;
+	case LLCACHE_EVENT_HAD_DATA:
+		error = mimesniff_compute_effective_type(handle, 
+				event->data.data.buf, event->data.data.len,
+				ctx->flags & HLCACHE_RETRIEVE_SNIFF_TYPE,
+				&effective_type);
+		if (error != NSERROR_OK) {
+			assert(0 && "MIME sniff failed with data");
+		}
 
-			if (ctx->handle->cb != NULL) {
-				hlcache_event hlevent;
+		error = hlcache_migrate_ctx(ctx, effective_type);
 
-				hlevent.type = CONTENT_MSG_ERROR;
-				hlevent.data.error = messages_get("BadType");
+		lwc_string_unref(effective_type);
 
-				ctx->handle->cb(ctx->handle, &hlevent, 
-						ctx->handle->pw);
-			}
-		}
+		return error;
 
-		/* No longer require retrieval context */
-		free((char *) ctx->child.charset);
-		free(ctx);
-	}
 		break;
-	case LLCACHE_EVENT_HAD_DATA:
-		/* fall through */
 	case LLCACHE_EVENT_DONE:
-		/* should never happen: the handler must be changed */
-		assert(0 && "Unexpected llcache event");
-		break;
+		/* DONE event before we could determine the effective MIME type.
+		 * Treat this as an error.
+		 */
+		/* Fall through */
 	case LLCACHE_EVENT_ERROR:
 		if (ctx->handle->cb != NULL) {
 			hlcache_event hlevent;
@@ -530,33 +512,93 @@ nserror hlcache_llcache_callback(llcache_handle *handle,
 }
 
 /**
- * Determine if the type of a low-level cache object is acceptable
+ * Migrate a retrieval context into its final destination content
  *
- * \param llcache	  Low-level cache object to consider
+ * \param ctx             Context to migrate
+ * \param effective_type  The effective MIME type of the content, or NULL
+ * \return NSERROR_OK on success, 
+ *         NSERROR_NEED_DATA on success where data is needed,
+ *         appropriate error otherwise
+ */
+nserror hlcache_migrate_ctx(hlcache_retrieval_ctx *ctx, 
+		lwc_string *effective_type)
+{
+	content_type type = CONTENT_NONE;
+	nserror error = NSERROR_OK;
+
+	/* Unlink the context to prevent recursion */
+	RING_REMOVE(hlcache_retrieval_ctx_ring, ctx);
+		
+	if (effective_type != NULL && 
+			hlcache_type_is_acceptable(effective_type,
+			ctx->accepted_types, &type)) {
+		error = hlcache_find_content(ctx, effective_type);
+		if (error != NSERROR_OK && error != NSERROR_NEED_DATA) {
+			hlcache_event hlevent;
+
+			hlevent.type = CONTENT_MSG_ERROR;
+			hlevent.data.error = messages_get("MiscError");
+
+			ctx->handle->cb(ctx->handle, &hlevent, 
+					ctx->handle->pw);
+			
+			llcache_handle_abort(ctx->llcache);
+			llcache_handle_release(ctx->llcache);
+		}
+	} else if (type == CONTENT_NONE && 
+			(ctx->flags & HLCACHE_RETRIEVE_MAY_DOWNLOAD)) {
+		/* Unknown type, and we can download, so convert */
+		llcache_handle_force_stream(ctx->llcache);
+
+		if (ctx->handle->cb != NULL) {
+			hlcache_event hlevent;
+
+			hlevent.type = CONTENT_MSG_DOWNLOAD;
+			hlevent.data.download = ctx->llcache;
+
+			ctx->handle->cb(ctx->handle, &hlevent,
+					ctx->handle->pw);
+		}
+
+		/* Ensure caller knows we need data */
+		error = NSERROR_NEED_DATA;
+	} else {
+		/* Unacceptable type: abort fetch and report error */
+		llcache_handle_abort(ctx->llcache);
+		llcache_handle_release(ctx->llcache);
+
+		if (ctx->handle->cb != NULL) {
+			hlcache_event hlevent;
+
+			hlevent.type = CONTENT_MSG_ERROR;
+			hlevent.data.error = messages_get("BadType");
+
+			ctx->handle->cb(ctx->handle, &hlevent, 
+					ctx->handle->pw);
+		}
+	}
+
+	/* No longer require retrieval context */
+	free((char *) ctx->child.charset);
+	free(ctx);
+
+	return error;
+}
+
+/**
+ * Determine if the specified MIME type is acceptable
+ *
+ * \param mime_type       MIME type to consider
  * \param accepted_types  Array of acceptable types, or NULL for any
  * \param computed_type	  Pointer to location to receive computed type of object
  * \return True if the type is acceptable, false otherwise
  */
-bool hlcache_type_is_acceptable(llcache_handle *llcache, 
+bool hlcache_type_is_acceptable(lwc_string *mime_type,
 		content_type accepted_types, content_type *computed_type)
 {
-	const char *content_type_header;
-	http_content_type *ct;
 	content_type type;
-	nserror error;
 
-	content_type_header = 
-			llcache_handle_get_header(llcache, "Content-Type");
-	if (content_type_header == NULL)
-		content_type_header = "text/plain";
-
-	error = http_parse_content_type(content_type_header, &ct);
-	if (error != NSERROR_OK)
-		return false;
-
-	type = content_factory_type_from_mime_type(ct->media_type);
-
-	http_content_type_destroy(ct);
+	type = content_factory_type_from_mime_type(mime_type);
 
 	*computed_type = type;
 
@@ -566,18 +608,23 @@ bool hlcache_type_is_acceptable(llcache_handle *llcache,
 /**
  * Find a content for the high-level cache handle
  *
- * \param ctx	High-level cache retrieval context
- * \return NSERROR_OK on success, appropriate error otherwise
+ * \param ctx             High-level cache retrieval context
+ * \param effective_type  Effective MIME type of content
+ * \return NSERROR_OK on success, 
+ *         NSERROR_NEED_DATA on success where data is needed,
+ *         appropriate error otherwise
  *
  * \pre handle::state == HLCACHE_HANDLE_NEW
  * \pre Headers must have been received for associated low-level handle
  * \post Low-level handle is either released, or associated with new content
  * \post High-level handle is registered with content
  */
-nserror hlcache_find_content(hlcache_retrieval_ctx *ctx)
+nserror hlcache_find_content(hlcache_retrieval_ctx *ctx,
+		lwc_string *effective_type)
 {
 	hlcache_entry *entry;
 	hlcache_event event;
+	nserror error = NSERROR_OK;
 
 	/* Search list of cached contents for a suitable one */
 	for (entry = hlcache_content_list; entry != NULL; entry = entry->next) {
@@ -617,7 +664,8 @@ nserror hlcache_find_content(hlcache_retrieval_ctx *ctx)
 
 		/* Create content using llhandle */
 		entry->content = content_factory_create_content(ctx->llcache, 
-				ctx->child.charset, ctx->child.quirks);
+				ctx->child.charset, ctx->child.quirks,
+				effective_type);
 		if (entry->content == NULL) {
 			free(entry);
 			return NSERROR_NOMEM;
@@ -629,6 +677,9 @@ nserror hlcache_find_content(hlcache_retrieval_ctx *ctx)
 		if (hlcache_content_list != NULL)
 			hlcache_content_list->prev = entry;
 		hlcache_content_list = entry;
+
+		/* Signal to caller that we created a content */
+		error = NSERROR_NEED_DATA;
 	} else {
 		/* Found a suitable content: no longer need low-level handle */
 		llcache_handle_release(ctx->llcache);	
@@ -676,7 +727,7 @@ nserror hlcache_find_content(hlcache_retrieval_ctx *ctx)
 		}
 	}
 
-	return NSERROR_OK;
+	return error;
 }
 
 /**
diff --git a/content/hlcache.h b/content/hlcache.h
index 2372158e1..714fcce45 100644
--- a/content/hlcache.h
+++ b/content/hlcache.h
@@ -60,7 +60,9 @@ enum hlcache_retrieve_flag {
 	 * To avoid confusion, high-level flags are allocated from bit 31 down. 
 	 */
 	/** It's permitted to convert this request into a download */
-	HLCACHE_RETRIEVE_MAY_DOWNLOAD = (1 << 31)
+	HLCACHE_RETRIEVE_MAY_DOWNLOAD = (1 << 31),
+	/* Permit content-type sniffing */
+	HLCACHE_RETRIEVE_SNIFF_TYPE   = (1 << 30)
 };
 
 /**
diff --git a/content/llcache.c b/content/llcache.c
index 3141ad05f..b02d0135f 100644
--- a/content/llcache.c
+++ b/content/llcache.c
@@ -1455,6 +1455,8 @@ nserror llcache_object_notify_users(llcache_object *object)
 		if (handle->state == LLCACHE_FETCH_DATA &&
 				objstate >= LLCACHE_FETCH_DATA &&
 				object->source_len > handle->bytes) {
+			size_t orig_handle_read;
+
 			/* Construct HAD_DATA event */
 			event.type = LLCACHE_EVENT_HAD_DATA;
 			event.data.data.buf = 
@@ -1466,9 +1468,13 @@ nserror llcache_object_notify_users(llcache_object *object)
 			if (object->fetch.flags & 
 					LLCACHE_RETRIEVE_STREAM_DATA) {
 				/* Streaming, so reset to zero to 
-				 * minimise amount of cached source data */
+				 * minimise amount of cached source data.
+				 * Additionally, we don't support replay
+				 * when streaming. */
+				orig_handle_read = 0;
 				handle->bytes = object->source_len = 0;
 			} else {
+				orig_handle_read = handle->bytes;
 				handle->bytes = object->source_len;
 			}
 
@@ -1482,6 +1488,15 @@ nserror llcache_object_notify_users(llcache_object *object)
 				if (error != NSERROR_OK)
 					return error;
 
+				continue;
+			} else if (error == NSERROR_NEED_DATA) {
+				/* User requested replay */
+				handle->bytes = orig_handle_read;
+
+				/* Continue with the next user -- we'll 
+				 * reemit the data next time round */
+				user->iterator_target = false;
+				next_user = user->next;
 				continue;
 			} else if (error != NSERROR_OK) {
 				user->iterator_target = false;
diff --git a/content/llcache.h b/content/llcache.h
index 215e6cc1a..e6584e165 100644
--- a/content/llcache.h
+++ b/content/llcache.h
@@ -89,12 +89,10 @@ enum llcache_retrieve_flag {
 	LLCACHE_RETRIEVE_FORCE_FETCH    = (1 << 0), 
 	/** Requested URL was verified */
 	LLCACHE_RETRIEVE_VERIFIABLE     = (1 << 1), 
-	/** Permit content-type sniffing */
-	LLCACHE_RETRIEVE_SNIFF_TYPE     = (1 << 2), 
 	/**< No error pages */
-	LLCACHE_RETRIEVE_NO_ERROR_PAGES = (1 << 3),
+	LLCACHE_RETRIEVE_NO_ERROR_PAGES = (1 << 2),
 	/**< Stream data (implies that object is not cacheable) */
-	LLCACHE_RETRIEVE_STREAM_DATA    = (1 << 4)
+	LLCACHE_RETRIEVE_STREAM_DATA    = (1 << 3)
 };
 
 /** Low-level cache query types */
diff --git a/content/mimesniff.c b/content/mimesniff.c
new file mode 100644
index 000000000..a911318f9
--- /dev/null
+++ b/content/mimesniff.c
@@ -0,0 +1,687 @@
+/*
+ * Copyright 2011 John-Mark Bell <jmb@netsurf-browser.org>
+ *
+ * This file is part of NetSurf, http://www.netsurf-browser.org/
+ *
+ * NetSurf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * NetSurf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** \file
+ * MIME type sniffer (implementation)
+ */
+
+#include<string.h>
+
+#include "content/content_factory.h"
+#include "content/llcache.h"
+#include "content/mimesniff.h"
+#include "utils/http.h"
+#include "utils/utils.h"
+
+static lwc_string *unknown_unknown;
+static lwc_string *application_unknown;
+static lwc_string *any;
+static lwc_string *text_xml;
+static lwc_string *application_xml;
+static lwc_string *text_html;
+static lwc_string *text_plain;
+static lwc_string *application_octet_stream;
+static lwc_string *image_gif;
+static lwc_string *image_png;
+static lwc_string *image_jpeg;
+static lwc_string *image_bmp;
+static lwc_string *image_vnd_microsoft_icon;
+static lwc_string *image_webp;
+static lwc_string *application_rss_xml;
+static lwc_string *application_atom_xml;
+static lwc_string *audio_x_wave;
+static lwc_string *application_ogg;
+static lwc_string *video_webm;
+static lwc_string *application_x_rar_compressed;
+static lwc_string *application_zip;
+static lwc_string *application_x_gzip;
+static lwc_string *application_postscript;
+static lwc_string *application_pdf;
+
+nserror mimesniff_init(void)
+{
+	lwc_error lerror;
+
+#define SINIT(v, s) \
+	lerror = lwc_intern_string(s, SLEN(s), &v); \
+	if (lerror != lwc_error_ok) \
+		return NSERROR_NOMEM
+
+	SINIT(unknown_unknown,              "unknown/unknown");
+	SINIT(application_unknown,          "application/unknown");
+	SINIT(any,                          "*/*");
+	SINIT(text_xml,                     "text/xml");
+	SINIT(application_xml,              "application/xml");
+	SINIT(text_html,                    "text/html");
+	SINIT(text_plain,                   "text/plain");
+	SINIT(application_octet_stream,     "application/octet-stream");
+	SINIT(image_gif,                    "image/gif");
+	SINIT(image_png,                    "image/png");
+	SINIT(image_jpeg,                   "image/jpeg");
+	SINIT(image_bmp,                    "image/bmp");
+	SINIT(image_vnd_microsoft_icon,     "image/vnd.microsoft.icon");
+	SINIT(image_webp,                   "image/webp");
+	SINIT(application_rss_xml,          "application/rss+xml");
+	SINIT(application_atom_xml,         "application/atom+xml");
+	SINIT(audio_x_wave,                 "audio/x-wave");
+	SINIT(application_ogg,              "application/ogg");
+	SINIT(video_webm,                   "video/webm");
+	SINIT(application_x_rar_compressed, "application/x-rar-compressed");
+	SINIT(application_zip,              "application/zip");
+	SINIT(application_x_gzip,           "application/x-gzip");
+	SINIT(application_postscript,       "application/postscript");
+	SINIT(application_pdf,              "application/pdf");
+#undef SINIT
+
+	return NSERROR_OK;
+}
+
+void mimesniff_fini(void)
+{
+	lwc_string_unref(application_pdf);
+	lwc_string_unref(application_postscript);
+	lwc_string_unref(application_x_gzip);
+	lwc_string_unref(application_zip);
+	lwc_string_unref(application_x_rar_compressed);
+	lwc_string_unref(video_webm);
+	lwc_string_unref(application_ogg);
+	lwc_string_unref(audio_x_wave);
+	lwc_string_unref(application_atom_xml);
+	lwc_string_unref(application_rss_xml);
+	lwc_string_unref(image_webp);
+	lwc_string_unref(image_vnd_microsoft_icon);
+	lwc_string_unref(image_bmp);
+	lwc_string_unref(image_jpeg);
+	lwc_string_unref(image_png);
+	lwc_string_unref(image_gif);
+	lwc_string_unref(application_octet_stream);
+	lwc_string_unref(text_plain);
+	lwc_string_unref(text_html);
+	lwc_string_unref(application_xml);
+	lwc_string_unref(text_xml);
+	lwc_string_unref(any);
+	lwc_string_unref(application_unknown);
+	lwc_string_unref(unknown_unknown);
+}
+
+static bool mimesniff__has_binary_octets(const uint8_t *data, size_t len)
+{
+	const uint8_t *end = data + len;
+
+	while (data != end) {
+		const uint8_t c = *data;
+
+		/* Binary iff in C0 and not ESC, CR, FF, LF, HT */
+		if (c <= 0x1f && c != 0x1b && c != '\r' && c != '\f' && 
+				c != '\n' && c != '\t')
+			break;
+
+		data++;
+	}
+
+	return data != end;
+}
+
+struct map_s {
+	const uint8_t *sig;
+	size_t len;
+	bool safe;
+	lwc_string **type;
+};
+
+static nserror mimesniff__match_unknown_ws(const uint8_t *data, size_t len,
+		lwc_string **effective_type)
+{
+#define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t }
+	static const struct map_s ws_exact_match_types[] = {
+		SIG(&text_xml, "<?xml", false),
+		{ NULL, 0, false, NULL }
+	};
+
+	static const struct map_s ws_inexact_match_types[] = {
+		SIG(&text_html, "<!DOCTYPE HTML", false),
+		SIG(&text_html, "<HTML",          false),
+		SIG(&text_html, "<HEAD",          false),
+		SIG(&text_html, "<SCRIPT",        false),
+		SIG(&text_html, "<IFRAME",        false),
+		SIG(&text_html, "<H1",            false),
+		SIG(&text_html, "<DIV",           false),
+		SIG(&text_html, "<FONT",          false),
+		SIG(&text_html, "<TABLE",         false),
+		SIG(&text_html, "<A",             false),
+		SIG(&text_html, "<STYLE",         false),
+		SIG(&text_html, "<TITLE",         false),
+		SIG(&text_html, "<B",             false),
+		SIG(&text_html, "<BODY",          false),
+		SIG(&text_html, "<BR",            false),
+		SIG(&text_html, "<P",             false),
+		SIG(&text_html, "<!--",           false),
+		{ NULL, 0, false, NULL }
+	};
+#undef SIG
+	const uint8_t *end = data + len;
+	const struct map_s *it;
+
+	/* Skip leading whitespace */
+	while (data != end) {
+		const uint8_t c = *data;
+
+		if (c != '\t' && c != '\n' && c != '\f' && 
+				c != '\r' && c != ' ')
+			break;
+
+		data++;
+		len--;
+	}
+
+	if (data == end)
+		return NSERROR_NOT_FOUND;
+
+	for (it = ws_exact_match_types; it->sig != NULL; it++) {
+		if (it->len <= len && memcmp(data, it->sig, it->len) == 0) {
+			*effective_type = lwc_string_ref(*it->type);
+			return NSERROR_OK;
+		}
+	}
+
+	for (it = ws_inexact_match_types; it->sig != NULL; it++) {
+		/* +1 for trailing space or > */
+		if (len < it->len + 1)
+			continue;
+
+		if (strncasecmp((const char *) data, 
+				(const char *) it->sig, it->len) == 0 && 
+				(data[it->len] == ' ' || 
+				data[it->len] == '>')) {
+			*effective_type = lwc_string_ref(*it->type);
+			return NSERROR_OK;
+		}
+	}
+
+	return NSERROR_NOT_FOUND;
+}
+
+static nserror mimesniff__match_unknown_bom(const uint8_t *data, size_t len,
+		lwc_string **effective_type)
+{
+#define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t }
+	static const struct map_s bom_match_types[] = {
+		SIG(&text_plain, "\xfe\xff",     false),
+		SIG(&text_plain, "\xff\xfe",     false),
+		SIG(&text_plain, "\xef\xbb\xbf", false),
+		{ NULL, 0, false, NULL }
+	};
+#undef SIG
+	const struct map_s *it;
+
+	for (it = bom_match_types; it->sig != NULL; it++) {
+		if (it->len <= len && memcmp(data, it->sig, it->len) == 0) {
+			*effective_type = lwc_string_ref(*it->type);
+			return NSERROR_OK;
+		}
+	}
+
+	return NSERROR_NOT_FOUND;
+}
+
+static nserror mimesniff__match_unknown_riff(const uint8_t *data, size_t len,
+		lwc_string **effective_type)
+{
+#define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t }
+	static const struct map_s riff_match_types[] = {
+		SIG(&image_webp,   "WEBPVP", true),
+		SIG(&audio_x_wave, "WAVE",   true),
+		{ NULL, 0, false, NULL }
+	};
+#undef SIG
+	const struct map_s *it;
+
+	for (it = riff_match_types; it->sig != NULL; it++) {
+		if (it->len + SLEN("RIFF????") <= len && 
+				memcmp(data, "RIFF", SLEN("RIFF")) == 0 &&
+				memcmp(data + SLEN("RIFF????"), 
+						it->sig, it->len) == 0) {
+			*effective_type = lwc_string_ref(*it->type);
+			return NSERROR_OK;
+		}
+	}
+
+	return NSERROR_NOT_FOUND;
+}
+
+static nserror mimesniff__match_unknown_exact(const uint8_t *data, size_t len,
+		bool allow_unsafe, lwc_string **effective_type)
+{
+#define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t }
+	static const struct map_s exact_match_types[] = {
+		SIG(&image_gif,                    "GIF87a",            true),
+		SIG(&image_gif,                    "GIF89a",            true),
+		SIG(&image_png,                    "\x89PNG\r\n\x1a\n", true),
+		SIG(&image_jpeg,                   "\xff\xd8\xff",      true),
+		SIG(&image_bmp,                    "BM",                true),
+		SIG(&image_vnd_microsoft_icon,     "\x00\x00\x01\x00",  true),
+		SIG(&application_ogg,              "OggS\x00",          true),
+		SIG(&video_webm,                   "\x1a\x45\xdf\xa3",  true),
+		SIG(&application_x_rar_compressed, "Rar \x1a\x07\x00",  true),
+		SIG(&application_zip,              "PK\x03\x04",        true),
+		SIG(&application_x_gzip,           "\x1f\x8b\x08",      true),
+		SIG(&application_postscript,       "%!PS-Adobe-",       true),
+		SIG(&application_pdf,              "%PDF-",             false),
+		{ NULL, 0, false, NULL }
+	};
+#undef SIG
+	const struct map_s *it;
+
+	for (it = exact_match_types; it->sig != NULL; it++) {
+		if (it->len <= len && memcmp(data, it->sig, it->len) == 0 &&
+				(allow_unsafe || it->safe)) {
+			*effective_type = lwc_string_ref(*it->type);
+			return NSERROR_OK;
+		}
+	}
+
+	return NSERROR_NOT_FOUND;
+}
+
+static nserror mimesniff__match_unknown(const uint8_t *data, size_t len,
+		bool allow_unsafe, lwc_string **effective_type)
+{
+	if (mimesniff__match_unknown_exact(data, len, allow_unsafe, 
+			effective_type) == NSERROR_OK)
+		return NSERROR_OK;
+
+	if (mimesniff__match_unknown_riff(data, len, 
+			effective_type) == NSERROR_OK)
+		return NSERROR_OK;
+
+	if (allow_unsafe == false)
+		return NSERROR_NOT_FOUND;
+
+	if (mimesniff__match_unknown_bom(data, len,
+			effective_type) == NSERROR_OK)
+		return NSERROR_OK;
+
+	if (mimesniff__match_unknown_ws(data, len,
+			effective_type) == NSERROR_OK)
+		return NSERROR_OK;
+
+	return NSERROR_NOT_FOUND;
+}
+
+static nserror mimesniff__compute_unknown(const uint8_t *data, size_t len,
+		lwc_string **effective_type)
+{
+	if (data == NULL)
+		return NSERROR_NEED_DATA;
+
+	len = min(len, 512);
+
+	if (mimesniff__match_unknown(data, len, true, 
+			effective_type) == NSERROR_OK)
+		return NSERROR_OK;
+
+	if (mimesniff__has_binary_octets(data, len) == false) {
+		/* No binary octets => text/plain */
+		*effective_type = lwc_string_ref(text_plain);
+		return NSERROR_OK;
+	}
+
+	*effective_type = lwc_string_ref(application_octet_stream);
+
+	return NSERROR_OK;
+}
+
+static nserror mimesniff__compute_text_or_binary(const uint8_t *data, 
+		size_t len, lwc_string **effective_type)
+{
+	if (data == NULL)
+		return NSERROR_NEED_DATA;
+
+	len = min(len, 512);
+
+	if (len >= 3 && ((data[0] == 0xfe && data[1] == 0xff) ||
+			(data[0] == 0xff && data[1] == 0xfe) ||
+			(data[0] == 0xef && data[1] == 0xbb && 
+				data[2] == 0xbf))) {
+		/* Found a BOM => text/plain */
+		*effective_type = lwc_string_ref(text_plain);
+		return NSERROR_OK;
+	}
+
+	if (mimesniff__has_binary_octets(data, len) == false) {
+		/* No binary octets => text/plain */
+		*effective_type = lwc_string_ref(text_plain);
+		return NSERROR_OK;
+	}
+
+	if (mimesniff__match_unknown(data, len, false, 
+			effective_type) == NSERROR_OK)
+		return NSERROR_OK;
+
+	*effective_type = lwc_string_ref(application_octet_stream);
+
+	return NSERROR_OK;
+}
+
+static nserror mimesniff__compute_image(lwc_string *official_type,
+		const uint8_t *data, size_t len, lwc_string **effective_type)
+{
+#define SIG(t, s) { (const uint8_t *) s, SLEN(s), t }
+	static const struct it_s {
+		const uint8_t *sig;
+		size_t len;
+		lwc_string **type;
+	} image_types[] = {
+		SIG(&image_gif,                "GIF87a"),
+		SIG(&image_gif,                "GIF89a"),
+		SIG(&image_png,                "\x89PNG\r\n\x1a\n"),
+		SIG(&image_jpeg,               "\xff\xd8\xff"),
+		SIG(&image_bmp,                "BM"),
+		SIG(&image_vnd_microsoft_icon, "\x00\x00\x01\x00"),
+		{ NULL, 0, NULL }
+	};
+#undef SIG
+
+	const struct it_s *it;
+
+	if (data == NULL)
+		return NSERROR_NEED_DATA;
+
+	for (it = image_types; it->sig != NULL; it++) {
+		if (it->len <= len && memcmp(data, it->sig, it->len) == 0) {
+			lwc_string_unref(official_type);
+			*effective_type = lwc_string_ref(*it->type);
+			return NSERROR_OK;
+		}
+	}
+
+	/* WebP has a signature that doesn't fit into the above table */
+	if (SLEN("RIFF????WEBPVP") <= len && 
+			memcmp(data, "RIFF", SLEN("RIFF")) == 0 && 
+			memcmp(data + SLEN("RIFF????"), 
+					"WEBPVP", SLEN("WEBPVP")) == 0 ) {
+		lwc_string_unref(official_type);
+		*effective_type = lwc_string_ref(image_webp);
+		return NSERROR_OK;
+	}
+
+	*effective_type = official_type;
+
+	return NSERROR_OK;
+}
+
+static nserror mimesniff__compute_feed_or_html(const uint8_t *data,
+		size_t len, lwc_string **effective_type)
+{
+#define RDF_NS "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+#define RSS_NS "http://purl.org/rss/1.0"
+
+	enum state_e {
+		BEFORE_BOM,
+		BEFORE_MARKUP,
+		MARKUP_START,
+		COMMENT_OR_DOCTYPE,
+		IN_COMMENT,
+		IN_DOCTYPE,
+		IN_PI,
+		IN_TAG,
+		IN_RDF
+	} state = BEFORE_BOM;
+
+	bool rdf = false, rss = false;
+	const uint8_t *end;
+
+	if (data == NULL)
+		return NSERROR_NEED_DATA;
+
+	end = data + min(len, 512);
+
+	while (data < end) {
+		const uint8_t c = *data;
+
+#define MATCH(s) SLEN(s) <= (size_t) (end - data) && \
+			memcmp(data, s, SLEN(s)) == 0
+
+		switch (state) {
+		case BEFORE_BOM:
+			if (3 <= end - data && c == 0xef && data[1] == 0xbb && 
+					data[2] == 0xbf) {
+				data += 3;
+			}
+
+			state = BEFORE_MARKUP;
+			break;
+		case BEFORE_MARKUP:
+			if (c == '\t' || c == '\n' || c	== '\r' || c == ' ')
+				data++;
+			else if (c != '<')
+				data = end;
+			else {
+				state = MARKUP_START;
+				data++;
+			}
+			break;
+		case MARKUP_START:
+			if (c == '!') {
+				state = COMMENT_OR_DOCTYPE;
+				data++;
+			} else if (c == '?') {
+				state = IN_PI;
+				data++;
+			} else {
+				/* Reconsume input */
+				state = IN_TAG;
+			}
+			break;
+		case COMMENT_OR_DOCTYPE:
+			if (2 <= end - data && c == '-' && data[1] == '-') {
+				state = IN_COMMENT;
+				data += 2;
+			} else {
+				/* Reconsume input */
+				state = IN_DOCTYPE;
+			}
+			break;
+		case IN_COMMENT:
+			if (3 <= end - data && c == '-' && data[1] == '-' &&
+					data[2] == '>') {
+				state = BEFORE_MARKUP;
+				data += 3;
+			} else
+				data++;
+			break;
+		case IN_DOCTYPE:
+			if (c == '>')
+				state = BEFORE_MARKUP;
+			data++;
+			break;
+		case IN_PI:
+			if (2 <= end - data && c == '?' && data[1] == '>') {
+				state = BEFORE_MARKUP;
+				data += 2;
+			} else
+				data++;
+			break;
+		case IN_TAG:
+			if (MATCH("rss")) {
+				*effective_type = 
+					lwc_string_ref(application_rss_xml);
+				return NSERROR_OK;
+			} else if (MATCH("feed")) {
+				*effective_type = 
+					lwc_string_ref(application_atom_xml);
+				return NSERROR_OK;
+			} else if (MATCH("rdf:RDF")) {
+				state = IN_RDF;
+				data += SLEN("rdf:RDF");
+			} else
+				data = end;
+			break;
+		case IN_RDF:
+			if (MATCH(RSS_NS)) {
+				rss = true;
+				data += SLEN(RSS_NS);
+			} else if (MATCH(RDF_NS)) {
+				rdf = true;
+				data += SLEN(RDF_NS);
+			} else
+				data++;
+
+			if (rdf && rss) {
+				*effective_type = 
+					lwc_string_ref(application_rss_xml);
+				return NSERROR_OK;
+			}
+
+			break;
+		}
+#undef MATCH
+	}
+
+	*effective_type = lwc_string_ref(text_html);
+
+	return NSERROR_OK;
+
+#undef RSS_NS
+#undef RDF_NS
+}
+
+/* See mimesniff.h for documentation */
+nserror mimesniff_compute_effective_type(llcache_handle *handle,
+		const uint8_t *data, size_t len, bool sniff_allowed,
+		lwc_string **effective_type)
+{
+#define S(s) { s, SLEN(s) }
+	static const struct tt_s {
+		const char *data;
+		size_t len;
+	} text_types[] = {
+		S("text/plain"),
+		S("text/plain; charset=ISO-8859-1"),
+		S("text/plain; charset=iso-8859-1"),
+		S("text/plain; charset=UTF-8"),
+		{ NULL, 0 }
+	};
+#undef S
+
+	const char *content_type_header;
+	size_t content_type_header_len;
+	http_content_type *ct;
+	const struct tt_s *tt;
+	bool match;
+	nserror error;
+
+	content_type_header = 
+			llcache_handle_get_header(handle, "Content-Type");
+	if (content_type_header == NULL) {
+		if (sniff_allowed == false)
+			return NSERROR_NOT_FOUND;
+
+		/* No official type => unknown */
+		return mimesniff__compute_unknown(data, len, effective_type);
+	}
+
+	error = http_parse_content_type(content_type_header, &ct);
+	if (error != NSERROR_OK) {
+		if (sniff_allowed == false)
+			return NSERROR_NOT_FOUND;
+
+		/* Unparseable => unknown */
+		return mimesniff__compute_unknown(data, len, effective_type);
+	}
+
+	if (sniff_allowed == false) {
+		*effective_type = lwc_string_ref(ct->media_type);
+		http_content_type_destroy(ct);
+		return NSERROR_OK;
+	}
+
+	content_type_header_len = strlen(content_type_header);
+
+	/* Look for text types */
+	for (tt = text_types; tt->data != NULL; tt++) {
+		if (tt->len == content_type_header_len &&
+				memcmp(tt->data, content_type_header, 
+					content_type_header_len) == 0) {
+			http_content_type_destroy(ct);
+			return mimesniff__compute_text_or_binary(data, len,
+					effective_type);
+		}
+	}
+
+	/* unknown/unknown, application/unknown, * / * */
+	if ((lwc_string_caseless_isequal(ct->media_type, unknown_unknown, 
+				&match) == lwc_error_ok && match) ||
+			(lwc_string_caseless_isequal(ct->media_type, 
+				application_unknown, &match) == lwc_error_ok && 
+				match) ||
+			(lwc_string_caseless_isequal(ct->media_type, any, 
+				&match) == lwc_error_ok && match)) {
+		http_content_type_destroy(ct);
+		return mimesniff__compute_unknown(data, len, effective_type);
+	}
+
+	/* +xml */
+	if (lwc_string_length(ct->media_type) > SLEN("+xml") &&
+			strncasecmp(lwc_string_data(ct->media_type) + 
+				lwc_string_length(ct->media_type) - 
+				SLEN("+xml"), 
+				"+xml", SLEN("+xml")) == 0) {
+		/* Use official type */
+		*effective_type = lwc_string_ref(ct->media_type);
+		http_content_type_destroy(ct);
+		return NSERROR_OK;
+	}
+
+	/* text/xml, application/xml */
+	if ((lwc_string_caseless_isequal(ct->media_type, text_xml, 
+				&match) == lwc_error_ok && match) ||
+			(lwc_string_caseless_isequal(ct->media_type, 
+				application_xml, &match) == lwc_error_ok && 
+				match)) {
+		/* Use official type */
+		*effective_type = lwc_string_ref(ct->media_type);
+		http_content_type_destroy(ct);
+		return NSERROR_OK;
+	}
+	
+	/* Image types */
+	if (content_factory_type_from_mime_type(ct->media_type) == 
+			CONTENT_IMAGE) {
+		lwc_string *official_type = lwc_string_ref(ct->media_type);
+		http_content_type_destroy(ct);
+		return mimesniff__compute_image(official_type,
+				data, len, effective_type);
+	}
+
+	/* text/html */
+	if ((lwc_string_caseless_isequal(ct->media_type, text_html, 
+			&match) == lwc_error_ok && match)) {
+		http_content_type_destroy(ct);
+		return mimesniff__compute_feed_or_html(data, len,
+				effective_type);
+	}
+
+	/* Use official type */
+	*effective_type = lwc_string_ref(ct->media_type);
+
+	http_content_type_destroy(ct);
+
+	return NSERROR_OK;
+}
+
diff --git a/content/mimesniff.h b/content/mimesniff.h
new file mode 100644
index 000000000..8ddabd2e7
--- /dev/null
+++ b/content/mimesniff.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2011 John-Mark Bell <jmb@netsurf-browser.org>
+ *
+ * This file is part of NetSurf, http://www.netsurf-browser.org/
+ *
+ * NetSurf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * NetSurf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** \file
+ * MIME type sniffer (interface)
+ */
+
+#ifndef NETSURF_CONTENT_MIMESNIFF_H_
+#define NETSURF_CONTENT_MIMESNIFF_H_
+
+#include <stdbool.h>
+
+#include <libwapcaplet/libwapcaplet.h>
+#include "utils/errors.h"
+
+struct llcache_handle;
+
+/**
+ * Compute the effective MIME type for an object using the sniffing
+ * algorithm described in draft-abarth-mime-sniff-06.
+ *
+ * \param handle          Source data handle to sniff
+ * \param data            First data chunk, or NULL
+ * \param len             Length of \a data, in bytes
+ * \param sniff_allowed   Whether MIME type sniffing is allowed
+ * \param effective_type  Location to receive computed type
+ * \return NSERROR_OK on success,
+ *         NSERROR_NEED_DATA iff \a data is NULL and data is needed
+ *         NSERROR_NOT_FOUND if sniffing is prohibited and no 
+ *                           Content-Type header was found
+ */
+nserror mimesniff_compute_effective_type(struct llcache_handle *handle,
+		const uint8_t *data, size_t len, bool sniff_allowed,
+		lwc_string **effective_type);
+
+nserror mimesniff_init(void);
+void mimesniff_fini(void);
+
+#endif
-- 
cgit v1.2.3