summaryrefslogtreecommitdiff
path: root/content/mimesniff.c
diff options
context:
space:
mode:
Diffstat (limited to 'content/mimesniff.c')
-rw-r--r--content/mimesniff.c687
1 files changed, 687 insertions, 0 deletions
diff --git a/content/mimesniff.c b/content/mimesniff.c
new file mode 100644
index 000000000..a911318f9
--- /dev/null
+++ b/content/mimesniff.c
@@ -0,0 +1,687 @@
+/*
+ * Copyright 2011 John-Mark Bell <jmb@netsurf-browser.org>
+ *
+ * This file is part of NetSurf, http://www.netsurf-browser.org/
+ *
+ * NetSurf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * NetSurf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** \file
+ * MIME type sniffer (implementation)
+ */
+
+#include<string.h>
+
+#include "content/content_factory.h"
+#include "content/llcache.h"
+#include "content/mimesniff.h"
+#include "utils/http.h"
+#include "utils/utils.h"
+
+static lwc_string *unknown_unknown;
+static lwc_string *application_unknown;
+static lwc_string *any;
+static lwc_string *text_xml;
+static lwc_string *application_xml;
+static lwc_string *text_html;
+static lwc_string *text_plain;
+static lwc_string *application_octet_stream;
+static lwc_string *image_gif;
+static lwc_string *image_png;
+static lwc_string *image_jpeg;
+static lwc_string *image_bmp;
+static lwc_string *image_vnd_microsoft_icon;
+static lwc_string *image_webp;
+static lwc_string *application_rss_xml;
+static lwc_string *application_atom_xml;
+static lwc_string *audio_x_wave;
+static lwc_string *application_ogg;
+static lwc_string *video_webm;
+static lwc_string *application_x_rar_compressed;
+static lwc_string *application_zip;
+static lwc_string *application_x_gzip;
+static lwc_string *application_postscript;
+static lwc_string *application_pdf;
+
+nserror mimesniff_init(void)
+{
+ lwc_error lerror;
+
+#define SINIT(v, s) \
+ lerror = lwc_intern_string(s, SLEN(s), &v); \
+ if (lerror != lwc_error_ok) \
+ return NSERROR_NOMEM
+
+ SINIT(unknown_unknown, "unknown/unknown");
+ SINIT(application_unknown, "application/unknown");
+ SINIT(any, "*/*");
+ SINIT(text_xml, "text/xml");
+ SINIT(application_xml, "application/xml");
+ SINIT(text_html, "text/html");
+ SINIT(text_plain, "text/plain");
+ SINIT(application_octet_stream, "application/octet-stream");
+ SINIT(image_gif, "image/gif");
+ SINIT(image_png, "image/png");
+ SINIT(image_jpeg, "image/jpeg");
+ SINIT(image_bmp, "image/bmp");
+ SINIT(image_vnd_microsoft_icon, "image/vnd.microsoft.icon");
+ SINIT(image_webp, "image/webp");
+ SINIT(application_rss_xml, "application/rss+xml");
+ SINIT(application_atom_xml, "application/atom+xml");
+ SINIT(audio_x_wave, "audio/x-wave");
+ SINIT(application_ogg, "application/ogg");
+ SINIT(video_webm, "video/webm");
+ SINIT(application_x_rar_compressed, "application/x-rar-compressed");
+ SINIT(application_zip, "application/zip");
+ SINIT(application_x_gzip, "application/x-gzip");
+ SINIT(application_postscript, "application/postscript");
+ SINIT(application_pdf, "application/pdf");
+#undef SINIT
+
+ return NSERROR_OK;
+}
+
+void mimesniff_fini(void)
+{
+ lwc_string_unref(application_pdf);
+ lwc_string_unref(application_postscript);
+ lwc_string_unref(application_x_gzip);
+ lwc_string_unref(application_zip);
+ lwc_string_unref(application_x_rar_compressed);
+ lwc_string_unref(video_webm);
+ lwc_string_unref(application_ogg);
+ lwc_string_unref(audio_x_wave);
+ lwc_string_unref(application_atom_xml);
+ lwc_string_unref(application_rss_xml);
+ lwc_string_unref(image_webp);
+ lwc_string_unref(image_vnd_microsoft_icon);
+ lwc_string_unref(image_bmp);
+ lwc_string_unref(image_jpeg);
+ lwc_string_unref(image_png);
+ lwc_string_unref(image_gif);
+ lwc_string_unref(application_octet_stream);
+ lwc_string_unref(text_plain);
+ lwc_string_unref(text_html);
+ lwc_string_unref(application_xml);
+ lwc_string_unref(text_xml);
+ lwc_string_unref(any);
+ lwc_string_unref(application_unknown);
+ lwc_string_unref(unknown_unknown);
+}
+
+static bool mimesniff__has_binary_octets(const uint8_t *data, size_t len)
+{
+ const uint8_t *end = data + len;
+
+ while (data != end) {
+ const uint8_t c = *data;
+
+ /* Binary iff in C0 and not ESC, CR, FF, LF, HT */
+ if (c <= 0x1f && c != 0x1b && c != '\r' && c != '\f' &&
+ c != '\n' && c != '\t')
+ break;
+
+ data++;
+ }
+
+ return data != end;
+}
+
+struct map_s {
+ const uint8_t *sig;
+ size_t len;
+ bool safe;
+ lwc_string **type;
+};
+
+static nserror mimesniff__match_unknown_ws(const uint8_t *data, size_t len,
+ lwc_string **effective_type)
+{
+#define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t }
+ static const struct map_s ws_exact_match_types[] = {
+ SIG(&text_xml, "<?xml", false),
+ { NULL, 0, false, NULL }
+ };
+
+ static const struct map_s ws_inexact_match_types[] = {
+ SIG(&text_html, "<!DOCTYPE HTML", false),
+ SIG(&text_html, "<HTML", false),
+ SIG(&text_html, "<HEAD", false),
+ SIG(&text_html, "<SCRIPT", false),
+ SIG(&text_html, "<IFRAME", false),
+ SIG(&text_html, "<H1", false),
+ SIG(&text_html, "<DIV", false),
+ SIG(&text_html, "<FONT", false),
+ SIG(&text_html, "<TABLE", false),
+ SIG(&text_html, "<A", false),
+ SIG(&text_html, "<STYLE", false),
+ SIG(&text_html, "<TITLE", false),
+ SIG(&text_html, "<B", false),
+ SIG(&text_html, "<BODY", false),
+ SIG(&text_html, "<BR", false),
+ SIG(&text_html, "<P", false),
+ SIG(&text_html, "<!--", false),
+ { NULL, 0, false, NULL }
+ };
+#undef SIG
+ const uint8_t *end = data + len;
+ const struct map_s *it;
+
+ /* Skip leading whitespace */
+ while (data != end) {
+ const uint8_t c = *data;
+
+ if (c != '\t' && c != '\n' && c != '\f' &&
+ c != '\r' && c != ' ')
+ break;
+
+ data++;
+ len--;
+ }
+
+ if (data == end)
+ return NSERROR_NOT_FOUND;
+
+ for (it = ws_exact_match_types; it->sig != NULL; it++) {
+ if (it->len <= len && memcmp(data, it->sig, it->len) == 0) {
+ *effective_type = lwc_string_ref(*it->type);
+ return NSERROR_OK;
+ }
+ }
+
+ for (it = ws_inexact_match_types; it->sig != NULL; it++) {
+ /* +1 for trailing space or > */
+ if (len < it->len + 1)
+ continue;
+
+ if (strncasecmp((const char *) data,
+ (const char *) it->sig, it->len) == 0 &&
+ (data[it->len] == ' ' ||
+ data[it->len] == '>')) {
+ *effective_type = lwc_string_ref(*it->type);
+ return NSERROR_OK;
+ }
+ }
+
+ return NSERROR_NOT_FOUND;
+}
+
+static nserror mimesniff__match_unknown_bom(const uint8_t *data, size_t len,
+ lwc_string **effective_type)
+{
+#define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t }
+ static const struct map_s bom_match_types[] = {
+ SIG(&text_plain, "\xfe\xff", false),
+ SIG(&text_plain, "\xff\xfe", false),
+ SIG(&text_plain, "\xef\xbb\xbf", false),
+ { NULL, 0, false, NULL }
+ };
+#undef SIG
+ const struct map_s *it;
+
+ for (it = bom_match_types; it->sig != NULL; it++) {
+ if (it->len <= len && memcmp(data, it->sig, it->len) == 0) {
+ *effective_type = lwc_string_ref(*it->type);
+ return NSERROR_OK;
+ }
+ }
+
+ return NSERROR_NOT_FOUND;
+}
+
+static nserror mimesniff__match_unknown_riff(const uint8_t *data, size_t len,
+ lwc_string **effective_type)
+{
+#define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t }
+ static const struct map_s riff_match_types[] = {
+ SIG(&image_webp, "WEBPVP", true),
+ SIG(&audio_x_wave, "WAVE", true),
+ { NULL, 0, false, NULL }
+ };
+#undef SIG
+ const struct map_s *it;
+
+ for (it = riff_match_types; it->sig != NULL; it++) {
+ if (it->len + SLEN("RIFF????") <= len &&
+ memcmp(data, "RIFF", SLEN("RIFF")) == 0 &&
+ memcmp(data + SLEN("RIFF????"),
+ it->sig, it->len) == 0) {
+ *effective_type = lwc_string_ref(*it->type);
+ return NSERROR_OK;
+ }
+ }
+
+ return NSERROR_NOT_FOUND;
+}
+
+static nserror mimesniff__match_unknown_exact(const uint8_t *data, size_t len,
+ bool allow_unsafe, lwc_string **effective_type)
+{
+#define SIG(t, s, x) { (const uint8_t *) s, SLEN(s), x, t }
+ static const struct map_s exact_match_types[] = {
+ SIG(&image_gif, "GIF87a", true),
+ SIG(&image_gif, "GIF89a", true),
+ SIG(&image_png, "\x89PNG\r\n\x1a\n", true),
+ SIG(&image_jpeg, "\xff\xd8\xff", true),
+ SIG(&image_bmp, "BM", true),
+ SIG(&image_vnd_microsoft_icon, "\x00\x00\x01\x00", true),
+ SIG(&application_ogg, "OggS\x00", true),
+ SIG(&video_webm, "\x1a\x45\xdf\xa3", true),
+ SIG(&application_x_rar_compressed, "Rar \x1a\x07\x00", true),
+ SIG(&application_zip, "PK\x03\x04", true),
+ SIG(&application_x_gzip, "\x1f\x8b\x08", true),
+ SIG(&application_postscript, "%!PS-Adobe-", true),
+ SIG(&application_pdf, "%PDF-", false),
+ { NULL, 0, false, NULL }
+ };
+#undef SIG
+ const struct map_s *it;
+
+ for (it = exact_match_types; it->sig != NULL; it++) {
+ if (it->len <= len && memcmp(data, it->sig, it->len) == 0 &&
+ (allow_unsafe || it->safe)) {
+ *effective_type = lwc_string_ref(*it->type);
+ return NSERROR_OK;
+ }
+ }
+
+ return NSERROR_NOT_FOUND;
+}
+
+static nserror mimesniff__match_unknown(const uint8_t *data, size_t len,
+ bool allow_unsafe, lwc_string **effective_type)
+{
+ if (mimesniff__match_unknown_exact(data, len, allow_unsafe,
+ effective_type) == NSERROR_OK)
+ return NSERROR_OK;
+
+ if (mimesniff__match_unknown_riff(data, len,
+ effective_type) == NSERROR_OK)
+ return NSERROR_OK;
+
+ if (allow_unsafe == false)
+ return NSERROR_NOT_FOUND;
+
+ if (mimesniff__match_unknown_bom(data, len,
+ effective_type) == NSERROR_OK)
+ return NSERROR_OK;
+
+ if (mimesniff__match_unknown_ws(data, len,
+ effective_type) == NSERROR_OK)
+ return NSERROR_OK;
+
+ return NSERROR_NOT_FOUND;
+}
+
+static nserror mimesniff__compute_unknown(const uint8_t *data, size_t len,
+ lwc_string **effective_type)
+{
+ if (data == NULL)
+ return NSERROR_NEED_DATA;
+
+ len = min(len, 512);
+
+ if (mimesniff__match_unknown(data, len, true,
+ effective_type) == NSERROR_OK)
+ return NSERROR_OK;
+
+ if (mimesniff__has_binary_octets(data, len) == false) {
+ /* No binary octets => text/plain */
+ *effective_type = lwc_string_ref(text_plain);
+ return NSERROR_OK;
+ }
+
+ *effective_type = lwc_string_ref(application_octet_stream);
+
+ return NSERROR_OK;
+}
+
+static nserror mimesniff__compute_text_or_binary(const uint8_t *data,
+ size_t len, lwc_string **effective_type)
+{
+ if (data == NULL)
+ return NSERROR_NEED_DATA;
+
+ len = min(len, 512);
+
+ if (len >= 3 && ((data[0] == 0xfe && data[1] == 0xff) ||
+ (data[0] == 0xff && data[1] == 0xfe) ||
+ (data[0] == 0xef && data[1] == 0xbb &&
+ data[2] == 0xbf))) {
+ /* Found a BOM => text/plain */
+ *effective_type = lwc_string_ref(text_plain);
+ return NSERROR_OK;
+ }
+
+ if (mimesniff__has_binary_octets(data, len) == false) {
+ /* No binary octets => text/plain */
+ *effective_type = lwc_string_ref(text_plain);
+ return NSERROR_OK;
+ }
+
+ if (mimesniff__match_unknown(data, len, false,
+ effective_type) == NSERROR_OK)
+ return NSERROR_OK;
+
+ *effective_type = lwc_string_ref(application_octet_stream);
+
+ return NSERROR_OK;
+}
+
+static nserror mimesniff__compute_image(lwc_string *official_type,
+ const uint8_t *data, size_t len, lwc_string **effective_type)
+{
+#define SIG(t, s) { (const uint8_t *) s, SLEN(s), t }
+ static const struct it_s {
+ const uint8_t *sig;
+ size_t len;
+ lwc_string **type;
+ } image_types[] = {
+ SIG(&image_gif, "GIF87a"),
+ SIG(&image_gif, "GIF89a"),
+ SIG(&image_png, "\x89PNG\r\n\x1a\n"),
+ SIG(&image_jpeg, "\xff\xd8\xff"),
+ SIG(&image_bmp, "BM"),
+ SIG(&image_vnd_microsoft_icon, "\x00\x00\x01\x00"),
+ { NULL, 0, NULL }
+ };
+#undef SIG
+
+ const struct it_s *it;
+
+ if (data == NULL)
+ return NSERROR_NEED_DATA;
+
+ for (it = image_types; it->sig != NULL; it++) {
+ if (it->len <= len && memcmp(data, it->sig, it->len) == 0) {
+ lwc_string_unref(official_type);
+ *effective_type = lwc_string_ref(*it->type);
+ return NSERROR_OK;
+ }
+ }
+
+ /* WebP has a signature that doesn't fit into the above table */
+ if (SLEN("RIFF????WEBPVP") <= len &&
+ memcmp(data, "RIFF", SLEN("RIFF")) == 0 &&
+ memcmp(data + SLEN("RIFF????"),
+ "WEBPVP", SLEN("WEBPVP")) == 0 ) {
+ lwc_string_unref(official_type);
+ *effective_type = lwc_string_ref(image_webp);
+ return NSERROR_OK;
+ }
+
+ *effective_type = official_type;
+
+ return NSERROR_OK;
+}
+
+static nserror mimesniff__compute_feed_or_html(const uint8_t *data,
+ size_t len, lwc_string **effective_type)
+{
+#define RDF_NS "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+#define RSS_NS "http://purl.org/rss/1.0"
+
+ enum state_e {
+ BEFORE_BOM,
+ BEFORE_MARKUP,
+ MARKUP_START,
+ COMMENT_OR_DOCTYPE,
+ IN_COMMENT,
+ IN_DOCTYPE,
+ IN_PI,
+ IN_TAG,
+ IN_RDF
+ } state = BEFORE_BOM;
+
+ bool rdf = false, rss = false;
+ const uint8_t *end;
+
+ if (data == NULL)
+ return NSERROR_NEED_DATA;
+
+ end = data + min(len, 512);
+
+ while (data < end) {
+ const uint8_t c = *data;
+
+#define MATCH(s) SLEN(s) <= (size_t) (end - data) && \
+ memcmp(data, s, SLEN(s)) == 0
+
+ switch (state) {
+ case BEFORE_BOM:
+ if (3 <= end - data && c == 0xef && data[1] == 0xbb &&
+ data[2] == 0xbf) {
+ data += 3;
+ }
+
+ state = BEFORE_MARKUP;
+ break;
+ case BEFORE_MARKUP:
+ if (c == '\t' || c == '\n' || c == '\r' || c == ' ')
+ data++;
+ else if (c != '<')
+ data = end;
+ else {
+ state = MARKUP_START;
+ data++;
+ }
+ break;
+ case MARKUP_START:
+ if (c == '!') {
+ state = COMMENT_OR_DOCTYPE;
+ data++;
+ } else if (c == '?') {
+ state = IN_PI;
+ data++;
+ } else {
+ /* Reconsume input */
+ state = IN_TAG;
+ }
+ break;
+ case COMMENT_OR_DOCTYPE:
+ if (2 <= end - data && c == '-' && data[1] == '-') {
+ state = IN_COMMENT;
+ data += 2;
+ } else {
+ /* Reconsume input */
+ state = IN_DOCTYPE;
+ }
+ break;
+ case IN_COMMENT:
+ if (3 <= end - data && c == '-' && data[1] == '-' &&
+ data[2] == '>') {
+ state = BEFORE_MARKUP;
+ data += 3;
+ } else
+ data++;
+ break;
+ case IN_DOCTYPE:
+ if (c == '>')
+ state = BEFORE_MARKUP;
+ data++;
+ break;
+ case IN_PI:
+ if (2 <= end - data && c == '?' && data[1] == '>') {
+ state = BEFORE_MARKUP;
+ data += 2;
+ } else
+ data++;
+ break;
+ case IN_TAG:
+ if (MATCH("rss")) {
+ *effective_type =
+ lwc_string_ref(application_rss_xml);
+ return NSERROR_OK;
+ } else if (MATCH("feed")) {
+ *effective_type =
+ lwc_string_ref(application_atom_xml);
+ return NSERROR_OK;
+ } else if (MATCH("rdf:RDF")) {
+ state = IN_RDF;
+ data += SLEN("rdf:RDF");
+ } else
+ data = end;
+ break;
+ case IN_RDF:
+ if (MATCH(RSS_NS)) {
+ rss = true;
+ data += SLEN(RSS_NS);
+ } else if (MATCH(RDF_NS)) {
+ rdf = true;
+ data += SLEN(RDF_NS);
+ } else
+ data++;
+
+ if (rdf && rss) {
+ *effective_type =
+ lwc_string_ref(application_rss_xml);
+ return NSERROR_OK;
+ }
+
+ break;
+ }
+#undef MATCH
+ }
+
+ *effective_type = lwc_string_ref(text_html);
+
+ return NSERROR_OK;
+
+#undef RSS_NS
+#undef RDF_NS
+}
+
+/* See mimesniff.h for documentation */
+nserror mimesniff_compute_effective_type(llcache_handle *handle,
+ const uint8_t *data, size_t len, bool sniff_allowed,
+ lwc_string **effective_type)
+{
+#define S(s) { s, SLEN(s) }
+ static const struct tt_s {
+ const char *data;
+ size_t len;
+ } text_types[] = {
+ S("text/plain"),
+ S("text/plain; charset=ISO-8859-1"),
+ S("text/plain; charset=iso-8859-1"),
+ S("text/plain; charset=UTF-8"),
+ { NULL, 0 }
+ };
+#undef S
+
+ const char *content_type_header;
+ size_t content_type_header_len;
+ http_content_type *ct;
+ const struct tt_s *tt;
+ bool match;
+ nserror error;
+
+ content_type_header =
+ llcache_handle_get_header(handle, "Content-Type");
+ if (content_type_header == NULL) {
+ if (sniff_allowed == false)
+ return NSERROR_NOT_FOUND;
+
+ /* No official type => unknown */
+ return mimesniff__compute_unknown(data, len, effective_type);
+ }
+
+ error = http_parse_content_type(content_type_header, &ct);
+ if (error != NSERROR_OK) {
+ if (sniff_allowed == false)
+ return NSERROR_NOT_FOUND;
+
+ /* Unparseable => unknown */
+ return mimesniff__compute_unknown(data, len, effective_type);
+ }
+
+ if (sniff_allowed == false) {
+ *effective_type = lwc_string_ref(ct->media_type);
+ http_content_type_destroy(ct);
+ return NSERROR_OK;
+ }
+
+ content_type_header_len = strlen(content_type_header);
+
+ /* Look for text types */
+ for (tt = text_types; tt->data != NULL; tt++) {
+ if (tt->len == content_type_header_len &&
+ memcmp(tt->data, content_type_header,
+ content_type_header_len) == 0) {
+ http_content_type_destroy(ct);
+ return mimesniff__compute_text_or_binary(data, len,
+ effective_type);
+ }
+ }
+
+ /* unknown/unknown, application/unknown, * / * */
+ if ((lwc_string_caseless_isequal(ct->media_type, unknown_unknown,
+ &match) == lwc_error_ok && match) ||
+ (lwc_string_caseless_isequal(ct->media_type,
+ application_unknown, &match) == lwc_error_ok &&
+ match) ||
+ (lwc_string_caseless_isequal(ct->media_type, any,
+ &match) == lwc_error_ok && match)) {
+ http_content_type_destroy(ct);
+ return mimesniff__compute_unknown(data, len, effective_type);
+ }
+
+ /* +xml */
+ if (lwc_string_length(ct->media_type) > SLEN("+xml") &&
+ strncasecmp(lwc_string_data(ct->media_type) +
+ lwc_string_length(ct->media_type) -
+ SLEN("+xml"),
+ "+xml", SLEN("+xml")) == 0) {
+ /* Use official type */
+ *effective_type = lwc_string_ref(ct->media_type);
+ http_content_type_destroy(ct);
+ return NSERROR_OK;
+ }
+
+ /* text/xml, application/xml */
+ if ((lwc_string_caseless_isequal(ct->media_type, text_xml,
+ &match) == lwc_error_ok && match) ||
+ (lwc_string_caseless_isequal(ct->media_type,
+ application_xml, &match) == lwc_error_ok &&
+ match)) {
+ /* Use official type */
+ *effective_type = lwc_string_ref(ct->media_type);
+ http_content_type_destroy(ct);
+ return NSERROR_OK;
+ }
+
+ /* Image types */
+ if (content_factory_type_from_mime_type(ct->media_type) ==
+ CONTENT_IMAGE) {
+ lwc_string *official_type = lwc_string_ref(ct->media_type);
+ http_content_type_destroy(ct);
+ return mimesniff__compute_image(official_type,
+ data, len, effective_type);
+ }
+
+ /* text/html */
+ if ((lwc_string_caseless_isequal(ct->media_type, text_html,
+ &match) == lwc_error_ok && match)) {
+ http_content_type_destroy(ct);
+ return mimesniff__compute_feed_or_html(data, len,
+ effective_type);
+ }
+
+ /* Use official type */
+ *effective_type = lwc_string_ref(ct->media_type);
+
+ http_content_type_destroy(ct);
+
+ return NSERROR_OK;
+}
+