diff options
Diffstat (limited to 'src/core/string.c')
-rw-r--r-- | src/core/string.c | 93 |
1 files changed, 80 insertions, 13 deletions
diff --git a/src/core/string.c b/src/core/string.c index d43c571..faa3c85 100644 --- a/src/core/string.c +++ b/src/core/string.c @@ -5,6 +5,7 @@ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> */ +#include <ctype.h> #include <inttypes.h> #include <string.h> @@ -12,6 +13,8 @@ #include "core/document.h" #include "utils/utils.h" +#include "utils/utf8.h" +#include "utils/utf16.h" /** * A DOM string @@ -28,6 +31,8 @@ struct dom_string { DOM_STRING_PTR_NODOC } type; /**< String type */ + dom_string_charset charset; /**< Charset of string */ + union { uint8_t *ptr; const uint8_t *cptr; @@ -49,7 +54,8 @@ struct dom_string { }; static struct dom_string empty_string = { - .type = DOM_STRING_CONST_PTR, + .type = DOM_STRING_CONST_PTR, + .charset = DOM_STRING_UTF8, .data.ptr = NULL, .len = 0, .ctx.doc = NULL, @@ -116,6 +122,8 @@ dom_exception dom_string_create_from_off(struct dom_document *doc, ret->type = DOM_STRING_OFFSET; + ret->charset = dom_document_get_charset(doc); + ret->data.offset = off; ret->len = len; @@ -161,6 +169,8 @@ dom_exception dom_string_create_from_ptr(struct dom_document *doc, ret->type = DOM_STRING_PTR; + ret->charset = dom_document_get_charset(doc); + memcpy(ret->data.ptr, ptr, len); ret->len = len; @@ -200,6 +210,8 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, ret->type = DOM_STRING_CONST_PTR; + ret->charset = dom_document_get_charset(doc); + ret->data.cptr = ptr; ret->len = len; @@ -217,11 +229,12 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, * Create a DOM string from a string of characters that does not belong * to a document * - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data - * \param ptr Pointer to string of characters - * \param len Length, in bytes, of string of characters - * \param str Pointer to location to receive result + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + * \param charset The charset of the string + * \param ptr Pointer to string of characters + * \param len Length, in bytes, of string of characters + * \param str Pointer to location to receive result * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion * * The returned string will already be referenced, so there is no need @@ -231,7 +244,8 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, * returned DOM string. */ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw, - const uint8_t *ptr, size_t len, struct dom_string **str) + dom_string_charset charset, const uint8_t *ptr, size_t len, + struct dom_string **str) { struct dom_string *ret; @@ -247,6 +261,8 @@ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw, ret->type = DOM_STRING_PTR_NODOC; + ret->charset = charset; + memcpy(ret->data.ptr, ptr, len); ret->len = len; @@ -324,10 +340,35 @@ int dom_string_cmp(struct dom_string *s1, struct dom_string *s2) if (err != DOM_NO_ERR) return 1; /* arbitrary */ - if (l1 != l2) - return 1; /* arbitrary */ + while (l1 > 0 && l2 > 0) { + uint32_t c1, c2; + size_t cl1, cl2; + charset_error err; + + err = (s1->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) + : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1); + if (err != CHARSET_OK) { + } + + err = (s2->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2) + : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2); + if (err != CHARSET_OK) { + } + + if (c1 != c2) { + return (int)(c1 - c2); + } - return strncmp((const char *) d1, (const char *) d2, l1); + d1 += cl1; + d2 += cl2; + + l1 -= cl1; + l2 -= cl2; + } + + return (int)(l1 - l2); } /** @@ -354,9 +395,35 @@ int dom_string_icmp(struct dom_string *s1, struct dom_string *s2) if (err != DOM_NO_ERR) return 1; /* arbitrary */ - if (l1 != l2) - return 1; /* arbitrary */ + while (l1 > 0 && l2 > 0) { + uint32_t c1, c2; + size_t cl1, cl2; + charset_error err; + + err = (s1->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) + : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1); + if (err != CHARSET_OK) { + } + + err = (s2->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2) + : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2); + if (err != CHARSET_OK) { + } + + /** \todo improved lower-casing algorithm */ + if (tolower(c1) != tolower(c2)) { + return (int)(tolower(c1) - tolower(c2)); + } + + d1 += cl1; + d2 += cl2; + + l1 -= cl1; + l2 -= cl2; + } - return strncasecmp((const char *) d1, (const char *) d2, l1); + return (int)(l1 - l2); } |