From e0e38d906c8974bb22a0368a9709af9590362927 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sun, 30 Sep 2007 21:10:50 +0000 Subject: DOM Strings are now capable of containing either UTF-8 or UTF-16 encoded data. The charset used for strings within a document is specified at document creation time. Whilst it is possible to mix charsets within a document, it's not recommended. Things that need fixing: + dom_string_get_data() doesn't return the charset. Better would be to permit the client to request a charset for the data to be returned in. + Interned node name strings will break if the document is UTF-16 (dom_document_create()). In fact, these could quite happily be globals, rather than allocating a set for each document. + Other usage of dom string constructors need checking for sanity + DOM Strings need to gain more utility APIs (such as getting the character length of a string, string concatenation etc). svn path=/trunk/dom/; revision=3614 --- src/core/document.c | 44 +++++++++++++++++++--- src/core/document.h | 5 ++- src/core/implementation.c | 4 +- src/core/string.c | 93 ++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 126 insertions(+), 20 deletions(-) (limited to 'src/core') diff --git a/src/core/document.c b/src/core/document.c index 5148224..e188868 100644 --- a/src/core/document.c +++ b/src/core/document.c @@ -56,6 +56,8 @@ struct dom_doc_nnm { struct dom_document { struct dom_node base; /**< Base node */ + dom_string_charset charset; /**< Charset of strings in document */ + struct dom_implementation *impl; /**< Owning implementation */ struct dom_doc_nl *nodelists; /**< List of active nodelists */ @@ -73,10 +75,11 @@ struct dom_document { /** * Create a Document * - * \param impl The DOM implementation owning the document - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data - * \param doc Pointer to location to receive created document + * \param impl The DOM implementation owning the document + * \param charset The charset used for strings in the document + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + * \param doc Pointer to location to receive created document * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion. * * ::impl will have its reference count increased. @@ -84,7 +87,8 @@ struct dom_document { * The returned document will already be referenced. */ dom_exception dom_document_create(struct dom_implementation *impl, - dom_alloc alloc, void *pw, struct dom_document **doc) + dom_string_charset charset, dom_alloc alloc, void *pw, + struct dom_document **doc) { static const char *names[DOM_NODE_TYPE_COUNT + 1] = { NULL, /* Unused */ @@ -110,6 +114,7 @@ dom_exception dom_document_create(struct dom_implementation *impl, return DOM_NO_MEM_ERR; /* Set up document allocation context - must be first */ + d->charset = charset; d->alloc = alloc; d->pw = pw; @@ -993,6 +998,35 @@ const uint8_t *dom_document_get_base(struct dom_document *doc) return NULL; } +/** + * Set the document buffer pointer + * + * \param doc Document to set buffer pointer of + * \param buffer Pointer to buffer + * \param buffer_len Length of buffer, in bytes + * + * By calling this, ownership of the buffer is transferred to the document. + * It should be called once per document node. + */ +void dom_document_set_buffer(struct dom_document *doc, uint8_t *buffer, + size_t buffer_len) +{ + UNUSED(doc); + UNUSED(buffer); + UNUSED(buffer_len); +} + +/** + * Retrieve the character set used to encode strings in the document + * + * \param doc The document to get the charset of + * \return The charset in use + */ +dom_string_charset dom_document_get_charset(struct dom_document *doc) +{ + return doc->charset; +} + /** * (De)allocate memory with a document's context * diff --git a/src/core/document.h b/src/core/document.h index 367b1ec..5149f2e 100644 --- a/src/core/document.h +++ b/src/core/document.h @@ -12,12 +12,12 @@ #include #include +#include struct dom_document; struct dom_namednodemap; struct dom_node; struct dom_nodelist; -struct dom_string; /* Destroy a document */ void dom_document_destroy(struct dom_document *doc); @@ -25,6 +25,9 @@ void dom_document_destroy(struct dom_document *doc); /* Get base of document buffer */ const uint8_t *dom_document_get_base(struct dom_document *doc); +/* Get the document character set */ +dom_string_charset dom_document_get_charset(struct dom_document *doc); + /* (De)allocate memory */ void *dom_document_alloc(struct dom_document *doc, void *ptr, size_t size); diff --git a/src/core/implementation.c b/src/core/implementation.c index e37b27d..9738b7c 100644 --- a/src/core/implementation.c +++ b/src/core/implementation.c @@ -94,6 +94,7 @@ dom_exception dom_implementation_create_document_type( * \param qname The qualified name of the document element * \param doctype The type of document to create * \param doc Pointer to location to receive result + * \param charset The charset to use for strings in the document * \param alloc Memory (de)allocation function * \param pw Pointer to client-specific private data * \return DOM_NO_ERR on success, @@ -126,10 +127,11 @@ dom_exception dom_implementation_create_document( struct dom_string *namespace, struct dom_string *qname, struct dom_document_type *doctype, struct dom_document **doc, + dom_string_charset charset, dom_alloc alloc, void *pw) { return impl->create_document(impl, namespace, qname, doctype, doc, - alloc, pw); + charset, alloc, pw); } /** diff --git a/src/core/string.c b/src/core/string.c index d43c571..faa3c85 100644 --- a/src/core/string.c +++ b/src/core/string.c @@ -5,6 +5,7 @@ * Copyright 2007 John-Mark Bell */ +#include #include #include @@ -12,6 +13,8 @@ #include "core/document.h" #include "utils/utils.h" +#include "utils/utf8.h" +#include "utils/utf16.h" /** * A DOM string @@ -28,6 +31,8 @@ struct dom_string { DOM_STRING_PTR_NODOC } type; /**< String type */ + dom_string_charset charset; /**< Charset of string */ + union { uint8_t *ptr; const uint8_t *cptr; @@ -49,7 +54,8 @@ struct dom_string { }; static struct dom_string empty_string = { - .type = DOM_STRING_CONST_PTR, + .type = DOM_STRING_CONST_PTR, + .charset = DOM_STRING_UTF8, .data.ptr = NULL, .len = 0, .ctx.doc = NULL, @@ -116,6 +122,8 @@ dom_exception dom_string_create_from_off(struct dom_document *doc, ret->type = DOM_STRING_OFFSET; + ret->charset = dom_document_get_charset(doc); + ret->data.offset = off; ret->len = len; @@ -161,6 +169,8 @@ dom_exception dom_string_create_from_ptr(struct dom_document *doc, ret->type = DOM_STRING_PTR; + ret->charset = dom_document_get_charset(doc); + memcpy(ret->data.ptr, ptr, len); ret->len = len; @@ -200,6 +210,8 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, ret->type = DOM_STRING_CONST_PTR; + ret->charset = dom_document_get_charset(doc); + ret->data.cptr = ptr; ret->len = len; @@ -217,11 +229,12 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, * Create a DOM string from a string of characters that does not belong * to a document * - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data - * \param ptr Pointer to string of characters - * \param len Length, in bytes, of string of characters - * \param str Pointer to location to receive result + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + * \param charset The charset of the string + * \param ptr Pointer to string of characters + * \param len Length, in bytes, of string of characters + * \param str Pointer to location to receive result * \return DOM_NO_ERR on success, DOM_NO_MEM_ERR on memory exhaustion * * The returned string will already be referenced, so there is no need @@ -231,7 +244,8 @@ dom_exception dom_string_create_from_const_ptr(struct dom_document *doc, * returned DOM string. */ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw, - const uint8_t *ptr, size_t len, struct dom_string **str) + dom_string_charset charset, const uint8_t *ptr, size_t len, + struct dom_string **str) { struct dom_string *ret; @@ -247,6 +261,8 @@ dom_exception dom_string_create_from_ptr_no_doc(dom_alloc alloc, void *pw, ret->type = DOM_STRING_PTR_NODOC; + ret->charset = charset; + memcpy(ret->data.ptr, ptr, len); ret->len = len; @@ -324,10 +340,35 @@ int dom_string_cmp(struct dom_string *s1, struct dom_string *s2) if (err != DOM_NO_ERR) return 1; /* arbitrary */ - if (l1 != l2) - return 1; /* arbitrary */ + while (l1 > 0 && l2 > 0) { + uint32_t c1, c2; + size_t cl1, cl2; + charset_error err; + + err = (s1->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) + : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1); + if (err != CHARSET_OK) { + } + + err = (s2->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2) + : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2); + if (err != CHARSET_OK) { + } + + if (c1 != c2) { + return (int)(c1 - c2); + } - return strncmp((const char *) d1, (const char *) d2, l1); + d1 += cl1; + d2 += cl2; + + l1 -= cl1; + l2 -= cl2; + } + + return (int)(l1 - l2); } /** @@ -354,9 +395,35 @@ int dom_string_icmp(struct dom_string *s1, struct dom_string *s2) if (err != DOM_NO_ERR) return 1; /* arbitrary */ - if (l1 != l2) - return 1; /* arbitrary */ + while (l1 > 0 && l2 > 0) { + uint32_t c1, c2; + size_t cl1, cl2; + charset_error err; + + err = (s1->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d1, l1, &c1, &cl1) + : _dom_utf16_to_ucs4(d1, l1, &c1, &cl1); + if (err != CHARSET_OK) { + } + + err = (s2->charset == DOM_STRING_UTF8) + ? _dom_utf8_to_ucs4(d2, l2, &c2, &cl2) + : _dom_utf16_to_ucs4(d2, l2, &c2, &cl2); + if (err != CHARSET_OK) { + } + + /** \todo improved lower-casing algorithm */ + if (tolower(c1) != tolower(c2)) { + return (int)(tolower(c1) - tolower(c2)); + } + + d1 += cl1; + d2 += cl2; + + l1 -= cl1; + l2 -= cl2; + } - return strncasecmp((const char *) d1, (const char *) d2, l1); + return (int)(l1 - l2); } -- cgit v1.2.3