From 2777a04ed2ba4fd36138b991d66a32a283361f7e Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Thu, 1 May 2008 16:34:46 +0000 Subject: Import parser construction utility library svn path=/trunk/libparserutils/; revision=4111 --- src/charset/encodings/utf8.c | 175 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 src/charset/encodings/utf8.c (limited to 'src/charset/encodings/utf8.c') diff --git a/src/charset/encodings/utf8.c b/src/charset/encodings/utf8.c new file mode 100644 index 0000000..5b4ba95 --- /dev/null +++ b/src/charset/encodings/utf8.c @@ -0,0 +1,175 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +/** \file + * UTF-8 manipulation functions (implementation). + */ + +#include +#include +#include + +#include +#include "charset/encodings/utf8impl.h" + +/** Number of continuation bytes for a given start byte */ +const uint8_t numContinuations[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, +}; + +/** + * Convert a UTF-8 multibyte sequence into a single UCS4 character + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This function conforms to RFC2279, however. + * + * \param s The sequence to process + * \param len Length of sequence + * \param ucs4 Pointer to location to receive UCS4 character (host endian) + * \param clen Pointer to location to receive byte length of UTF-8 sequence + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen) +{ + parserutils_error error; + + UTF8_TO_UCS4(s, len, ucs4, clen, error); + + return error; +} + +/** + * Convert a single UCS4 character into a UTF-8 multibyte sequence + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This function conforms to RFC2279, however. + * + * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) + * \param s Pointer to pointer to output buffer, updated on exit + * \param len Pointer to length, in bytes, of output buffer, updated on exit + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4, + uint8_t **s, size_t *len) +{ + parserutils_error error; + + UTF8_FROM_UCS4(ucs4, s, len, error); + + return error; +} + +/** + * Calculate the length (in characters) of a bounded UTF-8 string + * + * \param s The string + * \param max Maximum length + * \param len Pointer to location to receive length of string + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max, + size_t *len) +{ + parserutils_error error; + + UTF8_LENGTH(s, max, len, error); + + return error; +} + +/** + * Calculate the length (in bytes) of a UTF-8 character + * + * \param s Pointer to start of character + * \param len Pointer to location to receive length + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s, + size_t *len) +{ + parserutils_error error; + + UTF8_CHAR_BYTE_LENGTH(s, len, error); + + return error; +} + +/** + * Find previous legal UTF-8 char in string + * + * \param s The string + * \param off Offset in the string to start at + * \param prevoff Pointer to location to receive offset of first byte of + * previous legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff) +{ + parserutils_error error; + + UTF8_PREV(s, off, prevoff, error); + + return error; +} + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed valid) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff) +{ + parserutils_error error; + + UTF8_NEXT(s, len, off, nextoff, error); + + return error; +} + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed to be of dubious validity) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s, + uint32_t len, uint32_t off, uint32_t *nextoff) +{ + parserutils_error error; + + UTF8_NEXT_PARANOID(s, len, off, nextoff, error); + + return error; +} + -- cgit v1.2.3