summaryrefslogtreecommitdiff
path: root/src/charset/codec.h
blob: 4cd94d8257bc4d2a7270f25a6611f99244192400 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/*
 * This file is part of Hubbub.
 * Licensed under the MIT License,
 *                http://www.opensource.org/licenses/mit-license.php
 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
 */

#ifndef hubbub_charset_codec_h_
#define hubbub_charset_codec_h_

#include <inttypes.h>

#include <hubbub/errors.h>
#include <hubbub/functypes.h>

typedef struct hubbub_charsetcodec hubbub_charsetcodec;

#define HUBBUB_CHARSETCODEC_NULL (0xffffffffU)

/**
 * Type of charset codec filter function
 *
 * \param c          UCS4 character (in host byte order) or
 *                   HUBBUB_CHARSETCODEC_NULL to reset
 * \param output     Pointer to location to store output buffer location
 * \param outputlen  Pointer to location to store output buffer length
 * \param pw         Pointer to client-specific private data
 * \return HUBBUB_OK on success, or appropriate error otherwise.
 *
 * The output buffer is owned by the filter code and will not be freed by
 * any charset codec. It should contain the replacement UCS4 character(s)
 * for the input. The replacement characters should be in host byte order.
 * The contents of *output and *outputlen on entry are ignored and these
 * will be filled in by the filter code.
 *
 * Filters may elect to replace the input character with no output. In this
 * case, *output should be set to NULL and *outputlen should be set to 0 and
 * HUBBUB_OK should be returned.
 *
 * The output length is in terms of the number of UCS4 characters in the
 * output buffer. i.e.:
 *
 * for (size_t i = 0; i < outputlen; i++) {
 *   dest[curchar++] = output[i];
 * }
 *
 * would copy the contents of the filter output buffer to the codec's output
 * buffer.
 */
typedef hubbub_error (*hubbub_charsetcodec_filter)(uint32_t c,
		uint32_t **output, size_t *outputlen, void *pw);

/**
 * Charset codec error mode
 *
 * A codec's error mode determines its behaviour in the face of:
 *
 * + characters which are unrepresentable in the destination charset (if
 *   encoding data) or which cannot be converted to UCS4 (if decoding data).
 * + invalid byte sequences (both encoding and decoding)
 *
 * The options provide a choice between the following approaches:
 *
 * + draconian, "stop processing" ("strict")
 * + "replace the unrepresentable character with something else" ("loose")
 * + "attempt to transliterate, or replace if unable" ("translit")
 *
 * The default error mode is "loose".
 *
 *
 * In the "loose" case, the replacement character will depend upon:
 *
 * + Whether the operation was encoding or decoding
 * + If encoding, what the destination charset is.
 *
 * If decoding, the replacement character will be:
 *
 *     U+FFFD (REPLACEMENT CHARACTER)
 *
 * If encoding, the replacement character will be:
 *
 *     U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32)
 *     U+FFFD (REPLACEMENT CHARACTER) otherwise.
 *
 *
 * In the "translit" case, the codec will attempt to transliterate into
 * the destination charset, if encoding. If decoding, or if transliteration
 * fails, this option is identical to "loose".
 */
typedef enum hubbub_charsetcodec_errormode {
	/** Abort processing if unrepresentable character encountered */
	HUBBUB_CHARSETCODEC_ERROR_STRICT   = 0,
	/** Replace unrepresentable characters with single alternate */
	HUBBUB_CHARSETCODEC_ERROR_LOOSE    = 1,
	/** Transliterate unrepresentable characters, if possible */
	HUBBUB_CHARSETCODEC_ERROR_TRANSLIT = 2,
} hubbub_charsetcodec_errormode;

/**
 * Charset codec option types
 */
typedef enum hubbub_charsetcodec_opttype {
	/** Register codec filter function */
	HUBBUB_CHARSETCODEC_FILTER_FUNC = 0,
	/** Set codec error mode */
	HUBBUB_CHARSETCODEC_ERROR_MODE  = 1,
} hubbub_charsetcodec_opttype;

/**
 * Charset codec option parameters
 */
typedef union hubbub_charsetcodec_optparams {
	/** Parameters for filter function setting */
	struct {
		/** Filter function */
		hubbub_charsetcodec_filter filter;
		/** Client-specific private data */
		void *pw;
	} filter_func;

	/** Parameters for error mode setting */
	struct {
		/** The desired error handling mode */
		hubbub_charsetcodec_errormode mode;
	} error_mode;
} hubbub_charsetcodec_optparams;


/* Create a charset codec */
hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset,
		hubbub_alloc alloc, void *pw);
/* Destroy a charset codec */
void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec);

/* Configure a charset codec */
hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec,
		hubbub_charsetcodec_opttype type,
		hubbub_charsetcodec_optparams *params);

/* Encode a chunk of UCS4 data into a codec's charset */
hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec,
		const uint8_t **source, size_t *sourcelen,
		uint8_t **dest, size_t *destlen);

/* Decode a chunk of data in a codec's charset into UCS4 */
hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec,
		const uint8_t **source, size_t *sourcelen,
		uint8_t **dest, size_t *destlen);

/* Reset a charset codec */
hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec);

#endif