summaryrefslogtreecommitdiff
path: root/src/input/inputstream.c
blob: f678e6606204355e7dff51c95e064151d5479f9a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
/*
 * This file is part of LibParserUtils.
 * Licensed under the MIT License,
 *                http://www.opensource.org/licenses/mit-license.php
 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
 */

#include <assert.h>
#include <stdlib.h>
#include <string.h>

#include <parserutils/charset/mibenum.h>
#include <parserutils/charset/utf8.h>
#include <parserutils/input/inputstream.h>

#include "input/filter.h"
#include "utils/utils.h"

/**
 * Private input stream definition
 */
typedef struct parserutils_inputstream_private {
	parserutils_inputstream public;	/**< Public part. Must be first */

	parserutils_buffer *raw;	/**< Buffer containing raw data */

	bool done_first_chunk;		/**< Whether the first chunk has 
					 * been processed */

	uint16_t mibenum;		/**< MIB enum for charset, or 0 */
	uint32_t encsrc;		/**< Charset source */

	parserutils_filter *input;	/**< Charset conversion filter */

	parserutils_charset_detect_func csdetect; /**< Charset detection func.*/

	parserutils_alloc alloc;	/**< Memory (de)allocation function */
	void *pw;			/**< Client private data */
} parserutils_inputstream_private;

static inline parserutils_error parserutils_inputstream_refill_buffer(
		parserutils_inputstream_private *stream);
static inline parserutils_error parserutils_inputstream_strip_bom(
		uint16_t mibenum, parserutils_buffer *buffer);

/**
 * Create an input stream
 *
 * \param enc       Document charset, or NULL to autodetect
 * \param encsrc    Value for encoding source, if specified, or 0
 * \param csdetect  Charset detection function, or NULL
 * \param alloc     Memory (de)allocation function
 * \param pw        Pointer to client-specific private data (may be NULL)
 * \param stream    Pointer to location to receive stream instance
 * \return PARSERUTILS_OK on success,
 *         PARSERUTILS_BADPARM on bad parameters,
 *         PARSERUTILS_NOMEM on memory exhaustion,
 *         PARSERUTILS_BADENCODING on unsupported encoding
 *
 * The value 0 is defined as being the lowest priority encoding source 
 * (i.e. the default fallback encoding). Beyond this, no further 
 * interpretation is made upon the encoding source.
 */
parserutils_error parserutils_inputstream_create(const char *enc,
		uint32_t encsrc, parserutils_charset_detect_func csdetect,
		parserutils_alloc alloc, void *pw,
		parserutils_inputstream **stream)
{
	parserutils_inputstream_private *s;
	parserutils_error error;

	if (alloc == NULL || stream == NULL)
		return PARSERUTILS_BADPARM;

	s = alloc(NULL, sizeof(parserutils_inputstream_private), pw);
	if (s == NULL)
		return PARSERUTILS_NOMEM;

	error = parserutils_buffer_create(alloc, pw, &s->raw);
	if (error != PARSERUTILS_OK) {
		alloc(s, 0, pw);
		return error;
	}

	error = parserutils_buffer_create(alloc, pw, &s->public.utf8);
	if (error != PARSERUTILS_OK) {
		parserutils_buffer_destroy(s->raw);
		alloc(s, 0, pw);
		return error;
	}

	s->public.cursor = 0;
	s->public.had_eof = false;
	s->done_first_chunk = false;

	error = parserutils_filter_create("UTF-8", alloc, pw, &s->input);
	if (error != PARSERUTILS_OK) {
		parserutils_buffer_destroy(s->public.utf8);
		parserutils_buffer_destroy(s->raw);
		alloc(s, 0, pw);
		return error;
	}

	if (enc != NULL) {
		parserutils_filter_optparams params;

		s->mibenum = 
			parserutils_charset_mibenum_from_name(enc, strlen(enc));

		if (s->mibenum != 0) {
			params.encoding.name = enc;

			error = parserutils_filter_setopt(s->input,
					PARSERUTILS_FILTER_SET_ENCODING, 
					&params);
			if (error != PARSERUTILS_OK) {
				parserutils_filter_destroy(s->input);
				parserutils_buffer_destroy(s->public.utf8);
				parserutils_buffer_destroy(s->raw);
				alloc(s, 0, pw);
				return error;
			}

			s->encsrc = encsrc;
		}
	} else {
		s->mibenum = 0;
		s->encsrc = 0;
	}

	s->csdetect = csdetect;

	s->alloc = alloc;
	s->pw = pw;

	*stream = (parserutils_inputstream *) s;

	return PARSERUTILS_OK;
}

/**
 * Destroy an input stream
 *
 * \param stream  Input stream to destroy
 * \return PARSERUTILS_OK on success, appropriate error otherwise
 */
parserutils_error parserutils_inputstream_destroy(
		parserutils_inputstream *stream)
{
	parserutils_inputstream_private *s = 
			(parserutils_inputstream_private *) stream;

	if (stream == NULL)
		return PARSERUTILS_BADPARM;

	parserutils_filter_destroy(s->input);
	parserutils_buffer_destroy(s->public.utf8);
	parserutils_buffer_destroy(s->raw);
	s->alloc(s, 0, s->pw);

	return PARSERUTILS_OK;
}

/**
 * Append data to an input stream
 *
 * \param stream  Input stream to append data to
 * \param data    Data to append (in document charset), or NULL to flag EOF
 * \param len     Length, in bytes, of data
 * \return PARSERUTILS_OK on success, appropriate error otherwise
 */
parserutils_error parserutils_inputstream_append(
		parserutils_inputstream *stream, 
		const uint8_t *data, size_t len)
{
	parserutils_inputstream_private *s = 
			(parserutils_inputstream_private *) stream;

	if (stream == NULL)
		return PARSERUTILS_BADPARM;

	if (data == NULL) {
		s->public.had_eof = true;
		return PARSERUTILS_OK;
	}

	return parserutils_buffer_append(s->raw, data, len);
}

/**
 * Insert data into stream at current location
 *
 * \param stream  Input stream to insert into
 * \param data    Data to insert (UTF-8 encoded)
 * \param len     Length, in bytes, of data
 * \return PARSERUTILS_OK on success, appropriate error otherwise
 */
parserutils_error parserutils_inputstream_insert(
		parserutils_inputstream *stream,
		const uint8_t *data, size_t len)
{
	parserutils_inputstream_private *s = 
			(parserutils_inputstream_private *) stream;

	if (stream == NULL || data == NULL)
		return PARSERUTILS_BADPARM;

	return parserutils_buffer_insert(s->public.utf8, s->public.cursor, 
			data, len);
}

#define IS_ASCII(x) (((x) & 0x80) == 0)

/* Look at the character in the stream that starts at 
 * offset bytes from the cursor (slow version)
 *
 * \param stream  Stream to look in
 * \param offset  Byte offset of start of character
 * \param ptr     Pointer to location to receive pointer to character data
 * \param length  Pointer to location to receive character length (in bytes)
 * \return PARSERUTILS_OK on success, 
 *                    _NEEDDATA on reaching the end of available input,
 *                    _EOF on reaching the end of all input,
 *                    _BADENCODING if the input cannot be decoded,
 *                    _NOMEM on memory exhaustion,
 *                    _BADPARM if bad parameters are passed.
 *
 * Once the character pointed to by the result of this call has been advanced
 * past (i.e. parserutils_inputstream_advance has caused the stream cursor to 
 * pass over the character), then no guarantee is made as to the validity of 
 * the data pointed to. Thus, any attempt to dereference the pointer after 
 * advancing past the data it points to is a bug.
 */
parserutils_error parserutils_inputstream_peek_slow(
		parserutils_inputstream *stream, 
		size_t offset, const uint8_t **ptr, size_t *length)
{
	parserutils_inputstream_private *s = 
			(parserutils_inputstream_private *) stream;
	parserutils_error error = PARSERUTILS_OK;
	size_t len;

	if (stream == NULL || ptr == NULL || length == NULL)
		return PARSERUTILS_BADPARM;

	/* There's insufficient data in the buffer, so read some more */
	if (s->raw->length == 0) {
		/* No more data to be had */
		return s->public.had_eof ? PARSERUTILS_EOF
					 : PARSERUTILS_NEEDDATA;
	}

	/* Refill utf8 buffer from raw buffer */
	error = parserutils_inputstream_refill_buffer(s);
	if (error != PARSERUTILS_OK)
		return error;

	/* Refill may have succeeded, but not actually produced any new data */
	if (s->public.cursor + offset == s->public.utf8->length)
		return PARSERUTILS_NEEDDATA;

	/* Now try the read */
	if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) {
		len = 1;
	} else {
		error = parserutils_charset_utf8_char_byte_length(
			s->public.utf8->data + s->public.cursor + offset,
			&len);

		if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA)
			return error;

		if (error == PARSERUTILS_NEEDDATA) {
			return s->public.had_eof ? PARSERUTILS_EOF
						 : PARSERUTILS_NEEDDATA;
		}
	}

	(*length) = len;
	(*ptr) = (s->public.utf8->data + s->public.cursor + offset);

	return PARSERUTILS_OK;
}

#undef IS_ASCII

/**
 * Read the source charset of the input stream
 *
 * \param stream  Input stream to query
 * \param source  Pointer to location to receive charset source identifier
 * \return Pointer to charset name (constant; do not free)
 */
const char *parserutils_inputstream_read_charset(
		parserutils_inputstream *stream, uint32_t *source)
{
	parserutils_inputstream_private *s = 
			(parserutils_inputstream_private *) stream;

	if (stream == NULL || source == NULL)
		return NULL;

	*source = s->encsrc;

	if (s->encsrc == 0)
		return "UTF-8";

	return parserutils_charset_mibenum_to_name(s->mibenum);
}

/******************************************************************************
 ******************************************************************************/

/**
 * Refill the UTF-8 buffer from the raw buffer
 *
 * \param stream  The inputstream to operate on
 * \return PARSERUTILS_OK on success
 */
parserutils_error parserutils_inputstream_refill_buffer(
		parserutils_inputstream_private *stream)
{
	const uint8_t *raw;
	uint8_t *utf8;
	size_t raw_length, utf8_space;
	parserutils_error error;

	/* If this is the first chunk of data, we must detect the charset and
	 * strip the BOM, if one exists */
	if (!stream->done_first_chunk) {
		parserutils_filter_optparams params;

		if (stream->csdetect != NULL) {
			error = stream->csdetect(stream->raw->data, 
				stream->raw->length,
				&stream->mibenum, &stream->encsrc);
			if (error != PARSERUTILS_OK)
				return error;
		} else {
			/* Default to UTF-8 */
			stream->mibenum = 
				parserutils_charset_mibenum_from_name("UTF-8", 
					SLEN("UTF-8"));
			stream->encsrc = 0;
		}

		if (stream->mibenum == 0)
			abort();

		/* Ensure filter is using the correct encoding */
		params.encoding.name = 
			parserutils_charset_mibenum_to_name(stream->mibenum);

		error = parserutils_filter_setopt(stream->input,
				PARSERUTILS_FILTER_SET_ENCODING, 
				&params);
		if (error != PARSERUTILS_OK)
			return error;

		error = parserutils_inputstream_strip_bom(stream->mibenum, 
				stream->raw);
		if (error != PARSERUTILS_OK)
			return error;

		stream->done_first_chunk = true;
	}

	/* Work out how to perform the buffer fill */
	if (stream->public.cursor == stream->public.utf8->length) {
		/* Cursor's at the end, so simply reuse the entire buffer */
		utf8 = stream->public.utf8->data;
		utf8_space = stream->public.utf8->allocated;
	} else {
		/* Cursor's not at the end, so shift data after cursor to the
		 * bottom of the buffer. If the buffer's still over half full, 
		 * extend it. */
		memmove(stream->public.utf8->data,
			stream->public.utf8->data + stream->public.cursor,
			stream->public.utf8->length - stream->public.cursor);

		stream->public.utf8->length -= stream->public.cursor;

		if (stream->public.utf8->length > 
				stream->public.utf8->allocated / 2) {
			error = parserutils_buffer_grow(stream->public.utf8);
			if (error != PARSERUTILS_OK)
				return error;
		}

		utf8 = stream->public.utf8->data + stream->public.utf8->length;
		utf8_space = stream->public.utf8->allocated - 
				stream->public.utf8->length;
	}

	raw = stream->raw->data;
	raw_length = stream->raw->length;

	/* Try to fill utf8 buffer from the raw data */
	error = parserutils_filter_process_chunk(stream->input, 
			&raw, &raw_length, &utf8, &utf8_space);
	/* _NOMEM implies that there's more input to read than available space
	 * in the utf8 buffer. That's fine, so we'll ignore that error. */
	if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM)
		return error;

	/* Remove the raw data we've processed from the raw buffer */
	error = parserutils_buffer_discard(stream->raw, 0, 
			stream->raw->length - raw_length);
	if (error != PARSERUTILS_OK)
		return error;

	/* Fix up the utf8 buffer information */
	stream->public.utf8->length = 
			stream->public.utf8->allocated - utf8_space;

	/* Finally, fix up the cursor */
	stream->public.cursor = 0;

	return PARSERUTILS_OK;
}

/**
 * Strip a BOM from a buffer in the given encoding
 *
 * \param mibenum  The character set of the buffer
 * \param buffer   The buffer to process
 */
parserutils_error parserutils_inputstream_strip_bom(uint16_t mibenum, 
		parserutils_buffer *buffer)
{
	static uint16_t utf8;
	static uint16_t utf16;
	static uint16_t utf16be;
	static uint16_t utf16le;
	static uint16_t utf32;
	static uint16_t utf32be;
	static uint16_t utf32le;

	if (utf8 == 0) {
		utf8 = parserutils_charset_mibenum_from_name("UTF-8", 
				SLEN("UTF-8"));
		utf16 = parserutils_charset_mibenum_from_name("UTF-16", 
				SLEN("UTF-16"));
		utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
				SLEN("UTF-16BE"));
		utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
				SLEN("UTF-16LE"));
		utf32 = parserutils_charset_mibenum_from_name("UTF-32", 
				SLEN("UTF-32"));
		utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
				SLEN("UTF-32BE"));
		utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
				SLEN("UTF-32LE"));
	}

	/** \todo Handle unmarked UTF-16 and UTF-32. Endianness is specified 
	 * by the BOM, if present, or is assumed to be big endian. */

#define UTF32_BOM_LEN (4)
#define UTF16_BOM_LEN (2)
#define UTF8_BOM_LEN  (3)

	if (mibenum == utf8) {
		if (buffer->length >= UTF8_BOM_LEN && 
				buffer->data[0] == 0xEF &&
				buffer->data[1] == 0xBB && 
				buffer->data[2] == 0xBF) {
			return parserutils_buffer_discard(
					buffer, 0, UTF8_BOM_LEN);
		}
	} else if (mibenum == utf16be) {
		if (buffer->length >= UTF16_BOM_LEN &&
				buffer->data[0] == 0xFE &&
				buffer->data[1] == 0xFF) {
			return parserutils_buffer_discard(
					buffer, 0, UTF16_BOM_LEN);
		}
	} else if (mibenum == utf16le) {
		if (buffer->length >= UTF16_BOM_LEN &&
				buffer->data[0] == 0xFF &&
				buffer->data[1] == 0xFE) {
			return parserutils_buffer_discard(
					buffer, 0, UTF16_BOM_LEN);
		}
	} else if (mibenum == utf32be) {
		if (buffer->length >= UTF32_BOM_LEN &&
				buffer->data[0] == 0x00 &&
				buffer->data[1] == 0x00 &&
				buffer->data[2] == 0xFE &&
				buffer->data[3] == 0xFF) {
			return parserutils_buffer_discard(
					buffer, 0, UTF32_BOM_LEN);
		}
	} else if (mibenum == utf32le) {
		if (buffer->length >= UTF32_BOM_LEN &&
				buffer->data[0] == 0xFF &&
				buffer->data[1] == 0xFE &&
				buffer->data[2] == 0x00 &&
				buffer->data[3] == 0x00) {
			return parserutils_buffer_discard(
					buffer, 0, UTF32_BOM_LEN);
		}
	}

#undef UTF8_BOM_LEN
#undef UTF16_BOM_LEN
#undef UTF32_BOM_LEN

	return PARSERUTILS_OK;
}