summaryrefslogtreecommitdiff
path: root/include/parserutils/input/inputstream.h
blob: dac1ab77d772fdb634899f996e2887f3bf03acd0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
/*
 * This file is part of LibParserUtils.
 * Licensed under the MIT License,
 *                http://www.opensource.org/licenses/mit-license.php
 * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
 */

#ifndef parserutils_input_inputstream_h_
#define parserutils_input_inputstream_h_

#include <stdbool.h>
#ifndef NDEBUG
#include <stdio.h>
#endif
#include <stdlib.h>
#include <inttypes.h>

#include <parserutils/errors.h>
#include <parserutils/functypes.h>
#include <parserutils/types.h>
#include <parserutils/charset/utf8.h>
#include <parserutils/utils/buffer.h>

/**
 * Type of charset detection function
 */
typedef parserutils_error (*parserutils_charset_detect_func)(
		const uint8_t *data, size_t len, 
		uint16_t *mibenum, uint32_t *source);

/**
 * Input stream object
 */
typedef struct parserutils_inputstream 
{
	parserutils_buffer *utf8;	/**< Buffer containing UTF-8 data */

	uint32_t cursor;		/**< Byte offset of current position */

	bool had_eof;			/**< Whether EOF has been reached */
} parserutils_inputstream;

/* EOF pseudo-character */
#define PARSERUTILS_INPUTSTREAM_EOF (0xFFFFFFFFU)
/* Out-of-data indicator */
#define PARSERUTILS_INPUTSTREAM_OOD (0xFFFFFFFEU)

/* Create an input stream */
parserutils_error parserutils_inputstream_create(const char *enc,
		uint32_t encsrc, parserutils_charset_detect_func csdetect,
		parserutils_alloc alloc, void *pw, 
		parserutils_inputstream **stream);
/* Destroy an input stream */
parserutils_error parserutils_inputstream_destroy(
		parserutils_inputstream *stream);

/* Append data to an input stream */
parserutils_error parserutils_inputstream_append(
		parserutils_inputstream *stream,
		const uint8_t *data, size_t len);
/* Insert data into stream at current location */
parserutils_error parserutils_inputstream_insert(
		parserutils_inputstream *stream,
		const uint8_t *data, size_t len);

/* Slow form of css_inputstream_peek. */
uintptr_t parserutils_inputstream_peek_slow(parserutils_inputstream *stream, 
		size_t offset, size_t *length);

/* Look at the character in the stream that starts at 
 * offset bytes from the cursor
 *
 * \param stream  Stream to look in
 * \param offset  Byte offset of start of character
 * \param length  Pointer to location to receive character length (in bytes)
 * \return Pointer to character data, or EOF or OOD.
 *
 * Once the character pointed to by the result of this call has been advanced
 * past (i.e. parserutils_inputstream_advance has caused the stream cursor to 
 * pass over the character), then no guarantee is made as to the validity of 
 * the data pointed to. Thus, any attempt to dereference the pointer after 
 * advancing past the data it points to is a bug.
 */
static inline uintptr_t parserutils_inputstream_peek(
		parserutils_inputstream *stream, size_t offset, size_t *length)
{
	parserutils_error error = PARSERUTILS_OK;
	const parserutils_buffer *utf8;
	const uint8_t *utf8_data;
	size_t len, off, utf8_len;

	if (stream == NULL)
		return PARSERUTILS_INPUTSTREAM_OOD;

#ifndef NDEBUG
#ifdef VERBOSE_INPUTSTREAM
	fprintf(stdout, "Peek: len: %zu cur: %u off: %zu\n",
			stream->utf8->length, stream->cursor, offset);
#endif
#ifdef RANDOMISE_INPUTSTREAM
	parserutils_buffer_randomise(stream->utf8);
#endif
#endif

	utf8 = stream->utf8;
	utf8_data = utf8->data;
	utf8_len = utf8->length;
	off = stream->cursor + offset;

#define IS_ASCII(x) (((x) & 0x80) == 0)

	if (off < utf8_len) {
		if (IS_ASCII(utf8_data[off])) {
			/* Early exit for ASCII case */
			(*length) = 1;
			return (uintptr_t) (utf8_data + off);
		} else {
			error = parserutils_charset_utf8_char_byte_length(
				utf8_data + off, &len);

			if (error != PARSERUTILS_OK && 
					error != PARSERUTILS_NEEDDATA)
				return PARSERUTILS_INPUTSTREAM_OOD;
		}
	}

#undef IS_ASCII

	if (off == utf8_len || error == PARSERUTILS_NEEDDATA) {
		uintptr_t data = parserutils_inputstream_peek_slow(stream, 
				offset, length);
#if !defined(NDEBUG) && defined(VERBOSE_INPUTSTREAM)
		fprintf(stdout, "clen: %lu\n", *length);
#endif
		return data;
	}

#if !defined(NDEBUG) && defined(VERBOSE_INPUTSTREAM)
	fprintf(stdout, "clen: %lu\n", len);
#endif

	*length = len;

	return (uintptr_t) (utf8_data + off);
}

/**
 * Advance the stream's current position
 *
 * \param stream  The stream whose position to advance
 * \param bytes   The number of bytes to advance
 */
static inline void parserutils_inputstream_advance(
		parserutils_inputstream *stream, size_t bytes)
{
	if (stream == NULL)
		return;

#if !defined(NDEBUG) && defined(VERBOSE_INPUTSTREAM)
	fprintf(stdout, "Advance: len: %zu cur: %u bytes: %zu\n",
			stream->utf8->length, stream->cursor, bytes);
#endif

	if (bytes > stream->utf8->length - stream->cursor)
		abort();

	if (stream->cursor == stream->utf8->length)
		return;

	stream->cursor += bytes;
}

/* Read the document charset */
const char *parserutils_inputstream_read_charset(
		parserutils_inputstream *stream, uint32_t *source);

#endif