summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn-Mark Bell <jmb@netsurf-browser.org>2021-08-07 01:27:06 +0100
committerJohn-Mark Bell <jmb@netsurf-browser.org>2021-08-08 02:02:53 +0100
commit282b342fe51a2e9856b5448c30679ea8b49f18c5 (patch)
treeb0257dab5b447e2c720ebbfed3f99fa7c6572cdc
parentb7d315249f56dffa626354a4feaea5135f930e8a (diff)
downloadlibrufl-282b342fe51a2e9856b5448c30679ea8b49f18c5.tar.gz
librufl-282b342fe51a2e9856b5448c30679ea8b49f18c5.tar.bz2
Pave the way for astral character support.
No functional change, but redefine the meaning of the old "size" member of the rufl_character_set structure to allow for the addition of extension structures in future. This change is backwards compatible as it is reusing previously unused bits in the size field (which will be set to zero in all existing RUfl_caches). Rename the "size" field to "metadata" which better reflects its new usage. Update rufl_character_set_test and rufl_dump_state to follow this change (and fix up their parameter types while we're here).
-rw-r--r--src/rufl_character_set_test.c26
-rw-r--r--src/rufl_dump_state.c36
-rw-r--r--src/rufl_init.c17
-rw-r--r--src/rufl_internal.h86
4 files changed, 126 insertions, 39 deletions
diff --git a/src/rufl_character_set_test.c b/src/rufl_character_set_test.c
index 45fbcaf..2e97894 100644
--- a/src/rufl_character_set_test.c
+++ b/src/rufl_character_set_test.c
@@ -12,18 +12,28 @@
* Test if a character set contains a character.
*
* \param charset character set
- * \param c character code
+ * \param u Unicode codepoint
* \return true if present, false if absent
*/
-bool rufl_character_set_test(struct rufl_character_set *charset,
- unsigned int c)
+bool rufl_character_set_test(const struct rufl_character_set *charset,
+ uint32_t u)
{
- unsigned int block = c >> 8;
- unsigned int byte = (c >> 3) & 31;
- unsigned int bit = c & 7;
+ unsigned int plane = u >> 16;
+ unsigned int block = (u >> 8) & 0xff;
+ unsigned int byte = (u >> 3) & 31;
+ unsigned int bit = u & 7;
- if (256 <= block)
+ if (17 <= plane)
+ return false;
+
+ /* Look for the plane we want */
+ while (PLANE_ID(charset->metadata) != plane &&
+ EXTENSION_FOLLOWS(charset->metadata)) {
+ charset = (void *)(((uint8_t *)charset) +
+ PLANE_SIZE(charset->metadata));
+ }
+ if (PLANE_ID(charset->metadata) != plane)
return false;
if (charset->index[block] == BLOCK_EMPTY)
@@ -31,7 +41,7 @@ bool rufl_character_set_test(struct rufl_character_set *charset,
else if (charset->index[block] == BLOCK_FULL)
return true;
else {
- unsigned char z = charset->block[charset->index[block]][byte];
+ uint8_t z = charset->block[charset->index[block]][byte];
return z & (1 << bit);
}
}
diff --git a/src/rufl_dump_state.c b/src/rufl_dump_state.c
index 06a1f22..b03109c 100644
--- a/src/rufl_dump_state.c
+++ b/src/rufl_dump_state.c
@@ -9,7 +9,8 @@
#include "rufl_internal.h"
-static void rufl_dump_character_set(struct rufl_character_set *charset);
+static void rufl_dump_character_set_list(
+ const struct rufl_character_set *charset);
static void rufl_dump_unicode_map(struct rufl_unicode_map *umap);
static void rufl_dump_substitution_table(void);
@@ -27,7 +28,7 @@ void rufl_dump_state(void)
printf(" %u \"%s\"\n", i, rufl_font_list[i].identifier);
if (rufl_font_list[i].charset) {
printf(" ");
- rufl_dump_character_set(rufl_font_list[i].charset);
+ rufl_dump_character_set_list(rufl_font_list[i].charset);
printf("\n");
} else {
printf(" (no charset table)\n");
@@ -75,28 +76,45 @@ void rufl_dump_state(void)
* \param charset character set to print
*/
-void rufl_dump_character_set(struct rufl_character_set *charset)
+static void rufl_dump_character_set(const struct rufl_character_set *charset)
{
- unsigned int u, t;
+ unsigned int u, t, plane = PLANE_ID(charset->metadata) << 16;
u = 0;
while (u != 0x10000) {
- while (u != 0x10000 && !rufl_character_set_test(charset, u))
+ while (u != 0x10000 &&
+ !rufl_character_set_test(charset, plane + u))
u++;
if (u != 0x10000) {
- if (!rufl_character_set_test(charset, u + 1)) {
- printf("%x ", u);
+ if (!rufl_character_set_test(charset, plane + u + 1)) {
+ printf("%x ", plane + u);
u++;
} else {
t = u;
- while (rufl_character_set_test(charset, u))
+ while (rufl_character_set_test(
+ charset, plane + u))
u++;
- printf("%x-%x ", t, u - 1);
+ printf("%x-%x ", plane + t, plane + u - 1);
}
}
}
}
+/**
+ * Dump a representation of a character set list to stdout.
+ *
+ * \param charset character set to print
+ */
+
+void rufl_dump_character_set_list(const struct rufl_character_set *charset)
+{
+ while (EXTENSION_FOLLOWS(charset->metadata)) {
+ rufl_dump_character_set(charset);
+ charset = (void *)(((uint8_t *)charset) +
+ PLANE_SIZE(charset->metadata));
+ }
+ rufl_dump_character_set(charset);
+}
/**
* Dump a representation of a unicode map to stdout.
diff --git a/src/rufl_init.c b/src/rufl_init.c
index b441edc..3ae4ffa 100644
--- a/src/rufl_init.c
+++ b/src/rufl_init.c
@@ -575,9 +575,9 @@ rufl_code rufl_init_scan_font(unsigned int font_index)
}
/* shrink-wrap */
- charset->size = offsetof(struct rufl_character_set, block) +
+ charset->metadata = offsetof(struct rufl_character_set, block) +
32 * last_used;
- charset2 = realloc(charset, charset->size);
+ charset2 = realloc(charset, PLANE_SIZE(charset->metadata));
if (!charset2) {
free(charset);
return rufl_OUT_OF_MEMORY;
@@ -696,9 +696,9 @@ rufl_code rufl_init_scan_font_no_enumerate(unsigned int font_index)
}
/* shrink-wrap */
- charset->size = offsetof(struct rufl_character_set, block) +
+ charset->metadata = offsetof(struct rufl_character_set, block) +
32 * last_used;
- charset2 = realloc(charset, charset->size);
+ charset2 = realloc(charset, PLANE_SIZE(charset->metadata));
if (!charset2) {
free(charset);
return rufl_OUT_OF_MEMORY;
@@ -885,9 +885,9 @@ rufl_code rufl_init_scan_font_old(unsigned int font_index)
}
/* shrink-wrap */
- charset->size = offsetof(struct rufl_character_set, block) +
+ charset->metadata = offsetof(struct rufl_character_set, block) +
32 * last_used;
- charset2 = realloc(charset, charset->size);
+ charset2 = realloc(charset, PLANE_SIZE(charset->metadata));
if (!charset2) {
for (i = 0; i < num_umaps; i++)
free((umap + i)->encoding);
@@ -1255,7 +1255,8 @@ rufl_code rufl_save_cache(void)
/* character set */
if (fwrite(rufl_font_list[i].charset,
- rufl_font_list[i].charset->size, 1, fp) != 1) {
+ PLANE_SIZE(rufl_font_list[i].charset->metadata),
+ 1, fp) != 1) {
LOG("fwrite: 0x%x: %s", errno, strerror(errno));
fclose(fp);
return rufl_OK;
@@ -1430,7 +1431,7 @@ rufl_code rufl_load_cache(void)
return rufl_OUT_OF_MEMORY;
}
- charset->size = size;
+ charset->metadata = size;
if (fread(charset->index, size - sizeof size, 1, fp) != 1) {
if (feof(fp))
LOG("fread: %s", "unexpected eof");
diff --git a/src/rufl_internal.h b/src/rufl_internal.h
index 7d793a1..9c7d46e 100644
--- a/src/rufl_internal.h
+++ b/src/rufl_internal.h
@@ -14,20 +14,78 @@
#endif
-/** The available characters in a font. The range which can be represented is
- * 0x0000 to 0xffff. The size of the structure is 4 + 256 + 32 * blocks. A
- * typical * 200 glyph font might have characters in 10 blocks, giving 580
- * bytes. The maximum possible size of the structure is 8388 bytes. Note that
- * since two index values are reserved, fonts with 65280-65024 glyphs may be
- * unrepresentable, if there are no full blocks. This is unlikely. The primary
- * aim of this structure is to make lookup fast. */
+/**
+ * The available Unicode codepoints represented by a font. The entire Unicode
+ * range (U+0000 - U+10FFFF) may be covered by the font, but only codepoints
+ * in the Basic Multilingual Plane (i.e. U+0000 - U+FFFF) can be represented
+ * without the need for extension structures.
+ *
+ * Fonts which provide glyphs for astral characters will set the extension
+ * bit in the structure size field. If set, this indicates that an additional
+ * character set structure follows immediately after this one. The plane id
+ * field in the structure metadata indicates which plane the structure relates
+ * to. Planes are specified in ascending order (as the most commonly used
+ * codepoints occur in earlier planes). Planes for which the font has no
+ * glyphs are omitted entirely.
+ *
+ * Each plane is subdivided into 256 codepoint blocks (each block representing
+ * 256 contiguous codepoints). Note, however, that two index values are
+ * reserved (to indicate full or empty blocks) so only 254 partial blocks may
+ * be represented. As of Unicode 13, all planes have at least two blocks
+ * unused (or, in the case of the surrogate ranges in the Basic Multilingual
+ * Plane, defined as containing no characters), so all valid codepoints should
+ * be representable using this scheme.
+ *
+ * The size of the structure is 4 + 256 + 32 * blocks. A typical 200 glyph
+ * font might represent codepoints in 10 blocks, using 580 bytes of storage.
+ * A plane with glyphs in every block (but no block fully populated) requires
+ * the maximum possible structure size of (4 + 256 + 32 * 254 =) 8388 bytes.
+ * The maximum storage required for (the unlikely scenario of) a font
+ * providing glyphs in every block in each of the 17 Unicode planes is
+ * 17 * 8388 = 142596 bytes.
+ *
+ * The primary aim of this structure is to make lookup fast.
+ */
struct rufl_character_set {
- /** Size of structure / bytes. */
- size_t size;
+ /** Structure metadata.
+ *
+ * This field contains metadata about the structure in the form:
+ *
+ * 3 2 1 0
+ * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |E| PID | Reserved | Size |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * where:
+ *
+ * extension (E): 1 bit
+ * If set, another character set covering a different plane
+ * follows.
+ *
+ * plane id (PID): 5 bits
+ * The 0-based index of the Unicode plane this structure relates
+ * to. Valid values are in the range [0, 16], where 0 represents
+ * the Basic Multilingual Plane, and 16 represents the
+ * Supplementary Private Use Area - B.
+ *
+ * reserved: 10 bits
+ * These bits are currently unused and must be set to 0.
+ *
+ * size: 16 bits
+ * The total size of this structure, in bytes.
+ */
+ uint32_t metadata;
+# define EXTENSION_FOLLOWS(x) ((x) & (1u<<31))
+# define PLANE_ID(x) (((x) >> 26) & 0x1f)
+# define PLANE_SIZE(x) ((x) & 0xffff)
- /** Index table. Each entry represents a block of 256 characters, so
- * i[k] refers to characters [256*k, 256*(k+1)). The value is either
- * BLOCK_EMPTY, BLOCK_FULL, or an offset into the block table. */
+ /** Index table.
+ *
+ * Each entry represents a block of 256 codepoints, so i[k] refers
+ * to codepoints [256*k, 256*(k+1)). The value is either BLOCK_EMPTY,
+ * BLOCK_FULL, or an offset into the block table.
+ * */
uint8_t index[256];
/** The block has no characters present. */
# define BLOCK_EMPTY 254
@@ -142,8 +200,8 @@ rufl_code rufl_find_font_family(const char *family, rufl_style font_style,
struct rufl_character_set **charset);
rufl_code rufl_find_font(unsigned int font, unsigned int font_size,
const char *encoding, font_f *fhandle);
-bool rufl_character_set_test(struct rufl_character_set *charset,
- unsigned int c);
+bool rufl_character_set_test(const struct rufl_character_set *charset,
+ uint32_t u);
#define rufl_utf8_read(s, l, u) \