diff options
author | Benito van der Zander <benito@benibela.de> | 2016-07-12 17:51:50 +0200 |
---|---|---|
committer | Steven G. Johnson <stevenj@mit.edu> | 2016-07-12 11:51:50 -0400 |
commit | eeebf70bcf68443b0b2e5b3d811227ed3f039ea4 (patch) | |
tree | a4815a783b88588a2aa30b6a396eebfe1320e250 /utf8proc.c | |
parent | 9a0b87b57ec0be5bdf8baa7d53a4dfeb940d07d8 (diff) | |
download | libutf8proc-eeebf70bcf68443b0b2e5b3d811227ed3f039ea4.tar.gz libutf8proc-eeebf70bcf68443b0b2e5b3d811227ed3f039ea4.tar.bz2 |
Smaller tables (#68)
* convert sequences to utf-16 (saves 25kb)
* store sequence length in properties instead using -1 termination (saves 10kb)
* cache index for slightly faster data creation
* store lower/upper/title mapping in sequence array (saves 25kb). Add utf8proc_totitle, as title_mapping cannot be used to get the title codepoint anymore. Rename xxx_mapping to xxx_seqindex, so programs assuming a value with the old meaning fail at compile time
* change combination array data type to uint16 (saves 40kb)
* merge 1st and 2nd comb index (saves 50kb)
* kill empty prefix/suffix in combination array (saves 50kb)
* there was no need to have a separate combination start array, it can be merged in a single array
* some fixes
* mark the table as const again
* and regen
Diffstat (limited to 'utf8proc.c')
-rw-r--r-- | utf8proc.c | 107 |
1 files changed, 70 insertions, 37 deletions
@@ -316,16 +316,58 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break( state); } +static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry) +{ + utf8proc_int32_t entry_cp = **entry; + if ((entry_cp & 0xF800) == 0xD800) { + *entry = *entry + 1; + entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF); + entry_cp += 0x10000; + } + return entry_cp; +} + +static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex) +{ + const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex]; + return seqindex_decode_entry(&entry); +} + +static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) { + utf8proc_ssize_t written = 0; + const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF]; + int len = seqindex >> 13; + if (len >= 7) { + len = *entry; + entry++; + } + for (; len >= 0; entry++, len--) { + utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry); + + written += utf8proc_decompose_char(entry_cp, dst+written, + (bufsize > written) ? (bufsize - written) : 0, options, + last_boundclass); + if (written < 0) return UTF8PROC_ERROR_OVERFLOW; + } + return written; +} + UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) { - utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping; - return cl >= 0 ? cl : c; + utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex; + return cl != UINT16_MAX ? seqindex_decode_index(cl) : c; } UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) { - utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping; - return cu >= 0 ? cu : c; + utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex; + return cu != UINT16_MAX ? seqindex_decode_index(cu) : c; +} + +UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c) +{ + utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex; + return cu != UINT16_MAX ? seqindex_decode_index(cu) : c; } /* return a character width analogous to wcwidth (except portable and @@ -343,6 +385,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) { return s[utf8proc_category(c)]; } + + #define utf8proc_decompose_lump(replacement_uc) \ return utf8proc_decompose_char((replacement_uc), dst, bufsize, \ options & ~UTF8PROC_LUMP, last_boundclass) @@ -408,32 +452,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, category == UTF8PROC_CATEGORY_ME) return 0; } if (options & UTF8PROC_CASEFOLD) { - if (property->casefold_mapping != UINT16_MAX) { - const utf8proc_int32_t *casefold_entry; - utf8proc_ssize_t written = 0; - for (casefold_entry = &utf8proc_sequences[property->casefold_mapping]; - *casefold_entry >= 0; casefold_entry++) { - written += utf8proc_decompose_char(*casefold_entry, dst+written, - (bufsize > written) ? (bufsize - written) : 0, options, - last_boundclass); - if (written < 0) return UTF8PROC_ERROR_OVERFLOW; - } - return written; + if (property->casefold_seqindex != UINT16_MAX) { + return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass); } } if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) { - if (property->decomp_mapping != UINT16_MAX && + if (property->decomp_seqindex != UINT16_MAX && (!property->decomp_type || (options & UTF8PROC_COMPAT))) { - const utf8proc_int32_t *decomp_entry; - utf8proc_ssize_t written = 0; - for (decomp_entry = &utf8proc_sequences[property->decomp_mapping]; - *decomp_entry >= 0; decomp_entry++) { - written += utf8proc_decompose_char(*decomp_entry, dst+written, - (bufsize > written) ? (bufsize - written) : 0, options, - last_boundclass); - if (written < 0) return UTF8PROC_ERROR_OVERFLOW; - } - return written; + return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass); } } if (options & UTF8PROC_CHARBOUND) { @@ -588,17 +614,24 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, if (!starter_property) { starter_property = unsafe_get_property(*starter); } - if (starter_property->comb1st_index >= 0 && - current_property->comb2nd_index >= 0) { - composition = utf8proc_combinations[ - starter_property->comb1st_index + - current_property->comb2nd_index - ]; - if (composition >= 0 && (!(options & UTF8PROC_STABLE) || - !(unsafe_get_property(composition)->comp_exclusion))) { - *starter = composition; - starter_property = NULL; - continue; + if (starter_property->comb_index < 0x8000 && + current_property->comb_index != UINT16_MAX && + current_property->comb_index >= 0x8000) { + int sidx = starter_property->comb_index; + int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx]; + if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) { + idx += sidx + 2; + if (current_property->comb_index & 0x4000) { + composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1]; + } else + composition = utf8proc_combinations[idx]; + + if (composition > 0 && (!(options & UTF8PROC_STABLE) || + !(unsafe_get_property(composition)->comp_exclusion))) { + *starter = composition; + starter_property = NULL; + continue; + } } } } |