diff options
author | Keno Fischer <kfischer+github@college.harvard.edu> | 2016-06-28 16:04:25 -0400 |
---|---|---|
committer | Steven G. Johnson <stevenj@mit.edu> | 2016-06-28 16:04:25 -0400 |
commit | 41c6b23aab330d019789bf1fbb870c7e74e703bf (patch) | |
tree | 15c109853d5d5dba78b0002897368501b94d2fc0 /data/data_generator.rb | |
parent | 3d0576a9b9669a6e9fd170ffba3d3838d46986df (diff) | |
download | libutf8proc-41c6b23aab330d019789bf1fbb870c7e74e703bf.tar.gz libutf8proc-41c6b23aab330d019789bf1fbb870c7e74e703bf.tar.bz2 |
Unicode 9 updates (#70)
* Updates for Unicode 9.0.0 TR29 Changes
- New rules GB10/(12/13) are used to combine emoji-zwj sequences/
(force grapheme breaks every two RI codepoints). Unfortunately this
breaks statelessness of grapheme-boundary determination. Deal with
this by ignoring the problem in utf8proc_grapheme_break, and by
hacking in a special case in decompose
- ZWJ moved to its own boundclass, update what is now GB9 accordingly.
- Add comments to indicate which rule a given case implements
- The Number of bound classes Now exceeds 4 bits, expand to 8 and
reorganize fields
* Import Unicode 9 data
* Update Grapheme break API to expose state override
* Bump MAJOR version
Diffstat (limited to 'data/data_generator.rb')
-rw-r--r-- | data/data_generator.rb | 6 |
1 files changed, 3 insertions, 3 deletions
diff --git a/data/data_generator.rb b/data/data_generator.rb index c90c1f7..e488436 100644 --- a/data/data_generator.rb +++ b/data/data_generator.rb @@ -182,8 +182,8 @@ class UnicodeChar "#{$exclusions.include?(code) or $excl_version.include?(code)}, " << "#{$ignorable.include?(code)}, " << "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " << - "#{$grapheme_boundclass[code]}, " << - "#{$charwidth[code]}},\n" + "#{$charwidth[code]}, 0, " << + "#{$grapheme_boundclass[code]}},\n" end end @@ -306,7 +306,7 @@ end $stdout << "};\n\n" $stdout << "const utf8proc_property_t utf8proc_properties[] = {\n" -$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false, UTF8PROC_BOUNDCLASS_OTHER, 0},\n" +$stdout << " {0, 0, 0, 0, UINT16_MAX, UINT16_MAX, -1, -1, -1, -1, -1, false,false,false,false,0,0,UTF8PROC_BOUNDCLASS_OTHER},\n" properties.each { |line| $stdout << line } |