From d4a58cfec5345bbb2bb0db1e85172a8cff278da7 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 24 Jul 2018 13:18:48 -0400 Subject: update data and algorithms for Unicode 11 (#140) --- data/Makefile | 11 +++++++---- data/data_generator.rb | 13 +++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'data') diff --git a/data/Makefile b/data/Makefile index 1bf49e1..1b24728 100644 --- a/data/Makefile +++ b/data/Makefile @@ -16,11 +16,11 @@ CURLFLAGS = --retry 5 --location .DELETE_ON_ERROR: -utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt +utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt $(RUBY) data_generator.rb < UnicodeData.txt > $@ # GNU Unifont version for font metric calculations: -UNIFONT_VERSION=10.0.07 +UNIFONT_VERSION=11.0.01 unifont.ttf: $(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://mirrors.kernel.org/gnu/unifont/unifont-$(UNIFONT_VERSION)/unifont-$(UNIFONT_VERSION).ttf @@ -35,7 +35,7 @@ CharWidths.txt: charwidths.jl unifont.sfd unifont_upper.sfd EastAsianWidth.txt $(JULIA) charwidths.jl > $@ # Unicode data version -UNICODE_VERSION=10.0.0 +UNICODE_VERSION=11.0.0 UnicodeData.txt: $(CURL) $(CURLFLAGS) -o $@ -O http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt @@ -61,6 +61,9 @@ NormalizationTest.txt: GraphemeBreakTest.txt: $(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@ +emoji-data.txt: + $(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://unicode.org/Public/emoji/`echo $(UNICODE_VERSION) | cut -d. -f1-2`/emoji-data.txt + clean: - rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd + rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd emoji-data.txt rm -f utf8proc_data.c.new diff --git a/data/data_generator.rb b/data/data_generator.rb index 795652c..fe549f8 100644 --- a/data/data_generator.rb +++ b/data/data_generator.rb @@ -85,6 +85,19 @@ $grapheme_boundclass_list.each_line do |entry| end end +$emoji_data_list = File.read("emoji-data.txt") +$emoji_data_list.each_line do |entry| + if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W/ + $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" } + elsif entry =~ /^([0-9A-F]+)\s*;\s*Extended_Pictographic\W/ + $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" + elsif entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Modifier\W/ + $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTEND" } + elsif entry =~ /^([0-9A-F]+)\s*;\s*Emoji_Modifier\W/ + $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTEND" + end +end + $charwidth_list = File.read("CharWidths.txt") $charwidth = Hash.new(0) $charwidth_list.each_line do |entry| -- cgit v1.2.3