summaryrefslogtreecommitdiff
path: root/data
diff options
context:
space:
mode:
authorSteven G. Johnson <stevenj@mit.edu>2018-07-24 13:25:51 -0400
committerSteven G. Johnson <stevenj@mit.edu>2018-07-24 13:25:51 -0400
commite0295be467d15e7abec2af275bcca30dc816bc9e (patch)
tree370dcfc22d38a3ed302c1beeafa2ee5bf1b52db7 /data
parent98e5529a0a6cd4dd09a8885029253f26c677c85f (diff)
parentd4a58cfec5345bbb2bb0db1e85172a8cff278da7 (diff)
downloadlibutf8proc-e0295be467d15e7abec2af275bcca30dc816bc9e.tar.gz
libutf8proc-e0295be467d15e7abec2af275bcca30dc816bc9e.tar.bz2
Merge branch 'master' of https://github.com/JuliaLang/utf8proc
Diffstat (limited to 'data')
-rw-r--r--data/Makefile11
-rw-r--r--data/data_generator.rb13
2 files changed, 20 insertions, 4 deletions
diff --git a/data/Makefile b/data/Makefile
index 1bf49e1..1b24728 100644
--- a/data/Makefile
+++ b/data/Makefile
@@ -16,11 +16,11 @@ CURLFLAGS = --retry 5 --location
.DELETE_ON_ERROR:
-utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt
+utf8proc_data.c.new: data_generator.rb UnicodeData.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt CharWidths.txt emoji-data.txt
$(RUBY) data_generator.rb < UnicodeData.txt > $@
# GNU Unifont version for font metric calculations:
-UNIFONT_VERSION=10.0.07
+UNIFONT_VERSION=11.0.01
unifont.ttf:
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://mirrors.kernel.org/gnu/unifont/unifont-$(UNIFONT_VERSION)/unifont-$(UNIFONT_VERSION).ttf
@@ -35,7 +35,7 @@ CharWidths.txt: charwidths.jl unifont.sfd unifont_upper.sfd EastAsianWidth.txt
$(JULIA) charwidths.jl > $@
# Unicode data version
-UNICODE_VERSION=10.0.0
+UNICODE_VERSION=11.0.0
UnicodeData.txt:
$(CURL) $(CURLFLAGS) -o $@ -O http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/UnicodeData.txt
@@ -61,6 +61,9 @@ NormalizationTest.txt:
GraphemeBreakTest.txt:
$(CURL) $(CURLFLAGS) $(URLCACHE)http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@
+emoji-data.txt:
+ $(CURL) $(CURLFLAGS) -o $@ -O $(URLCACHE)http://unicode.org/Public/emoji/`echo $(UNICODE_VERSION) | cut -d. -f1-2`/emoji-data.txt
+
clean:
- rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd
+ rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt unifont*.ttf unifont*.sfd emoji-data.txt
rm -f utf8proc_data.c.new
diff --git a/data/data_generator.rb b/data/data_generator.rb
index 9932f74..8bd87e8 100644
--- a/data/data_generator.rb
+++ b/data/data_generator.rb
@@ -87,6 +87,19 @@ $grapheme_boundclass_list.each_line do |entry|
end
end
+$emoji_data_list = File.read("emoji-data.txt")
+$emoji_data_list.each_line do |entry|
+ if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
+ $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC" }
+ elsif entry =~ /^([0-9A-F]+)\s*;\s*Extended_Pictographic\W/
+ $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC"
+ elsif entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
+ $1.hex.upto($2.hex) { |e2| $grapheme_boundclass[e2] = "UTF8PROC_BOUNDCLASS_EXTEND" }
+ elsif entry =~ /^([0-9A-F]+)\s*;\s*Emoji_Modifier\W/
+ $grapheme_boundclass[$1.hex] = "UTF8PROC_BOUNDCLASS_EXTEND"
+ end
+end
+
$charwidth_list = File.read("CharWidths.txt")
$charwidth = Hash.new(0)
$charwidth_list.each_line do |entry|