summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJiahao Chen <jiahao@mit.edu>2014-07-17 15:55:47 -0700
committerJiahao Chen <jiahao@mit.edu>2014-07-18 10:46:11 -0400
commit5404ef8dc7a72a402312381c8c72f80363a7d8c0 (patch)
tree1ee50396275060767b9968c93765d58d146f9993
parentaa9823f5402299bf0f04634f8307cc07cb2e8d4e (diff)
downloadlibutf8proc-5404ef8dc7a72a402312381c8c72f80363a7d8c0.tar.gz
libutf8proc-5404ef8dc7a72a402312381c8c72f80363a7d8c0.tar.bz2
Mark composition exclusion characters
-rw-r--r--data_generator.rb89
1 files changed, 8 insertions, 81 deletions
diff --git a/data_generator.rb b/data_generator.rb
index f8d9cc3..7683339 100644
--- a/data_generator.rb
+++ b/data_generator.rb
@@ -99,91 +99,18 @@ $grapheme_extend_list.each do |entry|
end
$exclusions = <<END_OF_LIST
-0958 # DEVANAGARI LETTER QA
-0959 # DEVANAGARI LETTER KHHA
-095A # DEVANAGARI LETTER GHHA
-095B # DEVANAGARI LETTER ZA
-095C # DEVANAGARI LETTER DDDHA
-095D # DEVANAGARI LETTER RHA
-095E # DEVANAGARI LETTER FA
-095F # DEVANAGARI LETTER YYA
-09DC # BENGALI LETTER RRA
-09DD # BENGALI LETTER RHA
-09DF # BENGALI LETTER YYA
-0A33 # GURMUKHI LETTER LLA
-0A36 # GURMUKHI LETTER SHA
-0A59 # GURMUKHI LETTER KHHA
-0A5A # GURMUKHI LETTER GHHA
-0A5B # GURMUKHI LETTER ZA
-0A5E # GURMUKHI LETTER FA
-0B5C # ORIYA LETTER RRA
-0B5D # ORIYA LETTER RHA
-0F43 # TIBETAN LETTER GHA
-0F4D # TIBETAN LETTER DDHA
-0F52 # TIBETAN LETTER DHA
-0F57 # TIBETAN LETTER BHA
-0F5C # TIBETAN LETTER DZHA
-0F69 # TIBETAN LETTER KSSA
-0F76 # TIBETAN VOWEL SIGN VOCALIC R
-0F78 # TIBETAN VOWEL SIGN VOCALIC L
-0F93 # TIBETAN SUBJOINED LETTER GHA
-0F9D # TIBETAN SUBJOINED LETTER DDHA
-0FA2 # TIBETAN SUBJOINED LETTER DHA
-0FA7 # TIBETAN SUBJOINED LETTER BHA
-0FAC # TIBETAN SUBJOINED LETTER DZHA
-0FB9 # TIBETAN SUBJOINED LETTER KSSA
-FB1D # HEBREW LETTER YOD WITH HIRIQ
-FB1F # HEBREW LIGATURE YIDDISH YOD YOD PATAH
-FB2A # HEBREW LETTER SHIN WITH SHIN DOT
-FB2B # HEBREW LETTER SHIN WITH SIN DOT
-FB2C # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
-FB2D # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
-FB2E # HEBREW LETTER ALEF WITH PATAH
-FB2F # HEBREW LETTER ALEF WITH QAMATS
-FB30 # HEBREW LETTER ALEF WITH MAPIQ
-FB31 # HEBREW LETTER BET WITH DAGESH
-FB32 # HEBREW LETTER GIMEL WITH DAGESH
-FB33 # HEBREW LETTER DALET WITH DAGESH
-FB34 # HEBREW LETTER HE WITH MAPIQ
-FB35 # HEBREW LETTER VAV WITH DAGESH
-FB36 # HEBREW LETTER ZAYIN WITH DAGESH
-FB38 # HEBREW LETTER TET WITH DAGESH
-FB39 # HEBREW LETTER YOD WITH DAGESH
-FB3A # HEBREW LETTER FINAL KAF WITH DAGESH
-FB3B # HEBREW LETTER KAF WITH DAGESH
-FB3C # HEBREW LETTER LAMED WITH DAGESH
-FB3E # HEBREW LETTER MEM WITH DAGESH
-FB40 # HEBREW LETTER NUN WITH DAGESH
-FB41 # HEBREW LETTER SAMEKH WITH DAGESH
-FB43 # HEBREW LETTER FINAL PE WITH DAGESH
-FB44 # HEBREW LETTER PE WITH DAGESH
-FB46 # HEBREW LETTER TSADI WITH DAGESH
-FB47 # HEBREW LETTER QOF WITH DAGESH
-FB48 # HEBREW LETTER RESH WITH DAGESH
-FB49 # HEBREW LETTER SHIN WITH DAGESH
-FB4A # HEBREW LETTER TAV WITH DAGESH
-FB4B # HEBREW LETTER VAV WITH HOLAM
-FB4C # HEBREW LETTER BET WITH RAFE
-FB4D # HEBREW LETTER KAF WITH RAFE
-FB4E # HEBREW LETTER PE WITH RAFE
+#From:
+# http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
+#Section:
+# (1) Script Specifics
END_OF_LIST
$exclusions = $exclusions.chomp.split("\n").collect { |e| e.hex }
$excl_version = <<END_OF_LIST
-2ADC # FORKING
-1D15E # MUSICAL SYMBOL HALF NOTE
-1D15F # MUSICAL SYMBOL QUARTER NOTE
-1D160 # MUSICAL SYMBOL EIGHTH NOTE
-1D161 # MUSICAL SYMBOL SIXTEENTH NOTE
-1D162 # MUSICAL SYMBOL THIRTY-SECOND NOTE
-1D163 # MUSICAL SYMBOL SIXTY-FOURTH NOTE
-1D164 # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
-1D1BB # MUSICAL SYMBOL MINIMA
-1D1BC # MUSICAL SYMBOL MINIMA BLACK
-1D1BD # MUSICAL SYMBOL SEMIMINIMA WHITE
-1D1BE # MUSICAL SYMBOL SEMIMINIMA BLACK
-1D1BF # MUSICAL SYMBOL FUSA WHITE
-1D1C0 # MUSICAL SYMBOL FUSA BLACK
+#From:
+# http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
+#Section:
+# (2) Post Composition Version precomposed characters
END_OF_LIST
$excl_version = $excl_version.chomp.split("\n").collect { |e| e.hex }