summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJiahao Chen <jiahao@mit.edu>2014-07-17 15:47:52 -0700
committerJiahao Chen <jiahao@mit.edu>2014-07-18 10:46:11 -0400
commitaa9823f5402299bf0f04634f8307cc07cb2e8d4e (patch)
treef346b8e206df9e175e86f308080ba41062daff5b
parent7633bd03b6ec1bd776a7a78d2650fa0834399584 (diff)
downloadlibutf8proc-aa9823f5402299bf0f04634f8307cc07cb2e8d4e.tar.gz
libutf8proc-aa9823f5402299bf0f04634f8307cc07cb2e8d4e.tar.bz2
Mark Grapheme_Extend data
-rw-r--r--data_generator.rb155
1 files changed, 4 insertions, 151 deletions
diff --git a/data_generator.rb b/data_generator.rb
index 5581ce1..f8d9cc3 100644
--- a/data_generator.rb
+++ b/data_generator.rb
@@ -83,157 +83,10 @@ $ignorable_list.each do |entry|
end
$grapheme_extend_list = <<END_OF_LIST
-0300..036F ; Grapheme_Extend # Mn [112] COMBINING GRAVE ACCENT..COMBINING LATIN SMALL LETTER X
-0483..0486 ; Grapheme_Extend # Mn [4] COMBINING CYRILLIC TITLO..COMBINING CYRILLIC PSILI PNEUMATA
-0488..0489 ; Grapheme_Extend # Me [2] COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..COMBINING CYRILLIC MILLIONS SIGN
-0591..05BD ; Grapheme_Extend # Mn [45] HEBREW ACCENT ETNAHTA..HEBREW POINT METEG
-05BF ; Grapheme_Extend # Mn HEBREW POINT RAFE
-05C1..05C2 ; Grapheme_Extend # Mn [2] HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT
-05C4..05C5 ; Grapheme_Extend # Mn [2] HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT
-05C7 ; Grapheme_Extend # Mn HEBREW POINT QAMATS QATAN
-0610..0615 ; Grapheme_Extend # Mn [6] ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..ARABIC SMALL HIGH TAH
-064B..065E ; Grapheme_Extend # Mn [20] ARABIC FATHATAN..ARABIC FATHA WITH TWO DOTS
-0670 ; Grapheme_Extend # Mn ARABIC LETTER SUPERSCRIPT ALEF
-06D6..06DC ; Grapheme_Extend # Mn [7] ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA..ARABIC SMALL HIGH SEEN
-06DE ; Grapheme_Extend # Me ARABIC START OF RUB EL HIZB
-06DF..06E4 ; Grapheme_Extend # Mn [6] ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL HIGH MADDA
-06E7..06E8 ; Grapheme_Extend # Mn [2] ARABIC SMALL HIGH YEH..ARABIC SMALL HIGH NOON
-06EA..06ED ; Grapheme_Extend # Mn [4] ARABIC EMPTY CENTRE LOW STOP..ARABIC SMALL LOW MEEM
-0711 ; Grapheme_Extend # Mn SYRIAC LETTER SUPERSCRIPT ALAPH
-0730..074A ; Grapheme_Extend # Mn [27] SYRIAC PTHAHA ABOVE..SYRIAC BARREKH
-07A6..07B0 ; Grapheme_Extend # Mn [11] THAANA ABAFILI..THAANA SUKUN
-07EB..07F3 ; Grapheme_Extend # Mn [9] NKO COMBINING SHORT HIGH TONE..NKO COMBINING DOUBLE DOT ABOVE
-0901..0902 ; Grapheme_Extend # Mn [2] DEVANAGARI SIGN CANDRABINDU..DEVANAGARI SIGN ANUSVARA
-093C ; Grapheme_Extend # Mn DEVANAGARI SIGN NUKTA
-0941..0948 ; Grapheme_Extend # Mn [8] DEVANAGARI VOWEL SIGN U..DEVANAGARI VOWEL SIGN AI
-094D ; Grapheme_Extend # Mn DEVANAGARI SIGN VIRAMA
-0951..0954 ; Grapheme_Extend # Mn [4] DEVANAGARI STRESS SIGN UDATTA..DEVANAGARI ACUTE ACCENT
-0962..0963 ; Grapheme_Extend # Mn [2] DEVANAGARI VOWEL SIGN VOCALIC L..DEVANAGARI VOWEL SIGN VOCALIC LL
-0981 ; Grapheme_Extend # Mn BENGALI SIGN CANDRABINDU
-09BC ; Grapheme_Extend # Mn BENGALI SIGN NUKTA
-09BE ; Grapheme_Extend # Mc BENGALI VOWEL SIGN AA
-09C1..09C4 ; Grapheme_Extend # Mn [4] BENGALI VOWEL SIGN U..BENGALI VOWEL SIGN VOCALIC RR
-09CD ; Grapheme_Extend # Mn BENGALI SIGN VIRAMA
-09D7 ; Grapheme_Extend # Mc BENGALI AU LENGTH MARK
-09E2..09E3 ; Grapheme_Extend # Mn [2] BENGALI VOWEL SIGN VOCALIC L..BENGALI VOWEL SIGN VOCALIC LL
-0A01..0A02 ; Grapheme_Extend # Mn [2] GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN BINDI
-0A3C ; Grapheme_Extend # Mn GURMUKHI SIGN NUKTA
-0A41..0A42 ; Grapheme_Extend # Mn [2] GURMUKHI VOWEL SIGN U..GURMUKHI VOWEL SIGN UU
-0A47..0A48 ; Grapheme_Extend # Mn [2] GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN AI
-0A4B..0A4D ; Grapheme_Extend # Mn [3] GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA
-0A70..0A71 ; Grapheme_Extend # Mn [2] GURMUKHI TIPPI..GURMUKHI ADDAK
-0A81..0A82 ; Grapheme_Extend # Mn [2] GUJARATI SIGN CANDRABINDU..GUJARATI SIGN ANUSVARA
-0ABC ; Grapheme_Extend # Mn GUJARATI SIGN NUKTA
-0AC1..0AC5 ; Grapheme_Extend # Mn [5] GUJARATI VOWEL SIGN U..GUJARATI VOWEL SIGN CANDRA E
-0AC7..0AC8 ; Grapheme_Extend # Mn [2] GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN AI
-0ACD ; Grapheme_Extend # Mn GUJARATI SIGN VIRAMA
-0AE2..0AE3 ; Grapheme_Extend # Mn [2] GUJARATI VOWEL SIGN VOCALIC L..GUJARATI VOWEL SIGN VOCALIC LL
-0B01 ; Grapheme_Extend # Mn ORIYA SIGN CANDRABINDU
-0B3C ; Grapheme_Extend # Mn ORIYA SIGN NUKTA
-0B3E ; Grapheme_Extend # Mc ORIYA VOWEL SIGN AA
-0B3F ; Grapheme_Extend # Mn ORIYA VOWEL SIGN I
-0B41..0B43 ; Grapheme_Extend # Mn [3] ORIYA VOWEL SIGN U..ORIYA VOWEL SIGN VOCALIC R
-0B4D ; Grapheme_Extend # Mn ORIYA SIGN VIRAMA
-0B56 ; Grapheme_Extend # Mn ORIYA AI LENGTH MARK
-0B57 ; Grapheme_Extend # Mc ORIYA AU LENGTH MARK
-0B82 ; Grapheme_Extend # Mn TAMIL SIGN ANUSVARA
-0BBE ; Grapheme_Extend # Mc TAMIL VOWEL SIGN AA
-0BC0 ; Grapheme_Extend # Mn TAMIL VOWEL SIGN II
-0BCD ; Grapheme_Extend # Mn TAMIL SIGN VIRAMA
-0BD7 ; Grapheme_Extend # Mc TAMIL AU LENGTH MARK
-0C3E..0C40 ; Grapheme_Extend # Mn [3] TELUGU VOWEL SIGN AA..TELUGU VOWEL SIGN II
-0C46..0C48 ; Grapheme_Extend # Mn [3] TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI
-0C4A..0C4D ; Grapheme_Extend # Mn [4] TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA
-0C55..0C56 ; Grapheme_Extend # Mn [2] TELUGU LENGTH MARK..TELUGU AI LENGTH MARK
-0CBC ; Grapheme_Extend # Mn KANNADA SIGN NUKTA
-0CBF ; Grapheme_Extend # Mn KANNADA VOWEL SIGN I
-0CC2 ; Grapheme_Extend # Mc KANNADA VOWEL SIGN UU
-0CC6 ; Grapheme_Extend # Mn KANNADA VOWEL SIGN E
-0CCC..0CCD ; Grapheme_Extend # Mn [2] KANNADA VOWEL SIGN AU..KANNADA SIGN VIRAMA
-0CD5..0CD6 ; Grapheme_Extend # Mc [2] KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
-0CE2..0CE3 ; Grapheme_Extend # Mn [2] KANNADA VOWEL SIGN VOCALIC L..KANNADA VOWEL SIGN VOCALIC LL
-0D3E ; Grapheme_Extend # Mc MALAYALAM VOWEL SIGN AA
-0D41..0D43 ; Grapheme_Extend # Mn [3] MALAYALAM VOWEL SIGN U..MALAYALAM VOWEL SIGN VOCALIC R
-0D4D ; Grapheme_Extend # Mn MALAYALAM SIGN VIRAMA
-0D57 ; Grapheme_Extend # Mc MALAYALAM AU LENGTH MARK
-0DCA ; Grapheme_Extend # Mn SINHALA SIGN AL-LAKUNA
-0DCF ; Grapheme_Extend # Mc SINHALA VOWEL SIGN AELA-PILLA
-0DD2..0DD4 ; Grapheme_Extend # Mn [3] SINHALA VOWEL SIGN KETTI IS-PILLA..SINHALA VOWEL SIGN KETTI PAA-PILLA
-0DD6 ; Grapheme_Extend # Mn SINHALA VOWEL SIGN DIGA PAA-PILLA
-0DDF ; Grapheme_Extend # Mc SINHALA VOWEL SIGN GAYANUKITTA
-0E31 ; Grapheme_Extend # Mn THAI CHARACTER MAI HAN-AKAT
-0E34..0E3A ; Grapheme_Extend # Mn [7] THAI CHARACTER SARA I..THAI CHARACTER PHINTHU
-0E47..0E4E ; Grapheme_Extend # Mn [8] THAI CHARACTER MAITAIKHU..THAI CHARACTER YAMAKKAN
-0EB1 ; Grapheme_Extend # Mn LAO VOWEL SIGN MAI KAN
-0EB4..0EB9 ; Grapheme_Extend # Mn [6] LAO VOWEL SIGN I..LAO VOWEL SIGN UU
-0EBB..0EBC ; Grapheme_Extend # Mn [2] LAO VOWEL SIGN MAI KON..LAO SEMIVOWEL SIGN LO
-0EC8..0ECD ; Grapheme_Extend # Mn [6] LAO TONE MAI EK..LAO NIGGAHITA
-0F18..0F19 ; Grapheme_Extend # Mn [2] TIBETAN ASTROLOGICAL SIGN -KHYUD PA..TIBETAN ASTROLOGICAL SIGN SDONG TSHUGS
-0F35 ; Grapheme_Extend # Mn TIBETAN MARK NGAS BZUNG NYI ZLA
-0F37 ; Grapheme_Extend # Mn TIBETAN MARK NGAS BZUNG SGOR RTAGS
-0F39 ; Grapheme_Extend # Mn TIBETAN MARK TSA -PHRU
-0F71..0F7E ; Grapheme_Extend # Mn [14] TIBETAN VOWEL SIGN AA..TIBETAN SIGN RJES SU NGA RO
-0F80..0F84 ; Grapheme_Extend # Mn [5] TIBETAN VOWEL SIGN REVERSED I..TIBETAN MARK HALANTA
-0F86..0F87 ; Grapheme_Extend # Mn [2] TIBETAN SIGN LCI RTAGS..TIBETAN SIGN YANG RTAGS
-0F90..0F97 ; Grapheme_Extend # Mn [8] TIBETAN SUBJOINED LETTER KA..TIBETAN SUBJOINED LETTER JA
-0F99..0FBC ; Grapheme_Extend # Mn [36] TIBETAN SUBJOINED LETTER NYA..TIBETAN SUBJOINED LETTER FIXED-FORM RA
-0FC6 ; Grapheme_Extend # Mn TIBETAN SYMBOL PADMA GDAN
-102D..1030 ; Grapheme_Extend # Mn [4] MYANMAR VOWEL SIGN I..MYANMAR VOWEL SIGN UU
-1032 ; Grapheme_Extend # Mn MYANMAR VOWEL SIGN AI
-1036..1037 ; Grapheme_Extend # Mn [2] MYANMAR SIGN ANUSVARA..MYANMAR SIGN DOT BELOW
-1039 ; Grapheme_Extend # Mn MYANMAR SIGN VIRAMA
-1058..1059 ; Grapheme_Extend # Mn [2] MYANMAR VOWEL SIGN VOCALIC L..MYANMAR VOWEL SIGN VOCALIC LL
-135F ; Grapheme_Extend # Mn ETHIOPIC COMBINING GEMINATION MARK
-1712..1714 ; Grapheme_Extend # Mn [3] TAGALOG VOWEL SIGN I..TAGALOG SIGN VIRAMA
-1732..1734 ; Grapheme_Extend # Mn [3] HANUNOO VOWEL SIGN I..HANUNOO SIGN PAMUDPOD
-1752..1753 ; Grapheme_Extend # Mn [2] BUHID VOWEL SIGN I..BUHID VOWEL SIGN U
-1772..1773 ; Grapheme_Extend # Mn [2] TAGBANWA VOWEL SIGN I..TAGBANWA VOWEL SIGN U
-17B7..17BD ; Grapheme_Extend # Mn [7] KHMER VOWEL SIGN I..KHMER VOWEL SIGN UA
-17C6 ; Grapheme_Extend # Mn KHMER SIGN NIKAHIT
-17C9..17D3 ; Grapheme_Extend # Mn [11] KHMER SIGN MUUSIKATOAN..KHMER SIGN BATHAMASAT
-17DD ; Grapheme_Extend # Mn KHMER SIGN ATTHACAN
-180B..180D ; Grapheme_Extend # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE
-18A9 ; Grapheme_Extend # Mn MONGOLIAN LETTER ALI GALI DAGALGA
-1920..1922 ; Grapheme_Extend # Mn [3] LIMBU VOWEL SIGN A..LIMBU VOWEL SIGN U
-1927..1928 ; Grapheme_Extend # Mn [2] LIMBU VOWEL SIGN E..LIMBU VOWEL SIGN O
-1932 ; Grapheme_Extend # Mn LIMBU SMALL LETTER ANUSVARA
-1939..193B ; Grapheme_Extend # Mn [3] LIMBU SIGN MUKPHRENG..LIMBU SIGN SA-I
-1A17..1A18 ; Grapheme_Extend # Mn [2] BUGINESE VOWEL SIGN I..BUGINESE VOWEL SIGN U
-1B00..1B03 ; Grapheme_Extend # Mn [4] BALINESE SIGN ULU RICEM..BALINESE SIGN SURANG
-1B34 ; Grapheme_Extend # Mn BALINESE SIGN REREKAN
-1B36..1B3A ; Grapheme_Extend # Mn [5] BALINESE VOWEL SIGN ULU..BALINESE VOWEL SIGN RA REPA
-1B3C ; Grapheme_Extend # Mn BALINESE VOWEL SIGN LA LENGA
-1B42 ; Grapheme_Extend # Mn BALINESE VOWEL SIGN PEPET
-1B6B..1B73 ; Grapheme_Extend # Mn [9] BALINESE MUSICAL SYMBOL COMBINING TEGEH..BALINESE MUSICAL SYMBOL COMBINING GONG
-1DC0..1DCA ; Grapheme_Extend # Mn [11] COMBINING DOTTED GRAVE ACCENT..COMBINING LATIN SMALL LETTER R BELOW
-1DFE..1DFF ; Grapheme_Extend # Mn [2] COMBINING LEFT ARROWHEAD ABOVE..COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
-200C..200D ; Grapheme_Extend # Cf [2] ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
-20D0..20DC ; Grapheme_Extend # Mn [13] COMBINING LEFT HARPOON ABOVE..COMBINING FOUR DOTS ABOVE
-20DD..20E0 ; Grapheme_Extend # Me [4] COMBINING ENCLOSING CIRCLE..COMBINING ENCLOSING CIRCLE BACKSLASH
-20E1 ; Grapheme_Extend # Mn COMBINING LEFT RIGHT ARROW ABOVE
-20E2..20E4 ; Grapheme_Extend # Me [3] COMBINING ENCLOSING SCREEN..COMBINING ENCLOSING UPWARD POINTING TRIANGLE
-20E5..20EF ; Grapheme_Extend # Mn [11] COMBINING REVERSE SOLIDUS OVERLAY..COMBINING RIGHT ARROW BELOW
-302A..302F ; Grapheme_Extend # Mn [6] IDEOGRAPHIC LEVEL TONE MARK..HANGUL DOUBLE DOT TONE MARK
-3099..309A ; Grapheme_Extend # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
-A806 ; Grapheme_Extend # Mn SYLOTI NAGRI SIGN HASANTA
-A80B ; Grapheme_Extend # Mn SYLOTI NAGRI SIGN ANUSVARA
-A825..A826 ; Grapheme_Extend # Mn [2] SYLOTI NAGRI VOWEL SIGN U..SYLOTI NAGRI VOWEL SIGN E
-FB1E ; Grapheme_Extend # Mn HEBREW POINT JUDEO-SPANISH VARIKA
-FE00..FE0F ; Grapheme_Extend # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16
-FE20..FE23 ; Grapheme_Extend # Mn [4] COMBINING LIGATURE LEFT HALF..COMBINING DOUBLE TILDE RIGHT HALF
-10A01..10A03 ; Grapheme_Extend # Mn [3] KHAROSHTHI VOWEL SIGN I..KHAROSHTHI VOWEL SIGN VOCALIC R
-10A05..10A06 ; Grapheme_Extend # Mn [2] KHAROSHTHI VOWEL SIGN E..KHAROSHTHI VOWEL SIGN O
-10A0C..10A0F ; Grapheme_Extend # Mn [4] KHAROSHTHI VOWEL LENGTH MARK..KHAROSHTHI SIGN VISARGA
-10A38..10A3A ; Grapheme_Extend # Mn [3] KHAROSHTHI SIGN BAR ABOVE..KHAROSHTHI SIGN DOT BELOW
-10A3F ; Grapheme_Extend # Mn KHAROSHTHI VIRAMA
-1D165 ; Grapheme_Extend # Mc MUSICAL SYMBOL COMBINING STEM
-1D167..1D169 ; Grapheme_Extend # Mn [3] MUSICAL SYMBOL COMBINING TREMOLO-1..MUSICAL SYMBOL COMBINING TREMOLO-3
-1D16E..1D172 ; Grapheme_Extend # Mc [5] MUSICAL SYMBOL COMBINING FLAG-1..MUSICAL SYMBOL COMBINING FLAG-5
-1D17B..1D182 ; Grapheme_Extend # Mn [8] MUSICAL SYMBOL COMBINING ACCENT..MUSICAL SYMBOL COMBINING LOURE
-1D185..1D18B ; Grapheme_Extend # Mn [7] MUSICAL SYMBOL COMBINING DOIT..MUSICAL SYMBOL COMBINING TRIPLE TONGUE
-1D1AA..1D1AD ; Grapheme_Extend # Mn [4] MUSICAL SYMBOL COMBINING DOWN BOW..MUSICAL SYMBOL COMBINING SNAP PIZZICATO
-1D242..1D244 ; Grapheme_Extend # Mn [3] COMBINING GREEK MUSICAL TRISEME..COMBINING GREEK MUSICAL PENTASEME
-E0100..E01EF ; Grapheme_Extend # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256
+#From:
+# http://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt
+#Section:
+# Derived Property: Grapheme_Extend_List
END_OF_LIST
$grapheme_extend = []