#!/usr/pkg/bin/ruby # This file was used to generate the 'unicode_data.c' file by parsing the # Unicode data file 'UnicodeData.txt' of the Unicode Character Database. # It is included for informational purposes only and not intended for # production use. # Copyright (c) 2009 Public Software Group e. V., Berlin, Germany # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. # This file contains derived data from a modified version of the # Unicode data files. The following license applies to that data: # # COPYRIGHT AND PERMISSION NOTICE # # Copyright (c) 1991-2007 Unicode, Inc. All rights reserved. Distributed # under the Terms of Use in http://www.unicode.org/copyright.html. # # Permission is hereby granted, free of charge, to any person obtaining a # copy of the Unicode data files and any associated documentation (the "Data # Files") or Unicode software and any associated documentation (the # "Software") to deal in the Data Files or Software without restriction, # including without limitation the rights to use, copy, modify, merge, # publish, distribute, and/or sell copies of the Data Files or Software, and # to permit persons to whom the Data Files or Software are furnished to do # so, provided that (a) the above copyright notice(s) and this permission # notice appear with all copies of the Data Files or Software, (b) both the # above copyright notice(s) and this permission notice appear in associated # documentation, and (c) there is clear notice in each modified Data File or # in the Software as well as in the documentation associated with the Data # File(s) or Software that the data or software has been modified. # # THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY # KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF # THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS # INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR # CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF # USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR # PERFORMANCE OF THE DATA FILES OR SOFTWARE. # # Except as contained in this notice, the name of a copyright holder shall # not be used in advertising or otherwise to promote the sale, use or other # dealings in these Data Files or Software without prior written # authorization of the copyright holder. $ignorable_list = <.. 000E..001F ; Default_Ignorable_Code_Point # Cc [18] .. 007F..0084 ; Default_Ignorable_Code_Point # Cc [6] .. 0086..009F ; Default_Ignorable_Code_Point # Cc [26] .. 00AD ; Default_Ignorable_Code_Point # Cf SOFT HYPHEN 034F ; Default_Ignorable_Code_Point # Mn COMBINING GRAPHEME JOINER 0600..0603 ; Default_Ignorable_Code_Point # Cf [4] ARABIC NUMBER SIGN..ARABIC SIGN SAFHA 06DD ; Default_Ignorable_Code_Point # Cf ARABIC END OF AYAH 070F ; Default_Ignorable_Code_Point # Cf SYRIAC ABBREVIATION MARK 115F..1160 ; Default_Ignorable_Code_Point # Lo [2] HANGUL CHOSEONG FILLER..HANGUL JUNGSEONG FILLER 17B4..17B5 ; Default_Ignorable_Code_Point # Cf [2] KHMER VOWEL INHERENT AQ..KHMER VOWEL INHERENT AA 180B..180D ; Default_Ignorable_Code_Point # Mn [3] MONGOLIAN FREE VARIATION SELECTOR ONE..MONGOLIAN FREE VARIATION SELECTOR THREE 200B..200F ; Default_Ignorable_Code_Point # Cf [5] ZERO WIDTH SPACE..RIGHT-TO-LEFT MARK 202A..202E ; Default_Ignorable_Code_Point # Cf [5] LEFT-TO-RIGHT EMBEDDING..RIGHT-TO-LEFT OVERRIDE 2060..2063 ; Default_Ignorable_Code_Point # Cf [4] WORD JOINER..INVISIBLE SEPARATOR 2064..2069 ; Default_Ignorable_Code_Point # Cn [6] .. 206A..206F ; Default_Ignorable_Code_Point # Cf [6] INHIBIT SYMMETRIC SWAPPING..NOMINAL DIGIT SHAPES 3164 ; Default_Ignorable_Code_Point # Lo HANGUL FILLER D800..DFFF ; Default_Ignorable_Code_Point # Cs [2048] .. FE00..FE0F ; Default_Ignorable_Code_Point # Mn [16] VARIATION SELECTOR-1..VARIATION SELECTOR-16 FEFF ; Default_Ignorable_Code_Point # Cf ZERO WIDTH NO-BREAK SPACE FFA0 ; Default_Ignorable_Code_Point # Lo HALFWIDTH HANGUL FILLER FFF0..FFF8 ; Default_Ignorable_Code_Point # Cn [9] .. 1D173..1D17A ; Default_Ignorable_Code_Point # Cf [8] MUSICAL SYMBOL BEGIN BEAM..MUSICAL SYMBOL END PHRASE E0001 ; Default_Ignorable_Code_Point # Cf LANGUAGE TAG E0002..E001F ; Default_Ignorable_Code_Point # Cn [30] .. E0020..E007F ; Default_Ignorable_Code_Point # Cf [96] TAG SPACE..CANCEL TAG E0080..E00FF ; Default_Ignorable_Code_Point # Cn [128] .. E0100..E01EF ; Default_Ignorable_Code_Point # Mn [240] VARIATION SELECTOR-17..VARIATION SELECTOR-256 E01F0..E0FFF ; Default_Ignorable_Code_Point # Cn [3600] .. END_OF_LIST $ignorable = [] $ignorable_list.each do |entry| if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/ $1.hex.upto($2.hex) { |e2| $ignorable << e2 } elsif entry =~ /^[0-9A-F]+/ $ignorable << $&.hex end end $grapheme_extend_list = <)? # decomposition type ((\ ?[0-9A-F]+)*); # decompomposition mapping ([0-9]*); # decimal digit ([0-9]*); # digit ([^;]*); # numeric ([YN]*); # bidi mirrored ([^;]*); # unicode 1.0 name ([^;]*); # iso comment ([0-9A-F]*); # simple uppercase mapping ([0-9A-F]*); # simple lowercase mapping ([0-9A-F]*)$/ix # simple titlecase mapping @code = $1.hex @name = $2 @category = $3 @combining_class = Integer($4) @bidi_class = $5 @decomp_type = $7 @decomp_mapping = ($8=='') ? nil : $8.split.collect { |element| element.hex } @bidi_mirrored = ($13=='Y') ? true : false @uppercase_mapping = ($16=='') ? nil : $16.hex @lowercase_mapping = ($17=='') ? nil : $17.hex @titlecase_mapping = ($18=='') ? nil : $18.hex end def case_folding $case_folding[code] end def c_entry(comb1_indicies, comb2_indicies) " " << "{#{str2c category, 'CATEGORY'}, #{combining_class}, " << "#{str2c bidi_class, 'BIDI_CLASS'}, " << "#{str2c decomp_type, 'DECOMP_TYPE'}, " << "#{ary2c decomp_mapping}, " << "#{bidi_mirrored}, " << "#{uppercase_mapping or -1}, " << "#{lowercase_mapping or -1}, " << "#{titlecase_mapping or -1}, " << "#{comb1_indicies[code] ? (comb1_indicies[code]*comb2_indicies.keys.length) : -1 }, #{comb2_indicies[code] or -1}, " << "#{$exclusions.include?(code) or $excl_version.include?(code)}, " << "#{$ignorable.include?(code)}, " << "#{%W[Zl Zp Cc Cf].include?(category) and not [0x200C, 0x200D].include?(category)}, " << "#{$grapheme_extend.include?(code)}, " << "#{ary2c case_folding}},\n" end end chars = [] char_hash = {} while gets if $_ =~ /^([0-9A-F]+);<[^;>,]+, First>;/i first = $1.hex gets char = UnicodeChar.new($_) raise "No last character of sequence found." unless $_ =~ /^([0-9A-F]+);<([^;>,]+), Last>;/i last = $1.hex name = "<#{$2}>" for i in first..last char_clone = char.clone char_clone.code = i char_clone.name = name char_hash[char_clone.code] = char_clone chars << char_clone end else char = UnicodeChar.new($_) char_hash[char.code] = char chars << char end end comb1st_indicies = {} comb2nd_indicies = {} comb_array = [] chars.each do |char| if char.decomp_type.nil? and char.decomp_mapping and char.decomp_mapping.length == 2 and char_hash[char.decomp_mapping[0]].combining_class == 0 and not $exclusions.include?(char.code) unless comb1st_indicies[char.decomp_mapping[0]] comb1st_indicies[char.decomp_mapping[0]] = comb1st_indicies.keys.length end unless comb2nd_indicies[char.decomp_mapping[1]] comb2nd_indicies[char.decomp_mapping[1]] = comb2nd_indicies.keys.length end comb_array[comb1st_indicies[char.decomp_mapping[0]]] ||= [] raise "Duplicate canonical mapping" if comb_array[comb1st_indicies[char.decomp_mapping[0]]][ comb2nd_indicies[char.decomp_mapping[1]]] comb_array[comb1st_indicies[char.decomp_mapping[0]]][ comb2nd_indicies[char.decomp_mapping[1]]] = char.code end end properties_indicies = {} properties = [] chars.each do |char| c_entry = char.c_entry(comb1st_indicies, comb2nd_indicies) unless properties_indicies[c_entry] properties_indicies[c_entry] = properties.length properties << c_entry end end stage1 = [] stage2 = [] for code in 0...0x110000 next unless code % 0x100 == 0 stage2_entry = [] for code2 in code...(code+0x100) if char_hash[code2] stage2_entry << (properties_indicies[char_hash[code2].c_entry( comb1st_indicies, comb2nd_indicies)] + 1) else stage2_entry << 0 end end old_index = stage2.index(stage2_entry) if old_index stage1 << (old_index * 0x100) else stage1 << (stage2.length * 0x100) stage2 << stage2_entry end end $stdout << "const int32_t utf8proc_sequences[] = {\n " i = 0 $int_array.each do |entry| i += 1 if i == 8 i = 0 $stdout << "\n " end $stdout << entry << ", " end $stdout << "};\n\n" $stdout << "const uint16_t utf8proc_stage1table[] = {\n " i = 0 stage1.each do |entry| i += 1 if i == 8 i = 0 $stdout << "\n " end $stdout << entry << ", " end $stdout << "};\n\n" $stdout << "const uint16_t utf8proc_stage2table[] = {\n " i = 0 stage2.flatten.each do |entry| i += 1 if i == 8 i = 0 $stdout << "\n " end $stdout << entry << ", " end $stdout << "};\n\n" $stdout << "const utf8proc_property_t utf8proc_properties[] = {\n" $stdout << " {0, 0, 0, 0, NULL, false, -1, -1, -1, -1, -1, false},\n" properties.each { |line| $stdout << line } $stdout << "};\n\n" $stdout << "const int32_t utf8proc_combinations[] = {\n " i = 0 comb1st_indicies.keys.each_index do |a| comb2nd_indicies.keys.each_index do |b| i += 1 if i == 8 i = 0 $stdout << "\n " end $stdout << ( comb_array[a][b] or -1 ) << ", " end end $stdout << "};\n\n"