summaryrefslogtreecommitdiff
path: root/data
diff options
context:
space:
mode:
authorSteven G. Johnson <stevenj@mit.edu>2018-05-02 14:18:26 -0400
committerGitHub <noreply@github.com>2018-05-02 14:18:26 -0400
commitd81308faba0cfb3fccf8c3b12446863c7b76ae32 (patch)
treea7c9e0da3b030c9f1182633fb37f4facafd0b63d /data
parent86394501342fc7174a069a2d52d53f31b7ee62da (diff)
downloadlibutf8proc-d81308faba0cfb3fccf8c3b12446863c7b76ae32.tar.gz
libutf8proc-d81308faba0cfb3fccf8c3b12446863c7b76ae32.tar.bz2
uppercase mapping ß (U+00df) to ẞ (U+1E9E) (#134)
* uppercase(0x00df) = 0x1e9e * tests for titlecase and u+00df uppercase * NEWS, another test
Diffstat (limited to 'data')
-rw-r--r--data/data_generator.rb26
1 files changed, 13 insertions, 13 deletions
diff --git a/data/data_generator.rb b/data/data_generator.rb
index fa09617..972f542 100644
--- a/data/data_generator.rb
+++ b/data/data_generator.rb
@@ -137,13 +137,13 @@ def cpary2utf16encoded(array)
end
def cpary2c(array)
return "UINT16_MAX" if array.nil? || array.length == 0
- lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
+ lencode = array.length - 1 #no sequence has len 0, so we encode len 1 as 0, len 2 as 1, ...
array = cpary2utf16encoded(array)
if lencode >= 7 #we have only 3 bits for the length (which is already cutting it close. might need to change it to 2 bits in future Unicode versions)
- array = [lencode] + array
+ array = [lencode] + array
lencode = 7
- end
- idx = pushary(array)
+ end
+ idx = pushary(array)
raise "Array index out of bound" if idx > 0x1FFF
return "#{idx | (lencode << 13)}"
end
@@ -188,9 +188,10 @@ class UnicodeChar
@decomp_mapping = ($8=='') ? nil :
$8.split.collect { |element| element.hex }
@bidi_mirrored = ($13=='Y') ? true : false
- @uppercase_mapping = ($16=='') ? nil : $16.hex
+ # issue #130: use nonstandard uppercase ß -> ẞ
+ @uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : nil) : $16.hex
@lowercase_mapping = ($17=='') ? nil : $17.hex
- @titlecase_mapping = ($18=='') ? nil : $18.hex
+ @titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex
end
def case_folding
$case_folding[code]
@@ -260,17 +261,17 @@ chars.each do |char|
end
unless comb2nd_indicies[dm1]
comb2nd_indicies_sorted_keys << dm1
- comb2nd_indicies[dm1] = comb2nd_indicies.keys.length
+ comb2nd_indicies[dm1] = comb2nd_indicies.keys.length
end
comb_array[comb1st_indicies[dm0]] ||= []
raise "Duplicate canonical mapping: #{char.code} #{dm0} #{dm1}" if comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]]
comb_array[comb1st_indicies[dm0]][comb2nd_indicies[dm1]] = char.code
-
+
comb2nd_indicies_nonbasic[dm1] = true if char.code > 0xFFFF
end
char.c_decomp_mapping = cpary2c(char.decomp_mapping)
char.c_case_folding = cpary2c(char.case_folding)
-end
+end
comb_indicies = {}
cumoffset = 0
@@ -281,7 +282,7 @@ comb1st_indicies.each do |dm0, index|
last = nil
offset = 0
comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
- if comb_array[index][b]
+ if comb_array[index][b]
first = offset unless first
last = offset
last += 1 if comb2nd_indicies_nonbasic[dm1]
@@ -391,7 +392,7 @@ comb1st_indicies.keys.each_index do |a|
offset = 0
$stdout << comb1st_indicies_firstoffsets[a] << ", " << comb1st_indicies_lastoffsets[a] << ", "
comb2nd_indicies_sorted_keys.each_with_index do |dm1, b|
- break if offset > comb1st_indicies_lastoffsets[a]
+ break if offset > comb1st_indicies_lastoffsets[a]
if offset >= comb1st_indicies_firstoffsets[a]
i += 1
if i == 8
@@ -403,9 +404,8 @@ comb1st_indicies.keys.each_index do |a|
$stdout << (v & 0xFFFF) << ", "
end
offset += 1
- offset += 1 if comb2nd_indicies_nonbasic[dm1]
+ offset += 1 if comb2nd_indicies_nonbasic[dm1]
end
$stdout << "\n"
end
$stdout << "};\n\n"
-