summaryrefslogtreecommitdiff
path: root/data/charwidths.jl
diff options
context:
space:
mode:
Diffstat (limited to 'data/charwidths.jl')
-rw-r--r--data/charwidths.jl29
1 files changed, 17 insertions, 12 deletions
diff --git a/data/charwidths.jl b/data/charwidths.jl
index ba54ccb..7b3d158 100644
--- a/data/charwidths.jl
+++ b/data/charwidths.jl
@@ -20,12 +20,12 @@ import Base.UTF8proc
#############################################################################
# Use a default width of 1 for all character categories that are
-# letter/symbol/number-like. This can be overriden by Unifont or UAX 11
+# letter/symbol/number-like, as well as for unassigned/private-use chars.
+# This can be overriden by Unifont or UAX 11
# below, but provides a useful nonzero fallback for new codepoints when
# a new Unicode version has been released but Unifont hasn't been updated yet.
zerowidth = Set{Int}() # categories that may contain zero-width chars
-push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CN)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MN)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_MC)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ME)
@@ -36,7 +36,6 @@ push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_ZP)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CC)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CF)
push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CS)
-push!(zerowidth, UTF8proc.UTF8PROC_CATEGORY_CO)
for c in 0x0000:0x110000
if catcode(c) ∉ zerowidth
CharWidths[c] = 1
@@ -102,7 +101,7 @@ for line in readlines(open("EastAsianWidth.txt"))
for c in charstart:charend
if width=="W" || width=="F" # wide or full
CharWidths[c]=2
- elseif width=="Na"|| width=="H" # narrow or half
+ elseif width=="Na"|| width=="H"
CharWidths[c]=1
end
end
@@ -115,9 +114,11 @@ end
for c in keys(CharWidths)
cat = catcode(c)
- # make sure format control character (category Cf) have width 0,
- # except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2)
- if cat==UTF8proc.UTF8PROC_CATEGORY_CF && c ∉ [0x0601,0x0602,0x0603,0x06dd]
+ # make sure format control character (category Cf) have width 0
+ # (some of these, like U+0601, can have a width in some cases
+ # but normally act like prepended combining marks. U+fff9 etc
+ # are also odd, but have zero width in typical terminal contexts)
+ if cat==UTF8proc.UTF8PROC_CATEGORY_CF
CharWidths[c]=0
end
@@ -128,11 +129,12 @@ for c in keys(CharWidths)
CharWidths[c]=0
end
- # We also assign width of zero to unassigned and private-use
+ # We also assign width of one to unassigned and private-use
# codepoints (Unifont includes ConScript Unicode Registry PUA fonts,
- # but since these are nonstandard it seems questionable to recognize them).
+ # but since these are nonstandard it seems questionable to use Unifont metrics;
+ # if they are printed as the replacement character U+FFFD they will have width 1).
if cat==UTF8proc.UTF8PROC_CATEGORY_CO || cat==UTF8proc.UTF8PROC_CATEGORY_CN
- CharWidths[c]=0
+ CharWidths[c]=1
end
# for some reason, Unifont has width-2 glyphs for ASCII control chars
@@ -141,6 +143,9 @@ for c in keys(CharWidths)
end
end
+#Soft hyphen is typically printed as a hyphen (-) in terminals.
+CharWidths[0x00ad]=1
+
#By definition, should have zero width (on the same line)
#0x002028 '
' category: Zl name: LINE SEPARATOR/
#0x002029 '
' category: Zp name: PARAGRAPH SEPARATOR/
@@ -158,8 +163,8 @@ CharWidths[0x2001]=2
CharWidths[0x2003]=2
#############################################################################
-# Output (to a file or pipe) for processing by data_generator.rb
-# ... don't bother to output zero widths since that will be the default.
+# Output (to a file or pipe) for processing by data_generator.rb,
+# encoded as a sequence of intervals.
firstc = 0x000000
lastv = 0