lib/unicoder/builders/display_width.rb in unicoder-0.1.0 vs lib/unicoder/builders/display_width.rb in unicoder-1.0.0
- old
+ new
@@ -4,11 +4,27 @@
include Builder
include MultiDimensionalArrayBuilder
IGNORE_CATEGORIES = %w[Cs Co Cn].freeze
ZERO_WIDTH_CATEGORIES = %w[Mn Me Cf].freeze
- ZERO_WIDTH_CODEPOINTS = [*0x1160..0x11FF].freeze
+
+ ZERO_WIDTH_RANGES = [
+ *0x1160..0x11FF, # HANGUL JUNGSEONG
+ *0xD7B0..0xD7FF, # HANGUL JUNGSEONG
+ *0x2060..0x206F, # Ignorables
+ *0xFFF0..0xFFF8, # Ignorables
+ *0xE0000..0xE0FFF, # Ignorables
+ ].freeze
+
+ WIDE_RANGES = [
+ *0x3400..0x4DBF,
+ *0x4E00..0x9FFF,
+ *0xF900..0xFAFF,
+ *0x20000..0x2FFFD,
+ *0x30000..0x3FFFD,
+ ].freeze
+
SPECIAL_WIDTHS = {
0x0 => 0, # \0 NULL
0x5 => 0, # ENQUIRY
0x7 => 0, # \a BELL
0x8 => -1, # \b BACKSPACE
@@ -16,21 +32,21 @@
0xB => 0, # \v LINE TABULATION
0xC => 0, # \f FORM FEED
0xD => 0, # \r CARRIAGE RETURN
0xE => 0, # SHIFT OUT
0xF => 0, # SHIFT IN
- 0x00AD => 1, # SOFT HYPHEN
+ 0x00AD => nil, # SOFT HYPHEN
0x2E3A => 2, # TWO-EM DASH
0x2E3B => 3, # THREE-EM DASH
}.freeze
def initialize_index
@index = []
end
def parse!
- parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?);(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
+ parse_file :east_asian_width, :line, regex: /^(?<codepoints>\S+?)\s*;\s*(?<width>\S+)\s+#\s(?<category>\S+).*$/ do |line|
next if IGNORE_CATEGORIES.include?(line["category"])
if line["codepoints"]['..']
codepoints = Range.new(*line["codepoints"].split('..').map{ |codepoint|
codepoint.to_i(16)
@@ -42,22 +58,27 @@
codepoints.each{ |codepoint|
assign_codepoint codepoint, determine_width(codepoint, line["category"], line["width"])
}
end
+ ZERO_WIDTH_RANGES.each{ |codepoint|
+ assign_codepoint codepoint, 0
+ }
+
+ WIDE_RANGES.each{ |codepoint|
+ assign_codepoint codepoint, 2
+ }
+
SPECIAL_WIDTHS.each{ |codepoint, value|
assign_codepoint codepoint, value
}
4.times{ compress! }
-
- p @index
end
def determine_width(codepoint, category, east_asian_width)
if ( ZERO_WIDTH_CATEGORIES.include?(category) &&
- [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ ) ||
- ZERO_WIDTH_CODEPOINTS.include?(codepoint)
+ [codepoint].pack('U') !~ /\p{Cf}(?<=\p{Arabic})/ )
0
elsif east_asian_width == "F" || east_asian_width == "W"
2
elsif east_asian_width == "A"
:A