lib/unicoder/builders/name.rb in unicoder-1.0.0 vs lib/unicoder/builders/name.rb in unicoder-1.1.0

- old
+ new

@@ -1,28 +1,44 @@ module Unicoder module Builder class Name + include Builder + include ReplaceCommonWords JAMO_INITIAL = 4352 JAMO_MEDIAL = 4449 JAMO_FINAL = 4520 JAMO_END = 4697 + CJK = "CJK UNIFIED IDEOGRAPH-" + TANGUT = "TANGUT IDEOGRAPH-" + + REPLACE_COUNT = 500 + REPLACE_BASE = ?[.ord + def initialize_index @index = { NAMES: {}, ALIASES: {}, - CJK: [], - HANGUL: [], + # HANGUL: [], + CP_RANGES: { + CJK => [], # filled while parsing + TANGUT => [], # filled while parsing + "EGYPTIAN HIEROGLYPH-" => [[0x13460, 0x143FA]], + "KHITAN SMALL SCRIPT CHARACTER-" => [[0x18B00, 0x18CFF]], + "NUSHU CHARACTER-" => [[0x1B170, 0x1B2FB]], + "CJK COMPATIBILITY IDEOGRAPH-" => [[0x2F800, 0x2FA1D]], + }, # see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_Syllables_Area JAMO: { INITIAL: [], MEDIAL: [], FINAL: [""], }, } + @words = [] @range_start = nil end def parse! if option =~ /charkeys/ @@ -34,24 +50,34 @@ parse_file :unicode_data, :line, regex: /^(?<codepoint>.+?);(?<name>.+?);.*$/ do |line| if line["name"][0] == "<" && line["name"][-1] == ">" if line["name"] =~ /First/ @range_start = line["codepoint"].to_i(16) elsif line["name"] =~ /Last/ && @range_start - if line["name"] =~ /Hangul/ - @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)] - elsif line["name"] =~ /CJK/ - @index[:CJK] << [@range_start, line["codepoint"].to_i(16)] + case line["name"] + when /Hangul/ + # currently not necessary + # @index[:HANGUL] << [@range_start, line["codepoint"].to_i(16)] + when /CJK/ + @index[:CP_RANGES][CJK] << [@range_start, line["codepoint"].to_i(16)] + when /Tangut/ + @index[:CP_RANGES][TANGUT] << [@range_start, line["codepoint"].to_i(16)] else # no name + warn "ignoring range: #{line["name"]}" end @range_start = nil elsif line["name"] != "<control>" raise ArgumentError, "inconsistent range found in data, don't know what to do" end + elsif line["name"] =~ Regexp.union(@index[:CP_RANGES].keys) + # ignore else assign :NAMES, line["codepoint"].to_i(16), line["name"] + @words += line["name"].split end end + + replace_common_words! :NAMES, @words, REPLACE_COUNT, REPLACE_BASE parse_file :name_aliases, :line, regex: /^(?<codepoint>.+?);(?<alias>.+?);(?<type>.*)$/ do |line| @index[:ALIASES][get_key[line["codepoint"].to_i(16)]] ||= {} @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] ||= [] @index[:ALIASES][get_key[line["codepoint"].to_i(16)]][line["type"].to_sym] << line["alias"]