# frozen_string_literal: true module UnihanLang class ChineseProcessor attr_reader :zh_tw, :zh_cn, :common def initialize @zh_tw = Set.new @zh_cn = Set.new @common = Set.new load_chinese_characters end def zh_tw?(char) @zh_tw.include?(char) || @common.include?(char) end def zh_cn?(char) @zh_cn.include?(char) || @common.include?(char) end def only_zh_tw?(char) @zh_tw.include?(char) && !@common.include?(char) end def only_zh_cn?(char) @zh_cn.include?(char) end def chinese?(char) zh_tw?(char) || zh_cn?(char) || cjk?(char) end def chinese_character?(char) chinese?(char) end private def cjk?(char) char.ord >= 0x4E00 && char.ord <= 0x9FFF end def load_chinese_characters load_unihan_variants process_character_sets end def load_unihan_variants file_path = File.join(File.dirname(__FILE__), "..", "..", "data", "Unihan_Variants.txt") File.foreach(file_path, encoding: "UTF-8") do |line| next if line.start_with?("#") || line.strip.empty? fields = line.strip.split("\t") process_unihan_fields(fields) if fields.size >= 3 end end def process_unihan_fields(fields) from = [fields[0].gsub(/^U\+/, "").hex].pack("U") # Remove dictionary name. # Example: U+348B kSemanticVariant U+5EDD