Sha256: 3c19191c9b16044737aa0a232a1aad13422ed20f34accef6ee3ef39e3e396692

Contents?: true

Size: 1.74 KB

Versions: 2

Compression:

Stored size: 1.74 KB

Contents

# frozen_string_literal: true

# Utilities for manipulating Unihan data
module Unihan
  CODEPOINT_PATTERN = /U\+(?<hex>[A-F0-9]+)/.freeze

  class << self
    # @param readings_data [Hash<Integer,Hash{String => String}>]
    # @param tags [Array<String>]
    # @return [Regexp]
    def gen_unihan_core_pattern(dict_data, *tags)
      codepoints = dict_data.select do |_, data|
        tags.all? { |t| data['kUnihanCore2020']&.include?(t) }
      end.keys

      gen_pattern(codepoints)
    end

    # @param codepoints [Array<Integer>]
    # @return [Regexp]
    def gen_pattern(codepoints)
      alts = group(codepoints).map do |first, last|
        if first == last
          format('\u{%x}', first)
        else
          format('\u{%<first>x}-\u{%<last>x}', first: first, last: last)
        end
      end
      /[#{alts.join}]/
    end

    # @param codepoints [Array<Integer>]
    # @return [Array<Array(Integer,Integer)>]
    def group(codepoints)
      groups = [[codepoints.first, codepoints.first]]

      codepoints.drop(1).each do |cp|
        group = groups.last
        if group.last == cp - 1
          group[-1] = cp
        else
          groups << [cp, cp]
        end
      end

      groups
    end

    # @param data_file [String]
    # @return [Hash<Integer,Hash{String => String}>]
    def parse_file(data_file)
      result = {}

      File.open(data_file) do |f|
        f.each_line do |line|
          next if line.start_with?('#')
          next if line.empty?

          codepoint, field, data = line.chomp.split("\t")

          cp_int = CODEPOINT_PATTERN.match(codepoint) do |m|
            Integer(m['hex'], 16)
          end

          hash = result[cp_int] ||= {}
          hash[field] = data
        end
      end

      result
    end
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
script_detector_2-0.1.1 tasks/unihan.rb
script_detector_2-0.1.0 tasks/unihan.rb