lib/unicoder/builders/sequence_name.rb in unicoder-1.0.0 vs lib/unicoder/builders/sequence_name.rb in unicoder-1.1.0

- old
+ new

@@ -1,14 +1,21 @@ module Unicoder module Builder class SequenceName include Builder + include ReplaceCommonWords + REPLACE_COUNT = 100 + REPLACE_BASE = ?{.ord + REPLACE_MIN_WORD_LENGTH = 3 + def initialize_index @index = { SEQUENCES: {}, + SEQUENCES_NOT_QUALIFIED: {}, } + @words = [] end def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false) if option =~ /charkeys/ key = codepoints.pack("U*") @@ -23,10 +30,12 @@ # ignore new one end else idx[key] = value end + + @words += value.split end def parse! parse_file :named_sequences, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line| assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"] @@ -59,13 +68,31 @@ next if line["type"] == "Basic_Emoji" name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name end - parse_file :emoji_zwj_sequences, :line, regex: /^(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line| + parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line| name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase - assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name + codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) } + assign_codepoint codepoints, name + if codepoints.include?(0xFE0F) + # Build all combinations of VS16 present and missing + codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur| + if cur.include? 0xFE0F + acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] } + else + acc.map{|prev| prev + cur} + end + }. + select {|sub_codepoints| sub_codepoints != codepoints }. + each { |sub_codepoints| + assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED] + } + end end + + replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH + replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH end end end end