lib/unicoder/builders/sequence_name.rb in unicoder-1.0.0 vs lib/unicoder/builders/sequence_name.rb in unicoder-1.1.0
- old
+ new
@@ -1,14 +1,21 @@
module Unicoder
module Builder
class SequenceName
include Builder
+ include ReplaceCommonWords
+ REPLACE_COUNT = 100
+ REPLACE_BASE = ?{.ord
+ REPLACE_MIN_WORD_LENGTH = 3
+
def initialize_index
@index = {
SEQUENCES: {},
+ SEQUENCES_NOT_QUALIFIED: {},
}
+ @words = []
end
def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
if option =~ /charkeys/
key = codepoints.pack("U*")
@@ -23,10 +30,12 @@
# ignore new one
end
else
idx[key] = value
end
+
+ @words += value.split
end
def parse!
parse_file :named_sequences, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line|
assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
@@ -59,13 +68,31 @@
next if line["type"] == "Basic_Emoji"
name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
end
- parse_file :emoji_zwj_sequences, :line, regex: /^(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
+ parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
- assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
+ codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
+ assign_codepoint codepoints, name
+ if codepoints.include?(0xFE0F)
+ # Build all combinations of VS16 present and missing
+ codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
+ if cur.include? 0xFE0F
+ acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
+ else
+ acc.map{|prev| prev + cur}
+ end
+ }.
+ select {|sub_codepoints| sub_codepoints != codepoints }.
+ each { |sub_codepoints|
+ assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
+ }
+ end
end
+
+ replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
+ replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
end
end
end
end