sequence_name.rb in unicoder-1.1.0

- old
+ new

@@ -1,14 +1,21 @@
 module Unicoder
   module Builder
     class SequenceName
       include Builder
+      include ReplaceCommonWords
 
+      REPLACE_COUNT = 100
+      REPLACE_BASE = ?{.ord
+      REPLACE_MIN_WORD_LENGTH = 3
+
       def initialize_index
         @index = {
           SEQUENCES: {},
+          SEQUENCES_NOT_QUALIFIED: {},
         }
+        @words = []
       end
 
       def assign_codepoint(codepoints, value, idx = @index[:SEQUENCES], combine: false)
         if option =~ /charkeys/
           key = codepoints.pack("U*")
@@ -23,10 +30,12 @@
             # ignore new one
           end
         else
           idx[key] = value
         end
+
+        @words += value.split
       end
 
       def parse!
         parse_file :named_sequences, :line, regex: /^(?!#)(?<name>.+?);(?<codepoints>.+?)$/ do |line|
           assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, line["name"]
@@ -59,13 +68,31 @@
           next if line["type"] == "Basic_Emoji"
           name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
           assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
         end
 
-        parse_file :emoji_zwj_sequences, :line, regex: /^(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
+        parse_file :emoji_zwj_sequences, :line, regex: /^(?!#)(?<codepoints>.+?)\s*;.*?; (?<name>.+?)\s*#/ do |line|
           name = line["name"].gsub(/\\x{(\h+)}/){ [$1.to_i(16)].pack("U") }.upcase
-          assign_codepoint line["codepoints"].split.map{|cp| cp.to_i(16) }, name
+          codepoints = line["codepoints"].split.map{|cp| cp.to_i(16) }
+          assign_codepoint codepoints, name
+          if codepoints.include?(0xFE0F)
+            # Build all combinations of VS16 present and missing
+            codepoints.slice_after(0xFE0F).reduce([[]]){|acc,cur|
+              if cur.include? 0xFE0F
+                acc.flat_map{|prev| [prev + (cur - [0xFE0F]), prev + cur] }
+              else
+                acc.map{|prev| prev + cur}
+              end
+            }.
+            select {|sub_codepoints| sub_codepoints != codepoints }.
+            each { |sub_codepoints|
+              assign_codepoint (sub_codepoints), name, @index[:SEQUENCES_NOT_QUALIFIED]
+            }
+          end
         end
+
+        replace_common_words! :SEQUENCES, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
+        replace_common_words! :SEQUENCES_NOT_QUALIFIED, @words, REPLACE_COUNT, REPLACE_BASE, REPLACE_MIN_WORD_LENGTH
       end
     end
   end
 end