lib/unicode/emoji.rb in unicode-emoji-1.1.0 vs lib/unicode/emoji.rb in unicode-emoji-2.0.0

- old
+ new

@@ -16,12 +16,14 @@ EMOJI_VARIATION_SELECTOR = 0xFE0F TEXT_VARIATION_SELECTOR = 0xFE0E EMOJI_TAG_BASE_FLAG = 0x1F3F4 CANCEL_TAG = 0xE007F + TAGS = [*0xE0020..0xE007E] EMOJI_KEYCAP_SUFFIX = 0x20E3 ZWJ = 0x200D + REGIONAL_INDICATORS = [*0x1F1E6..0x1F1FF] EMOJI_CHAR = INDEX[:PROPERTIES].select{ |ord, props| props.include?(:E) }.keys.freeze EMOJI_PRESENTATION = INDEX[:PROPERTIES].select{ |ord, props| props.include?(:P) }.keys.freeze TEXT_PRESENTATION = INDEX[:PROPERTIES].select{ |ord, props| props.include?(:E) && !props.include?(:P) }.keys.freeze EMOJI_COMPONENT = INDEX[:PROPERTIES].select{ |ord, props| props.include?(:C) }.keys.freeze @@ -34,10 +36,14 @@ VALID_SUBDIVISIONS = INDEX[:SD].freeze RECOMMENDED_SUBDIVISION_FLAGS = INDEX[:TAGS].freeze RECOMMENDED_ZWJ_SEQUENCES = INDEX[:ZWJ].freeze LIST = INDEX[:LIST].freeze.each_value(&:freeze) + LIST_REMOVED_KEYS = [ + "Smileys & People", + "Component", + ] pack = ->(ord){ Regexp.escape(Array(ord).pack("U*")) } join = -> (*strings){ "(?:" + strings.join("|") + ")" } pack_and_join = ->(ords){ join[*ords.map{ |ord| pack[ord] }] } @@ -59,10 +65,13 @@ join[ pack_and_join[TEXT_PRESENTATION] + pack[EMOJI_VARIATION_SELECTOR], emoji_presentation + "(?!" + pack[TEXT_VARIATION_SELECTOR] + ")" + pack[EMOJI_VARIATION_SELECTOR] + "?", ] + non_component_emoji_presentation_sequence = \ + "(?!" + emoji_component + ")" + emoji_presentation_sequence + text_presentation_sequence = \ join[ pack_and_join[TEXT_PRESENTATION]+ "(?!" + join[emoji_modifier, pack[EMOJI_VARIATION_SELECTOR]] + ")" + pack[TEXT_VARIATION_SELECTOR] + "?", emoji_presentation + pack[TEXT_VARIATION_SELECTOR] ] @@ -71,50 +80,102 @@ emoji_modifier_base + emoji_modifier emoji_keycap_sequence = \ pack_and_join[EMOJI_KEYCAPS] + pack[[EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]] - emoji_valid_region_sequence = \ + emoji_valid_flag_sequence = \ pack_and_join[VALID_REGION_FLAGS] + emoji_well_formed_flag_sequence = \ + "(?:" + + pack_and_join[REGIONAL_INDICATORS] + + pack_and_join[REGIONAL_INDICATORS] + + ")" + + emoji_valid_core_sequence = \ + join[ + # emoji_character, + emoji_keycap_sequence, + emoji_modifier_sequence, + non_component_emoji_presentation_sequence, + emoji_valid_flag_sequence, + ] + + emoji_well_formed_core_sequence = \ + join[ + # emoji_character, + emoji_keycap_sequence, + emoji_modifier_sequence, + non_component_emoji_presentation_sequence, + emoji_well_formed_flag_sequence, + ] + + emoji_rgi_tag_sequence = \ + pack_and_join[RECOMMENDED_SUBDIVISION_FLAGS] + emoji_valid_tag_sequence = \ "(?:" + pack[EMOJI_TAG_BASE_FLAG] + "(?:" + VALID_SUBDIVISIONS.map{ |sd| Regexp.escape(sd.tr("\u{20}-\u{7E}", "\u{E0020}-\u{E007E}"))}.join("|") + ")" + pack[CANCEL_TAG] + ")" - emoji_zwj_element = \ + emoji_well_formed_tag_sequence = \ + "(?:" + + join[ + non_component_emoji_presentation_sequence, + emoji_modifier_sequence, + ] + + pack_and_join[TAGS] + "+" + + pack[CANCEL_TAG] + + ")" + + emoji_rgi_zwj_sequence = \ + pack_and_join[RECOMMENDED_ZWJ_SEQUENCES] + + emoji_valid_zwj_element = \ join[ emoji_modifier_sequence, emoji_presentation_sequence, emoji_character, ] - # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences - REGEX = Regexp.compile( - pack_and_join[RECOMMENDED_ZWJ_SEQUENCES] + - ?| + pack_and_join[RECOMMENDED_SUBDIVISION_FLAGS] + - ?| + emoji_modifier_sequence + - ?| + "(?!" + emoji_component + ")" + emoji_presentation_sequence + - ?| + emoji_keycap_sequence + - ?| + emoji_valid_region_sequence + - "" - ) + emoji_valid_zwj_sequence = \ + "(?:" + + "(?:" + emoji_valid_zwj_element + pack[ZWJ] + ")+" + emoji_valid_zwj_element + + ")" + emoji_rgi_sequence = \ + join[ + emoji_rgi_zwj_sequence, + emoji_rgi_tag_sequence, + emoji_valid_core_sequence, + ] + + emoji_valid_sequence = \ + join[ + emoji_valid_zwj_sequence, + emoji_valid_tag_sequence, + emoji_valid_core_sequence, + ] + + emoji_well_formed_sequence = \ + join[ + emoji_valid_zwj_sequence, + emoji_well_formed_tag_sequence, + emoji_well_formed_core_sequence, + ] + + # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi) + REGEX = Regexp.compile(emoji_rgi_sequence) + # Matches basic singleton emoji and all kind of valid sequences - REGEX_VALID = Regexp.compile( - # EMOJI_TAGS.map{ |base, spec| "(?:" + pack[base] + "[" + pack[spec] + "]+" + pack[CANCEL_TAG] + ")" }.join("|") + - emoji_valid_tag_sequence + - ?| + "(?:" + "(?:" + emoji_zwj_element + pack[ZWJ] + "){1,3}" + emoji_zwj_element + ")" + - ?| + emoji_modifier_sequence + - ?| + "(?!" + emoji_component + ")" + emoji_presentation_sequence + - ?| + emoji_keycap_sequence + - ?| + emoji_valid_region_sequence + - "" - ) + REGEX_VALID = Regexp.compile(emoji_valid_sequence) + # Matches basic singleton emoji and all kind of sequences + REGEX_WELL_FORMED = Regexp.compile(emoji_well_formed_sequence) + # Matches only basic single, non-textual emoji # Ignores "components" like modifiers or simple digits REGEX_BASIC = Regexp.compile( "(?!" + emoji_component + ")" + emoji_presentation_sequence ) @@ -123,15 +184,20 @@ # Ignores "components" like modifiers or simple digits REGEX_TEXT = Regexp.compile( "(?!" + emoji_component + ")" + text_presentation_sequence ) - # Matches any emoji-related codepoint + # Matches any emoji-related codepoint - Use with caution (returns partil matches) REGEX_ANY = Regexp.compile( emoji_character ) + # Combined REGEXes which also match for TEXTUAL emoji + REGEX_INCLUDE_TEXT = Regexp.union(REGEX, REGEX_TEXT) + REGEX_VALID_INCLUDE_TEXT = Regexp.union(REGEX_VALID, REGEX_TEXT) + REGEX_WELL_FORMED_INCLUDE_TEXT = Regexp.union(REGEX_WELL_FORMED, REGEX_TEXT) + def self.properties(char) ord = get_codepoint_value(char) props = INDEX[:PROPERTIES][ord] if props @@ -141,9 +207,12 @@ end end def self.list(key = nil, sub_key = nil) return LIST unless key || sub_key + if LIST_REMOVED_KEYS.include?(key) + $stderr.puts "Warning(unicode-emoji): The category of #{key} does not exist anymore" + end LIST.dig(*[key, sub_key].compact) end def self.get_codepoint_value(char) ord = nil