lib/unicode/emoji.rb in unicode-emoji-1.1.0 vs lib/unicode/emoji.rb in unicode-emoji-2.0.0
- old
+ new
@@ -16,12 +16,14 @@
EMOJI_VARIATION_SELECTOR = 0xFE0F
TEXT_VARIATION_SELECTOR = 0xFE0E
EMOJI_TAG_BASE_FLAG = 0x1F3F4
CANCEL_TAG = 0xE007F
+ TAGS = [*0xE0020..0xE007E]
EMOJI_KEYCAP_SUFFIX = 0x20E3
ZWJ = 0x200D
+ REGIONAL_INDICATORS = [*0x1F1E6..0x1F1FF]
EMOJI_CHAR = INDEX[:PROPERTIES].select{ |ord, props| props.include?(:E) }.keys.freeze
EMOJI_PRESENTATION = INDEX[:PROPERTIES].select{ |ord, props| props.include?(:P) }.keys.freeze
TEXT_PRESENTATION = INDEX[:PROPERTIES].select{ |ord, props| props.include?(:E) && !props.include?(:P) }.keys.freeze
EMOJI_COMPONENT = INDEX[:PROPERTIES].select{ |ord, props| props.include?(:C) }.keys.freeze
@@ -34,10 +36,14 @@
VALID_SUBDIVISIONS = INDEX[:SD].freeze
RECOMMENDED_SUBDIVISION_FLAGS = INDEX[:TAGS].freeze
RECOMMENDED_ZWJ_SEQUENCES = INDEX[:ZWJ].freeze
LIST = INDEX[:LIST].freeze.each_value(&:freeze)
+ LIST_REMOVED_KEYS = [
+ "Smileys & People",
+ "Component",
+ ]
pack = ->(ord){ Regexp.escape(Array(ord).pack("U*")) }
join = -> (*strings){ "(?:" + strings.join("|") + ")" }
pack_and_join = ->(ords){ join[*ords.map{ |ord| pack[ord] }] }
@@ -59,10 +65,13 @@
join[
pack_and_join[TEXT_PRESENTATION] + pack[EMOJI_VARIATION_SELECTOR],
emoji_presentation + "(?!" + pack[TEXT_VARIATION_SELECTOR] + ")" + pack[EMOJI_VARIATION_SELECTOR] + "?",
]
+ non_component_emoji_presentation_sequence = \
+ "(?!" + emoji_component + ")" + emoji_presentation_sequence
+
text_presentation_sequence = \
join[
pack_and_join[TEXT_PRESENTATION]+ "(?!" + join[emoji_modifier, pack[EMOJI_VARIATION_SELECTOR]] + ")" + pack[TEXT_VARIATION_SELECTOR] + "?",
emoji_presentation + pack[TEXT_VARIATION_SELECTOR]
]
@@ -71,50 +80,102 @@
emoji_modifier_base + emoji_modifier
emoji_keycap_sequence = \
pack_and_join[EMOJI_KEYCAPS] + pack[[EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]]
- emoji_valid_region_sequence = \
+ emoji_valid_flag_sequence = \
pack_and_join[VALID_REGION_FLAGS]
+ emoji_well_formed_flag_sequence = \
+ "(?:" +
+ pack_and_join[REGIONAL_INDICATORS] +
+ pack_and_join[REGIONAL_INDICATORS] +
+ ")"
+
+ emoji_valid_core_sequence = \
+ join[
+ # emoji_character,
+ emoji_keycap_sequence,
+ emoji_modifier_sequence,
+ non_component_emoji_presentation_sequence,
+ emoji_valid_flag_sequence,
+ ]
+
+ emoji_well_formed_core_sequence = \
+ join[
+ # emoji_character,
+ emoji_keycap_sequence,
+ emoji_modifier_sequence,
+ non_component_emoji_presentation_sequence,
+ emoji_well_formed_flag_sequence,
+ ]
+
+ emoji_rgi_tag_sequence = \
+ pack_and_join[RECOMMENDED_SUBDIVISION_FLAGS]
+
emoji_valid_tag_sequence = \
"(?:" +
pack[EMOJI_TAG_BASE_FLAG] +
"(?:" + VALID_SUBDIVISIONS.map{ |sd| Regexp.escape(sd.tr("\u{20}-\u{7E}", "\u{E0020}-\u{E007E}"))}.join("|") + ")" +
pack[CANCEL_TAG] +
")"
- emoji_zwj_element = \
+ emoji_well_formed_tag_sequence = \
+ "(?:" +
+ join[
+ non_component_emoji_presentation_sequence,
+ emoji_modifier_sequence,
+ ] +
+ pack_and_join[TAGS] + "+" +
+ pack[CANCEL_TAG] +
+ ")"
+
+ emoji_rgi_zwj_sequence = \
+ pack_and_join[RECOMMENDED_ZWJ_SEQUENCES]
+
+ emoji_valid_zwj_element = \
join[
emoji_modifier_sequence,
emoji_presentation_sequence,
emoji_character,
]
- # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences
- REGEX = Regexp.compile(
- pack_and_join[RECOMMENDED_ZWJ_SEQUENCES] +
- ?| + pack_and_join[RECOMMENDED_SUBDIVISION_FLAGS] +
- ?| + emoji_modifier_sequence +
- ?| + "(?!" + emoji_component + ")" + emoji_presentation_sequence +
- ?| + emoji_keycap_sequence +
- ?| + emoji_valid_region_sequence +
- ""
- )
+ emoji_valid_zwj_sequence = \
+ "(?:" +
+ "(?:" + emoji_valid_zwj_element + pack[ZWJ] + ")+" + emoji_valid_zwj_element +
+ ")"
+ emoji_rgi_sequence = \
+ join[
+ emoji_rgi_zwj_sequence,
+ emoji_rgi_tag_sequence,
+ emoji_valid_core_sequence,
+ ]
+
+ emoji_valid_sequence = \
+ join[
+ emoji_valid_zwj_sequence,
+ emoji_valid_tag_sequence,
+ emoji_valid_core_sequence,
+ ]
+
+ emoji_well_formed_sequence = \
+ join[
+ emoji_valid_zwj_sequence,
+ emoji_well_formed_tag_sequence,
+ emoji_well_formed_core_sequence,
+ ]
+
+ # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
+ REGEX = Regexp.compile(emoji_rgi_sequence)
+
# Matches basic singleton emoji and all kind of valid sequences
- REGEX_VALID = Regexp.compile(
- # EMOJI_TAGS.map{ |base, spec| "(?:" + pack[base] + "[" + pack[spec] + "]+" + pack[CANCEL_TAG] + ")" }.join("|") +
- emoji_valid_tag_sequence +
- ?| + "(?:" + "(?:" + emoji_zwj_element + pack[ZWJ] + "){1,3}" + emoji_zwj_element + ")" +
- ?| + emoji_modifier_sequence +
- ?| + "(?!" + emoji_component + ")" + emoji_presentation_sequence +
- ?| + emoji_keycap_sequence +
- ?| + emoji_valid_region_sequence +
- ""
- )
+ REGEX_VALID = Regexp.compile(emoji_valid_sequence)
+ # Matches basic singleton emoji and all kind of sequences
+ REGEX_WELL_FORMED = Regexp.compile(emoji_well_formed_sequence)
+
# Matches only basic single, non-textual emoji
# Ignores "components" like modifiers or simple digits
REGEX_BASIC = Regexp.compile(
"(?!" + emoji_component + ")" + emoji_presentation_sequence
)
@@ -123,15 +184,20 @@
# Ignores "components" like modifiers or simple digits
REGEX_TEXT = Regexp.compile(
"(?!" + emoji_component + ")" + text_presentation_sequence
)
- # Matches any emoji-related codepoint
+ # Matches any emoji-related codepoint - Use with caution (returns partil matches)
REGEX_ANY = Regexp.compile(
emoji_character
)
+ # Combined REGEXes which also match for TEXTUAL emoji
+ REGEX_INCLUDE_TEXT = Regexp.union(REGEX, REGEX_TEXT)
+ REGEX_VALID_INCLUDE_TEXT = Regexp.union(REGEX_VALID, REGEX_TEXT)
+ REGEX_WELL_FORMED_INCLUDE_TEXT = Regexp.union(REGEX_WELL_FORMED, REGEX_TEXT)
+
def self.properties(char)
ord = get_codepoint_value(char)
props = INDEX[:PROPERTIES][ord]
if props
@@ -141,9 +207,12 @@
end
end
def self.list(key = nil, sub_key = nil)
return LIST unless key || sub_key
+ if LIST_REMOVED_KEYS.include?(key)
+ $stderr.puts "Warning(unicode-emoji): The category of #{key} does not exist anymore"
+ end
LIST.dig(*[key, sub_key].compact)
end
def self.get_codepoint_value(char)
ord = nil