data/generate_constants.rb in unicode-emoji-3.7.0 vs data/generate_constants.rb in unicode-emoji-3.8.0
- old
+ new
@@ -66,29 +66,35 @@
else
character_class(rangify(ords))
end
end
-def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, picto:, picto_no_emoji:)
+def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, text_presentation:, picto:, picto_no_emoji:)
emoji_presentation_sequence = \
join(
- pack_and_join(TEXT_PRESENTATION) + pack(EMOJI_VARIATION_SELECTOR),
+ text_presentation + pack(EMOJI_VARIATION_SELECTOR),
emoji_presentation + "(?!" + pack(TEXT_VARIATION_SELECTOR) + ")" + pack(EMOJI_VARIATION_SELECTOR) + "?",
)
non_component_emoji_presentation_sequence = \
"(?!" + emoji_component + ")" + emoji_presentation_sequence
text_keycap_sequence = \
- join(EMOJI_KEYCAPS.map{|keycap| pack([keycap, EMOJI_KEYCAP_SUFFIX]) })
+ pack_and_join(EMOJI_KEYCAPS) + pack(EMOJI_KEYCAP_SUFFIX)
text_presentation_sequence = \
join(
- pack_and_join(TEXT_PRESENTATION)+ "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
+ text_presentation + "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?",
emoji_presentation + pack(TEXT_VARIATION_SELECTOR),
)
+ text_emoji = \
+ join(
+ "(?!" + emoji_component + ")" + text_presentation_sequence,
+ text_keycap_sequence,
+ )
+
emoji_modifier_sequence = \
emoji_modifier_base + emoji_modifier
emoji_keycap_sequence = \
pack_and_join(EMOJI_KEYCAPS) + pack([EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX])
@@ -97,28 +103,17 @@
pack_and_join(VALID_REGION_FLAGS)
emoji_well_formed_flag_sequence = \
'\p{RI}{2}'
- emoji_valid_core_sequence = \
+ emoji_core_sequence = \
join(
- # emoji_character,
emoji_keycap_sequence,
emoji_modifier_sequence,
non_component_emoji_presentation_sequence,
- emoji_valid_flag_sequence,
)
- emoji_well_formed_core_sequence = \
- join(
- # emoji_character,
- emoji_keycap_sequence,
- emoji_modifier_sequence,
- non_component_emoji_presentation_sequence,
- emoji_well_formed_flag_sequence,
- )
-
# Sort to make sure complex sequences match first
emoji_rgi_tag_sequence = \
pack_and_join(RECOMMENDED_SUBDIVISION_FLAGS.sort_by(&:length).reverse)
emoji_valid_tag_sequence = \
@@ -142,10 +137,22 @@
# Sort to make sure complex sequences match first
emoji_rgi_zwj_sequence = \
pack_and_join(RECOMMENDED_ZWJ_SEQUENCES.sort_by(&:length).reverse)
+ # FQE+MQE: Make VS16 optional after ZWJ has appeared
+ emoji_rgi_include_mqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
+ /#{ pack(ZWJ) }[^|]+?\K#{ pack(EMOJI_VARIATION_SELECTOR) }/,
+ pack(EMOJI_VARIATION_SELECTOR) + "?"
+ )
+
+ # FQE+MQE+UQE: Make all VS16 optional
+ emoji_rgi_include_mqe_uqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub(
+ pack(EMOJI_VARIATION_SELECTOR),
+ pack(EMOJI_VARIATION_SELECTOR) + "?",
+ )
+
emoji_valid_zwj_element = \
join(
emoji_modifier_sequence,
emoji_presentation_sequence,
emoji_character,
@@ -158,27 +165,74 @@
emoji_rgi_sequence = \
join(
emoji_rgi_zwj_sequence,
emoji_rgi_tag_sequence,
- emoji_valid_core_sequence,
+ emoji_valid_flag_sequence,
+ emoji_core_sequence,
)
+ emoji_rgi_sequence_include_text = \
+ join(
+ emoji_rgi_zwj_sequence,
+ emoji_rgi_tag_sequence,
+ emoji_valid_flag_sequence,
+ emoji_core_sequence,
+ text_emoji,
+ )
+
+ emoji_rgi_include_mqe_sequence = \
+ join(
+ emoji_rgi_include_mqe_zwj_sequence,
+ emoji_rgi_tag_sequence,
+ emoji_valid_flag_sequence,
+ emoji_core_sequence,
+ )
+
+ emoji_rgi_include_mqe_uqe_sequence = \
+ join(
+ emoji_rgi_include_mqe_uqe_zwj_sequence,
+ text_emoji, # also uqe
+ emoji_rgi_tag_sequence,
+ emoji_valid_flag_sequence,
+ emoji_core_sequence,
+ )
+
emoji_valid_sequence = \
join(
emoji_valid_zwj_sequence,
emoji_valid_tag_sequence,
- emoji_valid_core_sequence,
+ emoji_valid_flag_sequence,
+ emoji_core_sequence,
)
+ emoji_valid_sequence_include_text = \
+ join(
+ emoji_valid_zwj_sequence,
+ emoji_valid_tag_sequence,
+ emoji_valid_flag_sequence,
+ emoji_core_sequence,
+ text_emoji,
+ )
+
emoji_well_formed_sequence = \
join(
emoji_valid_zwj_sequence,
emoji_well_formed_tag_sequence,
- emoji_well_formed_core_sequence,
+ emoji_well_formed_flag_sequence,
+ emoji_core_sequence,
)
+ emoji_well_formed_sequence_include_text = \
+ join(
+ emoji_valid_zwj_sequence,
+ emoji_well_formed_tag_sequence,
+ emoji_well_formed_flag_sequence,
+ emoji_core_sequence,
+ text_emoji,
+ )
+
emoji_possible_modification = \
join(
emoji_modifier,
pack([EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]) + "?",
"[-]+" # raw tags
@@ -196,45 +250,46 @@
regexes = {}
# Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence)
+ # rgi + singleton text
+ regexes[:REGEX_INCLUDE_TEXT] = Regexp.compile(emoji_rgi_sequence_include_text)
+
+ # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
+ # Also make VS16 optional if not at first emoji character
+ regexes[:REGEX_INCLUDE_MQE] = Regexp.compile(emoji_rgi_include_mqe_sequence)
+
+ # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi)
+ # Also make VS16 optional even at first emoji character
+ regexes[:REGEX_INCLUDE_MQE_UQE] = Regexp.compile(emoji_rgi_include_mqe_uqe_sequence)
+
# Matches basic singleton emoji and all kind of valid sequences
regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence)
+ # valid + singleton text
+ regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.compile(emoji_valid_sequence_include_text)
+
# Matches basic singleton emoji and all kind of sequences
regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence)
+
+ # well-formed + singleton text
+ regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.compile(emoji_well_formed_sequence_include_text)
# Quick test which might lead to false positves
# See https://www.unicode.org/reports/tr51/#EBNF_and_Regex
regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible)
- # Matches only basic single, non-textual emoji
- # Ignores "components" like modifiers or simple digits
- regexes[:REGEX_BASIC] = Regexp.compile(
- "(?!" + emoji_component + ")" + emoji_presentation_sequence
- )
+ # Matches only basic single, non-textual emoji, ignores "components" like modifiers or simple digits
+ regexes[:REGEX_BASIC] = Regexp.compile(non_component_emoji_presentation_sequence)
- # Matches only basic single, textual emoji
- # Ignores "components" like modifiers or simple digits
- regexes[:REGEX_TEXT] = Regexp.compile(
- join(
- "(?!" + emoji_component + ")" + text_presentation_sequence,
- text_keycap_sequence,
- )
- )
+ # Matches only basic single, textual emoji, ignores "components" like modifiers or simple digits
+ regexes[:REGEX_TEXT] = Regexp.compile(text_emoji)
- # Matches any emoji-related codepoint - Use with caution (returns partial matches)
+ # Same as \p{Emoji} - to be removed or renamed
regexes[:REGEX_ANY] = Regexp.compile(emoji_character)
- # Combined REGEXes which also match for TEXTUAL emoji
- regexes[:REGEX_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX], regexes[:REGEX_TEXT])
-
- regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_VALID], regexes[:REGEX_TEXT])
-
- regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_WELL_FORMED], regexes[:REGEX_TEXT])
-
regexes[:REGEX_PICTO] = Regexp.compile(picto)
regexes[:REGEX_PICTO_NO_EMOJI] = Regexp.compile(picto_no_emoji)
regexes
@@ -244,10 +299,11 @@
emoji_character: pack_and_join(EMOJI_CHAR),
emoji_modifier: pack_and_join(EMOJI_MODIFIERS),
emoji_modifier_base: pack_and_join(EMOJI_MODIFIER_BASES),
emoji_component: pack_and_join(EMOJI_COMPONENT),
emoji_presentation: pack_and_join(EMOJI_PRESENTATION),
+ text_presentation: pack_and_join(TEXT_PRESENTATION),
picto: pack_and_join(EXTENDED_PICTOGRAPHIC),
picto_no_emoji: pack_and_join(EXTENDED_PICTOGRAPHIC_NO_EMOJI)
)
write_regexes(regexes, File.expand_path("../lib/unicode/emoji/generated", __dir__))
@@ -255,9 +311,10 @@
emoji_character: "\\p{Emoji}",
emoji_modifier: "\\p{EMod}",
emoji_modifier_base: "\\p{EBase}",
emoji_component: "\\p{EComp}",
emoji_presentation: "\\p{EPres}",
+ text_presentation: "\\p{Emoji}(?<!\\p{EPres})",
picto: "\\p{ExtPict}",
picto_no_emoji: "\\p{ExtPict}(?<!\\p{Emoji})"
)
write_regexes(native_regexes, File.expand_path("../lib/unicode/emoji/generated_native", __dir__))