data/generate_constants.rb in unicode-emoji-3.7.0 vs data/generate_constants.rb in unicode-emoji-3.8.0

- old
+ new

@@ -66,29 +66,35 @@ else character_class(rangify(ords)) end end -def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, picto:, picto_no_emoji:) +def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, text_presentation:, picto:, picto_no_emoji:) emoji_presentation_sequence = \ join( - pack_and_join(TEXT_PRESENTATION) + pack(EMOJI_VARIATION_SELECTOR), + text_presentation + pack(EMOJI_VARIATION_SELECTOR), emoji_presentation + "(?!" + pack(TEXT_VARIATION_SELECTOR) + ")" + pack(EMOJI_VARIATION_SELECTOR) + "?", ) non_component_emoji_presentation_sequence = \ "(?!" + emoji_component + ")" + emoji_presentation_sequence text_keycap_sequence = \ - join(EMOJI_KEYCAPS.map{|keycap| pack([keycap, EMOJI_KEYCAP_SUFFIX]) }) + pack_and_join(EMOJI_KEYCAPS) + pack(EMOJI_KEYCAP_SUFFIX) text_presentation_sequence = \ join( - pack_and_join(TEXT_PRESENTATION)+ "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?", + text_presentation + "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?", emoji_presentation + pack(TEXT_VARIATION_SELECTOR), ) + text_emoji = \ + join( + "(?!" + emoji_component + ")" + text_presentation_sequence, + text_keycap_sequence, + ) + emoji_modifier_sequence = \ emoji_modifier_base + emoji_modifier emoji_keycap_sequence = \ pack_and_join(EMOJI_KEYCAPS) + pack([EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]) @@ -97,28 +103,17 @@ pack_and_join(VALID_REGION_FLAGS) emoji_well_formed_flag_sequence = \ '\p{RI}{2}' - emoji_valid_core_sequence = \ + emoji_core_sequence = \ join( - # emoji_character, emoji_keycap_sequence, emoji_modifier_sequence, non_component_emoji_presentation_sequence, - emoji_valid_flag_sequence, ) - emoji_well_formed_core_sequence = \ - join( - # emoji_character, - emoji_keycap_sequence, - emoji_modifier_sequence, - non_component_emoji_presentation_sequence, - emoji_well_formed_flag_sequence, - ) - # Sort to make sure complex sequences match first emoji_rgi_tag_sequence = \ pack_and_join(RECOMMENDED_SUBDIVISION_FLAGS.sort_by(&:length).reverse) emoji_valid_tag_sequence = \ @@ -142,10 +137,22 @@ # Sort to make sure complex sequences match first emoji_rgi_zwj_sequence = \ pack_and_join(RECOMMENDED_ZWJ_SEQUENCES.sort_by(&:length).reverse) + # FQE+MQE: Make VS16 optional after ZWJ has appeared + emoji_rgi_include_mqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub( + /#{ pack(ZWJ) }[^|]+?\K#{ pack(EMOJI_VARIATION_SELECTOR) }/, + pack(EMOJI_VARIATION_SELECTOR) + "?" + ) + + # FQE+MQE+UQE: Make all VS16 optional + emoji_rgi_include_mqe_uqe_zwj_sequence = emoji_rgi_zwj_sequence.gsub( + pack(EMOJI_VARIATION_SELECTOR), + pack(EMOJI_VARIATION_SELECTOR) + "?", + ) + emoji_valid_zwj_element = \ join( emoji_modifier_sequence, emoji_presentation_sequence, emoji_character, @@ -158,27 +165,74 @@ emoji_rgi_sequence = \ join( emoji_rgi_zwj_sequence, emoji_rgi_tag_sequence, - emoji_valid_core_sequence, + emoji_valid_flag_sequence, + emoji_core_sequence, ) + emoji_rgi_sequence_include_text = \ + join( + emoji_rgi_zwj_sequence, + emoji_rgi_tag_sequence, + emoji_valid_flag_sequence, + emoji_core_sequence, + text_emoji, + ) + + emoji_rgi_include_mqe_sequence = \ + join( + emoji_rgi_include_mqe_zwj_sequence, + emoji_rgi_tag_sequence, + emoji_valid_flag_sequence, + emoji_core_sequence, + ) + + emoji_rgi_include_mqe_uqe_sequence = \ + join( + emoji_rgi_include_mqe_uqe_zwj_sequence, + text_emoji, # also uqe + emoji_rgi_tag_sequence, + emoji_valid_flag_sequence, + emoji_core_sequence, + ) + emoji_valid_sequence = \ join( emoji_valid_zwj_sequence, emoji_valid_tag_sequence, - emoji_valid_core_sequence, + emoji_valid_flag_sequence, + emoji_core_sequence, ) + emoji_valid_sequence_include_text = \ + join( + emoji_valid_zwj_sequence, + emoji_valid_tag_sequence, + emoji_valid_flag_sequence, + emoji_core_sequence, + text_emoji, + ) + emoji_well_formed_sequence = \ join( emoji_valid_zwj_sequence, emoji_well_formed_tag_sequence, - emoji_well_formed_core_sequence, + emoji_well_formed_flag_sequence, + emoji_core_sequence, ) + emoji_well_formed_sequence_include_text = \ + join( + emoji_valid_zwj_sequence, + emoji_well_formed_tag_sequence, + emoji_well_formed_flag_sequence, + emoji_core_sequence, + text_emoji, + ) + emoji_possible_modification = \ join( emoji_modifier, pack([EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]) + "?", "[󠀠-󠁾]+󠁿" # raw tags @@ -196,45 +250,46 @@ regexes = {} # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi) regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence) + # rgi + singleton text + regexes[:REGEX_INCLUDE_TEXT] = Regexp.compile(emoji_rgi_sequence_include_text) + + # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi) + # Also make VS16 optional if not at first emoji character + regexes[:REGEX_INCLUDE_MQE] = Regexp.compile(emoji_rgi_include_mqe_sequence) + + # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi) + # Also make VS16 optional even at first emoji character + regexes[:REGEX_INCLUDE_MQE_UQE] = Regexp.compile(emoji_rgi_include_mqe_uqe_sequence) + # Matches basic singleton emoji and all kind of valid sequences regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence) + # valid + singleton text + regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.compile(emoji_valid_sequence_include_text) + # Matches basic singleton emoji and all kind of sequences regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence) + + # well-formed + singleton text + regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.compile(emoji_well_formed_sequence_include_text) # Quick test which might lead to false positves # See https://www.unicode.org/reports/tr51/#EBNF_and_Regex regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible) - # Matches only basic single, non-textual emoji - # Ignores "components" like modifiers or simple digits - regexes[:REGEX_BASIC] = Regexp.compile( - "(?!" + emoji_component + ")" + emoji_presentation_sequence - ) + # Matches only basic single, non-textual emoji, ignores "components" like modifiers or simple digits + regexes[:REGEX_BASIC] = Regexp.compile(non_component_emoji_presentation_sequence) - # Matches only basic single, textual emoji - # Ignores "components" like modifiers or simple digits - regexes[:REGEX_TEXT] = Regexp.compile( - join( - "(?!" + emoji_component + ")" + text_presentation_sequence, - text_keycap_sequence, - ) - ) + # Matches only basic single, textual emoji, ignores "components" like modifiers or simple digits + regexes[:REGEX_TEXT] = Regexp.compile(text_emoji) - # Matches any emoji-related codepoint - Use with caution (returns partial matches) + # Same as \p{Emoji} - to be removed or renamed regexes[:REGEX_ANY] = Regexp.compile(emoji_character) - # Combined REGEXes which also match for TEXTUAL emoji - regexes[:REGEX_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX], regexes[:REGEX_TEXT]) - - regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_VALID], regexes[:REGEX_TEXT]) - - regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_WELL_FORMED], regexes[:REGEX_TEXT]) - regexes[:REGEX_PICTO] = Regexp.compile(picto) regexes[:REGEX_PICTO_NO_EMOJI] = Regexp.compile(picto_no_emoji) regexes @@ -244,10 +299,11 @@ emoji_character: pack_and_join(EMOJI_CHAR), emoji_modifier: pack_and_join(EMOJI_MODIFIERS), emoji_modifier_base: pack_and_join(EMOJI_MODIFIER_BASES), emoji_component: pack_and_join(EMOJI_COMPONENT), emoji_presentation: pack_and_join(EMOJI_PRESENTATION), + text_presentation: pack_and_join(TEXT_PRESENTATION), picto: pack_and_join(EXTENDED_PICTOGRAPHIC), picto_no_emoji: pack_and_join(EXTENDED_PICTOGRAPHIC_NO_EMOJI) ) write_regexes(regexes, File.expand_path("../lib/unicode/emoji/generated", __dir__)) @@ -255,9 +311,10 @@ emoji_character: "\\p{Emoji}", emoji_modifier: "\\p{EMod}", emoji_modifier_base: "\\p{EBase}", emoji_component: "\\p{EComp}", emoji_presentation: "\\p{EPres}", + text_presentation: "\\p{Emoji}(?<!\\p{EPres})", picto: "\\p{ExtPict}", picto_no_emoji: "\\p{ExtPict}(?<!\\p{Emoji})" ) write_regexes(native_regexes, File.expand_path("../lib/unicode/emoji/generated_native", __dir__))