data/generate_constants.rb in unicode-emoji-3.6.0 vs data/generate_constants.rb in unicode-emoji-3.7.0

- old
+ new

@@ -95,14 +95,11 @@ emoji_valid_flag_sequence = \ pack_and_join(VALID_REGION_FLAGS) emoji_well_formed_flag_sequence = \ - "(?:" + - pack_and_join(REGIONAL_INDICATORS) + - pack_and_join(REGIONAL_INDICATORS) + - ")" + '\p{RI}{2}' emoji_valid_core_sequence = \ join( # emoji_character, emoji_keycap_sequence, @@ -126,22 +123,22 @@ emoji_valid_tag_sequence = \ "(?:" + pack(EMOJI_TAG_BASE_FLAG) + "(?:" + VALID_SUBDIVISIONS.sort_by(&:length).reverse.map{ |sd| - Regexp.escape(sd.tr("\u{20}-\u{7E}", "\u{E0020}-\u{E007E}")) + sd.tr("\u{30}-\u{39}\u{61}-\u{7A}", "\u{E0030}-\u{E0039}\u{E0061}-\u{E007A}") }.join("|") + ")" + pack(CANCEL_TAG) + ")" emoji_well_formed_tag_sequence = \ "(?:" + join( non_component_emoji_presentation_sequence, emoji_modifier_sequence, ) + - pack_and_join(TAGS) + "+" + + pack_and_join(SPEC_TAGS) + "{1,30}" + pack(CANCEL_TAG) + ")" # Sort to make sure complex sequences match first emoji_rgi_zwj_sequence = \ @@ -178,19 +175,39 @@ emoji_valid_zwj_sequence, emoji_well_formed_tag_sequence, emoji_well_formed_core_sequence, ) + emoji_possible_modification = \ + join( + emoji_modifier, + pack([EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]) + "?", + "[󠀠-󠁾]+󠁿" # raw tags + ) + + emoji_possible_zwj_element = \ + join( + emoji_well_formed_flag_sequence, + emoji_character + emoji_possible_modification + "?" + ) + + emoji_possible = \ + emoji_possible_zwj_element + "(?:" + pack(ZWJ) + emoji_possible_zwj_element + ")*" + regexes = {} # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi) regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence) # Matches basic singleton emoji and all kind of valid sequences regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence) # Matches basic singleton emoji and all kind of sequences regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence) + + # Quick test which might lead to false positves + # See https://www.unicode.org/reports/tr51/#EBNF_and_Regex + regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible) # Matches only basic single, non-textual emoji # Ignores "components" like modifiers or simple digits regexes[:REGEX_BASIC] = Regexp.compile( "(?!" + emoji_component + ")" + emoji_presentation_sequence