require_relative '../lib/unicode/emoji/constants' require_relative '../lib/unicode/emoji/index' require_relative '../lib/unicode/emoji/lazy_constants' include Unicode::Emoji def write_regexes(regexes, dirpath) regexes.each do |const_name, regex| write_regex(const_name, regex, dirpath) end end def write_regex(const_name, regex, dirpath) filename = const_name.to_s.downcase filepath = File.join(dirpath, "#{filename}.rb") File.write(filepath, <<~CONTENT) # This file was generated by a script, please do not edit it by hand. # See `$ rake generate_constants` and data/generate_constants.rb for more info. module Unicode module Emoji #{const_name} = #{regex.inspect} end end CONTENT puts "#{const_name} written to #{filepath}" end # Converts [1, 2, 3, 5, 6, 20, 21, 22, 23, 100] (it does not need to be sorted) to [[1, 2, 3], [5, 6], [20, 21, 22, 23], [100]] def groupify(arr) arr = arr.sort prev = nil arr.slice_before do |el| (prev.nil? || el != prev + 1).tap { prev = el } end end # Converts [1, 2, 3, 5, 6, 20, 21, 22, 23, 100] (it does not need to be sorted) to [1..3, 5, 6, 20..23, 100] def rangify(arr) groupify(arr).map do |group| group.size < 3 ? group : Range.new(group.first, group.last) end.flatten end def pack(ord) Regexp.escape(Array(ord).pack("U*")) end def join(*strings) "(?:" + strings.join("|") + ")" end def character_class(ords_with_ranges) "[" + ords_with_ranges.map{ |ord_or_range| ord_or_range.is_a?(Range) ? pack(ord_or_range.first) + "-" + pack(ord_or_range.last) : pack(ord_or_range) }.join + "]" end def pack_and_join(ords) if ords.any? { |e| e.is_a?(Array) } join(*ords.map { |ord| pack(ord) }) else character_class(rangify(ords)) end end def compile(emoji_character:, emoji_modifier:, emoji_modifier_base:, emoji_component:, emoji_presentation:, picto:, picto_no_emoji:) emoji_presentation_sequence = \ join( pack_and_join(TEXT_PRESENTATION) + pack(EMOJI_VARIATION_SELECTOR), emoji_presentation + "(?!" + pack(TEXT_VARIATION_SELECTOR) + ")" + pack(EMOJI_VARIATION_SELECTOR) + "?", ) non_component_emoji_presentation_sequence = \ "(?!" + emoji_component + ")" + emoji_presentation_sequence text_keycap_sequence = \ join(EMOJI_KEYCAPS.map{|keycap| pack([keycap, EMOJI_KEYCAP_SUFFIX]) }) text_presentation_sequence = \ join( pack_and_join(TEXT_PRESENTATION)+ "(?!" + join(emoji_modifier, pack(EMOJI_VARIATION_SELECTOR)) + ")" + pack(TEXT_VARIATION_SELECTOR) + "?", emoji_presentation + pack(TEXT_VARIATION_SELECTOR), ) emoji_modifier_sequence = \ emoji_modifier_base + emoji_modifier emoji_keycap_sequence = \ pack_and_join(EMOJI_KEYCAPS) + pack([EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]) emoji_valid_flag_sequence = \ pack_and_join(VALID_REGION_FLAGS) emoji_well_formed_flag_sequence = \ '\p{RI}{2}' emoji_valid_core_sequence = \ join( # emoji_character, emoji_keycap_sequence, emoji_modifier_sequence, non_component_emoji_presentation_sequence, emoji_valid_flag_sequence, ) emoji_well_formed_core_sequence = \ join( # emoji_character, emoji_keycap_sequence, emoji_modifier_sequence, non_component_emoji_presentation_sequence, emoji_well_formed_flag_sequence, ) # Sort to make sure complex sequences match first emoji_rgi_tag_sequence = \ pack_and_join(RECOMMENDED_SUBDIVISION_FLAGS.sort_by(&:length).reverse) emoji_valid_tag_sequence = \ "(?:" + pack(EMOJI_TAG_BASE_FLAG) + "(?:" + VALID_SUBDIVISIONS.sort_by(&:length).reverse.map{ |sd| sd.tr("\u{30}-\u{39}\u{61}-\u{7A}", "\u{E0030}-\u{E0039}\u{E0061}-\u{E007A}") }.join("|") + ")" + pack(CANCEL_TAG) + ")" emoji_well_formed_tag_sequence = \ "(?:" + join( non_component_emoji_presentation_sequence, emoji_modifier_sequence, ) + pack_and_join(SPEC_TAGS) + "{1,30}" + pack(CANCEL_TAG) + ")" # Sort to make sure complex sequences match first emoji_rgi_zwj_sequence = \ pack_and_join(RECOMMENDED_ZWJ_SEQUENCES.sort_by(&:length).reverse) emoji_valid_zwj_element = \ join( emoji_modifier_sequence, emoji_presentation_sequence, emoji_character, ) emoji_valid_zwj_sequence = \ "(?:" + "(?:" + emoji_valid_zwj_element + pack(ZWJ) + ")+" + emoji_valid_zwj_element + ")" emoji_rgi_sequence = \ join( emoji_rgi_zwj_sequence, emoji_rgi_tag_sequence, emoji_valid_core_sequence, ) emoji_valid_sequence = \ join( emoji_valid_zwj_sequence, emoji_valid_tag_sequence, emoji_valid_core_sequence, ) emoji_well_formed_sequence = \ join( emoji_valid_zwj_sequence, emoji_well_formed_tag_sequence, emoji_well_formed_core_sequence, ) emoji_possible_modification = \ join( emoji_modifier, pack([EMOJI_VARIATION_SELECTOR, EMOJI_KEYCAP_SUFFIX]) + "?", "[󠀠-󠁾]+󠁿" # raw tags ) emoji_possible_zwj_element = \ join( emoji_well_formed_flag_sequence, emoji_character + emoji_possible_modification + "?" ) emoji_possible = \ emoji_possible_zwj_element + "(?:" + pack(ZWJ) + emoji_possible_zwj_element + ")*" regexes = {} # Matches basic singleton emoji and all kind of sequences, but restrict zwj and tag sequences to known sequences (rgi) regexes[:REGEX] = Regexp.compile(emoji_rgi_sequence) # Matches basic singleton emoji and all kind of valid sequences regexes[:REGEX_VALID] = Regexp.compile(emoji_valid_sequence) # Matches basic singleton emoji and all kind of sequences regexes[:REGEX_WELL_FORMED] = Regexp.compile(emoji_well_formed_sequence) # Quick test which might lead to false positves # See https://www.unicode.org/reports/tr51/#EBNF_and_Regex regexes[:REGEX_POSSIBLE] = Regexp.compile(emoji_possible) # Matches only basic single, non-textual emoji # Ignores "components" like modifiers or simple digits regexes[:REGEX_BASIC] = Regexp.compile( "(?!" + emoji_component + ")" + emoji_presentation_sequence ) # Matches only basic single, textual emoji # Ignores "components" like modifiers or simple digits regexes[:REGEX_TEXT] = Regexp.compile( join( "(?!" + emoji_component + ")" + text_presentation_sequence, text_keycap_sequence, ) ) # Matches any emoji-related codepoint - Use with caution (returns partial matches) regexes[:REGEX_ANY] = Regexp.compile(emoji_character) # Combined REGEXes which also match for TEXTUAL emoji regexes[:REGEX_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX], regexes[:REGEX_TEXT]) regexes[:REGEX_VALID_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_VALID], regexes[:REGEX_TEXT]) regexes[:REGEX_WELL_FORMED_INCLUDE_TEXT] = Regexp.union(regexes[:REGEX_WELL_FORMED], regexes[:REGEX_TEXT]) regexes[:REGEX_PICTO] = Regexp.compile(picto) regexes[:REGEX_PICTO_NO_EMOJI] = Regexp.compile(picto_no_emoji) regexes end regexes = compile( emoji_character: pack_and_join(EMOJI_CHAR), emoji_modifier: pack_and_join(EMOJI_MODIFIERS), emoji_modifier_base: pack_and_join(EMOJI_MODIFIER_BASES), emoji_component: pack_and_join(EMOJI_COMPONENT), emoji_presentation: pack_and_join(EMOJI_PRESENTATION), picto: pack_and_join(EXTENDED_PICTOGRAPHIC), picto_no_emoji: pack_and_join(EXTENDED_PICTOGRAPHIC_NO_EMOJI) ) write_regexes(regexes, File.expand_path("../lib/unicode/emoji/generated", __dir__)) native_regexes = compile( emoji_character: "\\p{Emoji}", emoji_modifier: "\\p{EMod}", emoji_modifier_base: "\\p{EBase}", emoji_component: "\\p{EComp}", emoji_presentation: "\\p{EPres}", picto: "\\p{ExtPict}", picto_no_emoji: "\\p{ExtPict}(?