module PragmaticTokenizer class PostProcessor REGEX_SYMBOL = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/.freeze REGEXP_COMMAS = /^(,|‚)+/.freeze REGEXP_SINGLE_QUOTES = /(.+)(’|'|‘|`)$/.freeze REGEXP_SLASH = /^(?!(https?:|www\.))(.*)\/(.*)/.freeze REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)(.*)/.freeze REGEXP_PLUS_SIGN = /(.+)\+(.+)/.freeze REGEXP_COLON = /^(\:)(\S{2,})/.freeze REGEXP_EMOJI = /(\u{2744}[\u{FE0E}|\u{FE0F}])/.freeze REGEX_UNIFIED1 = Regexp.union(REGEXP_SLASH, REGEXP_QUESTION_MARK, REGEXP_PLUS_SIGN, REGEXP_COLON, REGEXP_EMOJI, PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX, PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX ).freeze REGEX_UNIFIED2 = Regexp.union(REGEXP_SINGLE_QUOTES, REGEXP_COMMAS ).freeze attr_reader :text, :abbreviations, :downcase def initialize(text:, abbreviations:, downcase:) @text = text @abbreviations = abbreviations @downcase = downcase end def post_process EndingPunctuationSeparator.new(tokens: method_name3).separate end private def method_name3 separated = EndingPunctuationSeparator.new(tokens: full_stop_separated_tokens).separate procs = [unified1, split_unknown_period1, split_unknown_period2, split_emoji] procs.reduce(separated) { |a, e| a.flat_map(&e) } end def unified1 proc { |token| token.split(REGEX_UNIFIED1) } end def full_stop_separated_tokens FullStopSeparator.new(tokens: split_and_convert_commas_and_quotes, abbreviations: abbreviations, downcase: downcase).separate end def split_and_convert_commas_and_quotes text .split .flat_map { |token| token.split(REGEX_UNIFIED2) } .flat_map { |token| convert_sym_to_punct(token) } end def split_emoji proc { |token| (token =~ /(\A|\S)\u{2744}[^\u{FE0E}|\u{FE0F}]/) ? token.split(/(\u{2744})/) : token } end def split_unknown_period1 proc { |token| unknown_period1?(token) ? token.split(/(.*\.)/) : token } end def split_unknown_period2 proc { |token| unknown_period2?(token) ? token.split(/(\.)/) : token } end def unknown_period1?(token) token.include?(".") && token !~ /(http|https|www)(\.|:)/ && token.length > 1 && token !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix && token !~ /\S+(@|@)\S+/ && abbreviations.include?(extract_abbreviation(token)) end def unknown_period2?(token) token.include?(".") && token !~ /(http|https|www)(\.|:)/ && token !~ /\.(com|net|org|edu|gov|mil|int)/ && token !~ /\.[a-zA-Z]{2}(\s|\z)/ && token.length > 2 && token !~ /\A[a-zA-Z]{1}\./ && token.count(".") == 1 && token !~ /\d+/ && !abbreviations.include?(extract_abbreviation(token)) && token !~ /\S+(@|@)\S+/ end def extract_abbreviation(token) if downcase token.split(/(\.)/)[0] else Unicode.downcase(token.split(/(\.)/)[0]) end end def convert_sym_to_punct(token) symbol_matches = REGEX_SYMBOL.match(token) if symbol_matches.nil? token else pattern = symbol_matches[0] replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(pattern) token.gsub!(pattern, replacement) end end end end