# -*- encoding : utf-8 -*-
require 'pragmatic_tokenizer/languages'
require 'pragmatic_tokenizer/pre_processor'
require 'pragmatic_tokenizer/post_processor'
require 'pragmatic_tokenizer/full_stop_separator'
require 'pragmatic_tokenizer/ending_punctuation_separator'
require 'unicode'

module PragmaticTokenizer
  class Tokenizer

    PUNCTIATION_OPTIONS       = Set.new([:all, :semi, :none, :only]).freeze
    NUMBERS_OPTIONS           = Set.new([:all, :semi, :none, :only]).freeze
    MENTIONS_OPTIONS          = Set.new([:keep_original, :keep_and_clean, :remove]).freeze
    MAX_TOKEN_LENGTH          = 50
    EMPTY_STRING              = ''.freeze
    DOT_STRING                = '.'.freeze
    SPACE_STRING              = ' '.freeze
    REGEX_DOMAIN              = /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix
    REGEX_URL                 = /(http|https)(\.|:)/
    REGEX_HYPHEN              = /\-/
    REGEX_UNDERSCORE          = /\_/
    REGEX_CONTRACTIONS        = /[‘’‚‛‹›＇´`]/
    REGEX_APOSTROPHE_S        = /['’`́]s$/
    REGEX_EMAIL               = /\S+(＠|@)\S+\.\S+/
    REGEX_HASHTAG_OR_MENTION  = /[＠@#|＃]/
    REGEX_UNDERSCORE_AT_START = /(?<=\A)\_+/
    REGEX_UNDERSCORE_AT_END   = /\_+(?=\z)/
    REGEX_ASTERISK            = /\*+/
    REGEX_UNIFIED1            = Regexp.union(REGEX_UNDERSCORE_AT_START,
                                             REGEX_UNDERSCORE_AT_END,
                                             REGEX_ASTERISK)
    # https://en.wikipedia.org/wiki/Control_character
    # matches any character with hexadecimal value 00 through 1F or 7F.
    # Rubular: http://rubular.com/r/E83fpBoDjI
    REGEXP_CONTROL                  = /[[:cntrl:]]/
    REGEXP_ENDING_COLON             = /\:(?=\z)/
    REGEXP_EXCLAMATION_AT_START     = /(?<=\A)!+(?=.+)/
    REGEXP_EXCLAMATION_AT_END       = /!+(1*!*)*(?=\z)/
    REGEXP_HYPHEN_AT_START          = /\A(-|–|\u{00AD})/
    REGEXP_SPECIAL_SYMBOL           = /[®©]/
    REGEXP_PERCENT_AT_START         = /\A\%/
    # https://codepoints.net/enclosed_alphanumeric_supplement
    REGEXP_ALPHANUMERIC_SUPPLEMENT  = /[\u{1F100}-\u{1F1FF}]/
    REGEX_UNIFIED2                  = Regexp.union(REGEXP_CONTROL,
                                                   REGEXP_ENDING_COLON,
                                                   REGEXP_EXCLAMATION_AT_START,
                                                   REGEXP_EXCLAMATION_AT_END,
                                                   REGEXP_HYPHEN_AT_START,
                                                   REGEXP_SPECIAL_SYMBOL,
                                                   REGEXP_PERCENT_AT_START,
                                                   REGEXP_ALPHANUMERIC_SUPPLEMENT)
    REGEXP_ONE_AS_EXCLAMATION  = /(?<=\D)1+(?=\z)/
    REGEXP_HASHTAG_AT_START    = /(?<=\A)(#|＃)/
    REGEXP_AT_SIGN_AT_START    = /(?<=\A)(@|＠)/
    REGEXP_HYPHEN_HASTAG       = /\A(#|＃)\S+-/
    REGEXP_EMOJI_SNOWFLAKE     = /\u{2744}[\u{FE0F}|\u{FE0E}]?/
    REGEX_EMOJI_UNIFIED        = Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
                                            PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
    REGEXP_PUNCTUATION_ONLY    = /\A[[:punct:]]+\z/
    REGEXP_NUMBER_ONLY         = /\A\d+\z/
    REGEXP_NO_NUMBERS          = /\A\D+\z/
    REGEXP_NUMBER              = /\D*\d+\d*/
    REGEXP_CONSECUTIVE_DOTS    = /\A\.{2,}\z/
    REGEXP_CHUNK_STRING        = /.{,10000}(?=\s|\z)/m

    # @param [Hash] opts optional arguments

    # @option opts [Array] :filter_languages - user-supplied array of languages from which that language's stop words, abbreviations and contractions should be used when calculating the resulting tokens - array elements should be of the String class or can be symbols
    # @option opts [String] :language - two character ISO 639-1 code - can be a String or symbol (i.e. :en or 'en')
    # @option opts [Boolean] :expand_contractions - (default: false)
    # @option opts [Boolean] :remove_stop_words - (default: false)
    # @option opts [Array] :abbreviations - user-supplied array of abbreviations (each element should be downcased with final period removed) - array elements should be of the String class
    # @option opts [Array] :stop_words - user-supplied array of stop words - array elements should be of the String class
    # @option opts [Hash]  :contractions - user-supplied hash of contractions (key is the contracted form; value is the expanded form - both the key and value should be downcased)
    # @option opts [String] :punctuation - see description below - can be a String or symbol (i.e. :none or 'none')
      # Punctuation 'all': Does not remove any punctuation from the result
      # Punctuation 'semi': Removes common punctuation (such as full stops)
      # and does not remove less common punctuation (such as questions marks)
      # This is useful for text alignment as less common punctuation can help
      # identify a sentence (like a fingerprint) while common punctuation
      # (like stop words) should be removed.
      # Punctuation 'none': Removes all punctuation from the result
      # Punctuation 'only': Removes everything except punctuation. The
      # returned result is an array of only the punctuation.
    # @option opts [String] :numbers - see description below - can be a String or symbol (i.e. :none or 'none')
      # Numbers 'all': Does not remove any numbers from the result
      # Numbers 'semi': Removes tokens that include only digits
      # Numbers 'none': Removes all tokens that include a number from the result (including Roman numerals)
      # Numbers 'only': Removes everything except tokens that include a number
    # @option opts [Integer] :minimum_length - minimum length of the token in characters
    # @option opts [Integer] :long_word_split - the specified length to split long words at any hyphen or underscore.
    # @option opts [String] :mentions - :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don't alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or 'keep_and_clean')
    # @option opts [String] :hashtags - :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don't alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or 'keep_and_clean')
    # @option opts [Boolean] :downcase - (default: true)
    # @option opts [Boolean] :clean - (default: false)
    # @option opts [Boolean] :classic_filter - removes dots from acronyms and 's from the end of tokens - (default: false)
    # @option opts [Boolean] :remove_emoji - (default: false)
    # @option opts [Boolean] :remove_emails - (default: false)
    # @option opts [Boolean] :remove_urls - (default: false)
    # @option opts [Boolean] :remove_domains - (default: false)

    def initialize(opts={})
      @filter_languages    = opts[:filter_languages] || []
      @language_module     = Languages.get_language_by_code(opts[:language])
      @expand_contractions = opts[:expand_contractions]
      @remove_stop_words   = opts[:remove_stop_words]
      @punctuation         = opts[:punctuation] ? opts[:punctuation].to_sym : :all
      @numbers             = opts[:numbers] ? opts[:numbers].to_sym : :all
      @minimum_length      = opts[:minimum_length] || 0
      @long_word_split     = opts[:long_word_split]
      @mentions            = opts[:mentions] ? opts[:mentions].to_sym : :keep_original
      @hashtags            = opts[:hashtags] ? opts[:hashtags].to_sym : :keep_original
      @downcase            = opts[:downcase].nil? ? true : opts[:downcase]
      @clean               = opts[:clean]
      @classic_filter      = opts[:classic_filter]
      @remove_emoji        = opts[:remove_emoji]
      @remove_emails       = opts[:remove_emails]
      @remove_urls         = opts[:remove_urls]
      @remove_domains      = opts[:remove_domains]
      @contractions        = opts[:contractions] || {}
      @abbreviations       = Set.new(opts[:abbreviations])
      @stop_words          = Set.new(opts[:stop_words])

      # TODO: why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
      @contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
      @abbreviations       += @language_module::ABBREVIATIONS if @abbreviations.empty?
      @stop_words          += @language_module::STOP_WORDS if @stop_words.empty? && @filter_languages.empty?

      @filter_languages.each do |lang|
        language = Languages.get_language_by_code(lang)
        @contractions.merge!(language::CONTRACTIONS)
        @abbreviations += language::ABBREVIATIONS
        @stop_words    += language::STOP_WORDS
      end

      raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTIATION_OPTIONS.include?(@punctuation)
      raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
      raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)
      raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless @minimum_length.class == Fixnum || @minimum_length.nil?
      raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == Fixnum || @long_word_split.nil?
    end

    # @param [String] text to be tokenized

    def tokenize(text)
      return [] unless text
      raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
      CGI.unescapeHTML(text)
          .scan(REGEXP_CHUNK_STRING)
          .flat_map { |segment| post_process(pre_process(segment)) }
    end

    private

      def pre_process(text)
        text
            .extend(PragmaticTokenizer::PreProcessor)
            .pre_process(language: @language_module)
      end

      def post_process(text)
        @tokens = run_post_processor(text)
        remove_various!
        process_numbers!
        process_punctuation!
        expand_contractions! if @expand_contractions
        clean!               if @clean
        classic_filter!      if @classic_filter
        remove_short_tokens! if @minimum_length > 0
        remove_stop_words!   if @remove_stop_words
        mentions!            if @mentions
        hashtags!            if @hashtags
        split_long_words!    if @long_word_split
        @tokens.reject(&:empty?)
      end

      def run_post_processor(text)
        PostProcessor.new(
            text:          chosen_case(text),
            abbreviations: @abbreviations,
            downcase:      @downcase
        ).post_process
      end

      def expand_contractions!
        @tokens = @tokens.flat_map { |t| expand_token_contraction(t) }
      end

      def expand_token_contraction(token)
        normalized = inverse_case(token.gsub(REGEX_CONTRACTIONS, "'".freeze))
        return token unless @contractions.key?(normalized)
        result    = @contractions[normalized].split(SPACE_STRING)
        result[0] = Unicode.capitalize(result[0]) unless @downcase
        result
      end

      def clean!
        @tokens = @tokens
            .flat_map { |t| t !~ REGEX_HASHTAG_OR_MENTION ? t.split(REGEX_UNIFIED1) : t }
            .map! { |t| t !~ REGEX_HASHTAG_OR_MENTION ? t.gsub(REGEXP_ONE_AS_EXCLAMATION, EMPTY_STRING) : t }
            .map! { |t| t.gsub(REGEX_UNIFIED2, EMPTY_STRING) }
            .delete_if { |t| unclean_token?(t) }
      end

      def unclean_token?(token)
        return true if PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(token)
        return true if token.length > MAX_TOKEN_LENGTH
        return true if token.include?('\\'.freeze)
        token =~ REGEXP_CONSECUTIVE_DOTS
      end

      def classic_filter!
        @tokens.map! do |token|
          token.delete!(DOT_STRING) if @abbreviations.include?(token.chomp(DOT_STRING))
          token.sub!(REGEX_APOSTROPHE_S, EMPTY_STRING)
          token
        end
      end

      def process_numbers!
        case @numbers
        when :semi
          @tokens.delete_if { |t| t =~ REGEXP_NUMBER_ONLY }
        when :none
          @tokens.delete_if { |t| t =~ REGEXP_NUMBER || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(inverse_case(t)) }
        when :only
          @tokens.delete_if { |t| t =~ REGEXP_NO_NUMBERS }
        end
      end

      def remove_short_tokens!
        @tokens.delete_if { |t| t.length < @minimum_length }
      end

      def process_punctuation!
        case @punctuation
        when :semi
          @tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION.include?(t) }
        when :none
          @tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) || t =~ REGEXP_PUNCTUATION_ONLY }
        when :only
          @tokens.keep_if { |t| PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
        end
      end

      def remove_stop_words!
        @tokens.delete_if { |token| @stop_words.include?(inverse_case(token)) }
      end

      def mentions!
        case @mentions
        when :remove
          @tokens.delete_if { |t| t =~ REGEXP_AT_SIGN_AT_START }
        when :keep_and_clean
          @tokens.map! { |t| t =~ REGEXP_AT_SIGN_AT_START ? t.gsub!(REGEXP_AT_SIGN_AT_START, EMPTY_STRING) : t }
        end
      end

      def hashtags!
        case @hashtags
        when :remove
          @tokens.delete_if { |t| t =~ REGEXP_HASHTAG_AT_START }
        when :keep_and_clean
          @tokens = @tokens
                        .flat_map { |t| t =~ REGEXP_HYPHEN_HASTAG ? t.split(REGEX_HYPHEN) : t }
                        .map { |t| t =~ REGEXP_HASHTAG_AT_START ? t.gsub!(REGEXP_HASHTAG_AT_START, EMPTY_STRING) : t }
        end
      end

      def remove_various!
        @tokens.delete_if { |t| t =~ regex_various }
      end

      def regex_various
        @regex_various ||= begin
          regex_array = []
          regex_array << REGEX_EMOJI_UNIFIED if @remove_emoji
          regex_array << REGEX_EMAIL         if @remove_emails
          regex_array << REGEX_URL           if @remove_urls
          regex_array << REGEX_DOMAIN        if @remove_domains
          Regexp.union(regex_array)
        end
      end

      def split_long_words!
        @tokens = @tokens
                      .flat_map { |t| t.length > @long_word_split ? t.split(REGEX_HYPHEN) : t }
                      .flat_map { |t| t.length > @long_word_split ? t.split(REGEX_UNDERSCORE) : t }
      end

      def chosen_case(token)
        @downcase ? Unicode.downcase(token) : token
      end

      def inverse_case(token)
        @downcase ? token : Unicode.downcase(token)
      end

  end
end