# -*- encoding : utf-8 -*-

module PragmaticSegmenter
  # This class searches for periods within an abbreviation and
  # replaces the periods.
  class AbbreviationReplacer

    SENTENCE_STARTERS = %w(A Being Did For He How However I In It Millions More She That The There They We What When Where Who Why)

    attr_reader :text
    def initialize(text:, language: )
      @text = Text.new(text)
      @language = language
    end

    def replace
      @text.apply(@language::PossessiveAbbreviationRule,
        @language::KommanditgesellschaftRule,
        @language::SingleLetterAbbreviationRules::All)

      @text = search_for_abbreviations_in_string(@text)
      @text = replace_multi_period_abbreviations(@text)
      @text.apply(@language::AmPmRules::All)
      replace_abbreviation_as_sentence_boundary(@text)
    end

    private

    def search_for_abbreviations_in_string(txt)
      original = txt.dup
      downcased = txt.downcase
      @language::Abbreviation::ABBREVIATIONS.each do |a|
        next unless downcased.include?(a.strip)
        abbrev_match = original.scan(/(?:^|\s|\r|\n)#{Regexp.escape(a.strip)}/i)
        next if abbrev_match.empty?
        next_word_start = /(?<=#{Regexp.escape(a.strip)} ).{1}/
        character_array = @text.scan(next_word_start)
        abbrev_match.each_with_index do |am, index|
          txt = scan_for_replacements(txt, am, index, character_array)
        end
      end
      txt
    end

    def scan_for_replacements(txt, am, index, character_array)
      character = character_array[index]
      prepositive = @language::Abbreviation::PREPOSITIVE_ABBREVIATIONS
      number_abbr = @language::Abbreviation::NUMBER_ABBREVIATIONS
      upper = /[[:upper:]]/.match(character.to_s)
      if upper.nil? || prepositive.include?(am.downcase.strip)
        if prepositive.include?(am.downcase.strip)
          txt = replace_prepositive_abbr(txt, am)
        elsif number_abbr.include?(am.downcase.strip)
          txt = replace_pre_number_abbr(txt, am)
        else
          txt = replace_period_of_abbr(txt, am)
        end
      end
      txt
    end

    def replace_abbreviation_as_sentence_boundary(txt)
      # As we are being conservative and keeping ambiguous
      # sentence boundaries as one sentence instead of
      # splitting into two, we can split at words that
      # we know for certain never follow these abbreviations.
      # Some might say that the set of words that follow an
      # abbreviation such as U.S. (i.e. U.S. Government) is smaller than
      # the set of words that could start a sentence and
      # never follow U.S. However, we are being conservative
      # and not splitting by default, so we need to look for places
      # where we definitely can split. Obviously SENTENCE_STARTERS
      # will never cover all cases, but as the gem is named
      # 'Pragmatic Segmenter' we need to be pragmatic
      # and try to cover the words that most often start a
      # sentence but could never follow one of the abbreviations below.

      SENTENCE_STARTERS.each do |word|
        txt.gsub!(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
        txt.gsub!(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
        txt.gsub!(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
        txt.gsub!(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
        txt.gsub!(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
        txt.gsub!(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
        txt.gsub!(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
        txt.gsub!(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
        txt.gsub!(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
        txt.gsub!(/i.v∯\s#{Regexp.escape(word)}\s/, "i\.v\.\s#{Regexp.escape(word)}\s")
        txt.gsub!(/I.V∯\s#{Regexp.escape(word)}\s/, "I\.V\.\s#{Regexp.escape(word)}\s")
      end
      txt
    end

    def replace_multi_period_abbreviations(txt)
      mpa = txt.scan(@language::MULTI_PERIOD_ABBREVIATION_REGEX)
      return txt if mpa.empty?
      mpa.each do |r|
        txt.gsub!(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
      end
      txt
    end

    def replace_pre_number_abbr(txt, abbr)
      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
      txt
    end

    def replace_prepositive_abbr(txt, abbr)
      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
      txt
    end

    def replace_period_of_abbr(txt, abbr)
      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
      txt
    end

    def replace_possessive_abbreviations(txt)
      txt.gsub!(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
      txt
    end
  end
end