# encoding: utf-8

require 'linguistics'
Linguistics.use(:en, classes: [String])

module Ebooks
  module NLP
    # We don't necessarily want to use all of this stuff all the time
    # Only load it when it is needed

    def self.stopwords
      @stopwords ||= File.read(File.join(DATA_PATH, 'stopwords.txt')).split
    end

    def self.nouns
      @nouns ||= File.read(File.join(DATA_PATH, 'nouns.txt')).split
    end

    def self.adjectives
      @adjectives ||= File.read(File.join(DATA_PATH, 'adjectives.txt')).split
    end

    def self.tokenizer
      # This tokenizer is used for dividing sentences into words
      # It's too slow for finding sentences in paragraphs, hence tactful
      require 'tokenizer'
      @tokenizer ||= Tokenizer::Tokenizer.new(:en)
    end

    def self.tactful
      require 'tactful_tokenizer'
      @tactful ||= TactfulTokenizer::Model.new
    end

    def self.tagger
      require 'engtagger'
      @tagger ||= EngTagger.new
    end

    def self.stemmer
      require 'lingua/stemmer'
      @stemmer ||= Lingua::Stemmer.new
    end

    def self.gingerice
      require 'gingerice'
      Gingerice::Parser.new # No caching for this one
    end

    def self.htmlentities
      require 'htmlentities'
      @htmlentities ||= HTMLEntities.new
    end

    ### Utility functions which wrap the above
    
    def self.sentences(text)
      tactful.tokenize_text(text)
    end

    def self.normalize(text)
      htmlentities.decode text.gsub('“', '"').gsub('”', '"').gsub('’', "'").gsub('…', '...')
    end

    def self.tokenize(sentence)
      # This is hacky, but an ad hoc approach seems to be
      # most reliable for now. Tokenization libraries have oddities
      # that are hard to correct.
      sentence.split(/\s/).map do |token|
        exceptions = [/^\w\)$/, /^@/, /^#/, /^:\w$/, /^:\w$/, /^http/]
        if exceptions.find { |r| r.match(token) }
          token
        else
          token.split(/(?<=^[#{PUNCTUATION}])(?=[a-zA-Z])|(?<=[a-zA-Z])(?=[#{PUNCTUATION}]+$)/)
        end
      end.flatten
    end

    def self.tokenset(sentence)
      tokens = sentence.is_a?(Array) ? sentence : tokenize(sentence)
      tokens.map(&:downcase)
            .reject { |token| stopwords.include?(token) }
            .to_set
    end

    def self.space_between?(token1, token2)
      p1 = self.punctuation?(token1)
      p2 = self.punctuation?(token2)
      if p1 && p2 # "foo?!"
        false
      elsif !p1 && p2 # "foo."
        false
      elsif p1 && !p2 # "foo. rah"
        true
      else # "foo rah"
        true
      end
    end

    def self.reconstruct(tokens)
      # Put tokens back together into a nice looking sentence
      text = ""
      last_token = nil
      tokens.each do |token|
        text += ' ' if last_token && space_between?(last_token, token)
        text += token
        last_token = token
      end
      text
    end

    # Deliberately limit our punctuation handling to stuff we can do consistently
    # It'll just be a part of a token if we don't split it out, and that's fine
    PUNCTUATION = ".?!,"

    def self.punctuation?(token)
      (token.chars.to_set - PUNCTUATION.chars.to_set).empty?
    end

    def self.unmatched_enclosers?(text)
      # Weird quotes are an instant giveaway. Let's do paren-matching.
      enclosers = ['**', '""', '()', '[]', '``']
      enclosers.each do |pair|
        starter = Regexp.new('(\W|^)' + Regexp.escape(pair[0]) + '\S')
        ender = Regexp.new('\S' + Regexp.escape(pair[1]) + '(\W|$)')

        opened = 0

        tokenize(text).each do |token|
          opened += 1 if token.match(starter)
          opened -= 1 if token.match(ender)

          return true if opened < 0 # Too many ends!
        end

        return true if opened != 0 # Mismatch somewhere.
      end

      false
    end
  end
end