module Eps
  class BaseEstimator
    def initialize(data = nil, y = nil, **options)
      @options = options.dup
      # TODO better pattern - don't pass most options to train
      options.delete(:intercept)
      @trained = false
      train(data, y, **options) if data
    end

    def predict(data)
      singular = data.is_a?(Hash)
      data = [data] if singular

      data = Eps::DataFrame.new(data)

      @evaluator.features.each do |k, type|
        values = data.columns[k]
        raise ArgumentError, "Missing column: #{k}" if !values
        column_type = Utils.column_type(values.compact, k) if values

        if !column_type.nil?
          if (type == "numeric" && column_type != "numeric") || (type != "numeric" && column_type != "categorical")
            raise ArgumentError, "Bad type for column #{k}: Expected #{type} but got #{column_type}"
          end
        end
        # TODO check for unknown values for categorical features
      end

      predictions = @evaluator.predict(data)

      singular ? predictions.first : predictions
    end

    def evaluate(data, y = nil, target: nil, weight: nil)
      data, target = prep_data(data, y, target || @target, weight)
      Eps.metrics(data.label, predict(data), weight: data.weight)
    end

    def to_pmml
      @pmml ||= PMML.generate(self)
    end

    def self.load_pmml(pmml)
      model = new
      model.instance_variable_set("@evaluator", PMML.load(pmml))
      model.instance_variable_set("@pmml", pmml.respond_to?(:to_xml) ? pmml.to_xml : pmml) # cache data
      model
    end

    def summary(extended: false)
      raise "Summary not available for loaded models" unless @trained

      str = String.new("")

      if @validation_set
        y_true = @validation_set.label
        y_pred = predict(@validation_set)

        case @target_type
        when "numeric"
          metric_name = "RMSE"
          v = Metrics.rmse(y_true, y_pred, weight: @validation_set.weight)
          metric_value = v.round >= 1000 ? v.round.to_s : "%.3g" % v
        else
          metric_name = "accuracy"
          metric_value = "%.1f%%" % (100 * Metrics.accuracy(y_true, y_pred, weight: @validation_set.weight)).round(1)
        end
        str << "Validation %s: %s\n\n"  % [metric_name, metric_value]
      end

      str << _summary(extended: extended)
      str
    end

    private

    def train(data, y = nil, target: nil, weight: nil, split: nil, validation_set: nil, verbose: nil, text_features: nil, early_stopping: nil)
      data, @target = prep_data(data, y, target, weight)
      @target_type = Utils.column_type(data.label, @target)

      if split.nil?
        split = data.size >= 30
      end

      # cross validation
      # TODO adjust based on weight
      if split && !validation_set
        split = {} if split == true
        split = {column: split} unless split.is_a?(Hash)

        split_p = 1 - (split[:validation_size] || 0.25)
        if split[:column]
          split_column = split[:column].to_s
          times = data.columns.delete(split_column)
          check_missing(times, split_column)
          split_index = (times.size * split_p).round
          split_time = split[:value] || times.sort[split_index]
          train_idx, validation_idx = (0...data.size).to_a.partition { |i| times[i] < split_time }
        else
          if split[:shuffle] != false
            rng = Random.new(0) # seed random number generator
            train_idx, validation_idx = (0...data.size).to_a.partition { rng.rand < split_p }
          else
            split_index = (data.size * split_p).round
            train_idx, validation_idx = (0...data.size).to_a.partition { |i| i < split_index }
          end
        end
      end

      # determine feature types
      @features = {}
      data.columns.each do |k, v|
        @features[k] = Utils.column_type(v.compact, k)
      end

      # determine text features if not specified
      if text_features.nil?
        text_features = []

        @features.each do |k, type|
          next if type != "categorical"

          values = data.columns[k].compact

          next unless values.first.is_a?(String) # not boolean

          values = values.reject(&:empty?)
          count = values.count

          # check if spaces
          # two spaces is rough approximation for 3 words
          # TODO make more performant
          if values.count { |v| v.count(" ") >= 2 } > 0.5 * count
            text_features << k
          end
        end
      end

      # prep text features
      @text_features = {}
      (text_features || {}).each do |k, v|
        @features[k.to_s] = "text"

        # same output as scikit-learn CountVectorizer
        # except for max_features
        @text_features[k.to_s] = {
          tokenizer: /\W+/,
          min_length: 2,
          max_features: 100
        }.merge(v || {})
      end

      if split && !validation_set
        @train_set = data[train_idx]
        validation_set = data[validation_idx]
      else
        @train_set = data.dup
        if validation_set
          raise "Target required for validation set" unless target
          raise "Weight required for validation set" if data.weight && !weight
          validation_set, _ = prep_data(validation_set, nil, @target, weight)
        end
      end

      raise "No data in training set" if @train_set.empty?
      raise "No data in validation set" if validation_set && validation_set.empty?

      @validation_set = validation_set
      @evaluator = _train(verbose: verbose, early_stopping: early_stopping)

      # reset pmml
      @pmml = nil

      @trained = true

      nil
    end

    def prep_data(data, y, target, weight)
      data = Eps::DataFrame.new(data)

      # target
      target = (target || "target").to_s
      y ||= data.columns.delete(target)
      check_missing(y, target)
      data.label = y.to_a

      # weight
      if weight
        weight =
          if weight.respond_to?(:to_a)
            weight.to_a
          else
            data.columns.delete(weight.to_s)
          end
        check_missing(weight, "weight")
        data.weight = weight.to_a
      end

      check_data(data)
      [data, target]
    end

    def prep_text_features(train_set)
      @text_encoders = {}
      @text_features.each do |k, v|
        # reset vocabulary
        v.delete(:vocabulary)

        # TODO determine max features automatically
        # start based on number of rows
        encoder = Eps::TextEncoder.new(**v)
        counts = encoder.fit(train_set.columns.delete(k))
        encoder.vocabulary.each do |word|
          train_set.columns[[k, word]] = [0] * counts.size
        end
        counts.each_with_index do |ci, i|
          ci.each do |word, count|
            word_key = [k, word]
            train_set.columns[word_key][i] = 1 if train_set.columns.key?(word_key)
          end
        end
        @text_encoders[k] = encoder

        # update vocabulary
        v[:vocabulary] = encoder.vocabulary
      end

      raise "No features left" if train_set.columns.empty?
    end

    def check_data(data)
      raise "No data" if data.empty?
      raise "Number of data points differs from target" if data.size != data.label.size
      raise "Number of data points differs from weight" if data.weight && data.size != data.weight.size
    end

    def check_missing(c, name)
      raise ArgumentError, "Missing column: #{name}" if !c
      raise ArgumentError, "Missing values in column #{name}" if c.any?(&:nil?)
    end

    def check_missing_value(df)
      df.columns.each do |k, v|
        check_missing(v, k)
      end
    end

    def display_field(k)
      if k.is_a?(Array)
        if @features[k.first] == "text"
          "#{k.first}(#{k.last})"
        else
          k.join("=")
        end
      else
        k
      end
    end
  end
end