module Eps
  class Regressor < BaseRegressor
    def initialize(data, y = nil, target: nil, gsl: nil)
      raise ArgumentError, "missing target" if !target && !y

      target = prep_target(target, data) if target

      # TODO more performant conversion
      if daru?(data)
        y ||= data[target].to_a
        x = data.dup.delete_vector(target)
      else
        x = data.map(&:dup)
        y ||= x.map { |v| v.delete(target) }
      end

      @x = x
      @y = prep_y(y.to_a)

      if @x.size != @y.size
        raise "Number of samples differs from target"
      end

      @target = target
      @gsl = gsl.nil? ? defined?(GSL) : gsl

      # fit immediately
      coefficients
    end

    def coefficients
      @coefficients ||= begin
        x, @coefficient_names = prep_x(@x)

        if x.size <= @coefficient_names.size
          raise "Number of samples must be at least two more than number of features"
        end

        v =
          if @gsl
            x = GSL::Matrix.alloc(*x)
            y = GSL::Vector.alloc(@y)
            c, @covariance, _, _ = GSL::MultiFit::linear(x, y)
            c.to_a
          else
            x = Matrix.rows(x)
            y = Matrix.column_vector(@y)
            removed = []

            # https://statsmaths.github.io/stat612/lectures/lec13/lecture13.pdf
            # unforutnately, this method is unstable
            # haven't found an efficient way to do QR-factorization in Ruby
            # the extendmatrix gem has householder and givens (givens has bug)
            # but methods are too slow
            begin
              @xtxi = (x.t * x).inverse
            rescue ExceptionForMatrix::ErrNotRegular
              constant = {}
              (1...x.column_count).each do |i|
                constant[i] = constant?(x.column(i))
              end

              # remove constant columns
              removed = constant.select { |_, v| v }.keys

              # remove non-independent columns
              constant.select { |_, v| !v }.keys.combination(2) do |c|
                if !x.column(c[0]).independent?(x.column(c[1]))
                  removed << c[1]
                end
              end

              vectors = x.column_vectors
              # delete in reverse of indexes stay the same
              removed.sort.reverse.each do |i|
                # @coefficient_names.delete_at(i)
                vectors.delete_at(i)
              end
              x = Matrix.columns(vectors)

              # try again
              begin
                @xtxi = (x.t * x).inverse
              rescue ExceptionForMatrix::ErrNotRegular
                raise "Multiple solutions - GSL is needed to select one"
              end
            end
            v2 = matrix_arr(@xtxi * x.t * y)

            # add back removed
            removed.sort.each do |i|
              v2.insert(i, 0)
            end
            @removed = removed

            v2
          end

        Hash[@coefficient_names.zip(v)]
      end
    end

    def evaluate(data, y = nil)
      actual = y

      actual ||=
        if daru?(data)
          data[@target].to_a
        else
          data.map { |v| v[@target] }
        end

      actual = prep_y(actual)

      estimated = predict(data)
      Eps.metrics(actual, estimated)
    end

    # https://people.richland.edu/james/ictcm/2004/multiple.html
    def summary(extended: false)
      @summary_str ||= begin
        str = String.new("")
        len = [coefficients.keys.map(&:size).max, 15].max
        if extended
          str += "%-#{len}s %12s %12s %12s %12s\n" % ["", "coef", "stderr", "t", "p"]
        else
          str += "%-#{len}s %12s %12s\n" % ["", "coef", "p"]
        end
        coefficients.each do |k, v|
          if extended
            str += "%-#{len}s %12.2f %12.2f %12.2f %12.3f\n" % [k, v, std_err[k], t_value[k], p_value[k]]
          else
            str += "%-#{len}s %12.2f %12.3f\n" % [k, v, p_value[k]]
          end
        end
        str += "\n"
        str += "r2: %.3f\n" % [r2] if extended
        str += "adjusted r2: %.3f\n" % [adjusted_r2]
        str
      end
    end

    def r2
      @r2 ||= (sst - sse) / sst
    end

    def adjusted_r2
      @adjusted_r2 ||= (mst - mse) / mst
    end

    private

    def constant?(arr)
      arr.all? { |x| x == arr[0] }
    end

    # determine if target is a string or symbol
    def prep_target(target, data)
      if daru?(data)
        data.has_vector?(target) ? target : flip_target(target)
      else
        x = data[0] || {}
        x[target] ? target : flip_target(target)
      end
    end

    def flip_target(target)
      target.is_a?(String) ? target.to_sym : target.to_s
    end

    def prep_y(y)
      y.each do |yi|
        raise "Target missing in data" if yi.nil?
      end
      y.map(&:to_f)
    end

    # add epsilon for perfect fits
    # consistent with GSL
    def t_value
      @t_value ||= Hash[coefficients.map { |k, v| [k, v / (std_err[k] + Float::EPSILON)] }]
    end

    def p_value
      @p_value ||= begin
        Hash[coefficients.map do |k, _|
          tp =
            if @gsl
              GSL::Cdf.tdist_P(t_value[k].abs, degrees_of_freedom)
            else
              tdist_p(t_value[k].abs, degrees_of_freedom)
            end

          [k, 2 * (1 - tp)]
        end]
      end
    end

    def std_err
      @std_err ||= begin
        Hash[@coefficient_names.zip(diagonal.map { |v| Math.sqrt(v) })]
      end
    end

    def diagonal
      @diagonal ||= begin
        if covariance.respond_to?(:each)
          d = covariance.each(:diagonal).to_a
          @removed.each do |i|
            d.insert(i, 0)
          end
          d
        else
          covariance.diagonal.to_a
        end
      end
    end

    def covariance
      @covariance ||= mse * @xtxi
    end

    def y_bar
      @y_bar ||= mean(@y)
    end

    def y_hat
      @y_hat ||= predict(@x)
    end

    # total sum of squares
    def sst
      @sst ||= sum(@y.map { |y| (y - y_bar)**2 })
    end

    # sum of squared errors of prediction
    # not to be confused with "explained sum of squares"
    def sse
      @sse ||= sum(@y.zip(y_hat).map { |y, yh| (y - yh)**2 })
    end

    def mst
      @mst ||= sst / (@y.size - 1)
    end

    def mse
      @mse ||= sse / degrees_of_freedom
    end

    def degrees_of_freedom
      @y.size - coefficients.size
    end

    def sum(arr)
      arr.inject(0, &:+)
    end

    def mean(arr)
      sum(arr) / arr.size.to_f
    end

    ### Extracted from https://github.com/estebanz01/ruby-statistics
    ### The Ruby author is Esteban Zapata Rojas
    ###
    ### Originally extracted from https://codeplea.com/incomplete-beta-function-c
    ### This function is shared under zlib license and the author is Lewis Van Winkle
    def tdist_p(value, degrees_of_freedom)
      upper = (value + Math.sqrt(value * value + degrees_of_freedom))
      lower = (2.0 * Math.sqrt(value * value + degrees_of_freedom))

      x = upper/lower

      alpha = degrees_of_freedom/2.0
      beta = degrees_of_freedom/2.0

      incomplete_beta_function(x, alpha, beta)
    end

    ### Extracted from https://github.com/estebanz01/ruby-statistics
    ### The Ruby author is Esteban Zapata Rojas
    ###
    ### This implementation is an adaptation of the incomplete beta function made in C by
    ### Lewis Van Winkle, which released the code under the zlib license.
    ### The whole math behind this code is described in the following post: https://codeplea.com/incomplete-beta-function-c
    def incomplete_beta_function(x, alp, bet)
      return if x < 0.0
      return 1.0 if x > 1.0

      tiny = 1.0E-50

      if x > ((alp + 1.0)/(alp + bet + 2.0))
        return 1.0 - incomplete_beta_function(1.0 - x, bet, alp)
      end

      # To avoid overflow problems, the implementation applies the logarithm properties
      # to calculate in a faster and safer way the values.
      lbet_ab = (Math.lgamma(alp)[0] + Math.lgamma(bet)[0] - Math.lgamma(alp + bet)[0]).freeze
      front = (Math.exp(Math.log(x) * alp + Math.log(1.0 - x) * bet - lbet_ab) / alp.to_f).freeze

      # This is the non-log version of the left part of the formula (before the continuous fraction)
      # down_left = alp * self.beta_function(alp, bet)
      # upper_left = (x ** alp) * ((1.0 - x) ** bet)
      # front = upper_left/down_left

      f, c, d = 1.0, 1.0, 0.0

      returned_value = nil

      # Let's do more iterations than the proposed implementation (200 iters)
      (0..500).each do |number|
        m = number/2

        numerator = if number == 0
                      1.0
                    elsif number % 2 == 0
                      (m * (bet - m) * x)/((alp + 2.0 * m - 1.0)* (alp + 2.0 * m))
                    else
                      top = -((alp + m) * (alp + bet + m) * x)
                      down = ((alp + 2.0 * m) * (alp + 2.0 * m + 1.0))

                      top/down
                    end

        d = 1.0 + numerator * d
        d = tiny if d.abs < tiny
        d = 1.0 / d

        c = 1.0 + numerator / c
        c = tiny if c.abs < tiny

        cd = (c*d).freeze
        f = f * cd

        if (1.0 - cd).abs < 1.0E-10
          returned_value = front * (f - 1.0)
          break
        end
      end

      returned_value
    end
  end
end