module Eps
  class NaiveBayes < BaseEstimator
    attr_reader :probabilities

    def accuracy
      Eps::Metrics.accuracy(@train_set.label, predict(@train_set))
    end

    # pmml

    def self.load_pmml(data)
      super do |data|
        # TODO more validation
        node = data.css("NaiveBayesModel")

        prior = {}
        node.css("BayesOutput TargetValueCount").each do |n|
          prior[n.attribute("value").value] = n.attribute("count").value.to_f
        end

        legacy = false

        conditional = {}
        features = {}
        node.css("BayesInput").each do |n|
          prob = {}

          # numeric
          n.css("TargetValueStat").each do |n2|
            n3 = n2.css("GaussianDistribution")
            prob[n2.attribute("value").value] = {
              mean: n3.attribute("mean").value.to_f,
              stdev: Math.sqrt(n3.attribute("variance").value.to_f)
            }
          end

          # detect bad form in Eps < 0.3
          bad_format = n.css("PairCounts").map { |n2| n2.attribute("value").value } == prior.keys

          # categorical
          n.css("PairCounts").each do |n2|
            if bad_format
              n2.css("TargetValueCount").each do |n3|
                prob[n3.attribute("value").value] ||= {}
                prob[n3.attribute("value").value][n2.attribute("value").value] = BigDecimal(n3.attribute("count").value)
              end
            else
              boom = {}
              n2.css("TargetValueCount").each do |n3|
                boom[n3.attribute("value").value] = BigDecimal(n3.attribute("count").value)
              end
              prob[n2.attribute("value").value] = boom
            end
          end

          if bad_format
            legacy = true
            prob.each do |k, v|
              prior.keys.each do |k|
                v[k] ||= 0.0
              end
            end
          end

          name = n.attribute("fieldName").value
          conditional[name] = prob
          features[name] = n.css("TargetValueStat").any? ? "numeric" : "categorical"
        end

        target = node.css("BayesOutput").attribute("fieldName").value

        probabilities = {
          prior: prior,
          conditional: conditional
        }

        # get derived fields
        derived = {}
        data.css("DerivedField").each do |n|
          name = n.attribute("name").value
          field = n.css("NormDiscrete").attribute("field").value
          value = n.css("NormDiscrete").attribute("value").value
          features.delete(name)
          features[field] = "derived"
          derived[field] ||= {}
          derived[field][name] = value
        end

        Evaluators::NaiveBayes.new(probabilities: probabilities, features: features, derived: derived, legacy: legacy)
      end
    end

    private

    # TODO better summary
    def _summary(extended: false)
      str = String.new("")
      probabilities[:prior].each do |k, v|
        str += "#{k}: #{v}\n"
      end
      str
    end

    def _train(smoothing: 1, **options)
      raise "Target must be strings" if @target_type != "categorical"
      check_missing_value(@train_set)
      check_missing_value(@validation_set) if @validation_set

      data = @train_set

      prep_text_features(data)

      # convert boolean to strings
      data.label = data.label.map(&:to_s)

      indexes = {}
      data.label.each_with_index do |yi, i|
        (indexes[yi] ||= []) << i
      end

      grouped = {}
      indexes.each do |k, v|
        grouped[k] = data[v]
      end

      prior = {}
      grouped.sort_by { |k, _| k }.each do |k, v|
        prior[k] = v.size
      end
      labels = prior.keys

      target_counts = {}
      labels.each do |k|
        target_counts[k] = 0
      end

      conditional = {}

      @features.each do |k, type|
        prob = {}

        case type
        when "text"
          raise "Text features not supported yet for naive Bayes"
        when "categorical"
          groups = Hash.new { |hash, key| hash[key] = [] }
          data.columns[k].each_with_index do |v, i|
            groups[v] << i
          end

          groups.each do |group, indexes|
            df = data[indexes]
            prob[group] = group_count(df.label, target_counts.dup)
          end

          # smooth
          if smoothing
            labels.each do |label|
              sum = prob.map { |k2, v2| v2[label] }.sum.to_f
              prob.each do |k2, v|
                v[label] = (v[label] + smoothing) * sum / (sum + (prob.size * smoothing))
              end
            end
          end
        else
          labels.each do |group|
            xs = grouped[group]

            # TODO handle this case
            next unless xs

            values = xs.columns[k]
            prob[group] = {mean: mean(values), stdev: stdev(values)}
          end
        end

        conditional[k] = prob
      end

      @probabilities = {
        prior: prior,
        conditional: conditional
      }

      Evaluators::NaiveBayes.new(probabilities: probabilities, features: @features)
    end

    def generate_pmml
      data_fields = {}
      data_fields[@target] = probabilities[:prior].keys
      probabilities[:conditional].each do |k, v|
        if @features[k] == "categorical"
          data_fields[k] = v.keys
        else
          data_fields[k] = nil
        end
      end

      build_pmml(data_fields) do |xml|
        xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
          xml.MiningSchema do
            data_fields.each do |k, _|
              xml.MiningField(name: k)
            end
          end
          xml.BayesInputs do
            probabilities[:conditional].each do |k, v|
              xml.BayesInput(fieldName: k) do
                if @features[k] == "categorical"
                  v.sort_by { |k2, _| k2 }.each do |k2, v2|
                    xml.PairCounts(value: k2) do
                      xml.TargetValueCounts do
                        v2.sort_by { |k2, _| k2 }.each do |k3, v3|
                          xml.TargetValueCount(value: k3, count: v3)
                        end
                      end
                    end
                  end
                else
                  xml.TargetValueStats do
                    v.sort_by { |k2, _| k2 }.each do |k2, v2|
                      xml.TargetValueStat(value: k2) do
                        xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
                      end
                    end
                  end
                end
              end
            end
          end
          xml.BayesOutput(fieldName: "target") do
            xml.TargetValueCounts do
              probabilities[:prior].sort_by { |k, _| k }.each do |k, v|
                xml.TargetValueCount(value: k, count: v)
              end
            end
          end
        end
      end
    end

    def group_count(arr, start)
      arr.inject(start) { |h, e| h[e] += 1; h }
    end

    def mean(arr)
      arr.sum / arr.size.to_f
    end

    def stdev(arr)
      return nil if arr.size <= 1
      m = mean(arr)
      sum = arr.inject(0) { |accum, i| accum + (i - m)**2 }
      Math.sqrt(sum / (arr.length - 1).to_f)
    end
  end
end