naive_bayes.rb in eps-0.3.0

- old
+ new
@@ -1,246 +1,257 @@
 module Eps
   class NaiveBayes < BaseEstimator
     attr_reader :probabilities
 
-    def initialize(probabilities: nil, target: nil)
-      @probabilities = probabilities
-      @target = target
+    def accuracy
+      Eps::Metrics.accuracy(@train_set.label, predict(@train_set))
     end
 
-    def train(*args)
-      super
+    # pmml
 
-      @y = @y.map { |yi| yi.to_s }
+    def self.load_pmml(data)
+      super do |data|
+        # TODO more validation
+        node = data.css("NaiveBayesModel")
 
-      prior = group_count(@y)
-      conditional = {}
-
-      if @x.any?
-        keys = @x.first.keys
-        x = @x.dup
-        x.each_with_index do |xi, i|
-          xi[@target] = @y[i]
+        prior = {}
+        node.css("BayesOutput TargetValueCount").each do |n|
+          prior[n.attribute("value").value] = n.attribute("count").value.to_f
         end
-        keys.each do |k|
-          conditional[k.to_s] = {}
-          x.group_by { |xi| xi[@target] }.each do |group, xs|
-            v = xs.map { |xi| xi[k] }
 
-            if categorical?(v[0])
-              # TODO apply smoothing
-              # apply smoothing only to
-              # 1. categorical features
-              # 2. conditional probabilities
-              # TODO more efficient count
-              conditional[k.to_s][group] = group_count(v)
+        legacy = false
+
+        conditional = {}
+        features = {}
+        node.css("BayesInput").each do |n|
+          prob = {}
+
+          # numeric
+          n.css("TargetValueStat").each do |n2|
+            n3 = n2.css("GaussianDistribution")
+            prob[n2.attribute("value").value] = {
+              mean: n3.attribute("mean").value.to_f,
+              stdev: Math.sqrt(n3.attribute("variance").value.to_f)
+            }
+          end
+
+          # detect bad form in Eps < 0.3
+          bad_format = n.css("PairCounts").map { |n2| n2.attribute("value").value } == prior.keys
+
+          # categorical
+          n.css("PairCounts").each do |n2|
+            if bad_format
+              n2.css("TargetValueCount").each do |n3|
+                prob[n3.attribute("value").value] ||= {}
+                prob[n3.attribute("value").value][n2.attribute("value").value] = BigDecimal(n3.attribute("count").value)
+              end
             else
-              conditional[k.to_s][group] = {mean: mean(v), stdev: stdev(v)}
+              boom = {}
+              n2.css("TargetValueCount").each do |n3|
+                boom[n3.attribute("value").value] = BigDecimal(n3.attribute("count").value)
+              end
+              prob[n2.attribute("value").value] = boom
             end
           end
+
+          if bad_format
+            legacy = true
+            prob.each do |k, v|
+              prior.keys.each do |k|
+                v[k] ||= 0.0
+              end
+            end
+          end
+
+          name = n.attribute("fieldName").value
+          conditional[name] = prob
+          features[name] = n.css("TargetValueStat").any? ? "numeric" : "categorical"
         end
-      end
 
-      @probabilities = {
-        prior: prior,
-        conditional: conditional
-      }
+        target = node.css("BayesOutput").attribute("fieldName").value
+
+        probabilities = {
+          prior: prior,
+          conditional: conditional
+        }
+
+        # get derived fields
+        derived = {}
+        data.css("DerivedField").each do |n|
+          name = n.attribute("name").value
+          field = n.css("NormDiscrete").attribute("field").value
+          value = n.css("NormDiscrete").attribute("value").value
+          features.delete(name)
+          features[field] = "derived"
+          derived[field] ||= {}
+          derived[field][name] = value
+        end
+
+        Evaluators::NaiveBayes.new(probabilities: probabilities, features: features, derived: derived, legacy: legacy)
+      end
     end
 
+    private
+
     # TODO better summary
-    def summary(extended: false)
+    def _summary(extended: false)
       str = String.new("")
       probabilities[:prior].each do |k, v|
         str += "#{k}: #{v}\n"
       end
-      str += "\n"
-      str += "accuracy: %d%%\n" % [(100 * accuracy).round]
       str
     end
 
-    def accuracy
-      self.class.metrics(predict(@x), @y)[:accuracy]
-    end
+    def _train(smoothing: 1, **options)
+      raise "Target must be strings" if @target_type != "categorical"
+      check_missing_value(@train_set)
+      check_missing_value(@validation_set) if @validation_set
 
-    # pmml
+      data = @train_set
 
-    def self.load_pmml(data)
-      # TODO more validation
-      node = data.css("NaiveBayesModel")
+      prep_text_features(data)
 
+      # convert boolean to strings
+      data.label = data.label.map(&:to_s)
+
+      indexes = {}
+      data.label.each_with_index do |yi, i|
+        (indexes[yi] ||= []) << i
+      end
+
+      grouped = {}
+      indexes.each do |k, v|
+        grouped[k] = data[v]
+      end
+
       prior = {}
-      node.css("BayesOutput TargetValueCount").each do |n|
-        prior[n.attribute("value").value] = n.attribute("count").value.to_f
+      grouped.sort_by { |k, _| k }.each do |k, v|
+        prior[k] = v.size
       end
+      labels = prior.keys
 
+      target_counts = {}
+      labels.each do |k|
+        target_counts[k] = 0
+      end
+
       conditional = {}
-      node.css("BayesInput").each do |n|
+
+      @features.each do |k, type|
         prob = {}
-        n.css("TargetValueStat").each do |n2|
-          n3 = n2.css("GaussianDistribution")
-          prob[n2.attribute("value").value] = {
-            mean: n3.attribute("mean").value.to_f,
-            stdev: Math.sqrt(n3.attribute("variance").value.to_f)
-          }
-        end
-        n.css("PairCounts").each do |n2|
-          boom = {}
-          n2.css("TargetValueCount").each do |n3|
-            boom[n3.attribute("value").value] = n3.attribute("count").value.to_f
+
+        case type
+        when "text"
+          raise "Text features not supported yet for naive Bayes"
+        when "categorical"
+          groups = Hash.new { |hash, key| hash[key] = [] }
+          data.columns[k].each_with_index do |v, i|
+            groups[v] << i
           end
-          prob[n2.attribute("value").value] = boom
+
+          groups.each do |group, indexes|
+            df = data[indexes]
+            prob[group] = group_count(df.label, target_counts.dup)
+          end
+
+          # smooth
+          if smoothing
+            labels.each do |label|
+              sum = prob.map { |k2, v2| v2[label] }.sum.to_f
+              prob.each do |k2, v|
+                v[label] = (v[label] + smoothing) * sum / (sum + (prob.size * smoothing))
+              end
+            end
+          end
+        else
+          labels.each do |group|
+            xs = grouped[group]
+
+            # TODO handle this case
+            next unless xs
+
+            values = xs.columns[k]
+            prob[group] = {mean: mean(values), stdev: stdev(values)}
+          end
         end
-        conditional[n.attribute("fieldName").value] = prob
+
+        conditional[k] = prob
       end
 
-      @target = node.css("BayesOutput").attribute("fieldName").value
-
-      probabilities = {
+      @probabilities = {
         prior: prior,
         conditional: conditional
       }
 
-      new(probabilities: probabilities, target: @target)
+      Evaluators::NaiveBayes.new(probabilities: probabilities, features: @features)
     end
 
-    def to_pmml
+    def generate_pmml
       data_fields = {}
       data_fields[@target] = probabilities[:prior].keys
       probabilities[:conditional].each do |k, v|
-        if !v.values[0][:mean]
+        if @features[k] == "categorical"
           data_fields[k] = v.keys
         else
           data_fields[k] = nil
         end
       end
 
-      builder = Nokogiri::XML::Builder.new do |xml|
-        xml.PMML(version: "4.3", xmlns: "http://www.dmg.org/PMML-4_3", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
-          xml.Header
-          xml.DataDictionary do
-            data_fields.each do |k, vs|
-              if vs
-                xml.DataField(name: k, optype: "categorical", dataType: "string") do
-                  vs.each do |v|
-                    xml.Value(value: v)
-                  end
-                end
-              else
-                xml.DataField(name: k, optype: "continuous", dataType: "double")
-              end
+      build_pmml(data_fields) do |xml|
+        xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
+          xml.MiningSchema do
+            data_fields.each do |k, _|
+              xml.MiningField(name: k)
             end
           end
-          xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
-            xml.MiningSchema do
-              data_fields.each do |k, _|
-                xml.MiningField(name: k)
-              end
-            end
-            xml.BayesInputs do
-              probabilities[:conditional].each do |k, v|
-                xml.BayesInput(fieldName: k) do
-                  if !v.values[0][:mean]
-                    v.each do |k2, v2|
-                      xml.PairCounts(value: k2) do
-                        xml.TargetValueCounts do
-                          v2.each do |k3, v3|
-                            xml.TargetValueCount(value: k3, count: v3)
-                          end
+          xml.BayesInputs do
+            probabilities[:conditional].each do |k, v|
+              xml.BayesInput(fieldName: k) do
+                if @features[k] == "categorical"
+                  v.sort_by { |k2, _| k2 }.each do |k2, v2|
+                    xml.PairCounts(value: k2) do
+                      xml.TargetValueCounts do
+                        v2.sort_by { |k2, _| k2 }.each do |k3, v3|
+                          xml.TargetValueCount(value: k3, count: v3)
                         end
                       end
                     end
-                  else
-                    xml.TargetValueStats do
-                      v.each do |k2, v2|
-                        xml.TargetValueStat(value: k2) do
-                          xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
-                        end
+                  end
+                else
+                  xml.TargetValueStats do
+                    v.sort_by { |k2, _| k2 }.each do |k2, v2|
+                      xml.TargetValueStat(value: k2) do
+                        xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
                       end
                     end
                   end
                 end
               end
             end
-            xml.BayesOutput(fieldName: "target") do
-              xml.TargetValueCounts do
-                probabilities[:prior].each do |k, v|
-                  xml.TargetValueCount(value: k, count: v)
-                end
+          end
+          xml.BayesOutput(fieldName: "target") do
+            xml.TargetValueCounts do
+              probabilities[:prior].sort_by { |k, _| k }.each do |k, v|
+                xml.TargetValueCount(value: k, count: v)
               end
             end
           end
         end
-      end.to_xml
-    end
-
-    # metrics
-
-    def self.metrics(actual, estimated)
-      {
-        accuracy: actual.zip(estimated).count { |yi, yi2| yi == yi2 } / actual.size.to_f
-      }
-    end
-
-    private
-
-    def _predict(x)
-      x.map do |xi|
-        probs = calculate_class_probabilities(stringify_keys(xi))
-        # deterministic for equal probabilities
-        probs.sort_by { |k, v| [-v, k.to_s] }[0][0]
       end
     end
 
-    def calculate_class_probabilities(x)
-      prob = {}
-      probabilities[:prior].each do |c, cv|
-        prob[c] = cv.to_f / probabilities[:prior].values.sum
-        probabilities[:conditional].each do |k, v|
-          if !v[c][:mean]
-            # TODO compute ahead of time
-            p2 = v[c][x[k]].to_f / v[c].values.sum
-
-            # assign very small probability if probability is 0
-            # TODO use proper smoothing instead
-            if p2 == 0
-              p2 = 0.0001
-            end
-
-            prob[c] *= p2
-          else
-            prob[c] *= calculate_probability(x[k], v[c][:mean], v[c][:stdev])
-          end
-        end
-      end
-      prob
+    def group_count(arr, start)
+      arr.inject(start) { |h, e| h[e] += 1; h }
     end
 
-    def calculate_probability(x, mean, stdev)
-      exponent = Math.exp(-((x - mean)**2) / (2 * (stdev**2)))
-      (1 / (Math.sqrt(2 * Math::PI) * stdev)) * exponent
-    end
-
-    def group_count(arr)
-      r = arr.inject(Hash.new(0)) { |h, e| h[e.to_s] += 1 ; h }
-      r.default = nil
-      r
-    end
-
     def mean(arr)
       arr.sum / arr.size.to_f
     end
 
     def stdev(arr)
+      return nil if arr.size <= 1
       m = mean(arr)
       sum = arr.inject(0) { |accum, i| accum + (i - m)**2 }
       Math.sqrt(sum / (arr.length - 1).to_f)
-    end
-
-    def stringify_keys(h)
-      o = {}
-      h.each do |k, v|
-        o[k.to_s] = v
-      end
-      o
     end
   end
 end