lib/eps/naive_bayes.rb in eps-0.2.1 vs lib/eps/naive_bayes.rb in eps-0.3.0
- old
+ new
@@ -1,246 +1,257 @@
module Eps
class NaiveBayes < BaseEstimator
attr_reader :probabilities
- def initialize(probabilities: nil, target: nil)
- @probabilities = probabilities
- @target = target
+ def accuracy
+ Eps::Metrics.accuracy(@train_set.label, predict(@train_set))
end
- def train(*args)
- super
+ # pmml
- @y = @y.map { |yi| yi.to_s }
+ def self.load_pmml(data)
+ super do |data|
+ # TODO more validation
+ node = data.css("NaiveBayesModel")
- prior = group_count(@y)
- conditional = {}
-
- if @x.any?
- keys = @x.first.keys
- x = @x.dup
- x.each_with_index do |xi, i|
- xi[@target] = @y[i]
+ prior = {}
+ node.css("BayesOutput TargetValueCount").each do |n|
+ prior[n.attribute("value").value] = n.attribute("count").value.to_f
end
- keys.each do |k|
- conditional[k.to_s] = {}
- x.group_by { |xi| xi[@target] }.each do |group, xs|
- v = xs.map { |xi| xi[k] }
- if categorical?(v[0])
- # TODO apply smoothing
- # apply smoothing only to
- # 1. categorical features
- # 2. conditional probabilities
- # TODO more efficient count
- conditional[k.to_s][group] = group_count(v)
+ legacy = false
+
+ conditional = {}
+ features = {}
+ node.css("BayesInput").each do |n|
+ prob = {}
+
+ # numeric
+ n.css("TargetValueStat").each do |n2|
+ n3 = n2.css("GaussianDistribution")
+ prob[n2.attribute("value").value] = {
+ mean: n3.attribute("mean").value.to_f,
+ stdev: Math.sqrt(n3.attribute("variance").value.to_f)
+ }
+ end
+
+ # detect bad form in Eps < 0.3
+ bad_format = n.css("PairCounts").map { |n2| n2.attribute("value").value } == prior.keys
+
+ # categorical
+ n.css("PairCounts").each do |n2|
+ if bad_format
+ n2.css("TargetValueCount").each do |n3|
+ prob[n3.attribute("value").value] ||= {}
+ prob[n3.attribute("value").value][n2.attribute("value").value] = BigDecimal(n3.attribute("count").value)
+ end
else
- conditional[k.to_s][group] = {mean: mean(v), stdev: stdev(v)}
+ boom = {}
+ n2.css("TargetValueCount").each do |n3|
+ boom[n3.attribute("value").value] = BigDecimal(n3.attribute("count").value)
+ end
+ prob[n2.attribute("value").value] = boom
end
end
+
+ if bad_format
+ legacy = true
+ prob.each do |k, v|
+ prior.keys.each do |k|
+ v[k] ||= 0.0
+ end
+ end
+ end
+
+ name = n.attribute("fieldName").value
+ conditional[name] = prob
+ features[name] = n.css("TargetValueStat").any? ? "numeric" : "categorical"
end
- end
- @probabilities = {
- prior: prior,
- conditional: conditional
- }
+ target = node.css("BayesOutput").attribute("fieldName").value
+
+ probabilities = {
+ prior: prior,
+ conditional: conditional
+ }
+
+ # get derived fields
+ derived = {}
+ data.css("DerivedField").each do |n|
+ name = n.attribute("name").value
+ field = n.css("NormDiscrete").attribute("field").value
+ value = n.css("NormDiscrete").attribute("value").value
+ features.delete(name)
+ features[field] = "derived"
+ derived[field] ||= {}
+ derived[field][name] = value
+ end
+
+ Evaluators::NaiveBayes.new(probabilities: probabilities, features: features, derived: derived, legacy: legacy)
+ end
end
+ private
+
# TODO better summary
- def summary(extended: false)
+ def _summary(extended: false)
str = String.new("")
probabilities[:prior].each do |k, v|
str += "#{k}: #{v}\n"
end
- str += "\n"
- str += "accuracy: %d%%\n" % [(100 * accuracy).round]
str
end
- def accuracy
- self.class.metrics(predict(@x), @y)[:accuracy]
- end
+ def _train(smoothing: 1, **options)
+ raise "Target must be strings" if @target_type != "categorical"
+ check_missing_value(@train_set)
+ check_missing_value(@validation_set) if @validation_set
- # pmml
+ data = @train_set
- def self.load_pmml(data)
- # TODO more validation
- node = data.css("NaiveBayesModel")
+ prep_text_features(data)
+ # convert boolean to strings
+ data.label = data.label.map(&:to_s)
+
+ indexes = {}
+ data.label.each_with_index do |yi, i|
+ (indexes[yi] ||= []) << i
+ end
+
+ grouped = {}
+ indexes.each do |k, v|
+ grouped[k] = data[v]
+ end
+
prior = {}
- node.css("BayesOutput TargetValueCount").each do |n|
- prior[n.attribute("value").value] = n.attribute("count").value.to_f
+ grouped.sort_by { |k, _| k }.each do |k, v|
+ prior[k] = v.size
end
+ labels = prior.keys
+ target_counts = {}
+ labels.each do |k|
+ target_counts[k] = 0
+ end
+
conditional = {}
- node.css("BayesInput").each do |n|
+
+ @features.each do |k, type|
prob = {}
- n.css("TargetValueStat").each do |n2|
- n3 = n2.css("GaussianDistribution")
- prob[n2.attribute("value").value] = {
- mean: n3.attribute("mean").value.to_f,
- stdev: Math.sqrt(n3.attribute("variance").value.to_f)
- }
- end
- n.css("PairCounts").each do |n2|
- boom = {}
- n2.css("TargetValueCount").each do |n3|
- boom[n3.attribute("value").value] = n3.attribute("count").value.to_f
+
+ case type
+ when "text"
+ raise "Text features not supported yet for naive Bayes"
+ when "categorical"
+ groups = Hash.new { |hash, key| hash[key] = [] }
+ data.columns[k].each_with_index do |v, i|
+ groups[v] << i
end
- prob[n2.attribute("value").value] = boom
+
+ groups.each do |group, indexes|
+ df = data[indexes]
+ prob[group] = group_count(df.label, target_counts.dup)
+ end
+
+ # smooth
+ if smoothing
+ labels.each do |label|
+ sum = prob.map { |k2, v2| v2[label] }.sum.to_f
+ prob.each do |k2, v|
+ v[label] = (v[label] + smoothing) * sum / (sum + (prob.size * smoothing))
+ end
+ end
+ end
+ else
+ labels.each do |group|
+ xs = grouped[group]
+
+ # TODO handle this case
+ next unless xs
+
+ values = xs.columns[k]
+ prob[group] = {mean: mean(values), stdev: stdev(values)}
+ end
end
- conditional[n.attribute("fieldName").value] = prob
+
+ conditional[k] = prob
end
- @target = node.css("BayesOutput").attribute("fieldName").value
-
- probabilities = {
+ @probabilities = {
prior: prior,
conditional: conditional
}
- new(probabilities: probabilities, target: @target)
+ Evaluators::NaiveBayes.new(probabilities: probabilities, features: @features)
end
- def to_pmml
+ def generate_pmml
data_fields = {}
data_fields[@target] = probabilities[:prior].keys
probabilities[:conditional].each do |k, v|
- if !v.values[0][:mean]
+ if @features[k] == "categorical"
data_fields[k] = v.keys
else
data_fields[k] = nil
end
end
- builder = Nokogiri::XML::Builder.new do |xml|
- xml.PMML(version: "4.3", xmlns: "http://www.dmg.org/PMML-4_3", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do
- xml.Header
- xml.DataDictionary do
- data_fields.each do |k, vs|
- if vs
- xml.DataField(name: k, optype: "categorical", dataType: "string") do
- vs.each do |v|
- xml.Value(value: v)
- end
- end
- else
- xml.DataField(name: k, optype: "continuous", dataType: "double")
- end
+ build_pmml(data_fields) do |xml|
+ xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
+ xml.MiningSchema do
+ data_fields.each do |k, _|
+ xml.MiningField(name: k)
end
end
- xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do
- xml.MiningSchema do
- data_fields.each do |k, _|
- xml.MiningField(name: k)
- end
- end
- xml.BayesInputs do
- probabilities[:conditional].each do |k, v|
- xml.BayesInput(fieldName: k) do
- if !v.values[0][:mean]
- v.each do |k2, v2|
- xml.PairCounts(value: k2) do
- xml.TargetValueCounts do
- v2.each do |k3, v3|
- xml.TargetValueCount(value: k3, count: v3)
- end
+ xml.BayesInputs do
+ probabilities[:conditional].each do |k, v|
+ xml.BayesInput(fieldName: k) do
+ if @features[k] == "categorical"
+ v.sort_by { |k2, _| k2 }.each do |k2, v2|
+ xml.PairCounts(value: k2) do
+ xml.TargetValueCounts do
+ v2.sort_by { |k2, _| k2 }.each do |k3, v3|
+ xml.TargetValueCount(value: k3, count: v3)
end
end
end
- else
- xml.TargetValueStats do
- v.each do |k2, v2|
- xml.TargetValueStat(value: k2) do
- xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
- end
+ end
+ else
+ xml.TargetValueStats do
+ v.sort_by { |k2, _| k2 }.each do |k2, v2|
+ xml.TargetValueStat(value: k2) do
+ xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2)
end
end
end
end
end
end
- xml.BayesOutput(fieldName: "target") do
- xml.TargetValueCounts do
- probabilities[:prior].each do |k, v|
- xml.TargetValueCount(value: k, count: v)
- end
+ end
+ xml.BayesOutput(fieldName: "target") do
+ xml.TargetValueCounts do
+ probabilities[:prior].sort_by { |k, _| k }.each do |k, v|
+ xml.TargetValueCount(value: k, count: v)
end
end
end
end
- end.to_xml
- end
-
- # metrics
-
- def self.metrics(actual, estimated)
- {
- accuracy: actual.zip(estimated).count { |yi, yi2| yi == yi2 } / actual.size.to_f
- }
- end
-
- private
-
- def _predict(x)
- x.map do |xi|
- probs = calculate_class_probabilities(stringify_keys(xi))
- # deterministic for equal probabilities
- probs.sort_by { |k, v| [-v, k.to_s] }[0][0]
end
end
- def calculate_class_probabilities(x)
- prob = {}
- probabilities[:prior].each do |c, cv|
- prob[c] = cv.to_f / probabilities[:prior].values.sum
- probabilities[:conditional].each do |k, v|
- if !v[c][:mean]
- # TODO compute ahead of time
- p2 = v[c][x[k]].to_f / v[c].values.sum
-
- # assign very small probability if probability is 0
- # TODO use proper smoothing instead
- if p2 == 0
- p2 = 0.0001
- end
-
- prob[c] *= p2
- else
- prob[c] *= calculate_probability(x[k], v[c][:mean], v[c][:stdev])
- end
- end
- end
- prob
+ def group_count(arr, start)
+ arr.inject(start) { |h, e| h[e] += 1; h }
end
- def calculate_probability(x, mean, stdev)
- exponent = Math.exp(-((x - mean)**2) / (2 * (stdev**2)))
- (1 / (Math.sqrt(2 * Math::PI) * stdev)) * exponent
- end
-
- def group_count(arr)
- r = arr.inject(Hash.new(0)) { |h, e| h[e.to_s] += 1 ; h }
- r.default = nil
- r
- end
-
def mean(arr)
arr.sum / arr.size.to_f
end
def stdev(arr)
+ return nil if arr.size <= 1
m = mean(arr)
sum = arr.inject(0) { |accum, i| accum + (i - m)**2 }
Math.sqrt(sum / (arr.length - 1).to_f)
- end
-
- def stringify_keys(h)
- o = {}
- h.each do |k, v|
- o[k.to_s] = v
- end
- o
end
end
end