lib/eps/naive_bayes.rb in eps-0.2.1 vs lib/eps/naive_bayes.rb in eps-0.3.0

- old
+ new

@@ -1,246 +1,257 @@ module Eps class NaiveBayes < BaseEstimator attr_reader :probabilities - def initialize(probabilities: nil, target: nil) - @probabilities = probabilities - @target = target + def accuracy + Eps::Metrics.accuracy(@train_set.label, predict(@train_set)) end - def train(*args) - super + # pmml - @y = @y.map { |yi| yi.to_s } + def self.load_pmml(data) + super do |data| + # TODO more validation + node = data.css("NaiveBayesModel") - prior = group_count(@y) - conditional = {} - - if @x.any? - keys = @x.first.keys - x = @x.dup - x.each_with_index do |xi, i| - xi[@target] = @y[i] + prior = {} + node.css("BayesOutput TargetValueCount").each do |n| + prior[n.attribute("value").value] = n.attribute("count").value.to_f end - keys.each do |k| - conditional[k.to_s] = {} - x.group_by { |xi| xi[@target] }.each do |group, xs| - v = xs.map { |xi| xi[k] } - if categorical?(v[0]) - # TODO apply smoothing - # apply smoothing only to - # 1. categorical features - # 2. conditional probabilities - # TODO more efficient count - conditional[k.to_s][group] = group_count(v) + legacy = false + + conditional = {} + features = {} + node.css("BayesInput").each do |n| + prob = {} + + # numeric + n.css("TargetValueStat").each do |n2| + n3 = n2.css("GaussianDistribution") + prob[n2.attribute("value").value] = { + mean: n3.attribute("mean").value.to_f, + stdev: Math.sqrt(n3.attribute("variance").value.to_f) + } + end + + # detect bad form in Eps < 0.3 + bad_format = n.css("PairCounts").map { |n2| n2.attribute("value").value } == prior.keys + + # categorical + n.css("PairCounts").each do |n2| + if bad_format + n2.css("TargetValueCount").each do |n3| + prob[n3.attribute("value").value] ||= {} + prob[n3.attribute("value").value][n2.attribute("value").value] = BigDecimal(n3.attribute("count").value) + end else - conditional[k.to_s][group] = {mean: mean(v), stdev: stdev(v)} + boom = {} + n2.css("TargetValueCount").each do |n3| + boom[n3.attribute("value").value] = BigDecimal(n3.attribute("count").value) + end + prob[n2.attribute("value").value] = boom end end + + if bad_format + legacy = true + prob.each do |k, v| + prior.keys.each do |k| + v[k] ||= 0.0 + end + end + end + + name = n.attribute("fieldName").value + conditional[name] = prob + features[name] = n.css("TargetValueStat").any? ? "numeric" : "categorical" end - end - @probabilities = { - prior: prior, - conditional: conditional - } + target = node.css("BayesOutput").attribute("fieldName").value + + probabilities = { + prior: prior, + conditional: conditional + } + + # get derived fields + derived = {} + data.css("DerivedField").each do |n| + name = n.attribute("name").value + field = n.css("NormDiscrete").attribute("field").value + value = n.css("NormDiscrete").attribute("value").value + features.delete(name) + features[field] = "derived" + derived[field] ||= {} + derived[field][name] = value + end + + Evaluators::NaiveBayes.new(probabilities: probabilities, features: features, derived: derived, legacy: legacy) + end end + private + # TODO better summary - def summary(extended: false) + def _summary(extended: false) str = String.new("") probabilities[:prior].each do |k, v| str += "#{k}: #{v}\n" end - str += "\n" - str += "accuracy: %d%%\n" % [(100 * accuracy).round] str end - def accuracy - self.class.metrics(predict(@x), @y)[:accuracy] - end + def _train(smoothing: 1, **options) + raise "Target must be strings" if @target_type != "categorical" + check_missing_value(@train_set) + check_missing_value(@validation_set) if @validation_set - # pmml + data = @train_set - def self.load_pmml(data) - # TODO more validation - node = data.css("NaiveBayesModel") + prep_text_features(data) + # convert boolean to strings + data.label = data.label.map(&:to_s) + + indexes = {} + data.label.each_with_index do |yi, i| + (indexes[yi] ||= []) << i + end + + grouped = {} + indexes.each do |k, v| + grouped[k] = data[v] + end + prior = {} - node.css("BayesOutput TargetValueCount").each do |n| - prior[n.attribute("value").value] = n.attribute("count").value.to_f + grouped.sort_by { |k, _| k }.each do |k, v| + prior[k] = v.size end + labels = prior.keys + target_counts = {} + labels.each do |k| + target_counts[k] = 0 + end + conditional = {} - node.css("BayesInput").each do |n| + + @features.each do |k, type| prob = {} - n.css("TargetValueStat").each do |n2| - n3 = n2.css("GaussianDistribution") - prob[n2.attribute("value").value] = { - mean: n3.attribute("mean").value.to_f, - stdev: Math.sqrt(n3.attribute("variance").value.to_f) - } - end - n.css("PairCounts").each do |n2| - boom = {} - n2.css("TargetValueCount").each do |n3| - boom[n3.attribute("value").value] = n3.attribute("count").value.to_f + + case type + when "text" + raise "Text features not supported yet for naive Bayes" + when "categorical" + groups = Hash.new { |hash, key| hash[key] = [] } + data.columns[k].each_with_index do |v, i| + groups[v] << i end - prob[n2.attribute("value").value] = boom + + groups.each do |group, indexes| + df = data[indexes] + prob[group] = group_count(df.label, target_counts.dup) + end + + # smooth + if smoothing + labels.each do |label| + sum = prob.map { |k2, v2| v2[label] }.sum.to_f + prob.each do |k2, v| + v[label] = (v[label] + smoothing) * sum / (sum + (prob.size * smoothing)) + end + end + end + else + labels.each do |group| + xs = grouped[group] + + # TODO handle this case + next unless xs + + values = xs.columns[k] + prob[group] = {mean: mean(values), stdev: stdev(values)} + end end - conditional[n.attribute("fieldName").value] = prob + + conditional[k] = prob end - @target = node.css("BayesOutput").attribute("fieldName").value - - probabilities = { + @probabilities = { prior: prior, conditional: conditional } - new(probabilities: probabilities, target: @target) + Evaluators::NaiveBayes.new(probabilities: probabilities, features: @features) end - def to_pmml + def generate_pmml data_fields = {} data_fields[@target] = probabilities[:prior].keys probabilities[:conditional].each do |k, v| - if !v.values[0][:mean] + if @features[k] == "categorical" data_fields[k] = v.keys else data_fields[k] = nil end end - builder = Nokogiri::XML::Builder.new do |xml| - xml.PMML(version: "4.3", xmlns: "http://www.dmg.org/PMML-4_3", "xmlns:xsi" => "http://www.w3.org/2001/XMLSchema-instance") do - xml.Header - xml.DataDictionary do - data_fields.each do |k, vs| - if vs - xml.DataField(name: k, optype: "categorical", dataType: "string") do - vs.each do |v| - xml.Value(value: v) - end - end - else - xml.DataField(name: k, optype: "continuous", dataType: "double") - end + build_pmml(data_fields) do |xml| + xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do + xml.MiningSchema do + data_fields.each do |k, _| + xml.MiningField(name: k) end end - xml.NaiveBayesModel(functionName: "classification", threshold: 0.001) do - xml.MiningSchema do - data_fields.each do |k, _| - xml.MiningField(name: k) - end - end - xml.BayesInputs do - probabilities[:conditional].each do |k, v| - xml.BayesInput(fieldName: k) do - if !v.values[0][:mean] - v.each do |k2, v2| - xml.PairCounts(value: k2) do - xml.TargetValueCounts do - v2.each do |k3, v3| - xml.TargetValueCount(value: k3, count: v3) - end + xml.BayesInputs do + probabilities[:conditional].each do |k, v| + xml.BayesInput(fieldName: k) do + if @features[k] == "categorical" + v.sort_by { |k2, _| k2 }.each do |k2, v2| + xml.PairCounts(value: k2) do + xml.TargetValueCounts do + v2.sort_by { |k2, _| k2 }.each do |k3, v3| + xml.TargetValueCount(value: k3, count: v3) end end end - else - xml.TargetValueStats do - v.each do |k2, v2| - xml.TargetValueStat(value: k2) do - xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2) - end + end + else + xml.TargetValueStats do + v.sort_by { |k2, _| k2 }.each do |k2, v2| + xml.TargetValueStat(value: k2) do + xml.GaussianDistribution(mean: v2[:mean], variance: v2[:stdev]**2) end end end end end end - xml.BayesOutput(fieldName: "target") do - xml.TargetValueCounts do - probabilities[:prior].each do |k, v| - xml.TargetValueCount(value: k, count: v) - end + end + xml.BayesOutput(fieldName: "target") do + xml.TargetValueCounts do + probabilities[:prior].sort_by { |k, _| k }.each do |k, v| + xml.TargetValueCount(value: k, count: v) end end end end - end.to_xml - end - - # metrics - - def self.metrics(actual, estimated) - { - accuracy: actual.zip(estimated).count { |yi, yi2| yi == yi2 } / actual.size.to_f - } - end - - private - - def _predict(x) - x.map do |xi| - probs = calculate_class_probabilities(stringify_keys(xi)) - # deterministic for equal probabilities - probs.sort_by { |k, v| [-v, k.to_s] }[0][0] end end - def calculate_class_probabilities(x) - prob = {} - probabilities[:prior].each do |c, cv| - prob[c] = cv.to_f / probabilities[:prior].values.sum - probabilities[:conditional].each do |k, v| - if !v[c][:mean] - # TODO compute ahead of time - p2 = v[c][x[k]].to_f / v[c].values.sum - - # assign very small probability if probability is 0 - # TODO use proper smoothing instead - if p2 == 0 - p2 = 0.0001 - end - - prob[c] *= p2 - else - prob[c] *= calculate_probability(x[k], v[c][:mean], v[c][:stdev]) - end - end - end - prob + def group_count(arr, start) + arr.inject(start) { |h, e| h[e] += 1; h } end - def calculate_probability(x, mean, stdev) - exponent = Math.exp(-((x - mean)**2) / (2 * (stdev**2))) - (1 / (Math.sqrt(2 * Math::PI) * stdev)) * exponent - end - - def group_count(arr) - r = arr.inject(Hash.new(0)) { |h, e| h[e.to_s] += 1 ; h } - r.default = nil - r - end - def mean(arr) arr.sum / arr.size.to_f end def stdev(arr) + return nil if arr.size <= 1 m = mean(arr) sum = arr.inject(0) { |accum, i| accum + (i - m)**2 } Math.sqrt(sum / (arr.length - 1).to_f) - end - - def stringify_keys(h) - o = {} - h.each do |k, v| - o[k.to_s] = v - end - o end end end