lib/anomaly/detector.rb in anomaly-0.0.3 vs lib/anomaly/detector.rb in anomaly-0.1.0
- old
+ new
@@ -1,60 +1,122 @@
module Anomaly
class Detector
+ attr_accessor :eps
- def initialize(data = nil)
+ def initialize(examples = nil, opts = {})
@m = 0
- train(data) if data
+ train(examples, opts) if examples
end
- def train(data)
+ def train(examples, opts = {})
+ raise "No examples" if examples.empty?
+ raise "Must have at least two columns" if examples.first.size < 2
+
+ # Divide into groups since we only want to train with non-anomalies.
+ anomalies = []
+ non_anomalies = []
+ examples.each do |example|
+ if example.last == 0
+ non_anomalies << example
+ else
+ anomalies << example
+ end
+ end
+
+ raise "Must have at least one non-anomaly" if non_anomalies.empty?
+
+ @eps = (opts[:eps] || 0).to_f
+ if @eps > 0
+ # Use all non-anomalies to train.
+ training_examples = non_anomalies
+ else
+ training_examples, test_examples = partition!(non_anomalies)
+ test_examples.concat(anomalies)
+ end
+ # Remove last column.
+ training_examples = training_examples.map{|e| e[0..-2]}
+ @m = training_examples.size
+ @n = training_examples.first.size
+
if defined?(NMatrix)
- d = NMatrix.to_na(data)
- @n, @m = d.sizes
- # Convert these to an array for Marshal.dump
- @mean = d.mean(1).to_a
- @std = d.stddev(1).to_a
+ training_examples = NMatrix.to_na(training_examples)
+ # Convert these to an Array for Marshal.dump
+ @mean = training_examples.mean(1).to_a
+ @std = training_examples.stddev(1).to_a
else
# Default to Array, since built-in Matrix does not give us a big performance advantage.
- d = data.to_a
- @m = d.size
- @n = d.first ? d.first.size : 0
- cols = @n.times.map{|i| d.map{|r| r[i]}}
+ cols = @n.times.map{|i| training_examples.map{|r| r[i]}}
@mean = cols.map{|c| mean(c)}
@std = cols.each_with_index.map{|c,i| std(c, @mean[i])}
end
@std.map!{|std| (std == 0 or std.nan?) ? Float::MIN : std}
+
+ if @eps == 0
+ # Find the best eps.
+ epss = (1..9).map{|i| [1,3,5,7,9].map{|j| (j*10**(-i)).to_f }}.flatten
+ f1_scores = epss.map{|eps| [eps, compute_f1_score(test_examples, eps)] }
+ @eps, best_f1 = f1_scores.max_by{|v| v[1]}
+ end
end
def trained?
@m > 0
end
- def samples
- @m
- end
-
# Limit the probability of features to [0,1]
# to keep probabilities at same scale.
def probability(x)
raise "Train me first" unless trained?
- raise ArgumentError, "x must have #{@n} elements" if x.size != @n
+ raise ArgumentError, "First argument must have #{@n} elements" if x.size != @n
@n.times.map do |i|
p = normal_pdf(x[i], @mean[i], @std[i])
(p.nan? or p > 1) ? 1 : p
end.reduce(1, :*)
end
- def anomaly?(x, epsilon)
- probability(x) < epsilon
+ def anomaly?(x, eps = @eps)
+ probability(x) < eps
end
protected
SQRT2PI = Math.sqrt(2*Math::PI)
def normal_pdf(x, mean = 0, std = 1)
1/(SQRT2PI*std)*Math.exp(-((x - mean)**2/(2.0*(std**2))))
+ end
+
+ # Find best eps.
+
+ def partition!(examples, p_last = 0.2)
+ examples.shuffle!
+ n = (examples.size * p_last).floor
+ [examples[n..-1], examples[0...n]]
+ end
+
+ def compute_f1_score(examples, eps)
+ tp = 0
+ fp = 0
+ fn = 0
+ examples.each do |example|
+ act = example.last != 0
+ pred = self.anomaly?(example[0..-2], eps)
+ if act and pred
+ tp += 1
+ elsif pred # and !act
+ fp += 1
+ elsif act # and !pred
+ fn += 1
+ end
+ end
+ f1_score(tp, fp, fn)
+ end
+
+ def f1_score(tp, fp, fn)
+ precision = tp / (tp + fp).to_f
+ recall = tp / (tp + fn).to_f
+ score = 2.0 * precision * recall / (precision + recall)
+ score.nan? ? 0.0 : score
end
# Not used for NArray
def mean(x)