lib/naivebayes/classifier.rb in naivebayes-0.0.3 vs lib/naivebayes/classifier.rb in naivebayes-0.1.0
- old
+ new
@@ -1,37 +1,44 @@
#!/usr/bin/env ruby
# -*- coding: utf-8 -*-
module NaiveBayes
class Classifier
- attr_accessor :frequency_table, :word_table, :instance_count_of, :total_count, :model
+ attr_accessor :frequency_table, :word_table, :instance_count_of, :total_count, :model, :smoothing_parameter
def initialize(params = {})
@frequency_table = Hash.new
@word_table = Hash.new
@instance_count_of = Hash.new(0)
@total_count = 0
@model = params[:model]
+ @smoothing_parameter = params[:smoothing_parameter] || 1
end
- def train(label, attributes)
+ def train(label, feature)
unless @frequency_table.has_key?(label)
@frequency_table[label] = Hash.new(0)
end
- attributes.each {|word, frequency|
- if @model == "multinomial"
- @frequency_table[label][word] += frequency
- else
+ feature.each {|word, frequency|
+ if @model == "berounoulli"
@frequency_table[label][word] += 1
+ else
+ @frequency_table[label][word] += frequency
end
@word_table[word] = 1
}
@instance_count_of[label] += 1
@total_count += 1
end
- def classify(attributes)
+ def classify(feature)
+ @model == "complement" ? cnb(feature) : mnb(feature)
+ end
+
+ private
+
+ def mnb(feature)
class_prior_of = Hash.new(1)
likelihood_of = Hash.new(1)
class_posterior_of = Hash.new(1)
evidence = 0
@instance_count_of.each {|label, freq|
@@ -40,11 +47,11 @@
@frequency_table.each_key {|label|
likelihood_of[label] = 1
@word_table.each_key {|word|
laplace_word_likelihood = (@frequency_table[label][word] + 1).to_f /
(@instance_count_of[label] + @word_table.size()).to_f
- if attributes.has_key?(word)
+ if feature.has_key?(word)
likelihood_of[label] *= laplace_word_likelihood
else
likelihood_of[label] *= (1 - laplace_word_likelihood)
end
}
@@ -53,8 +60,40 @@
}
class_posterior_of.each {|label, posterior|
class_posterior_of[label] = posterior / evidence
}
return class_posterior_of
+ end
+
+ def cnb(feature)
+ all_class = @frequency_table.keys
+ all_train_data = @instance_count_of.values.inject(0) {|s, v| s + v}
+ class_posterior_of = all_class.map {|c|
+ n_c = total_number_of_word_in_other_class(c)
+ alpha = @smoothing_parameter*feature.length
+ term2nd = feature.to_a.map {|e|
+ k = e[0]
+ v = e[1]
+ v*Math.log((number_of_word_in_other_class(c, k) + @smoothing_parameter).to_f/(n_c + alpha))
+ }.inject(0) {|s, v| s + v}
+ theta_c = @instance_count_of[c].to_f/all_train_data
+ [c, Math.log(theta_c) - term2nd]
+ }.sort {|x, y| x[1] <=> y[1]}.flatten
+ Hash[*class_posterior_of]
+ end
+
+ def total_number_of_word_in_other_class(c)
+ all_words = @frequency_table.values.map {|h| h.keys}.flatten.sort.uniq
+ other_classes = @frequency_table.keys - [c]
+ other_classes.map {|c|
+ all_words.map {|w|
+ @frequency_table[c][w]
+ }
+ }.flatten.inject(0) {|s, v| s + v}
+ end
+
+ def number_of_word_in_other_class(c, i)
+ other_classes = @frequency_table.keys - [c]
+ other_classes.map {|c| @frequency_table[c][i]}.inject(0) {|s, v| s + v}
end
end
end