lib/naivebayes/classifier.rb in naivebayes-0.0.3 vs lib/naivebayes/classifier.rb in naivebayes-0.1.0

- old
+ new

@@ -1,37 +1,44 @@ #!/usr/bin/env ruby # -*- coding: utf-8 -*- module NaiveBayes class Classifier - attr_accessor :frequency_table, :word_table, :instance_count_of, :total_count, :model + attr_accessor :frequency_table, :word_table, :instance_count_of, :total_count, :model, :smoothing_parameter def initialize(params = {}) @frequency_table = Hash.new @word_table = Hash.new @instance_count_of = Hash.new(0) @total_count = 0 @model = params[:model] + @smoothing_parameter = params[:smoothing_parameter] || 1 end - def train(label, attributes) + def train(label, feature) unless @frequency_table.has_key?(label) @frequency_table[label] = Hash.new(0) end - attributes.each {|word, frequency| - if @model == "multinomial" - @frequency_table[label][word] += frequency - else + feature.each {|word, frequency| + if @model == "berounoulli" @frequency_table[label][word] += 1 + else + @frequency_table[label][word] += frequency end @word_table[word] = 1 } @instance_count_of[label] += 1 @total_count += 1 end - def classify(attributes) + def classify(feature) + @model == "complement" ? cnb(feature) : mnb(feature) + end + + private + + def mnb(feature) class_prior_of = Hash.new(1) likelihood_of = Hash.new(1) class_posterior_of = Hash.new(1) evidence = 0 @instance_count_of.each {|label, freq| @@ -40,11 +47,11 @@ @frequency_table.each_key {|label| likelihood_of[label] = 1 @word_table.each_key {|word| laplace_word_likelihood = (@frequency_table[label][word] + 1).to_f / (@instance_count_of[label] + @word_table.size()).to_f - if attributes.has_key?(word) + if feature.has_key?(word) likelihood_of[label] *= laplace_word_likelihood else likelihood_of[label] *= (1 - laplace_word_likelihood) end } @@ -53,8 +60,40 @@ } class_posterior_of.each {|label, posterior| class_posterior_of[label] = posterior / evidence } return class_posterior_of + end + + def cnb(feature) + all_class = @frequency_table.keys + all_train_data = @instance_count_of.values.inject(0) {|s, v| s + v} + class_posterior_of = all_class.map {|c| + n_c = total_number_of_word_in_other_class(c) + alpha = @smoothing_parameter*feature.length + term2nd = feature.to_a.map {|e| + k = e[0] + v = e[1] + v*Math.log((number_of_word_in_other_class(c, k) + @smoothing_parameter).to_f/(n_c + alpha)) + }.inject(0) {|s, v| s + v} + theta_c = @instance_count_of[c].to_f/all_train_data + [c, Math.log(theta_c) - term2nd] + }.sort {|x, y| x[1] <=> y[1]}.flatten + Hash[*class_posterior_of] + end + + def total_number_of_word_in_other_class(c) + all_words = @frequency_table.values.map {|h| h.keys}.flatten.sort.uniq + other_classes = @frequency_table.keys - [c] + other_classes.map {|c| + all_words.map {|w| + @frequency_table[c][w] + } + }.flatten.inject(0) {|s, v| s + v} + end + + def number_of_word_in_other_class(c, i) + other_classes = @frequency_table.keys - [c] + other_classes.map {|c| @frequency_table[c][i]}.inject(0) {|s, v| s + v} end end end