#!/usr/bin/env ruby # -*- coding: utf-8 -*- module NaiveBayes class Classifier attr_accessor :frequency_table, :word_table, :instance_count_of, :total_count, :model, :smoothing_parameter def initialize(params = {}) @frequency_table = Hash.new @word_table = Hash.new @instance_count_of = Hash.new(0) @total_count = 0 @model = params[:model] @smoothing_parameter = params[:smoothing_parameter] || 1 end def train(label, feature) unless @frequency_table.has_key?(label) @frequency_table[label] = Hash.new(0) end feature.each {|word, frequency| if @model == "berounoulli" @frequency_table[label][word] += 1 else @frequency_table[label][word] += frequency end @word_table[word] = 1 } @instance_count_of[label] += 1 @total_count += 1 end def classify(feature) @model == "complement" ? cnb(feature) : mnb(feature) end private def mnb(feature) class_prior_of = Hash.new(1) likelihood_of = Hash.new(1) class_posterior_of = Hash.new(1) evidence = 0 @instance_count_of.each {|label, freq| class_prior_of[label] = freq.to_f / @total_count.to_f } @frequency_table.each_key {|label| likelihood_of[label] = 1 @word_table.each_key {|word| laplace_word_likelihood = (@frequency_table[label][word] + 1).to_f / (@instance_count_of[label] + @word_table.size()).to_f if feature.has_key?(word) likelihood_of[label] *= laplace_word_likelihood else likelihood_of[label] *= (1 - laplace_word_likelihood) end } class_posterior_of[label] = class_prior_of[label] * likelihood_of[label] evidence += class_posterior_of[label] } class_posterior_of.each {|label, posterior| class_posterior_of[label] = posterior / evidence } return class_posterior_of end def cnb(feature) all_class = @frequency_table.keys all_train_data = @instance_count_of.values.inject(0) {|s, v| s + v} class_posterior_of = all_class.map {|c| n_c = total_number_of_word_in_other_class(c) alpha = @smoothing_parameter*feature.length term2nd = feature.to_a.map {|e| k = e[0] v = e[1] v*Math.log((number_of_word_in_other_class(c, k) + @smoothing_parameter).to_f/(n_c + alpha)) }.inject(0) {|s, v| s + v} theta_c = @instance_count_of[c].to_f/all_train_data [c, Math.log(theta_c) - term2nd] }.sort {|x, y| x[1] <=> y[1]}.flatten Hash[*class_posterior_of] end def total_number_of_word_in_other_class(c) all_words = @frequency_table.values.map {|h| h.keys}.flatten.sort.uniq other_classes = @frequency_table.keys - [c] other_classes.map {|c| all_words.map {|w| @frequency_table[c][w] } }.flatten.inject(0) {|s, v| s + v} end def number_of_word_in_other_class(c, i) other_classes = @frequency_table.keys - [c] other_classes.map {|c| @frequency_table[c][i]}.inject(0) {|s, v| s + v} end end end