# Author:: Thomas Kern # License:: MPL 1.1 # Project:: ai4r # Url:: http://ai4r.org/ # # You can redistribute it and/or modify it under the terms of # the Mozilla Public License version 1.1 as published by the # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt require File.dirname(__FILE__) + '/../data/data_set' require File.dirname(__FILE__) + '/classifier' module Ai4r module Classifiers # = Introduction # # This is an implementation of a Naive Bayesian Classifier without any # specialisation (ie. for text classification) # Probabilities P(a_i | v_j) are estimated using m-estimates, hence the # m parameter as second parameter when isntantiating the class. # The estimation looks like this: #(n_c + mp) / (n + m) # # the variables are: # n = the number of training examples for which v = v_j # n_c = number of examples for which v = v_j and a = a_i # p = a priori estimate for P(a_i | v_j) # m = the equivalent sample size # # stores the conditional probabilities in an array named @pcp and in this form: # @pcp[attributes][values][classes] # # This kind of estimator is useful when the training data set is relatively small. # If the data set is big enough, set it to 0, which is also the default value # # # For further details regarding Bayes and Naive Bayes Classifier have a look at those websites: # http://en.wikipedia.org/wiki/Naive_Bayesian_classification # http://en.wikipedia.org/wiki/Bayes%27_theorem # # # = Parameters # # * :m => Optional. Default value is set to 0. It may be set to a value greater than 0 when # the size of the dataset is relatively small # # = How to use it # # data = DataSet.new.load_csv_with_labels "bayes_data.csv" # b = NaiveBayes.new. # set_parameters({:m=>3}). # build data # b.eval(["Red", "SUV", "Domestic"]) # class NaiveBayes < Classifier parameters_info :m => "Default value is set to 0. It may be set to a value greater than " + "0 when the size of the dataset is relatively small" def initialize @m = 0 @class_counts = [] @class_prob = [] # stores the probability of the classes @pcc = [] # stores the number of instances divided into attribute/value/class @pcp = [] # stores the conditional probabilities of the values of an attribute @klass_index = {} # hashmap for quick lookup of all the used klasses and their indice @values = {} # hashmap for quick lookup of all the values end # You can evaluate new data, predicting its category. # e.g. # b.eval(["Red", "SUV", "Domestic"]) # => 'No' def eval(data) prob = @class_prob.map {|cp| cp} prob = calculate_class_probabilities_for_entry(data, prob) index_to_klass(prob.index(prob.max)) end # Calculates the probabilities for the data entry Data. # data has to be an array of the same dimension as the training data minus the # class column. # Returns a map containint all classes as keys: # {Class_1 => probability, Class_2 => probability2 ... } # Probability is <= 1 and of type Float. # e.g. # b.get_probability_map(["Red", "SUV", "Domestic"]) # => {"Yes"=>0.4166666666666667, "No"=>0.5833333333333334} def get_probability_map(data) prob = @class_prob.map {|cp| cp} prob = calculate_class_probabilities_for_entry(data, prob) prob = normalize_class_probability prob probability_map = {} prob.each_with_index { |p, i| probability_map[index_to_klass(i)] = p } return probability_map end # counts values of the attribute instances and calculates the probability of the classes # and the conditional probabilities # Parameter data has to be an instance of CsvDataSet def build(data) raise "Error instance must be passed" unless data.is_a?(DataSet) raise "Data should not be empty" if data.data_items.length == 0 initialize_domain_data(data) initialize_klass_index initialize_pc calculate_probabilities return self end private def initialize_domain_data(data) @domains = data.build_domains @data_items = data.data_items.map { |item| DataEntry.new(item[0...-1], item.last) } @data_labels = data.data_labels[0...-1] @klasses = @domains.last.to_a end # calculates the klass probability of a data entry # as usual, the probability of the value is multiplied with every conditional # probability of every attribute in condition to a specific class # this is repeated for every class def calculate_class_probabilities_for_entry(data, prob) prob.each_with_index do |prob_entry, prob_index| data.each_with_index do |att, index| next if value_index(att, index).nil? prob[prob_index] *= @pcp[index][value_index(att, index)][prob_index] end end end # normalises the array of probabilities so the sum of the array equals 1 def normalize_class_probability(prob) prob_sum = sum(prob) prob_sum > 0 ? prob.map {|prob_entry| prob_entry / prob_sum } : prob end # sums an array up; returns a number of type Float def sum(array) array.inject(0.0){|b, i| b+i} end # returns the name of the class when the index is found def index_to_klass(index) @klass_index.has_value?(index) ? @klass_index.key(index) : nil end # initializes @values and @klass_index; maps a certain value to a uniq index def initialize_klass_index @klasses.each_with_index do |dl, index| @klass_index[dl] = index end @data_labels.each_with_index do |dl, index| @values[index] = {} @domains[index].each_with_index do |d, d_index| @values[index][d] = d_index end end end # returns the index of a class def klass_index(klass) @klass_index[klass] end # returns the index of a value, depending on the attribute index def value_index(value, dl_index) @values[dl_index][value] end # builds an array of the form: # array[attributes][values][classes] def build_array(dl, index) domains = Array.new(@domains[index].length) domains.map do |p1| pl = Array.new @klasses.length, 0 end end # initializes the two array for storing the count and conditional probabilities of # the attributes def initialize_pc @data_labels.each_with_index do |dl, index| @pcc << build_array(dl, index) @pcp << build_array(dl, index) end end # calculates the occurrences of a class and the instances of a certain value of a # certain attribute and the assigned class. # In addition to that, it also calculates the conditional probabilities and values def calculate_probabilities @klasses.each {|dl| @class_counts[klass_index(dl)] = 0} calculate_class_probabilities count_instances calculate_conditional_probabilities end def calculate_class_probabilities @data_items.each do |entry| @class_counts[klass_index(entry.klass)] += 1 end @class_counts.each_with_index do |k, index| @class_prob[index] = k.to_f / @data_items.length end end # counts the instances of a certain value of a certain attribute and the assigned class def count_instances @data_items.each do |item| @data_labels.each_with_index do |dl, dl_index| @pcc[dl_index][value_index(item[dl_index], dl_index)][klass_index(item.klass)] += 1 end end end # calculates the conditional probability and stores it in the @pcp-array def calculate_conditional_probabilities @pcc.each_with_index do |attributes, a_index| attributes.each_with_index do |values, v_index| values.each_with_index do |klass, k_index| @pcp[a_index][v_index][k_index] = (klass.to_f + @m * @class_prob[k_index]) / (@class_counts[k_index] + @m).to_f end end end end #DataEntry stores the instance of the data entry #the data is accessible via entries #stores the class-column in the attribute klass and #removes the column for the class-entry class DataEntry attr_accessor :klass, :entries def initialize(attributes, klass) @klass = klass @entries = attributes end # wrapper method for the access to @entries def [](index) @entries[index] end end end end end # Monkeypatch to support both ruby 1.8 and 1.9 (key vs index method) class Hash alias_method(:key, :index) unless method_defined?(:key) end