# Author:: Sergio Fierens (Implementation only) # License:: MPL 1.1 # Project:: ai4r # Url:: http://ai4r.rubyforge.org/ # # You can redistribute it and/or modify it under the terms of # the Mozilla Public License version 1.1 as published by the # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt require 'set' require File.dirname(__FILE__) + '/classifier_helper' module Ai4r module Classifiers # = Introduction # # The idea of the OneR algorithm is identify the single # attribute to use to classify data that makes # fewest prediction errors. # It generates rules based on a single attribute. class OneR attr_accessor :data_labels, :rule include ClassifierHelper # Build a new OneR classifier. If your data is classified with N attributed # and M examples, then your data examples must have the following format: # # [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CLASS_VAL1], # [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CLASS_VAL2], # ... # [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM], # ] # # e.g. # [ ['New York', '<30', 'M', 'Y'], # ['Chicago', '<30', 'M', 'Y'], # ['Chicago', '<30', 'F', 'Y'], # ['New York', '<30', 'M', 'Y'], # ['New York', '<30', 'M', 'Y'], # ['Chicago', '[30-50)', 'M', 'Y'], # ['New York', '[30-50)', 'F', 'N'], # ['Chicago', '[30-50)', 'F', 'Y'], # ['New York', '[30-50)', 'F', 'N'], # ['Chicago', '[50-80]', 'M', 'N'], # ['New York', '[50-80]', 'F', 'N'], # ['New York', '[50-80]', 'M', 'N'], # ['Chicago', '[50-80]', 'M', 'N'], # ['New York', '[50-80]', 'F', 'N'], # ['Chicago', '>80', 'F', 'Y'] # ] # # Data labels must have the following format: # [ 'city', 'age_range', 'gender', 'marketing_target' ] # # If you do not provide labels for you data, the following labels will # be created by default: # [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ] # def build(data_examples, data_labels = nil) check_data_examples(data_examples) @data_labels = (data_labels) ? data_labels : default_data_labels(data_examples) if (num_attributes(data_examples) == 1) @zero_r = ZeroR.new.build(data_examples, data_labels) return self; else @zero_r = nil; end domains = build_domains(data_examples) @rule = nil domains[1...-1].each_index do |attr_index| rule = build_rule(data_examples, attr_index, domains) @rule = rule if !@rule || rule[:correct] > @rule[:correct] end return self end # You can evaluate new data, predicting its class. # e.g. # classifier.eval(['New York', '<30', 'F']) # => 'Y' def eval(data) return @zero_r.eval(data) if @zero_r attr_value = data[@rule[:attr_index]] return @rule[:rule][attr_value] end # This method returns the generated rules in ruby code. # e.g. # # classifier.to_s # # => if age_range == '<30' then marketing_target = 'Y' # elsif age_range == '[30-50)' then marketing_target = 'N' # elsif age_range == '[50-80]' then marketing_target = 'N' # end # # It is a nice way to inspect induction results, and also to execute them: # marketing_target = nil # eval classifier.to_s # puts marketing_target # # => 'Y' def to_s return @zero_r.to_s if @zero_r sentences = [] attr_label = @data_labels[@rule[:attr_index]] class_label = @data_labels.last @rule[:rule].each_pair do |attr_value, class_value| sentences << "#{attr_label} == '#{attr_value}' then #{class_label} = '#{class_value}'" end return "if " + sentences.join("\nelsif ") + "\nend" end protected def build_domains(data_examples) domains = Array.new(num_attributes(data_examples)) { Set.new } data_examples.each do |data| data.each_index {|attr_index| domains[attr_index] << data[attr_index]} end return domains end def build_rule(data_examples, attr_index, domains) domain = domains[attr_index] value_freq = Hash.new domain.each do |attr_value| value_freq[attr_value] = Hash.new { |hash, key| hash[key] = 0 } end data_examples.each do |data| value_freq[data[attr_index]][data.last] = value_freq[data[attr_index]][data.last] + 1 end rule = {} correct_instances = 0 value_freq.each_pair do |attr, class_freq_hash| max_freq = 0 class_freq_hash.each_pair do |class_value, freq| if max_freq < freq rule[attr] = class_value max_freq = freq end end correct_instances += max_freq end return {:attr_index => attr_index, :rule => rule, :correct => correct_instances} end end end end