lib/ai4r/classifiers/one_r.rb in ai4r-1.2 vs lib/ai4r/classifiers/one_r.rb in ai4r-1.3

- old
+ new

@@ -6,73 +6,41 @@ # You can redistribute it and/or modify it under the terms of # the Mozilla Public License version 1.1 as published by the # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt require 'set' -require File.dirname(__FILE__) + '/classifier_helper' +require File.dirname(__FILE__) + '/../data/data_set' +require File.dirname(__FILE__) + '/../classifiers/classifier' module Ai4r module Classifiers # = Introduction # # The idea of the OneR algorithm is identify the single # attribute to use to classify data that makes # fewest prediction errors. # It generates rules based on a single attribute. - class OneR + class OneR < Classifier - attr_accessor :data_labels, :rule - include ClassifierHelper + attr_reader :data_set, :rule - # Build a new OneR classifier. If your data is classified with N attributed - # and M examples, then your data examples must have the following format: - # - # [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CLASS_VAL1], - # [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CLASS_VAL2], - # ... - # [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM], - # ] - # - # e.g. - # [ ['New York', '<30', 'M', 'Y'], - # ['Chicago', '<30', 'M', 'Y'], - # ['Chicago', '<30', 'F', 'Y'], - # ['New York', '<30', 'M', 'Y'], - # ['New York', '<30', 'M', 'Y'], - # ['Chicago', '[30-50)', 'M', 'Y'], - # ['New York', '[30-50)', 'F', 'N'], - # ['Chicago', '[30-50)', 'F', 'Y'], - # ['New York', '[30-50)', 'F', 'N'], - # ['Chicago', '[50-80]', 'M', 'N'], - # ['New York', '[50-80]', 'F', 'N'], - # ['New York', '[50-80]', 'M', 'N'], - # ['Chicago', '[50-80]', 'M', 'N'], - # ['New York', '[50-80]', 'F', 'N'], - # ['Chicago', '>80', 'F', 'Y'] - # ] - # - # Data labels must have the following format: - # [ 'city', 'age_range', 'gender', 'marketing_target' ] - # - # If you do not provide labels for you data, the following labels will - # be created by default: - # [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ] - # - def build(data_examples, data_labels = nil) - check_data_examples(data_examples) - @data_labels = (data_labels) ? data_labels : default_data_labels(data_examples) - if (num_attributes(data_examples) == 1) - @zero_r = ZeroR.new.build(data_examples, data_labels) + # Build a new OneR classifier. You must provide a DataSet instance + # as parameter. + def build(data_set) + data_set.check_not_empty + @data_set = data_set + if (data_set.num_attributes == 1) + @zero_r = ZeroR.new.build(data_set) return self; else @zero_r = nil; end - domains = build_domains(data_examples) + domains = @data_set.build_domains @rule = nil domains[1...-1].each_index do |attr_index| - rule = build_rule(data_examples, attr_index, domains) + rule = build_rule(@data_set.data_items, attr_index, domains) @rule = rule if !@rule || rule[:correct] > @rule[:correct] end return self end @@ -86,39 +54,32 @@ end # This method returns the generated rules in ruby code. # e.g. # - # classifier.to_s + # classifier.get_rules # # => if age_range == '<30' then marketing_target = 'Y' # elsif age_range == '[30-50)' then marketing_target = 'N' # elsif age_range == '[50-80]' then marketing_target = 'N' # end # # It is a nice way to inspect induction results, and also to execute them: # marketing_target = nil - # eval classifier.to_s + # eval classifier.get_rules # puts marketing_target # # => 'Y' - def to_s - return @zero_r.to_s if @zero_r + def get_rules + return @zero_r.get_rules if @zero_r sentences = [] - attr_label = @data_labels[@rule[:attr_index]] - class_label = @data_labels.last + attr_label = @data_set.data_labels[@rule[:attr_index]] + class_label = @data_set.data_labels.last @rule[:rule].each_pair do |attr_value, class_value| sentences << "#{attr_label} == '#{attr_value}' then #{class_label} = '#{class_value}'" end return "if " + sentences.join("\nelsif ") + "\nend" end protected - def build_domains(data_examples) - domains = Array.new(num_attributes(data_examples)) { Set.new } - data_examples.each do |data| - data.each_index {|attr_index| domains[attr_index] << data[attr_index]} - end - return domains - end def build_rule(data_examples, attr_index, domains) domain = domains[attr_index] value_freq = Hash.new domain.each do |attr_value|