lib/ai4r/classifiers/one_r.rb in ai4r-1.2 vs lib/ai4r/classifiers/one_r.rb in ai4r-1.3
- old
+ new
@@ -6,73 +6,41 @@
# You can redistribute it and/or modify it under the terms of
# the Mozilla Public License version 1.1 as published by the
# Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt
require 'set'
-require File.dirname(__FILE__) + '/classifier_helper'
+require File.dirname(__FILE__) + '/../data/data_set'
+require File.dirname(__FILE__) + '/../classifiers/classifier'
module Ai4r
module Classifiers
# = Introduction
#
# The idea of the OneR algorithm is identify the single
# attribute to use to classify data that makes
# fewest prediction errors.
# It generates rules based on a single attribute.
- class OneR
+ class OneR < Classifier
- attr_accessor :data_labels, :rule
- include ClassifierHelper
+ attr_reader :data_set, :rule
- # Build a new OneR classifier. If your data is classified with N attributed
- # and M examples, then your data examples must have the following format:
- #
- # [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CLASS_VAL1],
- # [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CLASS_VAL2],
- # ...
- # [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM],
- # ]
- #
- # e.g.
- # [ ['New York', '<30', 'M', 'Y'],
- # ['Chicago', '<30', 'M', 'Y'],
- # ['Chicago', '<30', 'F', 'Y'],
- # ['New York', '<30', 'M', 'Y'],
- # ['New York', '<30', 'M', 'Y'],
- # ['Chicago', '[30-50)', 'M', 'Y'],
- # ['New York', '[30-50)', 'F', 'N'],
- # ['Chicago', '[30-50)', 'F', 'Y'],
- # ['New York', '[30-50)', 'F', 'N'],
- # ['Chicago', '[50-80]', 'M', 'N'],
- # ['New York', '[50-80]', 'F', 'N'],
- # ['New York', '[50-80]', 'M', 'N'],
- # ['Chicago', '[50-80]', 'M', 'N'],
- # ['New York', '[50-80]', 'F', 'N'],
- # ['Chicago', '>80', 'F', 'Y']
- # ]
- #
- # Data labels must have the following format:
- # [ 'city', 'age_range', 'gender', 'marketing_target' ]
- #
- # If you do not provide labels for you data, the following labels will
- # be created by default:
- # [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ]
- #
- def build(data_examples, data_labels = nil)
- check_data_examples(data_examples)
- @data_labels = (data_labels) ? data_labels : default_data_labels(data_examples)
- if (num_attributes(data_examples) == 1)
- @zero_r = ZeroR.new.build(data_examples, data_labels)
+ # Build a new OneR classifier. You must provide a DataSet instance
+ # as parameter.
+ def build(data_set)
+ data_set.check_not_empty
+ @data_set = data_set
+ if (data_set.num_attributes == 1)
+ @zero_r = ZeroR.new.build(data_set)
return self;
else
@zero_r = nil;
end
- domains = build_domains(data_examples)
+ domains = @data_set.build_domains
@rule = nil
domains[1...-1].each_index do |attr_index|
- rule = build_rule(data_examples, attr_index, domains)
+ rule = build_rule(@data_set.data_items, attr_index, domains)
@rule = rule if !@rule || rule[:correct] > @rule[:correct]
end
return self
end
@@ -86,39 +54,32 @@
end
# This method returns the generated rules in ruby code.
# e.g.
#
- # classifier.to_s
+ # classifier.get_rules
# # => if age_range == '<30' then marketing_target = 'Y'
# elsif age_range == '[30-50)' then marketing_target = 'N'
# elsif age_range == '[50-80]' then marketing_target = 'N'
# end
#
# It is a nice way to inspect induction results, and also to execute them:
# marketing_target = nil
- # eval classifier.to_s
+ # eval classifier.get_rules
# puts marketing_target
# # => 'Y'
- def to_s
- return @zero_r.to_s if @zero_r
+ def get_rules
+ return @zero_r.get_rules if @zero_r
sentences = []
- attr_label = @data_labels[@rule[:attr_index]]
- class_label = @data_labels.last
+ attr_label = @data_set.data_labels[@rule[:attr_index]]
+ class_label = @data_set.data_labels.last
@rule[:rule].each_pair do |attr_value, class_value|
sentences << "#{attr_label} == '#{attr_value}' then #{class_label} = '#{class_value}'"
end
return "if " + sentences.join("\nelsif ") + "\nend"
end
protected
- def build_domains(data_examples)
- domains = Array.new(num_attributes(data_examples)) { Set.new }
- data_examples.each do |data|
- data.each_index {|attr_index| domains[attr_index] << data[attr_index]}
- end
- return domains
- end
def build_rule(data_examples, attr_index, domains)
domain = domains[attr_index]
value_freq = Hash.new
domain.each do |attr_value|