# Author:: Sergio Fierens (Implementation only, Cendrowska is # the creator of the algorithm) # License:: MPL 1.1 # Project:: ai4r # Url:: http://ai4r.rubyforge.org/ # # You can redistribute it and/or modify it under the terms of # the Mozilla Public License version 1.1 as published by the # Mozilla Foundation at http://www.mozilla.org/MPL/MPL-1.1.txt # # J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules. # International Journal of Man-Machine Studies. 27(4):349-370. require File.dirname(__FILE__) + '/classifier_helper' module Ai4r module Classifiers # = Introduction # This is an implementation of the PRISM algorithm (Cendrowska, 1987) # Given a set of preclassified examples, it builds a set of rules # to predict the class of other instaces. # # J. Cendrowska (1987). PRISM: An algorithm for inducing modular rules. # International Journal of Man-Machine Studies. 27(4):349-370. class Prism attr_accessor :data_labels, :rules include ClassifierHelper # Build a new Prism classifier. If your data is classified with N attributed # and M examples, then your data examples must have the following format: # # [ [ATT1_VAL1, ATT2_VAL1, ATT3_VAL1, ... , ATTN_VAL1, CLASS_VAL1], # [ATT1_VAL2, ATT2_VAL2, ATT3_VAL2, ... , ATTN_VAL2, CLASS_VAL2], # ... # [ATTM1_VALM, ATT2_VALM, ATT3_VALM, ... , ATTN_VALM, CLASS_VALM], # ] # # e.g. # [ ['New York', '<30', 'M', 'Y'], # ['Chicago', '<30', 'M', 'Y'], # ['Chicago', '<30', 'F', 'Y'], # ['New York', '<30', 'M', 'Y'], # ['New York', '<30', 'M', 'Y'], # ['Chicago', '[30-50)', 'M', 'Y'], # ['New York', '[30-50)', 'F', 'N'], # ['Chicago', '[30-50)', 'F', 'Y'], # ['New York', '[30-50)', 'F', 'N'], # ['Chicago', '[50-80]', 'M', 'N'], # ['New York', '[50-80]', 'F', 'N'], # ['New York', '[50-80]', 'M', 'N'], # ['Chicago', '[50-80]', 'M', 'N'], # ['New York', '[50-80]', 'F', 'N'], # ['Chicago', '>80', 'F', 'Y'] # ] # # Data labels must have the following format: # [ 'city', 'age_range', 'gender', 'marketing_target' ] # # If you do not provide labels for you data, the following labels will # be created by default: # [ 'attribute_1', 'attribute_2', 'attribute_3', 'class_value' ] # def build(data_examples, data_labels=nil) check_data_examples(data_examples) @data_labels = (data_labels) ? data_labels : default_data_labels(data_examples) domains = build_domains(data_examples) instances = data_examples.collect {|data| data } @rules = [] domains.last.each do |class_value| while(has_class_value(instances, class_value)) rule = build_rule(class_value, instances) @rules << rule instances = instances.select {|data| !matches_conditions(data, rule[:conditions])} end end return self end # You can evaluate new data, predicting its class. # e.g. # classifier.eval(['New York', '<30', 'F']) # => 'Y' def eval(instace) @rules.each do |rule| return rule[:class_value] if matches_conditions(instace, rule[:conditions]) end return nil end # This method returns the generated rules in ruby code. # e.g. # # classifier.to_s # # => if age_range == '<30' then marketing_target = 'Y' # elsif age_range == '>80' then marketing_target = 'Y' # elsif city == 'Chicago' and age_range == '[30-50)' then marketing_target = 'Y' # else marketing_target = 'N' # end # # It is a nice way to inspect induction results, and also to execute them: # age_range = '[30-50)' # city = 'New York' # eval(classifier.to_s) # puts marketing_target # 'Y' def to_s out = "if #{join_terms(@rules.first)} then #{then_clause(@rules.first)}" @rules[1...-1].each do |rule| out += "\nelsif #{join_terms(rule)} then #{then_clause(rule)}" end out += "\nelse #{then_clause(@rules.last)}" if @rules.size > 1 out += "\nend" return out end protected def has_class_value(instances, class_value) instances.each { |data| return true if data.last == class_value} return false end def is_perfect(instances, rule) class_value = rule[:class_value] instances.each do |data| return false if data.last != class_value and matches_conditions(data, rule[:conditions]) end return true end def matches_conditions(data, conditions) conditions.each_pair do |attr_label, attr_value| return false if data[get_attr_index(attr_label)] != attr_value end return true end def get_attr_index(attr_label) return @data_labels.index(attr_label) end def get_attr_value(data, attr_label) return data[get_attr_index(attr_label)] end def build_rule(class_value, instances) rule = {:class_value => class_value, :conditions => {}} rule_instances = instances.collect {|data| data } attributes = @data_labels[0...-1].collect {|label| label } until(is_perfect(instances, rule) || attributes.empty?) freq_table = build_freq_table(rule_instances, attributes, class_value) condition = get_condition(freq_table) rule[:conditions].merge!(condition) rule_instances = rule_instances.select do |data| matches_conditions(data, condition) end end return rule end # Returns a structure with the folloring format: # => {attr1_label => { :attr1_value1 => [p, t], attr1_value2 => [p, t], ... }, # attr2_label => { :attr2_value1 => [p, t], attr2_value2 => [p, t], ... }, # ... # } # where p is the number of instances classified as class_value # with that attribute value, and t is the total number of instances with # that attribute value def build_freq_table(rule_instances, attributes, class_value) freq_table = Hash.new() rule_instances.each do |data| attributes.each do |attr_label| attr_freqs = freq_table[attr_label] || Hash.new([0, 0]) pt = attr_freqs[get_attr_value(data, attr_label)] pt = [(data.last == class_value) ? pt[0]+1 : pt[0], pt[1]+1] attr_freqs[get_attr_value(data, attr_label)] = pt freq_table[attr_label] = attr_freqs end end return freq_table end # returns a single conditional term: {attrN_label => attrN_valueM} # selecting the attribute with higher pt ratio # (occurrences of attribute value classified as class_value / # occurrences of attribute value) def get_condition(freq_table) best_pt = [0, 0] condition = nil freq_table.each do |attr_label, attr_freqs| attr_freqs.each do |attr_value, pt| if(better_pt(pt, best_pt)) condition = { attr_label => attr_value } best_pt = pt end end end return condition end # pt = [p, t] # p = occurrences of attribute value with instance classified as class_value # t = occurrences of attribute value # a pt is better if: # 1- its ratio is higher # 2- its ratio is equal, and has a higher p def better_pt(pt, best_pt) return false if pt[1] == 0 return true if best_pt[1] == 0 a = pt[0]*best_pt[1] b = best_pt[0]*pt[1] return true if a>b || (a==b && pt[0]>best_pt[0]) return false end def join_terms(rule) terms = [] rule[:conditions].each do |attr_label, attr_value| terms << "#{attr_label} == '#{attr_value}'" end "#{terms.join(" and ")}" end def then_clause(rule) "#{@data_labels.last} = '#{rule[:class_value]}'" end end end end