lib/svmkit/ensemble/ada_boost_classifier.rb in svmkit-0.7.0 vs lib/svmkit/ensemble/ada_boost_classifier.rb in svmkit-0.7.1
- old
+ new
@@ -1,8 +1,10 @@
# frozen_string_literal: true
require 'svmkit/validation'
+require 'svmkit/values'
+require 'svmkit/utils'
require 'svmkit/base/base_estimator'
require 'svmkit/base/classifier'
require 'svmkit/tree/decision_tree_classifier'
module SVMKit
@@ -20,10 +22,11 @@
# *Reference*
# - J. Zhu, S. Rosset, H. Zou, and T.Hashie, "Multi-class AdaBoost," Technical Report No. 430, Department of Statistics, University of Michigan, 2005.
class AdaBoostClassifier
include Base::BaseEstimator
include Base::Classifier
+ include Validation
# Return the set of estimators.
# @return [Array<DecisionTreeClassifier>]
attr_reader :estimators
@@ -50,19 +53,20 @@
# @param min_samples_leaf [Integer] The minimum number of samples at a leaf node.
# @param max_features [Integer] The number of features to consider when searching optimal split point.
# If nil is given, split process considers all features.
# @param random_seed [Integer] The seed value using to initialize the random generator.
# It is used to randomly determine the order of features when deciding spliting point.
- def initialize(n_estimators: 50, criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
+ def initialize(n_estimators: 50,
+ criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1,
max_features: nil, random_seed: nil)
- SVMKit::Validation.check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
- max_features: max_features, random_seed: random_seed)
- SVMKit::Validation.check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
- SVMKit::Validation.check_params_string(criterion: criterion)
- SVMKit::Validation.check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
- max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
- max_features: max_features)
+ check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes,
+ max_features: max_features, random_seed: random_seed)
+ check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf)
+ check_params_string(criterion: criterion)
+ check_params_positive(n_estimators: n_estimators, max_depth: max_depth,
+ max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf,
+ max_features: max_features)
@params = {}
@params[:n_estimators] = n_estimators
@params[:criterion] = criterion
@params[:max_depth] = max_depth
@params[:max_leaf_nodes] = max_leaf_nodes
@@ -80,13 +84,13 @@
#
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
# @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model.
# @return [AdaBoostClassifier] The learned classifier itself.
def fit(x, y) # rubocop:disable Metrics/AbcSize
- SVMKit::Validation.check_sample_array(x)
- SVMKit::Validation.check_label_array(y)
- SVMKit::Validation.check_sample_label_size(x, y)
+ check_sample_array(x)
+ check_label_array(y)
+ check_sample_label_size(x, y)
## Initialize some variables.
n_samples, n_features = x.shape
@estimators = []
@feature_importances = Numo::DFloat.zeros(n_features)
@params[:max_features] = n_features unless @params[:max_features].is_a?(Integer)
@@ -98,16 +102,16 @@
y_codes = Numo::DFloat.zeros(n_samples, n_classes) - 1.fdiv(n_classes - 1)
n_samples.times { |n| y_codes[n, classes_arr.index(y[n])] = 1.0 }
observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples)
@params[:n_estimators].times do |_t|
# Fit classfier.
- ids = weighted_sampling(observation_weights)
+ ids = SVMKit::Utils.choice_ids(n_samples, observation_weights, @rng)
break if y[ids].to_a.uniq.size != n_classes
tree = Tree::DecisionTreeClassifier.new(
criterion: @params[:criterion], max_depth: @params[:max_depth],
max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf],
- max_features: @params[:max_features], random_seed: @rng.rand(int_max)
+ max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values::int_max)
)
tree.fit(x[ids, true], y[ids])
# Calculate estimator error.
proba = tree.predict_proba(x).clip(1.0e-15, nil)
p = Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[proba[n, true].max_index] })
@@ -132,11 +136,11 @@
# Calculate confidence scores for samples.
#
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores.
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample.
def decision_function(x)
- SVMKit::Validation.check_sample_array(x)
+ check_sample_array(x)
n_samples, = x.shape
n_classes = @classes.size
sum_probs = Numo::DFloat.zeros(n_samples, n_classes)
@estimators.each do |tree|
log_proba = Numo::NMath.log(tree.predict_proba(x).clip(1.0e-15, nil))
@@ -148,34 +152,37 @@
# Predict class labels for samples.
#
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels.
# @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample.
def predict(x)
- SVMKit::Validation.check_sample_array(x)
+ check_sample_array(x)
n_samples, = x.shape
probs = decision_function(x)
Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] })
end
# Predict probability for samples.
#
# @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities.
# @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample.
def predict_proba(x)
- SVMKit::Validation.check_sample_array(x)
+ check_sample_array(x)
n_classes = @classes.size
probs = Numo::NMath.exp(1.fdiv(n_classes - 1) * decision_function(x))
sum_probs = probs.sum(1)
probs /= Numo::DFloat[sum_probs].transpose
probs
end
# Dump marshal data.
# @return [Hash] The marshal data about AdaBoostClassifier.
def marshal_dump
- { params: @params, estimators: @estimators, classes: @classes,
- feature_importances: @feature_importances, rng: @rng }
+ { params: @params,
+ estimators: @estimators,
+ classes: @classes,
+ feature_importances: @feature_importances,
+ rng: @rng }
end
# Load marshal data.
# @return [nil]
def marshal_load(obj)
@@ -183,30 +190,9 @@
@estimators = obj[:estimators]
@classes = obj[:classes]
@feature_importances = obj[:feature_importances]
@rng = obj[:rng]
nil
- end
-
- private
-
- def weighted_sampling(weights)
- Array.new(weights.size) do
- target = @rng.rand
- chosen = 0
- weights.each_with_index do |w, idx|
- if target <= w
- chosen = idx
- break
- end
- target -= w
- end
- chosen
- end
- end
-
- def int_max
- @int_max ||= 2**([42].pack('i').size * 16 - 2) - 1
end
end
end
end