lib/svmkit/ensemble/ada_boost_classifier.rb in svmkit-0.7.0 vs lib/svmkit/ensemble/ada_boost_classifier.rb in svmkit-0.7.1

- old
+ new

@@ -1,8 +1,10 @@ # frozen_string_literal: true require 'svmkit/validation' +require 'svmkit/values' +require 'svmkit/utils' require 'svmkit/base/base_estimator' require 'svmkit/base/classifier' require 'svmkit/tree/decision_tree_classifier' module SVMKit @@ -20,10 +22,11 @@ # *Reference* # - J. Zhu, S. Rosset, H. Zou, and T.Hashie, "Multi-class AdaBoost," Technical Report No. 430, Department of Statistics, University of Michigan, 2005. class AdaBoostClassifier include Base::BaseEstimator include Base::Classifier + include Validation # Return the set of estimators. # @return [Array<DecisionTreeClassifier>] attr_reader :estimators @@ -50,19 +53,20 @@ # @param min_samples_leaf [Integer] The minimum number of samples at a leaf node. # @param max_features [Integer] The number of features to consider when searching optimal split point. # If nil is given, split process considers all features. # @param random_seed [Integer] The seed value using to initialize the random generator. # It is used to randomly determine the order of features when deciding spliting point. - def initialize(n_estimators: 50, criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, + def initialize(n_estimators: 50, + criterion: 'gini', max_depth: nil, max_leaf_nodes: nil, min_samples_leaf: 1, max_features: nil, random_seed: nil) - SVMKit::Validation.check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes, - max_features: max_features, random_seed: random_seed) - SVMKit::Validation.check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf) - SVMKit::Validation.check_params_string(criterion: criterion) - SVMKit::Validation.check_params_positive(n_estimators: n_estimators, max_depth: max_depth, - max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf, - max_features: max_features) + check_params_type_or_nil(Integer, max_depth: max_depth, max_leaf_nodes: max_leaf_nodes, + max_features: max_features, random_seed: random_seed) + check_params_integer(n_estimators: n_estimators, min_samples_leaf: min_samples_leaf) + check_params_string(criterion: criterion) + check_params_positive(n_estimators: n_estimators, max_depth: max_depth, + max_leaf_nodes: max_leaf_nodes, min_samples_leaf: min_samples_leaf, + max_features: max_features) @params = {} @params[:n_estimators] = n_estimators @params[:criterion] = criterion @params[:max_depth] = max_depth @params[:max_leaf_nodes] = max_leaf_nodes @@ -80,13 +84,13 @@ # # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model. # @param y [Numo::Int32] (shape: [n_samples]) The labels to be used for fitting the model. # @return [AdaBoostClassifier] The learned classifier itself. def fit(x, y) # rubocop:disable Metrics/AbcSize - SVMKit::Validation.check_sample_array(x) - SVMKit::Validation.check_label_array(y) - SVMKit::Validation.check_sample_label_size(x, y) + check_sample_array(x) + check_label_array(y) + check_sample_label_size(x, y) ## Initialize some variables. n_samples, n_features = x.shape @estimators = [] @feature_importances = Numo::DFloat.zeros(n_features) @params[:max_features] = n_features unless @params[:max_features].is_a?(Integer) @@ -98,16 +102,16 @@ y_codes = Numo::DFloat.zeros(n_samples, n_classes) - 1.fdiv(n_classes - 1) n_samples.times { |n| y_codes[n, classes_arr.index(y[n])] = 1.0 } observation_weights = Numo::DFloat.zeros(n_samples) + 1.fdiv(n_samples) @params[:n_estimators].times do |_t| # Fit classfier. - ids = weighted_sampling(observation_weights) + ids = SVMKit::Utils.choice_ids(n_samples, observation_weights, @rng) break if y[ids].to_a.uniq.size != n_classes tree = Tree::DecisionTreeClassifier.new( criterion: @params[:criterion], max_depth: @params[:max_depth], max_leaf_nodes: @params[:max_leaf_nodes], min_samples_leaf: @params[:min_samples_leaf], - max_features: @params[:max_features], random_seed: @rng.rand(int_max) + max_features: @params[:max_features], random_seed: @rng.rand(SVMKit::Values::int_max) ) tree.fit(x[ids, true], y[ids]) # Calculate estimator error. proba = tree.predict_proba(x).clip(1.0e-15, nil) p = Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[proba[n, true].max_index] }) @@ -132,11 +136,11 @@ # Calculate confidence scores for samples. # # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to compute the scores. # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Confidence score per sample. def decision_function(x) - SVMKit::Validation.check_sample_array(x) + check_sample_array(x) n_samples, = x.shape n_classes = @classes.size sum_probs = Numo::DFloat.zeros(n_samples, n_classes) @estimators.each do |tree| log_proba = Numo::NMath.log(tree.predict_proba(x).clip(1.0e-15, nil)) @@ -148,34 +152,37 @@ # Predict class labels for samples. # # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the labels. # @return [Numo::Int32] (shape: [n_samples]) Predicted class label per sample. def predict(x) - SVMKit::Validation.check_sample_array(x) + check_sample_array(x) n_samples, = x.shape probs = decision_function(x) Numo::Int32.asarray(Array.new(n_samples) { |n| @classes[probs[n, true].max_index] }) end # Predict probability for samples. # # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the probailities. # @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probability of each class per sample. def predict_proba(x) - SVMKit::Validation.check_sample_array(x) + check_sample_array(x) n_classes = @classes.size probs = Numo::NMath.exp(1.fdiv(n_classes - 1) * decision_function(x)) sum_probs = probs.sum(1) probs /= Numo::DFloat[sum_probs].transpose probs end # Dump marshal data. # @return [Hash] The marshal data about AdaBoostClassifier. def marshal_dump - { params: @params, estimators: @estimators, classes: @classes, - feature_importances: @feature_importances, rng: @rng } + { params: @params, + estimators: @estimators, + classes: @classes, + feature_importances: @feature_importances, + rng: @rng } end # Load marshal data. # @return [nil] def marshal_load(obj) @@ -183,30 +190,9 @@ @estimators = obj[:estimators] @classes = obj[:classes] @feature_importances = obj[:feature_importances] @rng = obj[:rng] nil - end - - private - - def weighted_sampling(weights) - Array.new(weights.size) do - target = @rng.rand - chosen = 0 - weights.each_with_index do |w, idx| - if target <= w - chosen = idx - break - end - target -= w - end - chosen - end - end - - def int_max - @int_max ||= 2**([42].pack('i').size * 16 - 2) - 1 end end end end