#include "ext.h" double* alloc_dbl_array(const long n_dimensions) { double* arr = ALLOC_N(double, n_dimensions); memset(arr, 0, n_dimensions * sizeof(double)); return arr; } double calc_gini_coef(double* histogram, const long n_elements, const long n_classes) { double gini = 0.0; for (long i = 0; i < n_classes; i++) { double el = histogram[i] / n_elements; gini += el * el; } return 1.0 - gini; } double calc_entropy(double* histogram, const long n_elements, const long n_classes) { double entropy = 0.0; for (long i = 0; i < n_classes; i++) { double el = histogram[i] / n_elements; entropy += el * log(el + 1.0); } return -entropy; } VALUE calc_mean_vec(double* sum_vec, const long n_dimensions, const long n_elements) { VALUE mean_vec = rb_ary_new2(n_dimensions); for (long i = 0; i < n_dimensions; i++) { rb_ary_store(mean_vec, i, DBL2NUM(sum_vec[i] / n_elements)); } return mean_vec; } double calc_vec_mae(VALUE vec_a, VALUE vec_b) { const long n_dimensions = RARRAY_LEN(vec_a); double sum = 0.0; for (long i = 0; i < n_dimensions; i++) { double diff = NUM2DBL(rb_ary_entry(vec_a, i)) - NUM2DBL(rb_ary_entry(vec_b, i)); sum += fabs(diff); } return sum / n_dimensions; } double calc_vec_mse(VALUE vec_a, VALUE vec_b) { const long n_dimensions = RARRAY_LEN(vec_a); double sum = 0.0; for (long i = 0; i < n_dimensions; i++) { double diff = NUM2DBL(rb_ary_entry(vec_a, i)) - NUM2DBL(rb_ary_entry(vec_b, i)); sum += diff * diff; } return sum / n_dimensions; } double calc_mae(VALUE target_vecs, VALUE mean_vec) { const long n_elements = RARRAY_LEN(target_vecs); double sum = 0.0; for (long i = 0; i < n_elements; i++) { sum += calc_vec_mae(rb_ary_entry(target_vecs, i), mean_vec); } return sum / n_elements; } double calc_mse(VALUE target_vecs, VALUE mean_vec) { const long n_elements = RARRAY_LEN(target_vecs); double sum = 0.0; for (long i = 0; i < n_elements; i++) { sum += calc_vec_mse(rb_ary_entry(target_vecs, i), mean_vec); } return sum / n_elements; } double calc_impurity_cls(const char* criterion, double* histogram, const long n_elements, const long n_classes) { if (strcmp(criterion, "entropy") == 0) { return calc_entropy(histogram, n_elements, n_classes); } return calc_gini_coef(histogram, n_elements, n_classes); } double calc_impurity_reg(const char* criterion, VALUE target_vecs, double* sum_vec) { const long n_elements = RARRAY_LEN(target_vecs); const long n_dimensions = RARRAY_LEN(rb_ary_entry(target_vecs, 0)); VALUE mean_vec = calc_mean_vec(sum_vec, n_dimensions, n_elements); if (strcmp(criterion, "mae") == 0) { return calc_mae(target_vecs, mean_vec); } return calc_mse(target_vecs, mean_vec); } void add_sum_vec(double* sum_vec, VALUE target) { const long n_dimensions = RARRAY_LEN(target); for (long i = 0; i < n_dimensions; i++) { sum_vec[i] += NUM2DBL(rb_ary_entry(target, i)); } } void sub_sum_vec(double* sum_vec, VALUE target) { const long n_dimensions = RARRAY_LEN(target); for (long i = 0; i < n_dimensions; i++) { sum_vec[i] -= NUM2DBL(rb_ary_entry(target, i)); } } /** * @!visibility private */ typedef struct { char* criterion; long n_classes; double impurity; } split_opts_cls; /** * @!visibility private */ static void iter_find_split_params_cls(na_loop_t const* lp) { const int32_t* o = (int32_t*)NDL_PTR(lp, 0); const double* f = (double*)NDL_PTR(lp, 1); const int32_t* y = (int32_t*)NDL_PTR(lp, 2); const long n_elements = NDL_SHAPE(lp, 0)[0]; const char* criterion = ((split_opts_cls*)lp->opt_ptr)->criterion; const long n_classes = ((split_opts_cls*)lp->opt_ptr)->n_classes; const double w_impurity = ((split_opts_cls*)lp->opt_ptr)->impurity; double* params = (double*)NDL_PTR(lp, 3); long curr_pos = 0; long next_pos = 0; long n_l_elements = 0; long n_r_elements = n_elements; double curr_el = f[o[0]]; double last_el = f[o[n_elements - 1]]; double next_el; double l_impurity; double r_impurity; double gain; double* l_histogram = alloc_dbl_array(n_classes); double* r_histogram = alloc_dbl_array(n_classes); /* Initialize optimal parameters. */ params[0] = 0.0; /* left impurity */ params[1] = w_impurity; /* right impurity */ params[2] = curr_el; /* threshold */ params[3] = 0.0; /* gain */ /* Initialize child node variables. */ for (long i = 0; i < n_elements; i++) { r_histogram[y[o[i]]] += 1.0; } /* Find optimal parameters. */ while (curr_pos < n_elements && curr_el != last_el) { next_el = f[o[next_pos]]; while (next_pos < n_elements && next_el == curr_el) { l_histogram[y[o[next_pos]]] += 1; n_l_elements++; r_histogram[y[o[next_pos]]] -= 1; n_r_elements--; next_pos++; next_el = f[o[next_pos]]; } /* Calculate gain of new split. */ l_impurity = calc_impurity_cls(criterion, l_histogram, n_l_elements, n_classes); r_impurity = calc_impurity_cls(criterion, r_histogram, n_r_elements, n_classes); gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements; /* Update optimal parameters. */ if (gain > params[3]) { params[0] = l_impurity; params[1] = r_impurity; params[2] = 0.5 * (curr_el + next_el); params[3] = gain; } if (next_pos == n_elements) break; curr_pos = next_pos; curr_el = f[o[curr_pos]]; } xfree(l_histogram); xfree(r_histogram); } /** * @!visibility private * Find for split point with maximum information gain. * * @overload find_split_params(criterion, impurity, order, features, labels, n_classes) -> Array * * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'gini' and 'entropy'. * @param impurity [Float] The impurity of whole dataset. * @param order [Numo::Int32] (shape: [n_elements]) The element indices sorted according to feature values. * @param features [Numo::DFloat] (shape: [n_elements]) The feature values. * @param labels [Numo::Int32] (shape: [n_elements]) The labels. * @param n_classes [Integer] The number of classes. * @return [Array] The array consists of optimal parameters including impurities of child nodes, threshold, and gain. */ static VALUE find_split_params_cls(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE labels, VALUE n_classes) { ndfunc_arg_in_t ain[3] = {{numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cInt32, 1}}; size_t out_shape[1] = {4}; ndfunc_arg_out_t aout[1] = {{numo_cDFloat, 1, out_shape}}; ndfunc_t ndf = {(na_iter_func_t)iter_find_split_params_cls, NO_LOOP, 3, 1, ain, aout}; split_opts_cls opts = {StringValuePtr(criterion), NUM2LONG(n_classes), NUM2DBL(impurity)}; VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, labels); VALUE results = rb_ary_new2(4); double* params_ptr = (double*)na_get_pointer_for_read(params); rb_ary_store(results, 0, DBL2NUM(params_ptr[0])); rb_ary_store(results, 1, DBL2NUM(params_ptr[1])); rb_ary_store(results, 2, DBL2NUM(params_ptr[2])); rb_ary_store(results, 3, DBL2NUM(params_ptr[3])); RB_GC_GUARD(params); RB_GC_GUARD(criterion); return results; } /** * @!visibility private */ typedef struct { char* criterion; double impurity; } split_opts_reg; /** * @!visibility private */ static void iter_find_split_params_reg(na_loop_t const* lp) { const int32_t* o = (int32_t*)NDL_PTR(lp, 0); const double* f = (double*)NDL_PTR(lp, 1); const double* y = (double*)NDL_PTR(lp, 2); const long n_elements = NDL_SHAPE(lp, 0)[0]; const long n_outputs = NDL_SHAPE(lp, 2)[1]; const char* criterion = ((split_opts_reg*)lp->opt_ptr)->criterion; const double w_impurity = ((split_opts_reg*)lp->opt_ptr)->impurity; double* params = (double*)NDL_PTR(lp, 3); long curr_pos = 0; long next_pos = 0; long n_l_elements = 0; long n_r_elements = n_elements; double curr_el = f[o[0]]; double last_el = f[o[n_elements - 1]]; double next_el; double l_impurity; double r_impurity; double gain; double* l_sum_vec = alloc_dbl_array(n_outputs); double* r_sum_vec = alloc_dbl_array(n_outputs); double target_var; VALUE l_target_vecs = rb_ary_new(); VALUE r_target_vecs = rb_ary_new(); VALUE target; /* Initialize optimal parameters. */ params[0] = 0.0; /* left impurity */ params[1] = w_impurity; /* right impurity */ params[2] = curr_el; /* threshold */ params[3] = 0.0; /* gain */ /* Initialize child node variables. */ for (long i = 0; i < n_elements; i++) { target = rb_ary_new2(n_outputs); for (long j = 0; j < n_outputs; j++) { target_var = y[o[i] * n_outputs + j]; rb_ary_store(target, j, DBL2NUM(target_var)); r_sum_vec[j] += target_var; } rb_ary_push(r_target_vecs, target); } /* Find optimal parameters. */ while (curr_pos < n_elements && curr_el != last_el) { next_el = f[o[next_pos]]; while (next_pos < n_elements && next_el == curr_el) { target = rb_ary_shift(r_target_vecs); n_r_elements--; sub_sum_vec(r_sum_vec, target); rb_ary_push(l_target_vecs, target); n_l_elements++; add_sum_vec(l_sum_vec, target); next_pos++; next_el = f[o[next_pos]]; } /* Calculate gain of new split. */ l_impurity = calc_impurity_reg(criterion, l_target_vecs, l_sum_vec); r_impurity = calc_impurity_reg(criterion, r_target_vecs, r_sum_vec); gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements; /* Update optimal parameters. */ if (gain > params[3]) { params[0] = l_impurity; params[1] = r_impurity; params[2] = 0.5 * (curr_el + next_el); params[3] = gain; } if (next_pos == n_elements) break; curr_pos = next_pos; curr_el = f[o[curr_pos]]; } xfree(l_sum_vec); xfree(r_sum_vec); } /** * @!visibility private * Find for split point with maximum information gain. * * @overload find_split_params(criterion, impurity, order, features, targets) -> Array * * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'. * @param impurity [Float] The impurity of whole dataset. * @param order [Numo::Int32] (shape: [n_samples]) The element indices sorted according to feature values in ascending order. * @param features [Numo::DFloat] (shape: [n_samples]) The feature values. * @param targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values. * @return [Array] The array consists of optimal parameters including impurities of child nodes, threshold, and gain. */ static VALUE find_split_params_reg(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE targets) { ndfunc_arg_in_t ain[3] = {{numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 2}}; size_t out_shape[1] = {4}; ndfunc_arg_out_t aout[1] = {{numo_cDFloat, 1, out_shape}}; ndfunc_t ndf = {(na_iter_func_t)iter_find_split_params_reg, NO_LOOP, 3, 1, ain, aout}; split_opts_reg opts = {StringValuePtr(criterion), NUM2DBL(impurity)}; VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, targets); VALUE results = rb_ary_new2(4); double* params_ptr = (double*)na_get_pointer_for_read(params); rb_ary_store(results, 0, DBL2NUM(params_ptr[0])); rb_ary_store(results, 1, DBL2NUM(params_ptr[1])); rb_ary_store(results, 2, DBL2NUM(params_ptr[2])); rb_ary_store(results, 3, DBL2NUM(params_ptr[3])); RB_GC_GUARD(params); RB_GC_GUARD(criterion); return results; } /** * @!visibility private */ static void iter_find_split_params_grad_reg(na_loop_t const* lp) { const int32_t* o = (int32_t*)NDL_PTR(lp, 0); const double* f = (double*)NDL_PTR(lp, 1); const double* g = (double*)NDL_PTR(lp, 2); const double* h = (double*)NDL_PTR(lp, 3); const double s_grad = ((double*)lp->opt_ptr)[0]; const double s_hess = ((double*)lp->opt_ptr)[1]; const double reg_lambda = ((double*)lp->opt_ptr)[2]; const long n_elements = NDL_SHAPE(lp, 0)[0]; double* params = (double*)NDL_PTR(lp, 4); long curr_pos = 0; long next_pos = 0; double curr_el = f[o[0]]; double last_el = f[o[n_elements - 1]]; double next_el; double l_grad = 0.0; double l_hess = 0.0; double r_grad; double r_hess; double threshold = curr_el; double gain_max = 0.0; double gain; /* Find optimal parameters. */ while (curr_pos < n_elements && curr_el != last_el) { next_el = f[o[next_pos]]; while (next_pos < n_elements && next_el == curr_el) { l_grad += g[o[next_pos]]; l_hess += h[o[next_pos]]; next_pos++; next_el = f[o[next_pos]]; } /* Calculate gain of new split. */ r_grad = s_grad - l_grad; r_hess = s_hess - l_hess; gain = (l_grad * l_grad) / (l_hess + reg_lambda) + (r_grad * r_grad) / (r_hess + reg_lambda) - (s_grad * s_grad) / (s_hess + reg_lambda); /* Update optimal parameters. */ if (gain > gain_max) { threshold = 0.5 * (curr_el + next_el); gain_max = gain; } if (next_pos == n_elements) { break; } curr_pos = next_pos; curr_el = f[o[curr_pos]]; } params[0] = threshold; params[1] = gain_max; } /** * @!visibility private * Find for split point with maximum information gain. * * @overload find_split_params(order, features, gradients, hessians, sum_gradient, sum_hessian, reg_lambda) -> Array * @param order [Numo::Int32] (shape: [n_elements]) The element indices sorted according to feature values. * @param features [Numo::DFloat] (shape: [n_elements]) The feature values. * @param gradients [Numo::DFloat] (shape: [n_elements]) The gradient values. * @param hessians [Numo::DFloat] (shape: [n_elements]) The hessian values. * @param sum_gradient [Float] The sum of gradient values. * @param sum_hessian [Float] The sum of hessian values. * @param reg_lambda [Float] The L2 regularization term on weight. * @return [Array] The array consists of optimal parameters including threshold and gain. */ static VALUE find_split_params_grad_reg(VALUE self, VALUE order, VALUE features, VALUE gradients, VALUE hessians, VALUE sum_gradient, VALUE sum_hessian, VALUE reg_lambda) { ndfunc_arg_in_t ain[4] = {{numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 1}}; size_t out_shape[1] = {2}; ndfunc_arg_out_t aout[1] = {{numo_cDFloat, 1, out_shape}}; ndfunc_t ndf = {(na_iter_func_t)iter_find_split_params_grad_reg, NO_LOOP, 4, 1, ain, aout}; double opts[3] = {NUM2DBL(sum_gradient), NUM2DBL(sum_hessian), NUM2DBL(reg_lambda)}; VALUE params = na_ndloop3(&ndf, opts, 4, order, features, gradients, hessians); VALUE results = rb_ary_new2(2); double* params_ptr = (double*)na_get_pointer_for_read(params); rb_ary_store(results, 0, DBL2NUM(params_ptr[0])); rb_ary_store(results, 1, DBL2NUM(params_ptr[1])); RB_GC_GUARD(params); return results; } /** * @!visibility private */ typedef struct { char* criterion; long n_classes; } node_impurity_cls_opts; /** * @!visibility private */ static void iter_node_impurity_cls(na_loop_t const* lp) { const int32_t* y = (int32_t*)NDL_PTR(lp, 0); const char* criterion = ((node_impurity_cls_opts*)lp->opt_ptr)->criterion; const long n_classes = ((node_impurity_cls_opts*)lp->opt_ptr)->n_classes; const long n_elements = NDL_SHAPE(lp, 0)[0]; double* ret = (double*)NDL_PTR(lp, 1); double* histogram = alloc_dbl_array(n_classes); for (long i = 0; i < n_elements; i++) histogram[y[i]] += 1; *ret = calc_impurity_cls(criterion, histogram, n_elements, n_classes); xfree(histogram); } /** * @!visibility private * Calculate impurity based on criterion. * * @overload node_impurity(criterion, y, n_classes) -> Float * * @param criterion [String] The function to calculate impurity. Supported criteria are 'gini' and 'entropy'. * @param y [Numo::Int32] (shape: [n_samples]) The labels. * @param n_classes [Integer] The number of classes. * @return [Float] impurity */ static VALUE node_impurity_cls(VALUE self, VALUE criterion, VALUE y, VALUE n_classes) { ndfunc_arg_in_t ain[1] = {{numo_cInt32, 1}}; ndfunc_arg_out_t aout[1] = {{numo_cDFloat, 0}}; ndfunc_t ndf = {(na_iter_func_t)iter_node_impurity_cls, NDF_EXTRACT, 1, 1, ain, aout}; node_impurity_cls_opts opts = {StringValuePtr(criterion), NUM2LONG(n_classes)}; VALUE ret = na_ndloop3(&ndf, &opts, 1, y); RB_GC_GUARD(criterion); return ret; } /** * @!visibility private */ static void iter_check_same_label(na_loop_t const* lp) { const int32_t* y = (int32_t*)NDL_PTR(lp, 0); const long n_elements = NDL_SHAPE(lp, 0)[0]; int32_t* ret = (int32_t*)NDL_PTR(lp, 1); *ret = 1; if (n_elements > 0) { int32_t label = y[0]; for (long i = 0; i < n_elements; i++) { if (y[i] != label) { *ret = 0; break; } } } } /** * @!visibility private * Check all elements have the save value. * * @overload check_same_label(y) -> Boolean * * @param y [Numo::Int32] (shape: [n_samples]) The labels. * @return [Boolean] */ static VALUE check_same_label(VALUE self, VALUE y) { ndfunc_arg_in_t ain[1] = {{numo_cInt32, 1}}; ndfunc_arg_out_t aout[1] = {{numo_cInt32, 0}}; ndfunc_t ndf = {(na_iter_func_t)iter_check_same_label, NDF_EXTRACT, 1, 1, ain, aout}; VALUE ret = na_ndloop(&ndf, 1, y); return (NUM2INT(ret) == 1 ? Qtrue : Qfalse); } /** * @!visibility private * Calculate impurity based on criterion. * * @overload node_impurity(criterion, y) -> Float * * @param criterion [String] The function to calculate impurity. Supported criteria are 'mae' and 'mse'. * @param y [Array] (shape: [n_samples, n_outputs]) The taget values. * @return [Float] impurity */ static VALUE node_impurity_reg(VALUE self, VALUE criterion, VALUE y) { const long n_elements = RARRAY_LEN(y); const long n_outputs = RARRAY_LEN(rb_ary_entry(y, 0)); double* sum_vec = alloc_dbl_array(n_outputs); VALUE target_vecs = rb_ary_new(); for (long i = 0; i < n_elements; i++) { VALUE target = rb_ary_entry(y, i); add_sum_vec(sum_vec, target); rb_ary_push(target_vecs, target); } VALUE ret = DBL2NUM(calc_impurity_reg(StringValuePtr(criterion), target_vecs, sum_vec)); xfree(sum_vec); RB_GC_GUARD(criterion); return ret; } void Init_ext(void) { VALUE rb_mRumale = rb_define_module("Rumale"); VALUE rb_mTree = rb_define_module_under(rb_mRumale, "Tree"); /** * Document-module: Rumale::Tree::ExtDecisionTreeClassifier * @!visibility private * The mixin module consisting of extension method for DecisionTreeClassifier class. * This module is used internally. */ VALUE rb_mExtDTreeCls = rb_define_module_under(rb_mTree, "ExtDecisionTreeClassifier"); /** * Document-module: Rumale::Tree::ExtDecisionTreeRegressor * @!visibility private * The mixin module consisting of extension method for DecisionTreeRegressor class. * This module is used internally. */ VALUE rb_mExtDTreeReg = rb_define_module_under(rb_mTree, "ExtDecisionTreeRegressor"); /** * Document-module: Rumale::Tree::ExtGradientTreeRegressor * @!visibility private * The mixin module consisting of extension method for GradientTreeRegressor class. * This module is used internally. */ VALUE rb_mExtGTreeReg = rb_define_module_under(rb_mTree, "ExtGradientTreeRegressor"); rb_define_private_method(rb_mExtDTreeCls, "find_split_params", find_split_params_cls, 6); rb_define_private_method(rb_mExtDTreeReg, "find_split_params", find_split_params_reg, 5); rb_define_private_method(rb_mExtGTreeReg, "find_split_params", find_split_params_grad_reg, 7); rb_define_private_method(rb_mExtDTreeCls, "node_impurity", node_impurity_cls, 3); rb_define_private_method(rb_mExtDTreeCls, "stop_growing?", check_same_label, 1); rb_define_private_method(rb_mExtDTreeReg, "node_impurity", node_impurity_reg, 2); }