// isotree #include // rice #include #include #include #include #include using Rice::Array; using Rice::Hash; using Rice::Module; using Rice::String; using Rice::Symbol; using Rice::define_class_under; using Rice::define_module; extern "C" void Init_ext() { Module rb_mIsoTree = define_module("IsoTree"); Module rb_mExt = define_module_under(rb_mIsoTree, "Ext"); define_class_under(rb_mExt, "ExtIsoForest"); rb_mExt .define_singleton_method( "fit_iforest", *[](Hash options) { // model ExtIsoForest iso; // data size_t nrows = options.get("nrows"); size_t ncols_numeric = options.get("ncols_numeric"); size_t ncols_categ = options.get("ncols_categ"); double *restrict numeric_data = NULL; if (ncols_numeric > 0) { numeric_data = (double*) options.get("numeric_data").c_str(); } int *restrict categorical_data = NULL; int *restrict ncat = NULL; if (ncols_categ > 0) { categorical_data = (int*) options.get("categorical_data").c_str(); ncat = (int*) options.get("ncat").c_str(); } // not used (sparse matrices) double* Xc = NULL; sparse_ix* Xc_ind = NULL; sparse_ix* Xc_indptr = NULL; // options CoefType coef_type = Normal; double* sample_weights = NULL; bool weight_as_sample = false; size_t max_depth = 0; bool limit_depth = true; bool standardize_dist = false; double* tmat = NULL; double* output_depths = NULL; bool standardize_depth = false; double* col_weights = NULL; MissingAction missing_action = Impute; CategSplit cat_split_type = SubSet; NewCategAction new_cat_action = Smallest; Imputer *imputer = NULL; UseDepthImp depth_imp = Higher; WeighImpRows weigh_imp_rows = Inverse; bool impute_at_fit = false; // Rice has limit of 14 arguments, so use hash for options size_t sample_size = options.get("sample_size"); size_t ndim = options.get("ndim"); size_t ntrees = options.get("ntrees"); size_t ntry = options.get("ntry"); double prob_pick_by_gain_avg = options.get("prob_pick_avg_gain"); double prob_split_by_gain_avg = options.get("prob_split_avg_gain"); double prob_pick_by_gain_pl = options.get("prob_pick_pooled_gain"); double prob_split_by_gain_pl = options.get("prob_split_pooled_gain"); double min_gain = options.get("min_gain"); bool all_perm = options.get("all_perm"); bool coef_by_prop = options.get("coef_by_prop"); bool with_replacement = options.get("sample_with_replacement"); bool penalize_range = options.get("penalize_range"); bool weigh_by_kurt = options.get("weigh_by_kurtosis"); size_t min_imp_obs = options.get("min_imp_obs"); uint64_t random_seed = options.get("random_seed"); int nthreads = options.get("nthreads"); fit_iforest( NULL, &iso, numeric_data, ncols_numeric, categorical_data, ncols_categ, ncat, Xc, Xc_ind, Xc_indptr, ndim, ntry, coef_type, coef_by_prop, sample_weights, with_replacement, weight_as_sample, nrows, sample_size, ntrees, max_depth, limit_depth, penalize_range, standardize_dist, tmat, output_depths, standardize_depth, col_weights, weigh_by_kurt, prob_pick_by_gain_avg, prob_split_by_gain_avg, prob_pick_by_gain_pl, prob_split_by_gain_pl, min_gain, missing_action, cat_split_type, new_cat_action, all_perm, imputer, min_imp_obs, depth_imp, weigh_imp_rows, impute_at_fit, random_seed, nthreads ); return iso; }) .define_singleton_method( "predict_iforest", *[](ExtIsoForest& iso, Hash options) { // data size_t nrows = options.get("nrows"); size_t ncols_numeric = options.get("ncols_numeric"); size_t ncols_categ = options.get("ncols_categ"); double *restrict numeric_data = NULL; if (ncols_numeric > 0) { numeric_data = (double*) options.get("numeric_data").c_str(); } int *restrict categorical_data = NULL; if (ncols_categ > 0) { categorical_data = (int*) options.get("categorical_data").c_str(); } // not used (sparse matrices) double* Xc = NULL; sparse_ix* Xc_ind = NULL; sparse_ix* Xc_indptr = NULL; double* Xr = NULL; sparse_ix* Xr_ind = NULL; sparse_ix* Xr_indptr = NULL; // options int nthreads = options.get("nthreads"); bool standardize = options.get("standardize"); std::vector outlier_scores(nrows); sparse_ix* tree_num = NULL; predict_iforest( numeric_data, categorical_data, Xc, Xc_ind, Xc_indptr, Xr, Xr_ind, Xr_indptr, nrows, nthreads, standardize, NULL, &iso, outlier_scores.data(), tree_num ); Array ret; for (size_t i = 0; i < outlier_scores.size(); i++) { ret.push(outlier_scores[i]); } return ret; }); }