// isotree #include // rice #include using Rice::Array; using Rice::Hash; using Rice::Module; using Rice::Object; using Rice::String; using Rice::Symbol; using Rice::define_class_under; using Rice::define_module; namespace Rice::detail { template<> class From_Ruby { public: NewCategAction convert(VALUE x) { auto value = Object(x).to_s().str(); if (value == "weighted") return Weighted; if (value == "smallest") return Smallest; if (value == "random") return Random; throw std::runtime_error("Unknown new categ action: " + value); } }; template<> class From_Ruby { public: MissingAction convert(VALUE x) { auto value = Object(x).to_s().str(); if (value == "divide") return Divide; if (value == "impute") return Impute; if (value == "fail") return Fail; throw std::runtime_error("Unknown missing action: " + value); } }; template<> class From_Ruby { public: CategSplit convert(VALUE x) { auto value = Object(x).to_s().str(); if (value == "subset") return SubSet; if (value == "single_categ") return SingleCateg; throw std::runtime_error("Unknown categ split: " + value); } }; template<> class From_Ruby { public: CoefType convert(VALUE x) { auto value = Object(x).to_s().str(); if (value == "uniform") return Uniform; if (value == "normal") return Normal; throw std::runtime_error("Unknown coef type: " + value); } }; template<> class From_Ruby { public: UseDepthImp convert(VALUE x) { auto value = Object(x).to_s().str(); if (value == "lower") return Lower; if (value == "higher") return Higher; if (value == "same") return Same; throw std::runtime_error("Unknown depth imp: " + value); } }; template<> class From_Ruby { public: WeighImpRows convert(VALUE x) { auto value = Object(x).to_s().str(); if (value == "inverse") return Inverse; if (value == "prop") return Prop; if (value == "flat") return Flat; throw std::runtime_error("Unknown weight imp rows: " + value); } }; } extern "C" void Init_ext() { Module rb_mIsoTree = define_module("IsoTree"); Module rb_mExt = define_module_under(rb_mIsoTree, "Ext"); define_class_under(rb_mExt, "ExtIsoForest"); rb_mExt .define_singleton_function( "fit_iforest", [](Hash options) { // model ExtIsoForest iso; // data size_t nrows = options.get("nrows"); size_t ncols_numeric = options.get("ncols_numeric"); size_t ncols_categ = options.get("ncols_categ"); double *restrict numeric_data = NULL; if (ncols_numeric > 0) { numeric_data = (double*) options.get("numeric_data").c_str(); } int *restrict categorical_data = NULL; int *restrict ncat = NULL; if (ncols_categ > 0) { categorical_data = (int*) options.get("categorical_data").c_str(); ncat = (int*) options.get("ncat").c_str(); } // not used (sparse matrices) double* Xc = NULL; sparse_ix* Xc_ind = NULL; sparse_ix* Xc_indptr = NULL; // options // Rice has limit of 14 arguments, so use hash size_t sample_size = options.get("sample_size"); size_t ndim = options.get("ndim"); size_t ntrees = options.get("ntrees"); size_t ntry = options.get("ntry"); double prob_pick_by_gain_avg = options.get("prob_pick_avg_gain"); double prob_split_by_gain_avg = options.get("prob_split_avg_gain"); double prob_pick_by_gain_pl = options.get("prob_pick_pooled_gain"); double prob_split_by_gain_pl = options.get("prob_split_pooled_gain"); double min_gain = options.get("min_gain"); MissingAction missing_action = options.get("missing_action"); CategSplit cat_split_type = options.get("categ_split_type"); NewCategAction new_cat_action = options.get("new_categ_action"); bool all_perm = options.get("all_perm"); bool coef_by_prop = options.get("coef_by_prop"); bool with_replacement = options.get("sample_with_replacement"); bool penalize_range = options.get("penalize_range"); bool weigh_by_kurt = options.get("weigh_by_kurtosis"); CoefType coef_type = options.get("coefs"); size_t min_imp_obs = options.get("min_imp_obs"); UseDepthImp depth_imp = options.get("depth_imp"); WeighImpRows weigh_imp_rows = options.get("weigh_imp_rows"); uint64_t random_seed = options.get("random_seed"); int nthreads = options.get("nthreads"); // TODO options double* sample_weights = NULL; bool weight_as_sample = false; size_t max_depth = 0; bool limit_depth = true; bool standardize_dist = false; double* tmat = NULL; double* output_depths = NULL; bool standardize_depth = false; double* col_weights = NULL; Imputer *imputer = NULL; bool impute_at_fit = false; bool handle_interrupt = false; fit_iforest( NULL, &iso, numeric_data, ncols_numeric, categorical_data, ncols_categ, ncat, Xc, Xc_ind, Xc_indptr, ndim, ntry, coef_type, coef_by_prop, sample_weights, with_replacement, weight_as_sample, nrows, sample_size, ntrees, max_depth, limit_depth, penalize_range, standardize_dist, tmat, output_depths, standardize_depth, col_weights, weigh_by_kurt, prob_pick_by_gain_avg, prob_split_by_gain_avg, prob_pick_by_gain_pl, prob_split_by_gain_pl, min_gain, missing_action, cat_split_type, new_cat_action, all_perm, imputer, min_imp_obs, depth_imp, weigh_imp_rows, impute_at_fit, random_seed, handle_interrupt, nthreads ); return iso; }) .define_singleton_function( "predict_iforest", [](ExtIsoForest& iso, Hash options) { // data size_t nrows = options.get("nrows"); size_t ncols_numeric = options.get("ncols_numeric"); size_t ncols_categ = options.get("ncols_categ"); double *restrict numeric_data = NULL; if (ncols_numeric > 0) { numeric_data = (double*) options.get("numeric_data").c_str(); } int *restrict categorical_data = NULL; if (ncols_categ > 0) { categorical_data = (int*) options.get("categorical_data").c_str(); } // not used (sparse matrices) double* Xc = NULL; sparse_ix* Xc_ind = NULL; sparse_ix* Xc_indptr = NULL; double* Xr = NULL; sparse_ix* Xr_ind = NULL; sparse_ix* Xr_indptr = NULL; // options int nthreads = options.get("nthreads"); bool standardize = options.get("standardize"); std::vector outlier_scores(nrows); sparse_ix* tree_num = NULL; predict_iforest( numeric_data, categorical_data, Xc, Xc_ind, Xc_indptr, Xr, Xr_ind, Xr_indptr, nrows, nthreads, standardize, NULL, &iso, outlier_scores.data(), tree_num ); Array ret; for (size_t i = 0; i < outlier_scores.size(); i++) { ret.push(outlier_scores[i]); } return ret; }) .define_singleton_function( "serialize_ext_isoforest", [](ExtIsoForest& iso, String path) { #ifdef _MSC_VER // TODO convert to wchar_t throw std::runtime_error("Not supported on Windows yet"); #else serialize_ext_isoforest(iso, path.c_str()); #endif }) .define_singleton_function( "deserialize_ext_isoforest", [](String path) { ExtIsoForest iso; #ifdef _MSC_VER // TODO convert to wchar_t throw std::runtime_error("Not supported on Windows yet"); #else deserialize_ext_isoforest(iso, path.c_str()); #endif return iso; }); }