vendor/isotree/src/sql.cpp in isotree-0.2.2 vs vendor/isotree/src/sql.cpp in isotree-0.3.0

- old
+ new

@@ -16,15 +16,33 @@ * "On detecting clustered anomalies using SCiForest." * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010. * [5] https://sourceforge.net/projects/iforest/ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014. -* [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019). -* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019). +* [8] Cortes, David. +* "Distance approximation using Isolation Forests." +* arXiv preprint arXiv:1910.12362 (2019). +* [9] Cortes, David. +* "Imputing missing values with unsupervised random trees." +* arXiv preprint arXiv:1911.06646 (2019). +* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom +* [11] Cortes, David. +* "Revisiting randomized choices in isolation forests." +* arXiv preprint arXiv:2110.13402 (2021). +* [12] Guha, Sudipto, et al. +* "Robust random cut forest based anomaly detection on streams." +* International conference on machine learning. PMLR, 2016. +* [13] Cortes, David. +* "Isolation forests: looking beyond tree depth." +* arXiv preprint arXiv:2111.11639 (2021). +* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou. +* "Isolation kernel and its effect on SVM" +* Proceedings of the 24th ACM SIGKDD +* International Conference on Knowledge Discovery & Data Mining. 2018. * * BSD 2-Clause License -* Copyright (c) 2020, David Cortes +* Copyright (c) 2019-2022, David Cortes * All rights reserved. * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. @@ -91,26 +109,45 @@ std::vector<std::string> tree_conds = generate_sql(model_outputs, model_outputs_ext, numeric_colnames, categ_colnames, categ_levels, false, index1, false, 0, nthreads); - std::string out = std::accumulate(tree_conds.begin(), tree_conds.end(), std::string("SELECT\nPOWER(2.0, -(0.0"), + bool is_density = (model_outputs != NULL && model_outputs->scoring_metric == Density) || + (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == Density); + bool is_bdens = (model_outputs != NULL && model_outputs->scoring_metric == BoxedDensity) || + (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedDensity); + bool is_bdens2 = (model_outputs != NULL && model_outputs->scoring_metric == BoxedDensity) || + (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedDensity); + bool is_bratio = (model_outputs != NULL && model_outputs->scoring_metric == BoxedRatio) || + (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedRatio); + is_density = is_density || is_bdens2; + std::string out = std::accumulate(tree_conds.begin(), tree_conds.end(), + is_density? + std::string("SELECT\n(-(0.0") + : + (is_bdens? + std::string("SELECT\n((0.0") + : + (is_bratio? + std::string("SELECT\n((0.0") + : + std::string("SELECT\nPOWER(2.0, -(0.0"))), [&tree_conds, &index1](std::string &a, std::string &b) {return a + std::string(" + \n---BEGIN TREE ") + std::to_string((size_t)std::distance(tree_conds.data(), &b) + (size_t)index1) + std::string("---\n") + b + std::string("\n---END OF TREE ") + std::to_string((size_t)std::distance(tree_conds.data(), &b) + (size_t)index1) + std::string("---\n");}); size_t ntrees = (model_outputs != NULL)? (model_outputs->trees.size()) : (model_outputs_ext->hplanes.size()); - return + return out + std::string(") / ") - + std::to_string((long double)ntrees * ((model_outputs != NULL)? - (model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth))) + + std::to_string((double)ntrees * ((model_outputs != NULL)? + (model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth))) + std::string(") AS ") + select_as + std::string("\nFROM ") + table_from; } @@ -172,17 +209,17 @@ size_t_for loop_st = 0; size_t_for loop_end = ntrees_use; if (single_tree) { - loop_st = tree_num; + loop_st = tree_num - index1; loop_end = loop_st + 1; } /* determine maximum number of nodes in a tree */ size_t max_nodes = 0; - for (size_t tree = loop_st; tree < loop_end; tree++) + for (size_t tree = loop_st; tree < (size_t)loop_end; tree++) max_nodes = std::max(max_nodes, (model_outputs != NULL)? (model_outputs->trees[tree].size()) : (model_outputs_ext->hplanes[tree].size())); std::vector<std::string> conditions_left(max_nodes); std::vector<std::string> conditions_right(max_nodes); @@ -190,84 +227,115 @@ std::vector<std::vector<std::string>> all_node_rules(ntrees_use); std::vector<std::string> out(ntrees_use); size_t tree_use; + bool threw_exception = false; + std::exception_ptr ex = NULL; + #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \ shared(model_outputs, model_outputs_ext, numeric_colnames, categ_colnames, categ_levels, \ - loop_st, loop_end, index1, single_tree, all_node_rules, out) \ + loop_st, loop_end, index1, single_tree, all_node_rules, out, ex, threw_exception) \ firstprivate(conditions_left, conditions_right) private(tree_use) for (size_t_for tree = loop_st; tree < loop_end; tree++) { - if (model_outputs != NULL) + if (threw_exception) continue; + + try { - for (size_t node = 0; node < model_outputs->trees[tree].size(); node++) - extract_cond_isotree(*model_outputs, model_outputs->trees[tree][node], - conditions_left[node], conditions_right[node], - numeric_colnames, categ_colnames, - categ_levels); - } - - else - { - for (size_t node = 0; node < model_outputs_ext->hplanes[tree].size(); node++) - extract_cond_ext_isotree(*model_outputs_ext, model_outputs_ext->hplanes[tree][node], + if (model_outputs != NULL) + { + for (size_t node = 0; node < model_outputs->trees[tree].size(); node++) + extract_cond_isotree(*model_outputs, model_outputs->trees[tree][node], conditions_left[node], conditions_right[node], numeric_colnames, categ_colnames, categ_levels); - } + } - generate_tree_rules( - (model_outputs == NULL)? (NULL) : &(model_outputs->trees[tree]), - (model_outputs_ext == NULL)? (NULL) : &(model_outputs_ext->hplanes[tree]), - output_score, - 0, index1, initial_str, all_node_rules[single_tree? 0 : tree], - conditions_left, conditions_right - ); + else + { + for (size_t node = 0; node < model_outputs_ext->hplanes[tree].size(); node++) + extract_cond_ext_isotree(*model_outputs_ext, model_outputs_ext->hplanes[tree][node], + conditions_left[node], conditions_right[node], + numeric_colnames, categ_colnames, + categ_levels); + } - /* Code below doesn't compile with MSVC (stuck with an OMP standard that's >20 years old) */ - // if (single_tree) - // tree = 0; - tree_use = single_tree? (size_t)0 : tree; + generate_tree_rules( + (model_outputs == NULL)? (NULL) : &(model_outputs->trees[tree]), + (model_outputs_ext == NULL)? (NULL) : &(model_outputs_ext->hplanes[tree]), + output_score, + 0, index1, initial_str, all_node_rules[single_tree? 0 : tree], + conditions_left, conditions_right, + model_outputs, model_outputs_ext + ); - if (all_node_rules[tree_use].size() <= 1) - { - for (std::string &rule : all_node_rules[tree_use]) - rule = std::string("WHEN TRUE THEN ") - + std::to_string((model_outputs != NULL)? - (model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth)) - + std::string(" "); + /* Code below doesn't compile with MSVC (stuck with an OMP standard that's >20 years old) */ + // if (single_tree) + // tree = 0; + tree_use = single_tree? (size_t)0 : tree; + + if (all_node_rules[tree_use].size() <= 1) + { + for (std::string &rule : all_node_rules[tree_use]) + rule = std::string("WHEN TRUE THEN ") + + std::to_string((model_outputs != NULL)? + (model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth)) + + std::string(" "); + } + + out[tree_use] = std::accumulate(all_node_rules[tree_use].begin(), all_node_rules[tree_use].end(), + std::string("CASE\n"), + [&all_node_rules, &tree_use, &index1](std::string &a, std::string &b) + {return a + + std::string("---begin terminal node ") + + std::to_string((size_t)std::distance(&(all_node_rules[tree_use][0]), &b) + (size_t)index1) + + std::string("---\n") + + b;}) + + std::string("END\n"); + all_node_rules[tree_use].clear(); } - out[tree_use] = std::accumulate(all_node_rules[tree_use].begin(), all_node_rules[tree_use].end(), - std::string("CASE\n"), - [&all_node_rules, &tree_use, &index1](std::string &a, std::string &b) - {return a - + std::string("---begin terminal node ") - + std::to_string((size_t)std::distance(&(all_node_rules[tree_use][0]), &b) + (size_t)index1) - + std::string("---\n") - + b;}) - + std::string("END\n"); - all_node_rules[tree_use].clear(); + catch (...) + { + #pragma omp critical + { + if (!threw_exception) + { + threw_exception = true; + ex = std::current_exception(); + } + } + } } + if (threw_exception) + std::rethrow_exception(ex); + return out; } void generate_tree_rules(std::vector<IsoTree> *trees, std::vector<IsoHPlane> *hplanes, bool output_score, size_t curr_ix, bool index1, std::string &prev_cond, std::vector<std::string> &node_rules, - std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right) + std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right, + const IsoForest *model_outputs, const ExtIsoForest *model_outputs_ext) { - if ((trees != NULL && (*trees)[curr_ix].score >= 0) || - (hplanes != NULL && (*hplanes)[curr_ix].score >= 0)) + // if ((trees != NULL && (*trees)[curr_ix].score >= 0) || + // (hplanes != NULL && (*hplanes)[curr_ix].score >= 0)) + if ((trees != NULL && (*trees)[curr_ix].tree_left == 0) || + (hplanes != NULL && (*hplanes)[curr_ix].hplane_left == 0)) { node_rules.push_back(prev_cond + std::string("\tTHEN ") + (output_score? (std::to_string((trees != NULL)? - ((*trees)[curr_ix].score) : ((*hplanes)[curr_ix].score))) + ((model_outputs->scoring_metric != Density && model_outputs->scoring_metric != BoxedRatio)? + (*trees)[curr_ix].score : (-(*trees)[curr_ix].score)) + : + ((model_outputs_ext->scoring_metric != Density && model_outputs_ext->scoring_metric != BoxedRatio)? + (*hplanes)[curr_ix].score : (-(*hplanes)[curr_ix].score)))) : (std::to_string(node_rules.size() + (size_t)index1))) + std::string("\n---end of terminal node ") + std::to_string(node_rules.size() + (size_t)index1) + std::string("---\n")); @@ -281,32 +349,33 @@ + std::string(")\n"); generate_tree_rules(trees, hplanes, output_score, (trees != NULL)? ((*trees)[curr_ix].tree_left) : ((*hplanes)[curr_ix].hplane_left), index1, cond_left, node_rules, - conditions_left, conditions_right); + conditions_left, conditions_right, model_outputs, model_outputs_ext); cond_left.clear(); std::string cond_right = prev_cond + ((curr_ix > 0)? std::string("\t\tAND (") : std::string("\t\t (")) + conditions_right[curr_ix] + std::string(")\n"); generate_tree_rules(trees, hplanes, output_score, (trees != NULL)? ((*trees)[curr_ix].tree_right) : ((*hplanes)[curr_ix].hplane_right), index1, cond_right, node_rules, - conditions_left, conditions_right); + conditions_left, conditions_right, model_outputs, model_outputs_ext); } void extract_cond_isotree(IsoForest &model, IsoTree &tree, std::string &cond_left, std::string &cond_right, std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames, std::vector<std::vector<std::string>> &categ_levels) { cond_left = std::string(""); cond_right = std::string(""); - if (tree.score >= 0.) + // if (tree.score >= 0.) + if (tree.tree_left == 0) return; switch(tree.col_type) { case Numeric: @@ -455,21 +524,28 @@ break; } } break; } + + default: + { + unexpected_error(); + break; + } } } void extract_cond_ext_isotree(ExtIsoForest &model, IsoHPlane &hplane, std::string &cond_left, std::string &cond_right, std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames, std::vector<std::vector<std::string>> &categ_levels) { cond_left = std::string(""); cond_right = std::string(""); - if (hplane.score >= 0.) + // if (hplane.score >= 0.) + if (hplane.hplane_left == 0) return; std::string hplane_conds = std::string(""); size_t n_visited_numeric = 0; @@ -531,9 +607,15 @@ hplane_conds += std::string(" END"); break; } } n_visited_categ++; + break; + } + + default: + { + unexpected_error(); break; } } hplane_conds += ((model.missing_action == Impute)? (std::string(", ") + std::to_string(hplane.fill_val[ix]) + std::string(")")) : (std::string("")));