vendor/isotree/src/sql.cpp in isotree-0.2.2 vs vendor/isotree/src/sql.cpp in isotree-0.3.0
- old
+ new
@@ -16,15 +16,33 @@
* "On detecting clustered anomalies using SCiForest."
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
* [5] https://sourceforge.net/projects/iforest/
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
-* [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
-* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
+* [8] Cortes, David.
+* "Distance approximation using Isolation Forests."
+* arXiv preprint arXiv:1910.12362 (2019).
+* [9] Cortes, David.
+* "Imputing missing values with unsupervised random trees."
+* arXiv preprint arXiv:1911.06646 (2019).
+* [10] https://math.stackexchange.com/questions/3333220/expected-average-depth-in-random-binary-tree-constructed-top-to-bottom
+* [11] Cortes, David.
+* "Revisiting randomized choices in isolation forests."
+* arXiv preprint arXiv:2110.13402 (2021).
+* [12] Guha, Sudipto, et al.
+* "Robust random cut forest based anomaly detection on streams."
+* International conference on machine learning. PMLR, 2016.
+* [13] Cortes, David.
+* "Isolation forests: looking beyond tree depth."
+* arXiv preprint arXiv:2111.11639 (2021).
+* [14] Ting, Kai Ming, Yue Zhu, and Zhi-Hua Zhou.
+* "Isolation kernel and its effect on SVM"
+* Proceedings of the 24th ACM SIGKDD
+* International Conference on Knowledge Discovery & Data Mining. 2018.
*
* BSD 2-Clause License
-* Copyright (c) 2020, David Cortes
+* Copyright (c) 2019-2022, David Cortes
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
@@ -91,26 +109,45 @@
std::vector<std::string> tree_conds = generate_sql(model_outputs, model_outputs_ext,
numeric_colnames, categ_colnames,
categ_levels,
false, index1, false, 0,
nthreads);
- std::string out = std::accumulate(tree_conds.begin(), tree_conds.end(), std::string("SELECT\nPOWER(2.0, -(0.0"),
+ bool is_density = (model_outputs != NULL && model_outputs->scoring_metric == Density) ||
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == Density);
+ bool is_bdens = (model_outputs != NULL && model_outputs->scoring_metric == BoxedDensity) ||
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedDensity);
+ bool is_bdens2 = (model_outputs != NULL && model_outputs->scoring_metric == BoxedDensity) ||
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedDensity);
+ bool is_bratio = (model_outputs != NULL && model_outputs->scoring_metric == BoxedRatio) ||
+ (model_outputs_ext != NULL && model_outputs_ext->scoring_metric == BoxedRatio);
+ is_density = is_density || is_bdens2;
+ std::string out = std::accumulate(tree_conds.begin(), tree_conds.end(),
+ is_density?
+ std::string("SELECT\n(-(0.0")
+ :
+ (is_bdens?
+ std::string("SELECT\n((0.0")
+ :
+ (is_bratio?
+ std::string("SELECT\n((0.0")
+ :
+ std::string("SELECT\nPOWER(2.0, -(0.0"))),
[&tree_conds, &index1](std::string &a, std::string &b)
{return a
+ std::string(" + \n---BEGIN TREE ")
+ std::to_string((size_t)std::distance(tree_conds.data(), &b) + (size_t)index1)
+ std::string("---\n")
+ b
+ std::string("\n---END OF TREE ")
+ std::to_string((size_t)std::distance(tree_conds.data(), &b) + (size_t)index1)
+ std::string("---\n");});
size_t ntrees = (model_outputs != NULL)? (model_outputs->trees.size()) : (model_outputs_ext->hplanes.size());
- return
+ return
out
+ std::string(") / ")
- + std::to_string((long double)ntrees * ((model_outputs != NULL)?
- (model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth)))
+ + std::to_string((double)ntrees * ((model_outputs != NULL)?
+ (model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth)))
+ std::string(") AS ")
+ select_as
+ std::string("\nFROM ")
+ table_from;
}
@@ -172,17 +209,17 @@
size_t_for loop_st = 0;
size_t_for loop_end = ntrees_use;
if (single_tree)
{
- loop_st = tree_num;
+ loop_st = tree_num - index1;
loop_end = loop_st + 1;
}
/* determine maximum number of nodes in a tree */
size_t max_nodes = 0;
- for (size_t tree = loop_st; tree < loop_end; tree++)
+ for (size_t tree = loop_st; tree < (size_t)loop_end; tree++)
max_nodes = std::max(max_nodes,
(model_outputs != NULL)?
(model_outputs->trees[tree].size()) : (model_outputs_ext->hplanes[tree].size()));
std::vector<std::string> conditions_left(max_nodes);
std::vector<std::string> conditions_right(max_nodes);
@@ -190,84 +227,115 @@
std::vector<std::vector<std::string>> all_node_rules(ntrees_use);
std::vector<std::string> out(ntrees_use);
size_t tree_use;
+ bool threw_exception = false;
+ std::exception_ptr ex = NULL;
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
shared(model_outputs, model_outputs_ext, numeric_colnames, categ_colnames, categ_levels, \
- loop_st, loop_end, index1, single_tree, all_node_rules, out) \
+ loop_st, loop_end, index1, single_tree, all_node_rules, out, ex, threw_exception) \
firstprivate(conditions_left, conditions_right) private(tree_use)
for (size_t_for tree = loop_st; tree < loop_end; tree++)
{
- if (model_outputs != NULL)
+ if (threw_exception) continue;
+
+ try
{
- for (size_t node = 0; node < model_outputs->trees[tree].size(); node++)
- extract_cond_isotree(*model_outputs, model_outputs->trees[tree][node],
- conditions_left[node], conditions_right[node],
- numeric_colnames, categ_colnames,
- categ_levels);
- }
-
- else
- {
- for (size_t node = 0; node < model_outputs_ext->hplanes[tree].size(); node++)
- extract_cond_ext_isotree(*model_outputs_ext, model_outputs_ext->hplanes[tree][node],
+ if (model_outputs != NULL)
+ {
+ for (size_t node = 0; node < model_outputs->trees[tree].size(); node++)
+ extract_cond_isotree(*model_outputs, model_outputs->trees[tree][node],
conditions_left[node], conditions_right[node],
numeric_colnames, categ_colnames,
categ_levels);
- }
+ }
- generate_tree_rules(
- (model_outputs == NULL)? (NULL) : &(model_outputs->trees[tree]),
- (model_outputs_ext == NULL)? (NULL) : &(model_outputs_ext->hplanes[tree]),
- output_score,
- 0, index1, initial_str, all_node_rules[single_tree? 0 : tree],
- conditions_left, conditions_right
- );
+ else
+ {
+ for (size_t node = 0; node < model_outputs_ext->hplanes[tree].size(); node++)
+ extract_cond_ext_isotree(*model_outputs_ext, model_outputs_ext->hplanes[tree][node],
+ conditions_left[node], conditions_right[node],
+ numeric_colnames, categ_colnames,
+ categ_levels);
+ }
- /* Code below doesn't compile with MSVC (stuck with an OMP standard that's >20 years old) */
- // if (single_tree)
- // tree = 0;
- tree_use = single_tree? (size_t)0 : tree;
+ generate_tree_rules(
+ (model_outputs == NULL)? (NULL) : &(model_outputs->trees[tree]),
+ (model_outputs_ext == NULL)? (NULL) : &(model_outputs_ext->hplanes[tree]),
+ output_score,
+ 0, index1, initial_str, all_node_rules[single_tree? 0 : tree],
+ conditions_left, conditions_right,
+ model_outputs, model_outputs_ext
+ );
- if (all_node_rules[tree_use].size() <= 1)
- {
- for (std::string &rule : all_node_rules[tree_use])
- rule = std::string("WHEN TRUE THEN ")
- + std::to_string((model_outputs != NULL)?
- (model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth))
- + std::string(" ");
+ /* Code below doesn't compile with MSVC (stuck with an OMP standard that's >20 years old) */
+ // if (single_tree)
+ // tree = 0;
+ tree_use = single_tree? (size_t)0 : tree;
+
+ if (all_node_rules[tree_use].size() <= 1)
+ {
+ for (std::string &rule : all_node_rules[tree_use])
+ rule = std::string("WHEN TRUE THEN ")
+ + std::to_string((model_outputs != NULL)?
+ (model_outputs->exp_avg_depth) : (model_outputs_ext->exp_avg_depth))
+ + std::string(" ");
+ }
+
+ out[tree_use] = std::accumulate(all_node_rules[tree_use].begin(), all_node_rules[tree_use].end(),
+ std::string("CASE\n"),
+ [&all_node_rules, &tree_use, &index1](std::string &a, std::string &b)
+ {return a
+ + std::string("---begin terminal node ")
+ + std::to_string((size_t)std::distance(&(all_node_rules[tree_use][0]), &b) + (size_t)index1)
+ + std::string("---\n")
+ + b;})
+ + std::string("END\n");
+ all_node_rules[tree_use].clear();
}
- out[tree_use] = std::accumulate(all_node_rules[tree_use].begin(), all_node_rules[tree_use].end(),
- std::string("CASE\n"),
- [&all_node_rules, &tree_use, &index1](std::string &a, std::string &b)
- {return a
- + std::string("---begin terminal node ")
- + std::to_string((size_t)std::distance(&(all_node_rules[tree_use][0]), &b) + (size_t)index1)
- + std::string("---\n")
- + b;})
- + std::string("END\n");
- all_node_rules[tree_use].clear();
+ catch (...)
+ {
+ #pragma omp critical
+ {
+ if (!threw_exception)
+ {
+ threw_exception = true;
+ ex = std::current_exception();
+ }
+ }
+ }
}
+ if (threw_exception)
+ std::rethrow_exception(ex);
+
return out;
}
void generate_tree_rules(std::vector<IsoTree> *trees, std::vector<IsoHPlane> *hplanes, bool output_score,
size_t curr_ix, bool index1, std::string &prev_cond, std::vector<std::string> &node_rules,
- std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right)
+ std::vector<std::string> &conditions_left, std::vector<std::string> &conditions_right,
+ const IsoForest *model_outputs, const ExtIsoForest *model_outputs_ext)
{
- if ((trees != NULL && (*trees)[curr_ix].score >= 0) ||
- (hplanes != NULL && (*hplanes)[curr_ix].score >= 0))
+ // if ((trees != NULL && (*trees)[curr_ix].score >= 0) ||
+ // (hplanes != NULL && (*hplanes)[curr_ix].score >= 0))
+ if ((trees != NULL && (*trees)[curr_ix].tree_left == 0) ||
+ (hplanes != NULL && (*hplanes)[curr_ix].hplane_left == 0))
{
node_rules.push_back(prev_cond
+ std::string("\tTHEN ")
+ (output_score?
(std::to_string((trees != NULL)?
- ((*trees)[curr_ix].score) : ((*hplanes)[curr_ix].score)))
+ ((model_outputs->scoring_metric != Density && model_outputs->scoring_metric != BoxedRatio)?
+ (*trees)[curr_ix].score : (-(*trees)[curr_ix].score))
+ :
+ ((model_outputs_ext->scoring_metric != Density && model_outputs_ext->scoring_metric != BoxedRatio)?
+ (*hplanes)[curr_ix].score : (-(*hplanes)[curr_ix].score))))
:
(std::to_string(node_rules.size() + (size_t)index1)))
+ std::string("\n---end of terminal node ")
+ std::to_string(node_rules.size() + (size_t)index1)
+ std::string("---\n"));
@@ -281,32 +349,33 @@
+ std::string(")\n");
generate_tree_rules(trees, hplanes, output_score,
(trees != NULL)?
((*trees)[curr_ix].tree_left) : ((*hplanes)[curr_ix].hplane_left),
index1, cond_left, node_rules,
- conditions_left, conditions_right);
+ conditions_left, conditions_right, model_outputs, model_outputs_ext);
cond_left.clear();
std::string cond_right = prev_cond
+ ((curr_ix > 0)? std::string("\t\tAND (") : std::string("\t\t ("))
+ conditions_right[curr_ix]
+ std::string(")\n");
generate_tree_rules(trees, hplanes, output_score,
(trees != NULL)?
((*trees)[curr_ix].tree_right) : ((*hplanes)[curr_ix].hplane_right),
index1, cond_right, node_rules,
- conditions_left, conditions_right);
+ conditions_left, conditions_right, model_outputs, model_outputs_ext);
}
void extract_cond_isotree(IsoForest &model, IsoTree &tree,
std::string &cond_left, std::string &cond_right,
std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
std::vector<std::vector<std::string>> &categ_levels)
{
cond_left = std::string("");
cond_right = std::string("");
- if (tree.score >= 0.)
+ // if (tree.score >= 0.)
+ if (tree.tree_left == 0)
return;
switch(tree.col_type)
{
case Numeric:
@@ -455,21 +524,28 @@
break;
}
}
break;
}
+
+ default:
+ {
+ unexpected_error();
+ break;
+ }
}
}
void extract_cond_ext_isotree(ExtIsoForest &model, IsoHPlane &hplane,
std::string &cond_left, std::string &cond_right,
std::vector<std::string> &numeric_colnames, std::vector<std::string> &categ_colnames,
std::vector<std::vector<std::string>> &categ_levels)
{
cond_left = std::string("");
cond_right = std::string("");
- if (hplane.score >= 0.)
+ // if (hplane.score >= 0.)
+ if (hplane.hplane_left == 0)
return;
std::string hplane_conds = std::string("");
size_t n_visited_numeric = 0;
@@ -531,9 +607,15 @@
hplane_conds += std::string(" END");
break;
}
}
n_visited_categ++;
+ break;
+ }
+
+ default:
+ {
+ unexpected_error();
break;
}
}
hplane_conds += ((model.missing_action == Impute)?
(std::string(", ") + std::to_string(hplane.fill_val[ix]) + std::string(")")) : (std::string("")));