| Signature | Description | Parameters |
|---|---|---|
#include <DataFrame/DataFrameMLVisitors.h> template<typename T, typename I = unsigned long, std::size_t A = 0> struct AnomalyDetectByZScoreVisitor; // ------------------------------------- template<typename T, typename I = unsigned long, std::size_t A = 0> using and_zscr_v = AnomalyDetectByZScoreVisitor<T, I, A>; |
This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface. This visitor applies Z-Score method to find outliers. This works with both scalar and multidimensional (i.e. vector and arrays) datasets. get_result() returns a vector of indices to the original data that were deemed outliers. In case of multidimensional input column, the result is a vector of std::pairs. The first integer is the index into the input column and the second integer is the index into the dimension. explicit AnomalyDetectByZScoreVisitor(value_type threshold); threshold: Number of stdev's higher than that it is a outlier |
T: Column data type I: Index type A: Memory alignment boundary for vectors. Default is system default alignment |
static void test_AnomalyDetectByZScoreVisitor() { std::cout << "\nTesting AnomalyDetectByZScoreVisitor{ } ..." << std::endl; constexpr std::size_t item_cnt = 1024; MyStdDataFrame df; df.load_index(MyStdDataFrame::gen_sequence_index(0, item_cnt, 1)); std::vector<double> sine_col; sine_col.reserve(item_cnt); for (std::size_t i = 0; i < item_cnt; ++i) { sine_col.push_back(std::sin(2.0 * M_PI * i / 20.0)); // Base sine wave if (i % 30 == 0) sine_col.back() += 2.0; // Inject anomalies } df.load_column("sine col", std::move(sine_col)); and_zscr_v<double> anomaly1 { 2.0 }; const std::vector<std::size_t> result1 = { 0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 600, 630, 660, 690, 720, 750, 780, 810, 840, 870, 900, 930, 960, 990, 1020 }; df.single_act_visit<double>("sine col", anomaly1); assert((anomaly1.get_result() == result1)); // Now do the same thing for IBM market data // StrDataFrame ibm; try { ibm.read("IBM.csv", io_format::csv2); } catch (const DataFrameError &ex) { std::cout << ex.what() << std::endl; ::exit(-1); } ibm.get_column<double>("IBM_Close")[502] = 800.0; ibm.get_column<double>("IBM_Close")[1001] = 900.0; ibm.get_column<double>("IBM_Close")[2002] = 850.0; and_zscr_v<double> anomaly2 { 15.0 }; const std::vector<std::size_t> result2 = { 502, 1001, 2002 }; ibm.single_act_visit<double>("IBM_Close", anomaly2); assert((anomaly2.get_result() == result2)); // Now multidimensional data // constexpr std::size_t dim { 3 }; using ary_col_t = std::array<double, dim>; using vec_col_t = std::vector<double>; std::vector<vec_col_t> vec_col { { 0.1, 99.8, -50.1 }, // normal { 0.2, 100.1, -49.9 }, // normal { -0.1, 100.3, -50.2 }, // normal { 0.0, 99.7, -50.0 }, // normal { 0.3, 100.0, -49.8 }, // normal { -0.2, 99.9, -50.3 }, // normal { 0.1, 100.2, -50.1 }, // normal { 0.0, 100.0, -50.0 }, // normal { -0.1, 99.8, -49.9 }, // normal { 0.2, 100.1, -50.2 }, // normal { 0.1, 99.9, -50.1 }, // normal { 0.0, 100.0, -50.0 }, // normal { -0.2, 100.2, -49.8 }, // normal { 0.1, 99.7, -50.2 }, // normal { 0.3, 100.1, -50.0 }, // normal { 0.0, 99.9, -49.9 }, // normal { 0.2, 100.0, -50.1 }, // normal { -0.1, 100.1, -50.0 }, // normal { 15.0, 100.2, -50.1 }, // ANOMALY: dim 0 spike { 0.1, 99.8, -120.0 }, // ANOMALY: dim 2 spike }; std::vector<ary_col_t> ary_col { { 0.1, 99.8, -50.1 }, // normal { 0.2, 100.1, -49.9 }, // normal { -0.1, 100.3, -50.2 }, // normal { 0.0, 99.7, -50.0 }, // normal { 0.3, 100.0, -49.8 }, // normal { -0.2, 99.9, -50.3 }, // normal { 0.1, 100.2, -50.1 }, // normal { 0.0, 100.0, -50.0 }, // normal { -0.1, 99.8, -49.9 }, // normal { 0.2, 100.1, -50.2 }, // normal { 0.1, 99.9, -50.1 }, // normal { 0.0, 100.0, -50.0 }, // normal { -0.2, 100.2, -49.8 }, // normal { 0.1, 99.7, -50.2 }, // normal { 0.3, 100.1, -50.0 }, // normal { 0.0, 99.9, -49.9 }, // normal { 0.2, 100.0, -50.1 }, // normal { -0.1, 100.1, -50.0 }, // normal { 15.0, 100.2, -50.1 }, // ANOMALY: dim 0 spike { 0.1, 99.8, -120.0 }, // ANOMALY: dim 2 spike }; df.load_column<vec_col_t>("COL VEC", std::move(vec_col), nan_policy::dont_pad_with_nans); df.load_column<ary_col_t>("COL ARY", std::move(ary_col), nan_policy::dont_pad_with_nans); and_zscr_v<vec_col_t> vec_and_v { 2.5 }; and_zscr_v<ary_col_t> ary_and_v { 2.5 }; df.single_act_visit<vec_col_t>("COL VEC", vec_and_v); df.single_act_visit<ary_col_t>("COL ARY", ary_and_v); assert(vec_and_v.get_result().size() == 2); assert(vec_and_v.get_result()[0].first == 18); assert(vec_and_v.get_result()[0].second == 0); assert(vec_and_v.get_result()[1].first == 19); assert(vec_and_v.get_result()[1].second == 2); assert(ary_and_v.get_result().size() == 2); assert(ary_and_v.get_result()[0].first == 18); assert(ary_and_v.get_result()[0].second == 0); assert(ary_and_v.get_result()[1].first == 19); assert(ary_and_v.get_result()[1].second == 2); }