Back to Documentations

Signature Description Parameters
#include <DataFrame/DataFrameMLVisitors.h>

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
struct AnomalyDetectByZScoreVisitor;

// -------------------------------------

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
using and_zscr_v = AnomalyDetectByZScoreVisitor<T, I, A>;
This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This visitor applies Z-Score method to find outliers.

This works with both scalar and multidimensional (i.e. vector and arrays) datasets.

get_result() returns a vector of indices to the original data that were deemed outliers. In case of multidimensional input column, the result is a vector of std::pairs. The first integer is the index into the input column and the second integer is the index into the dimension.
explicit
AnomalyDetectByZScoreVisitor(value_type threshold);

threshold: Number of stdev's higher than that it is a outlier
T: Column data type
I: Index type
A: Memory alignment boundary for vectors. Default is system default alignment
static void test_AnomalyDetectByZScoreVisitor()  {

    std::cout << "\nTesting AnomalyDetectByZScoreVisitor{ } ..." << std::endl;

    constexpr std::size_t   item_cnt = 1024;
    MyStdDataFrame          df;

    df.load_index(MyStdDataFrame::gen_sequence_index(0, item_cnt, 1));

    std::vector<double>   sine_col;

    sine_col.reserve(item_cnt);
    for (std::size_t i = 0; i < item_cnt; ++i)  {
        sine_col.push_back(std::sin(2.0 * M_PI * i / 20.0)); // Base sine wave
        if (i % 30 == 0)  sine_col.back() += 2.0;  // Inject anomalies
    }
    df.load_column("sine col", std::move(sine_col));

    and_zscr_v<double>              anomaly1 { 2.0 };
    const std::vector<std::size_t>  result1 =
        { 0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 600, 630, 660, 690, 720, 750, 780, 810, 840, 870, 900, 930, 960, 990, 1020 };

    df.single_act_visit<double>("sine col", anomaly1);
    assert((anomaly1.get_result() == result1));

    // Now do the same thing for IBM market data
    //
    StrDataFrame    ibm;

    try  {
        ibm.read("IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }
    ibm.get_column<double>("IBM_Close")[502] = 800.0;
    ibm.get_column<double>("IBM_Close")[1001] = 900.0;
    ibm.get_column<double>("IBM_Close")[2002] = 850.0;

    and_zscr_v<double>              anomaly2 { 15.0 };
    const std::vector<std::size_t>  result2 = { 502, 1001, 2002 };

    ibm.single_act_visit<double>("IBM_Close", anomaly2);
    assert((anomaly2.get_result() == result2));

    // Now multidimensional data
    //
    constexpr std::size_t   dim { 3 };

    using ary_col_t = std::array<double, dim>;
    using vec_col_t = std::vector<double>;

    std::vector<vec_col_t>  vec_col  {
        {  0.1, 99.8, -50.1 },   // normal
        {  0.2, 100.1, -49.9 },  // normal
        { -0.1, 100.3, -50.2 },  // normal
        {  0.0, 99.7, -50.0 },   // normal
        {  0.3, 100.0, -49.8 },  // normal
        { -0.2, 99.9, -50.3 },   // normal
        {  0.1, 100.2, -50.1 },  // normal
        {  0.0, 100.0, -50.0 },  // normal
        { -0.1, 99.8, -49.9 },   // normal
        {  0.2, 100.1, -50.2 },  // normal
        {  0.1, 99.9, -50.1 },   // normal
        {  0.0, 100.0, -50.0 },  // normal
        { -0.2, 100.2, -49.8 },  // normal
        {  0.1, 99.7, -50.2 },   // normal
        {  0.3, 100.1, -50.0 },  // normal
        {  0.0, 99.9, -49.9 },   // normal
        {  0.2, 100.0, -50.1 },  // normal
        { -0.1, 100.1, -50.0 },  // normal
        { 15.0, 100.2, -50.1 },  // ANOMALY: dim 0 spike
        {  0.1, 99.8, -120.0 },  // ANOMALY: dim 2 spike
    };
    std::vector<ary_col_t>  ary_col  {
        {  0.1, 99.8, -50.1 },   // normal
        {  0.2, 100.1, -49.9 },  // normal
        { -0.1, 100.3, -50.2 },  // normal
        {  0.0, 99.7, -50.0 },   // normal
        {  0.3, 100.0, -49.8 },  // normal
        { -0.2, 99.9, -50.3 },   // normal
        {  0.1, 100.2, -50.1 },  // normal
        {  0.0, 100.0, -50.0 },  // normal
        { -0.1, 99.8, -49.9 },   // normal
        {  0.2, 100.1, -50.2 },  // normal
        {  0.1, 99.9, -50.1 },   // normal
        {  0.0, 100.0, -50.0 },  // normal
        { -0.2, 100.2, -49.8 },  // normal
        {  0.1, 99.7, -50.2 },   // normal
        {  0.3, 100.1, -50.0 },  // normal
        {  0.0, 99.9, -49.9 },   // normal
        {  0.2, 100.0, -50.1 },  // normal
        { -0.1, 100.1, -50.0 },  // normal
        { 15.0, 100.2, -50.1 },  // ANOMALY: dim 0 spike
        {  0.1, 99.8, -120.0 },  // ANOMALY: dim 2 spike
    };

    df.load_column<vec_col_t>("COL VEC", std::move(vec_col), nan_policy::dont_pad_with_nans);
    df.load_column<ary_col_t>("COL ARY", std::move(ary_col), nan_policy::dont_pad_with_nans);

    and_zscr_v<vec_col_t>   vec_and_v { 2.5 };
    and_zscr_v<ary_col_t>   ary_and_v { 2.5 };

    df.single_act_visit<vec_col_t>("COL VEC", vec_and_v);
    df.single_act_visit<ary_col_t>("COL ARY", ary_and_v);
    assert(vec_and_v.get_result().size() == 2);
    assert(vec_and_v.get_result()[0].first == 18);
    assert(vec_and_v.get_result()[0].second == 0);
    assert(vec_and_v.get_result()[1].first == 19);
    assert(vec_and_v.get_result()[1].second == 2);
    assert(ary_and_v.get_result().size() == 2);
    assert(ary_and_v.get_result()[0].first == 18);
    assert(ary_and_v.get_result()[0].second == 0);
    assert(ary_and_v.get_result()[1].first == 19);
    assert(ary_and_v.get_result()[1].second == 2);
}

C++ DataFrame