Back to Documentations

Signature Description Parameters
#include <DataFrame/DataFrameMLVisitors.h>

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
struct AnomalyDetectByIQRVisitor;

// -------------------------------------

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
using and_iqr_v = AnomalyDetectByIQRVisitor<T, I, A>;
This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This visitor applies Inter-Quartile Range (IQR) method to find outliers. IQR is a statistical technique used to identify outliers in a dataset by calculating the difference between the first quartile (Q1) and the third quartile (Q3), essentially representing the spread of the middle 50% of the data; any data points falling significantly outside this range are considered potential outliers.
To detect outliers using the IQR method, you typically calculate "fences" by adding and subtracting 1.5 (default) times the IQR from Q1 and Q3 respectively. Any data points falling outside these fences are considered potential outliers.

The result is a vector of indices to the original data that were deemed outliers.
explicit
AnomalyDetectByIQRVisitor(T high_fence = T(1.5), T low_fence = T(1.5));

high_fence: Upper limit that is multiplied by the IQR value.
low_fence: Lower limit that is multiplied by the IQR value.
    
T: Column data type
I: Index type
A: Memory alignment boundary for vectors. Default is system default alignment
static void test_AnomalyDetectByIQRVisitor()  {

    std::cout << "\nTesting AnomalyDetectByIQRVisitor{ } ..." << std::endl;

    constexpr std::size_t   item_cnt = 1024;
    MyStdDataFrame          df;

    df.load_index(MyStdDataFrame::gen_sequence_index(0, item_cnt, 1));

    std::vector<double>   sine_col;

    sine_col.reserve(item_cnt);
    for (std::size_t i = 0; i < item_cnt; ++i)  {
        sine_col.push_back(std::sin(2.0 * M_PI * i / 20.0)); // Base sine wave
        if (i % 30 == 0)  sine_col.back() += 2.0;  // Inject anomalies
    }
    df.load_column("sine col", std::move(sine_col));

    and_iqr_v<double>               anomaly1(0.5, 0.5);
    const std::vector<std::size_t>  result1 = { 0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570,
                                                600, 630, 660, 690, 720, 750, 780, 810, 840, 870, 900, 930, 960, 990, 1020 };

    df.single_act_visit<double>("sine col", anomaly1);
    assert((anomaly1.get_result() == result1));

    // Now do the same thing for IBM market data
    //
    StrDataFrame    ibm;

    try  {
        ibm.read("IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }
    ibm.get_column<double>("IBM_Close")[502] = 800.0;
    ibm.get_column<double>("IBM_Close")[1001] = 900.0;
    ibm.get_column<double>("IBM_Close")[2002] = 850.0;

    and_iqr_v<double>               anomaly2;
    const std::vector<std::size_t>  result2 = { 502, 1001, 2002 };

    ibm.single_act_visit<double>("IBM_Close", anomaly2);
    assert((anomaly2.get_result() == result2));
}

C++ DataFrame