Back to Documentations

Signature Description
template<typename T>
struct  DetectAndChangeParams  {

    using value_type = T;
    using distance_func = std::function<double(const T &x, const T &y)>;

    // Parameter specific to Z-Score, FFT and LOF
    //
    value_type          threshold { 0 };

    // Parameter specific to FFT and LOF
    //
    normalization_type  norm_type { normalization_type::none };

    // Parameter specific to FFT
    //
    std::size_t         freq_num { 0 };

    // Parameters specific to IQR
    //
    value_type          high_fence { 1.5 };
    value_type          low_fence { 1.5 };

    // Parameter specific to LOF
    //
    std::size_t         k { 0 };
    distance_func       dist_fun =
        [](const value_type &x, const value_type &y) -> double  {
            return (std::fabs(x - y));
        };

    // Parameters specific to Hampel filter
    //
    std::size_t         window_size { 0 };
    hampel_type         htype { hampel_type::median };
    value_type          num_stdev { 3.0 };
};
Parameters to member function detect_and_change()

Signature Description Parameters
template<arithmetic T>
void
detect_and_change(const StlVecType<const char *> &col_names,
                  detect_method d_method,
                  fill_policy f_policy,
                  DetectAndChangeParams<T> params = { });
It detects anomalous data based on the specified method and changes them based on the specified fill policy.
Anomalous data don't mean NaN or missing datapoints. The best description is unusual datapoints. But once it identifies the anomalous data it changes it like filling missing data.
T: Type of the column(s) in col_names vector
col_names: A vector of names specifying the columns to fill.
d_policy: Specifies the method to use to detect anomalous data.
f_policy: Specifies the method to use to change anomalous data.
params: Contains parameters for various detection methods
static void test_detect_and_change()  {

    std::cout << "\nTesting detect_and_change( ) ..." << std::endl;

    StrDataFrame    ibm;

    try  {
        ibm.read("IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }

    auto    &close_col = ibm.get_column<double>("IBM_Close");
    auto    &open_col = ibm.get_column<double>("IBM_Open");

    {
        close_col[502] = 800.0;
        close_col[1001] = 900.0;
        close_col[2002] = 850.0;
        open_col[2] = 1.0;
        open_col[3000] = 2.5;
        open_col[5029] = 850.0;

        ibm.detect_and_change<double>({ "IBM_Close", "IBM_Open" }, detect_method::zscore, fill_policy::fill_forward, { .threshold = 3.0 });

        assert((std::fabs(close_col[502] - 82.5) < 0.01));
        assert((std::fabs(close_col[1001] - 89.5) < 0.01));
        assert((std::fabs(close_col[2002] - 92.51) < 0.01));
        assert((std::fabs(open_col[2] - 99.0) < 0.01));
        assert((std::fabs(open_col[3000] - 210.28) < 0.01));
        assert((std::fabs(open_col[5029] - 108.66) < 0.01));
    }

    {
        close_col[502] = 800.0;
        close_col[1001] = 900.0;
        close_col[2002] = 850.0;
        open_col[2] = 1.0;
        open_col[3000] = 2.5;
        open_col[5029] = 850.0;

        ibm.detect_and_change<double>({ "IBM_Close", "IBM_Open" }, detect_method::hampel, fill_policy::fill_backward, { .window_size = 10, .htype = hampel_type::median, .num_stdev = 2.0 });

        assert((std::fabs(close_col[502] - 81.54) < 0.01));
        assert((std::fabs(close_col[1001] - 90.11) < 0.01));
        assert((std::fabs(close_col[2002] - 83.6) < 0.01));
        assert((std::fabs(open_col[2] - 1.0) < 0.01)); // It didn't catch it
        assert((std::fabs(open_col[3000] - 210.02) < 0.01));
        assert((std::fabs(open_col[5029] - 107.9) < 0.01));
    }

    {
        close_col[502] = 800.0;
        close_col[1001] = 900.0;
        close_col[2002] = 850.0;
        open_col[2] = 1.0;
        open_col[3000] = 2.5;
        open_col[5029] = 850.0;

        ibm.detect_and_change<double>({ "IBM_Close", "IBM_Open" }, detect_method::fft, fill_policy::mid_point, { .threshold = 250.0, .norm_type = normalization_type::z_score, .freq_num = 1000 });

        assert((std::fabs(close_col[502] - 82.02) < 0.01));
        assert((std::fabs(close_col[1001] - 89.805) < 0.01));
        assert((std::fabs(close_col[2002] - 88.055) < 0.01));
        assert((std::fabs(open_col[2] - 1.0) < 0.01));    // It didn't catch it
        assert((std::fabs(open_col[3000] - 2.5) < 0.01)); // It didn't catch it
        assert((std::fabs(open_col[5029] - 108.28) < 0.01));
    }

    // Now we need a DataFrame with a numeric index to be able to use
    // interpolation
    //
    MyDataFrame ford;

    try  {
        ford.read("FORD.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }

    auto    &fclose_col = ford.get_column<double>("FORD_Close");
    auto    &fopen_col = ford.get_column<double>("FORD_Open");

    {
        fclose_col[502] = 200.0;
        fclose_col[1001] = 300.0;
        fclose_col[2002] = 250.0;
        fopen_col[2] = 0.01;
        fopen_col[3000] = 0.05;
        fopen_col[5029] = 850.0;

        ford.detect_and_change<double>({ "FORD_Close", "FORD_Open" }, detect_method::iqr, fill_policy::linear_interpolate, { .high_fence = 0.5, .low_fence = 0.5 });

        assert((std::fabs(fclose_col[502] - 1.6889) < 0.0001));
        assert((std::fabs(fclose_col[1001] - 1.8146) < 0.0001));
        assert((std::fabs(fclose_col[2002] - 0.9022) < 0.0001));
        assert((std::fabs(fopen_col[2] - 0.01) < 0.01));    // It didn't catch it
        assert((std::fabs(fopen_col[3000] - 0.05) < 0.01)); // It didn't catch it
        assert((std::fabs(fopen_col[5029] - 7.9947) < 0.0001));
    }
}

C++ DataFrame