Back to Documentations

Signature Description Parameters
template<arithmetic T, typename ... Ts>
void
remove_data_by_zscore(const char *col_name, T threshold);
This uses z-score to detect and remove outliers in the named column and all rows corresponding to those outliers in the DataFrame.

NOTE: Type T must support arithmetic operations
T: Type of the named column
Ts: The list of types for all columns. A type should be specified only once
col_name: Name of the data column
threshold: Number of stdev's higher than that it is a outlier
static void test_remove_data_by_zscore()  {

    std::cout << "\nTesting remove_data_by_zscore( ) ..." << std::endl;

    constexpr std::size_t   item_cnt = 1024;
    MyStdDataFrame          df;

    df.load_index(MyStdDataFrame::gen_sequence_index(0, item_cnt, 1));

    std::vector<double>   sine_col;

    sine_col.reserve(item_cnt);
    for (std::size_t i = 0; i < item_cnt; ++i)  {
        sine_col.push_back(std::sin(2.0 * M_PI * i / 20.0)); // Base sine wave
        if (i % 30 == 0)  sine_col.back() += 2.0;  // Inject anomalies
    }
    df.load_column("sine col", std::move(sine_col));

    MyStdDataFrame  df2 = df;

    auto    lbd = [](const unsigned long &, const double &) -> bool { return (true); };
    auto    view = df2.get_view_by_sel<double, decltype(lbd), double>("sine col", lbd);

    assert((df.get_column<double>("sine col").size() == 1024));
    assert((view.get_column<double>("sine col").size() == 1024));

    df.remove_data_by_zscore<double, double>("sine col", 2.0);
    assert((df.get_column<double>("sine col").size() == (1024 - 35)));

    view.remove_data_by_zscore<double, double>("sine col", 2.0);
    assert((view.get_column<double>("sine col").size() == (1024 - 35)));

    // Now do the same thing for IBM market data
    //
    StrDataFrame    ibm;

    try  {
        ibm.read("IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }
    ibm.get_column<double>("IBM_Close")[502] = 800.0;
    ibm.get_column<double>("IBM_Close")[1001] = 900.0;
    ibm.get_column<double>("IBM_Close")[2002] = 850.0;

    StrDataFrame    ibm2 = ibm;
    auto            ibm_lbd = [](const std::string &, const double &) -> bool { return (true); };
    auto            ibm_view = ibm2.get_view_by_sel<double, decltype(ibm_lbd), double, long>("IBM_Open", ibm_lbd);

    ibm_view.get_column<double>("IBM_Close")[502] = 800.0;
    ibm_view.get_column<double>("IBM_Close")[1001] = 900.0;
    ibm_view.get_column<double>("IBM_Close")[2002] = 850.0;

    assert((ibm.get_column<double>("IBM_Open").size() == 5031));
    assert((ibm_view.get_column<double>("IBM_Open").size() == 5031));

    ibm.remove_data_by_zscore<double, double, long>("IBM_Close", 15.0);
    assert((ibm.get_column<double>("IBM_Open").size() == (5031 - 3)));

    ibm_view.remove_data_by_zscore<double, double, long>("IBM_Close", 15.0);
    assert((ibm_view.get_column<double>("IBM_Open").size() == (5031 - 3)));
}

C++ DataFrame