| Signature | Description | Parameters |
|---|---|---|
template<arithmetic T, typename ... Ts> void remove_data_by_zscore(const char *col_name, T threshold); |
This uses z-score to detect and remove outliers in the named column and all rows corresponding to those outliers in the DataFrame. NOTE: Type T must support arithmetic operations |
T: Type of the named column Ts: The list of types for all columns. A type should be specified only once col_name: Name of the data column threshold: Number of stdev's higher than that it is a outlier |
static void test_remove_data_by_zscore() { std::cout << "\nTesting remove_data_by_zscore( ) ..." << std::endl; constexpr std::size_t item_cnt = 1024; MyStdDataFrame df; df.load_index(MyStdDataFrame::gen_sequence_index(0, item_cnt, 1)); std::vector<double> sine_col; sine_col.reserve(item_cnt); for (std::size_t i = 0; i < item_cnt; ++i) { sine_col.push_back(std::sin(2.0 * M_PI * i / 20.0)); // Base sine wave if (i % 30 == 0) sine_col.back() += 2.0; // Inject anomalies } df.load_column("sine col", std::move(sine_col)); MyStdDataFrame df2 = df; auto lbd = [](const unsigned long &, const double &) -> bool { return (true); }; auto view = df2.get_view_by_sel<double, decltype(lbd), double>("sine col", lbd); assert((df.get_column<double>("sine col").size() == 1024)); assert((view.get_column<double>("sine col").size() == 1024)); df.remove_data_by_zscore<double, double>("sine col", 2.0); assert((df.get_column<double>("sine col").size() == (1024 - 35))); view.remove_data_by_zscore<double, double>("sine col", 2.0); assert((view.get_column<double>("sine col").size() == (1024 - 35))); // Now do the same thing for IBM market data // StrDataFrame ibm; try { ibm.read("IBM.csv", io_format::csv2); } catch (const DataFrameError &ex) { std::cout << ex.what() << std::endl; ::exit(-1); } ibm.get_column<double>("IBM_Close")[502] = 800.0; ibm.get_column<double>("IBM_Close")[1001] = 900.0; ibm.get_column<double>("IBM_Close")[2002] = 850.0; StrDataFrame ibm2 = ibm; auto ibm_lbd = [](const std::string &, const double &) -> bool { return (true); }; auto ibm_view = ibm2.get_view_by_sel<double, decltype(ibm_lbd), double, long>("IBM_Open", ibm_lbd); ibm_view.get_column<double>("IBM_Close")[502] = 800.0; ibm_view.get_column<double>("IBM_Close")[1001] = 900.0; ibm_view.get_column<double>("IBM_Close")[2002] = 850.0; assert((ibm.get_column<double>("IBM_Open").size() == 5031)); assert((ibm_view.get_column<double>("IBM_Open").size() == 5031)); ibm.remove_data_by_zscore<double, double, long>("IBM_Close", 15.0); assert((ibm.get_column<double>("IBM_Open").size() == (5031 - 3))); ibm_view.remove_data_by_zscore<double, double, long>("IBM_Close", 15.0); assert((ibm_view.get_column<double>("IBM_Open").size() == (5031 - 3))); }