| Signature | Description |
|---|---|
template<typename T> struct DetectAndChangeParams { using value_type = T; using distance_func = std::function<double(const T &x, const T &y)>; // Parameter specific to Z-Score, FFT and LOF // value_type threshold { 0 }; // Parameter specific to FFT and LOF // normalization_type norm_type { normalization_type::none }; // Parameter specific to FFT // std::size_t freq_num { 0 }; // Parameters specific to IQR // value_type high_fence { 1.5 }; value_type low_fence { 1.5 }; // Parameter specific to LOF // std::size_t k { 0 }; distance_func dist_fun = [](const value_type &x, const value_type &y) -> double { return (std::fabs(x - y)); }; // Parameters specific to Hampel filter // std::size_t window_size { 0 }; hampel_type htype { hampel_type::median }; value_type num_stdev { 3.0 }; }; |
Parameters to member function detect_and_change() |
| Signature | Description | Parameters |
|---|---|---|
template<arithmetic T> void detect_and_change(const StlVecType<const char *> &col_names, detect_method d_method, fill_policy f_policy, DetectAndChangeParams<T> params = { }); |
It detects anomalous data based on the specified method and changes them based on the specified fill policy. Anomalous data don't mean NaN or missing datapoints. The best description is unusual datapoints. But once it identifies the anomalous data it changes it like filling missing data. |
T: Type of the column(s) in col_names vector col_names: A vector of names specifying the columns to fill. d_policy: Specifies the method to use to detect anomalous data. f_policy: Specifies the method to use to change anomalous data. params: Contains parameters for various detection methods |
static void test_detect_and_change() { std::cout << "\nTesting detect_and_change( ) ..." << std::endl; StrDataFrame ibm; try { ibm.read("IBM.csv", io_format::csv2); } catch (const DataFrameError &ex) { std::cout << ex.what() << std::endl; ::exit(-1); } auto &close_col = ibm.get_column<double>("IBM_Close"); auto &open_col = ibm.get_column<double>("IBM_Open"); { close_col[502] = 800.0; close_col[1001] = 900.0; close_col[2002] = 850.0; open_col[2] = 1.0; open_col[3000] = 2.5; open_col[5029] = 850.0; ibm.detect_and_change<double>({ "IBM_Close", "IBM_Open" }, detect_method::zscore, fill_policy::fill_forward, { .threshold = 3.0 }); assert((std::fabs(close_col[502] - 82.5) < 0.01)); assert((std::fabs(close_col[1001] - 89.5) < 0.01)); assert((std::fabs(close_col[2002] - 92.51) < 0.01)); assert((std::fabs(open_col[2] - 99.0) < 0.01)); assert((std::fabs(open_col[3000] - 210.28) < 0.01)); assert((std::fabs(open_col[5029] - 108.66) < 0.01)); } { close_col[502] = 800.0; close_col[1001] = 900.0; close_col[2002] = 850.0; open_col[2] = 1.0; open_col[3000] = 2.5; open_col[5029] = 850.0; ibm.detect_and_change<double>({ "IBM_Close", "IBM_Open" }, detect_method::hampel, fill_policy::fill_backward, { .window_size = 10, .htype = hampel_type::median, .num_stdev = 2.0 }); assert((std::fabs(close_col[502] - 81.54) < 0.01)); assert((std::fabs(close_col[1001] - 90.11) < 0.01)); assert((std::fabs(close_col[2002] - 83.6) < 0.01)); assert((std::fabs(open_col[2] - 1.0) < 0.01)); // It didn't catch it assert((std::fabs(open_col[3000] - 210.02) < 0.01)); assert((std::fabs(open_col[5029] - 107.9) < 0.01)); } { close_col[502] = 800.0; close_col[1001] = 900.0; close_col[2002] = 850.0; open_col[2] = 1.0; open_col[3000] = 2.5; open_col[5029] = 850.0; ibm.detect_and_change<double>({ "IBM_Close", "IBM_Open" }, detect_method::fft, fill_policy::mid_point, { .threshold = 250.0, .norm_type = normalization_type::z_score, .freq_num = 1000 }); assert((std::fabs(close_col[502] - 82.02) < 0.01)); assert((std::fabs(close_col[1001] - 89.805) < 0.01)); assert((std::fabs(close_col[2002] - 88.055) < 0.01)); assert((std::fabs(open_col[2] - 1.0) < 0.01)); // It didn't catch it assert((std::fabs(open_col[3000] - 2.5) < 0.01)); // It didn't catch it assert((std::fabs(open_col[5029] - 108.28) < 0.01)); } // Now we need a DataFrame with a numeric index to be able to use // interpolation // MyDataFrame ford; try { ford.read("FORD.csv", io_format::csv2); } catch (const DataFrameError &ex) { std::cout << ex.what() << std::endl; ::exit(-1); } auto &fclose_col = ford.get_column<double>("FORD_Close"); auto &fopen_col = ford.get_column<double>("FORD_Open"); { fclose_col[502] = 200.0; fclose_col[1001] = 300.0; fclose_col[2002] = 250.0; fopen_col[2] = 0.01; fopen_col[3000] = 0.05; fopen_col[5029] = 850.0; ford.detect_and_change<double>({ "FORD_Close", "FORD_Open" }, detect_method::iqr, fill_policy::linear_interpolate, { .high_fence = 0.5, .low_fence = 0.5 }); assert((std::fabs(fclose_col[502] - 1.6889) < 0.0001)); assert((std::fabs(fclose_col[1001] - 1.8146) < 0.0001)); assert((std::fabs(fclose_col[2002] - 0.9022) < 0.0001)); assert((std::fabs(fopen_col[2] - 0.01) < 0.01)); // It didn't catch it assert((std::fabs(fopen_col[3000] - 0.05) < 0.01)); // It didn't catch it assert((std::fabs(fopen_col[5029] - 7.9947) < 0.0001)); } }