| Signature | Description | Parameters |
|---|---|---|
#include <DataFrame/DataFrameMLVisitors.h> template<typename T, typename I = unsigned long, std::size_t A = 0> struct AnomalyDetectByFFTVisitor; // ------------------------------------- template<typename T, typename I = unsigned long, std::size_t A = 0> using and_fft_v = AnomalyDetectByFFTVisitor<T, I, A>; |
This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface. This visitor applies Fast Fourier Transform (FFT), which is an implementation of discrete Fourier transform to find outliers in the given column. It is easy to find anomalies in data, if the data is a repeating pattern such as a sine wave. It is more difficult (you must really tune the two parameters) to find anomalies in more random data such as a stock’s market data – see the code sample below. This visitor goes through the following steps 1. It optionally normalizes the data 2. It converts either the original column or the normalized data to frequency domain by running FFT 3. It zeros-out the frequency spectrums of all frequencies behind freq_num. 4. It runs an inverse FFT (IFFT) on the modified frequency spectrums. 5. It compares the original data with the data coming out of IFFT. Any data point whose difference is greater than anomaly_threshold is considered an outlier. This works with both scalar and multidimensional (i.e. vector and arrays) datasets. get_result() returns a vector of indices to the original data that were deemed outliers. In case of multidimensional input column, the result is a vector of std::pairs. The first integer is the index into the input column and the second integer is the index into the dimension.
explicit
AnomalyDetectByFFTVisitor(size_type freq_num,
value_type anomaly_threshold = T(1),
normalization_type norm_type = normalization_type::none);
freq_num: Number of dominant frequencies to keep when performing IFFT on the result of FFT
anomaly_threshold: The difference threshold between original data and the result of IFFT
|
T: Column data type I: Index type A: Memory alignment boundary for vectors. Default is system default alignment |
static void test_AnomalyDetectByFFTVisitor() { std::cout << "\nTesting AnomalyDetectByFFTVisitor{ } ..." << std::endl; constexpr std::size_t item_cnt = 1024; MyStdDataFrame df; df.load_index(MyStdDataFrame::gen_sequence_index(0, item_cnt, 1)); std::vector<double> sine_col; sine_col.reserve(item_cnt); for (std::size_t i = 0; i < item_cnt; ++i) { sine_col.push_back(std::sin(2.0 * M_PI * i / 20.0)); // Base sine wave if (i % 30 == 0) sine_col.back() += 2.0; // Inject anomalies } df.load_column("sine col", std::move(sine_col)); // Keep at least 10% of the frequencies as dominant frequencies. // and_fft_v<double> anomaly1(100, 1.0); const std::vector<std::size_t> result1 = { 0, 30, 60, 90, 120, 150, 180, 210, 240, 270, 300, 330, 360, 390, 420, 450, 480, 510, 540, 570, 600, 630, 660, 690, 720, 750, 780, 810, 840, 870, 900, 930, 960, 990, 1020 }; df.single_act_visit<double>("sine col", anomaly1); assert((anomaly1.get_result() == result1)); and_fft_v<double> anomaly2(10, 1.5); df.single_act_visit<double>("sine col", anomaly2); assert((anomaly2.get_result() == result1)); and_fft_v<double> anomaly3(100, 1.0, normalization_type::z_score); df.single_act_visit<double>("sine col", anomaly3); assert((anomaly3.get_result() == result1)); and_fft_v<double> anomaly4(10, 1.5, normalization_type::z_score); df.single_act_visit<double>("sine col", anomaly4); assert((anomaly4.get_result() == result1)); // Now do the same thing for IBM market data // StrDataFrame ibm; try { ibm.read("IBM.csv", io_format::csv2); } catch (const DataFrameError &ex) { std::cout << ex.what() << std::endl; ::exit(-1); } ibm.get_column<double>("IBM_Close")[502] = 800.0; ibm.get_column<double>("IBM_Close")[1001] = 900.0; ibm.get_column<double>("IBM_Close")[2002] = 850.0; // Keep at least 10% of the frequencies as dominant frequencies. // In case of IBM market data, I had to keep more // and_fft_v<double, std::string> anomaly5(1000, 80.0); const std::vector<std::size_t> result2 = { 501, 502, 503, 1000, 1001, 1002, 2001, 2002, 2003 }; ibm.single_act_visit<double>("IBM_Close", anomaly5); assert((anomaly5.get_result() == result2)); and_fft_v<double, std::string> anomaly6(1000, 250.0, normalization_type::z_score); const std::vector<std::size_t> result3 = { 502, 1001, 2002 }; ibm.single_act_visit<double>("IBM_Close", anomaly6); assert((anomaly6.get_result() == result3)); // Now multidimensional data // constexpr std::size_t dim { 3 }; constexpr double two_pi { 2.0 * M_PI }; constexpr std::size_t col_s { 128 }; // power-of-2 - radix-2 FFT constexpr std::size_t freq_num { 8 }; // keep only 8 low frequencies constexpr double amplitude { 1.0 }; // sine amplitude per channel constexpr double spike_mag { 15.0 }; // anomaly magnitude constexpr std::size_t spike_idx { 64 }; // time step of the spike constexpr double threshold { 2.0 }; // anomaly_threshold using ary_col_t = std::array<double, dim>; using vec_col_t = std::vector<double>; std::vector<vec_col_t> no_anomaly_vec(col_s, vec_col_t(dim)); std::vector<ary_col_t> no_anomaly_ary(col_s); for (std::size_t i { 0 }; i < col_s; ++i) { for (std::size_t d { 0 }; d < dim; ++d) { no_anomaly_vec[i][d] = amplitude * std::sin(two_pi * double(d + 1) * double(i) / double(col_s)); no_anomaly_ary[i][d] = no_anomaly_vec[i][d]; } } df.load_column<vec_col_t>("NO ANOMALY VEC", std::move(no_anomaly_vec), nan_policy::dont_pad_with_nans); df.load_column<ary_col_t>("NO ANOMALY ARY", std::move(no_anomaly_ary), nan_policy::dont_pad_with_nans); std::vector<vec_col_t> spiked_vec = df.get_column<vec_col_t>("NO ANOMALY VEC"); std::vector<ary_col_t> spiked_ary = df.get_column<ary_col_t>("NO ANOMALY ARY"); for (std::size_t d { 0 }; d < dim; ++d) { spiked_vec[spike_idx][d] += spike_mag; spiked_ary[spike_idx][d] += spike_mag; } df.load_column<vec_col_t>("SPIKED VEC", std::move(spiked_vec), nan_policy::dont_pad_with_nans); df.load_column<ary_col_t>("SPIKED ARY", std::move(spiked_ary), nan_policy::dont_pad_with_nans); and_fft_v<vec_col_t, std::string> fft_vec { freq_num, threshold }; and_fft_v<ary_col_t, std::string> fft_ary { freq_num, threshold }; df.single_act_visit<vec_col_t>("NO ANOMALY VEC", fft_vec); df.single_act_visit<ary_col_t>("NO ANOMALY ARY", fft_ary); assert(fft_vec.get_result().empty()); assert(fft_ary.get_result().empty()); df.single_act_visit<vec_col_t>("SPIKED VEC", fft_vec); df.single_act_visit<ary_col_t>("SPIKED ARY", fft_ary); assert(fft_vec.get_result().size() == dim); assert(fft_vec.get_result()[0].first == spike_idx); assert(fft_vec.get_result()[0].second == 0); assert(fft_vec.get_result()[1].first == spike_idx); assert(fft_vec.get_result()[1].second == 1); assert(fft_vec.get_result()[2].first == spike_idx); assert(fft_vec.get_result()[2].second == 2); assert(fft_ary.get_result().size() == dim); assert(fft_ary.get_result()[0].first == spike_idx); assert(fft_ary.get_result()[0].second == 0); assert(fft_ary.get_result()[1].first == spike_idx); assert(fft_ary.get_result()[1].second == 1); assert(fft_ary.get_result()[2].first == spike_idx); assert(fft_ary.get_result()[2].second == 2); }