| Signature | Description | Parameters |
|---|---|---|
#include <DataFrame/DataFrameMLVisitors.h> template<typename T, typename I = unsigned long, std::size_t A = 0> struct AnomalyDetectByKNNVisitor; // ------------------------------------- template<typename T, typename I = unsigned long, std::size_t A = 0> using and_knn_v = AnomalyDetectByKNNVisitor<T, I, A>; |
This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface. This visitor does anomaly detection by using K-Nearest Neighbors (KNN) and by implementing a kd-tree. Time Series Anomaly Detection with KNN works by identifying data points that are far from their neighbors, treating large distances as anomalies; it's an unsupervised method where an outlier score (often distance to the k-th neighbor) indicates abnormality, with higher scores signaling anomalies, making it useful for detecting sudden spikes or deviations in patterns like in predictive maintenance or process monitoring. This works with both scalar and multidimensional (i.e. vectors and arrays) datasets. get_result(): Returns a vector of scores for each datapoint in the column. Higher values mean more abnormality. get_anomalous_indices(T threshold = T(1e-7)): Returns the index of anomalous datapoints by analyzing the scores. You can do more elaborate analysis be looking at the scores.
using win_type = std::vector<T>;
using result_type = std::vector<T, typename allocator_declare<T, A>::type>;
using index_vec_t = std::vector<size_type, typename allocator_declare<size_type, A>::type>;
using distance_func = std::function<T(const win_type &x, const win_type &y)>;
AnomalyDetectByKNNVisitor(size_type window,
size_type k,
normalization_type norm_type = normalization_type::none,
distance_func &&f = def_dist);
window: This is the KDTree dimension. Larger dimension makes KDTree less effective.
Window stays small to keep KD-tree viable.
Window decides what a point looks like.
k: This is number of neighbors used to estimate rarity. K is about local density,
and Independent of time series length or window size.
K grows with dataset size, not dimension. K decides how lonely that point is.
norm_type: Normalization type. the default is no normalization
f: Function to calculate distance between two datapoints
static double def_dist(const tree_point_t &x, const tree_point_t &y) { double sum { 0 }; const size_type sz { x.size() }; for (size_type i { 0 }; i < sz; ++i) { const double diff { double(x[i]) - double(y[i]) }; sum += diff * diff; } return (std::sqrt(sum)); } |
T: Column data type I: Index type A: Memory alignment boundary for vectors. Default is system default alignment |
static void test_AnomalyDetectByKNNVisitor() { std::cout << "\nTesting AnomalyDetectByKNNVisitor{ } ..." << std::endl; constexpr std::size_t item_cnt = 1024; ULDataFrame df; df.load_index(ULDataFrame::gen_sequence_index(0, item_cnt, 1)); std::vector<double> sine_col; sine_col.reserve(item_cnt); for (std::size_t i = 0; i < item_cnt; ++i) { sine_col.push_back(std::sin(2.0 * M_PI * i / 20.0)); // Base sine wave if (i % 31 == 0) sine_col.back() += 10.0; // Inject anomalies } df.load_column("sine col", std::move(sine_col)); and_knn_v<double> anomaly1 { 3, 4 }; df.single_act_visit<double>("sine col", anomaly1); const auto anomalous_indices1 = anomaly1.get_anomalous_indices(); assert(anomaly1.get_result().size() == 1024); assert(anomalous_indices1.size() == 34); assert(anomalous_indices1[0] == 0); assert(anomalous_indices1[1] == 31); assert(anomalous_indices1[2] == 62); assert(anomalous_indices1[17] == 527); assert(anomalous_indices1[22] == 682); assert(anomalous_indices1[32] == 992); assert(anomalous_indices1[33] == 1023); // Now do the same thing for IBM market data // StrDataFrame ibm; try { ibm.read("IBM.csv", io_format::csv2); } catch (const DataFrameError &ex) { std::cout << ex.what() << std::endl; ::exit(-1); } ibm.get_column<double>("IBM_Adj_Close")[502] = 800.0; ibm.get_column<double>("IBM_Adj_Close")[1001] = 900.0; ibm.get_column<double>("IBM_Adj_Close")[2002] = 850.0; and_knn_v<double, std::string> anomaly2 { 3, 4, normalization_type::z_score }; ibm.single_act_visit<double>("IBM_Adj_Close", anomaly2); const auto anomalous_indices2 = anomaly2.get_anomalous_indices(0.9); assert(anomalous_indices2.size() == 3); assert(anomalous_indices2[0] == 502); assert(anomalous_indices2[1] == 1001); assert(anomalous_indices2[2] == 2002); // Now multidimensional data // constexpr std::size_t dim { 3 }; constexpr std::size_t n { 120 }; using ary_col_t = std::array<double, dim>; using vec_col_t = std::vector<double>; std::vector<vec_col_t> vec_col; vec_col.reserve(n); // Normal region 1: cluster near (1, 2, 3) // for (std::size_t i { 0 }; i < 40; ++i) { double t { i * 0.15 }; vec_col.push_back({ 1.0 + 0.1 * std::sin(t), 2.0 + 0.1 * std::cos(t), 3.0 + 0.05 * t }); } // *** Anomaly A: sudden spike on all 3 axes *** // vec_col.push_back({ 9.5, -7.0, 15.0 }); // idx 40 vec_col.push_back({ 10.0, -8.0, 16.0 }); // idx 41 vec_col.push_back({ 9.8, -7.5, 15.5 }); // idx 42 // Normal region 2: cluster near (-1, 0, 1) // for (std::size_t i { 0 }; i < 35; ++i) { double t { i * 0.15 }; vec_col.push_back({ -1.0 + 0.1 * std::sin(t), 0.0 + 0.1 * std::cos(t), 1.0 + 0.05 * t }); } // *** Anomaly B: one axis goes wild, others stay normal *** // vec_col.push_back({ -1.0, 0.0, 50.0 }); // idx 78 — z-axis outlier vec_col.push_back({ -1.1, 0.1, 52.0 }); // idx 79 // Normal region 3: cluster near (3, 3, 3) // for (std::size_t i { 0 }; i < 38; ++i) { double t { i * 0.15 }; vec_col.push_back({ 3.0 + 0.1 * std::sin(t), 3.0 + 0.1 * std::cos(t), 3.0 + 0.05 * t }); } // *** Anomaly C: isolated single point far from everything *** // vec_col.push_back({ -20.0, 20.0, -20.0 }); // idx 118 // Final normal cap // vec_col.push_back({ 3.0, 3.0, 3.0 }); std::vector<ary_col_t> ary_col(n); // Copy the vector of vectors to vector of arrays // for (std::size_t i { 0 }; i < vec_col.size(); ++i) { const auto &vec = vec_col[i]; ary_col_t ary; for (std::size_t j { 0 }; j < vec.size(); ++j) ary[j] = vec[j]; ary_col[i] = std::move(ary); } df.load_column<ary_col_t>("ARY COL", std::move(ary_col), nan_policy::dont_pad_with_nans); df.load_column<vec_col_t>("VEC COL", std::move(vec_col), nan_policy::dont_pad_with_nans); and_knn_v<vec_col_t> vec_knn { 4, 5 }; and_knn_v<ary_col_t> ary_knn { 4, 5 }; df.single_act_visit<vec_col_t>("VEC COL", vec_knn); df.single_act_visit<ary_col_t>("ARY COL", ary_knn); const auto &anom_idxs_vec { vec_knn.get_anomalous_indices(0.5) }; const auto &anom_idxs_ary { ary_knn.get_anomalous_indices(0.5) }; // Why didn't all anomalies get caught: // 1. Anomaly A (indices 40–42) is a cluster of 3 similar points. When the // tree scores index 41, its nearest neighbors are indices 40 and 42 — // they're close to each other, so the average KNN distance is small. // A cluster of outliers looks locally dense to KNN. This is a known // fundamental limitation of KNN anomaly detection. // 2. Anomaly C (index 118) is a single point but near the end of the // series. With window=4, the buckets containing index 118 have very // few neighbors in the tree overall, and the score gets averaged down // by expand_scores_ across the window, diluting the signal. // 3. The three normal clusters are far apart from each other — (1,2,3), // (-1,0,1), (3,3,3). This raises the baseline KNN distances for normal // points near cluster boundaries, compressing the relative contrast // with anomalies. // assert(anom_idxs_vec.size() == 3); assert(anom_idxs_ary.size() == 3); assert(anom_idxs_vec[0] == 41); assert(anom_idxs_vec[1] == 79); assert(anom_idxs_vec[2] == 118); assert(anom_idxs_ary[0] == 41); assert(anom_idxs_ary[1] == 79); assert(anom_idxs_ary[2] == 118); }