Signature	Description	Parameters
#include <DataFrame/DataFrameMLVisitors.h> template<typename T, typename I = unsigned long, std::size_t A = 0> struct AnomalyDetectByKNNVisitor; // ------------------------------------- template<typename T, typename I = unsigned long, std::size_t A = 0> using and_knn_v = AnomalyDetectByKNNVisitor<T, I, A>;	This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface. This visitor does anomaly detection by using K-Nearest Neighbors (KNN) and by implementing a kd-tree. Time Series Anomaly Detection with KNN works by identifying data points that are far from their neighbors, treating large distances as anomalies; it's an unsupervised method where an outlier score (often distance to the k-th neighbor) indicates abnormality, with higher scores signaling anomalies, making it useful for detecting sudden spikes or deviations in patterns like in predictive maintenance or process monitoring. get_result(): Returns a vector of scores for each datapoint in the column. Higher values mean more abnormality. get_anomalous_indices(T threshold = T(1e-7)): Returns the index of anomalous datapoints by analyzing the scores. You can do more elaborate analysis be looking at the scores. using win_type = std::vector<T>; using result_type = std::vector<T, typename allocator_declare<T, A>::type>; using index_vec_t = std::vector<size_type, typename allocator_declare<size_type, A>::type>; using distance_func = std::function<T(const win_type &x, const win_type &y)>; AnomalyDetectByKNNVisitor( size_type window, size_type k, normalization_type norm_type = normalization_type::none, distance_func &&f = [](const win_type &a, const win_type &b) -> T { T sum { 0 }; const size_type sz { a.size() }; for (size_type i { 0 }; i < sz; ++i) { const T diff { a[i] - b[i] }; sum += diff * diff; } return (std::sqrt(sum)); }); window: This is the KDTree dimension. Larger dimension makes KDTree less effective. Window stays small to keep KD-tree viable. Window decides what a point looks like. k: This is number of neighbors used to estimate rarity. K is about local density, and Independent of time series length or window size. K grows with dataset size, not dimension. K decides how lonely that point is. norm_type: Normalization type. the default is no normalization f: Function to calculate distance between two datapoints	T: Column data type I: Index type A: Memory alignment boundary for vectors. Default is system default alignment

Signature

Description

Parameters

#include <DataFrame/DataFrameMLVisitors.h>

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
struct AnomalyDetectByKNNVisitor;

// -------------------------------------

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
using and_knn_v = AnomalyDetectByKNNVisitor<T, I, A>;

This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This visitor does anomaly detection by using K-Nearest Neighbors (KNN) and by implementing a kd-tree. Time Series Anomaly Detection with KNN works by identifying data points that are far from their neighbors, treating large distances as anomalies; it's an unsupervised method where an outlier score (often distance to the k-th neighbor) indicates abnormality, with higher scores signaling anomalies, making it useful for detecting sudden spikes or deviations in patterns like in predictive maintenance or process monitoring.

get_result(): Returns a vector of scores for each datapoint in the column. Higher values mean more abnormality.
get_anomalous_indices(T threshold = T(1e-7)): Returns the index of anomalous datapoints by analyzing the scores. You can do more elaborate analysis be looking at the scores.

using win_type = std::vector<T>; using result_type = std::vector<T, typename allocator_declare<T, A>::type>; using index_vec_t = std::vector<size_type, typename allocator_declare<size_type, A>::type>; using distance_func = std::function<T(const win_type &x, const win_type &y)>; AnomalyDetectByKNNVisitor( size_type window, size_type k, normalization_type norm_type = normalization_type::none, distance_func &&f = [](const win_type &a, const win_type &b) -> T { T sum { 0 }; const size_type sz { a.size() }; for (size_type i { 0 }; i < sz; ++i) { const T diff { a[i] - b[i] }; sum += diff * diff; } return (std::sqrt(sum)); }); window: This is the KDTree dimension. Larger dimension makes KDTree less effective. Window stays small to keep KD-tree viable. Window decides what a point looks like. k: This is number of neighbors used to estimate rarity. K is about local density, and Independent of time series length or window size. K grows with dataset size, not dimension. K decides how lonely that point is. norm_type: Normalization type. the default is no normalization f: Function to calculate distance between two datapoints

T: Column data type
I: Index type
A: Memory alignment boundary for vectors. Default is system default alignment

static void test_AnomalyDetectByKNNVisitor()  {

    std::cout << "\nTesting AnomalyDetectByKNNVisitor{ } ..." << std::endl;

    constexpr std::size_t   item_cnt = 1024;
    ULDataFrame             df;

    df.load_index(ULDataFrame::gen_sequence_index(0, item_cnt, 1));

    std::vector<double>   sine_col;

    sine_col.reserve(item_cnt);
    for (std::size_t i = 0; i < item_cnt; ++i)  {
        sine_col.push_back(std::sin(2.0 * M_PI * i / 20.0)); // Base sine wave
        if (i % 31 == 0)  sine_col.back() += 10.0;  // Inject anomalies
    }
    df.load_column("sine col", std::move(sine_col));

    and_knn_v<double>   anomaly1 { 3, 4 };

    df.single_act_visit<double>("sine col", anomaly1);

    const auto  anomalous_indices1 = anomaly1.get_anomalous_indices();

    assert(anomaly1.get_result().size() == 1024);
    assert(anomalous_indices1.size() == 34);
    assert(anomalous_indices1[0] == 0);
    assert(anomalous_indices1[1] == 31);
    assert(anomalous_indices1[2] == 62);
    assert(anomalous_indices1[17] == 527);
    assert(anomalous_indices1[22] == 682);
    assert(anomalous_indices1[32] == 992);
    assert(anomalous_indices1[33] == 1023);

    // Now do the same thing for IBM market data
    //
    StrDataFrame    ibm;

    try  {
        ibm.read("IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }
    ibm.get_column<double>("IBM_Adj_Close")[502] = 800.0;
    ibm.get_column<double>("IBM_Adj_Close")[1001] = 900.0;
    ibm.get_column<double>("IBM_Adj_Close")[2002] = 850.0;

    and_knn_v<double, std::string>  anomaly2 { 3, 4, normalization_type::z_score };

    ibm.single_act_visit<double>("IBM_Adj_Close", anomaly2);

    const auto  anomalous_indices2 = anomaly2.get_anomalous_indices(0.9);

    assert(anomalous_indices2.size() == 3);
    assert(anomalous_indices2[0] == 502);
    assert(anomalous_indices2[1] == 1001);
    assert(anomalous_indices2[2] == 2002);
}