Back to Documentations

Signature Description Parameters
#include <DataFrame/DataFrameMLVisitors.h>

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
struct AnomalyDetectByKNNVisitor;

// -------------------------------------

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
using and_knn_v = AnomalyDetectByKNNVisitor<T, I, A>;
This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This visitor does anomaly detection by using K-Nearest Neighbors (KNN) and by implementing a kd-tree. Time Series Anomaly Detection with KNN works by identifying data points that are far from their neighbors, treating large distances as anomalies; it's an unsupervised method where an outlier score (often distance to the k-th neighbor) indicates abnormality, with higher scores signaling anomalies, making it useful for detecting sudden spikes or deviations in patterns like in predictive maintenance or process monitoring.

get_result(): Returns a vector of scores for each datapoint in the column. Higher values mean more abnormality.
get_anomalous_indices(T threshold = T(1e-7)): Returns the index of anomalous datapoints by analyzing the scores. You can do more elaborate analysis be looking at the scores.

using win_type = std::vector<T>;
using result_type = std::vector<T, typename allocator_declare<T, A>::type>;
using index_vec_t = std::vector<size_type, typename allocator_declare<size_type, A>::type>;
using distance_func = std::function<T(const win_type &x, const win_type &y)>;

AnomalyDetectByKNNVisitor(
    size_type window,
    size_type k,
    normalization_type norm_type = normalization_type::none,
    distance_func &&f = [](const win_type &a, const win_type &b) -> T {
                            T                sum { 0 };
                            const size_type  sz { a.size() };

                            for (size_type i { 0 }; i < sz; ++i)  {
                                const T diff { a[i] - b[i] };

                                sum += diff * diff;
                            }
                            return (std::sqrt(sum));
                        });

window: This is the KDTree dimension. Larger dimension makes KDTree less effective.
        Window stays small to keep KD-tree viable.
        Window decides what a point looks like.
k: This is number of neighbors used to estimate rarity. K is about local density,
   and Independent of time series length or window size.
   K grows with dataset size, not dimension. K decides how lonely that point is.
norm_type: Normalization type. the default is no normalization
f: Function to calculate distance between two datapoints
    
T: Column data type
I: Index type
A: Memory alignment boundary for vectors. Default is system default alignment
static void test_AnomalyDetectByKNNVisitor()  {

    std::cout << "\nTesting AnomalyDetectByKNNVisitor{ } ..." << std::endl;

    constexpr std::size_t   item_cnt = 1024;
    ULDataFrame             df;

    df.load_index(ULDataFrame::gen_sequence_index(0, item_cnt, 1));

    std::vector<double>   sine_col;

    sine_col.reserve(item_cnt);
    for (std::size_t i = 0; i < item_cnt; ++i)  {
        sine_col.push_back(std::sin(2.0 * M_PI * i / 20.0)); // Base sine wave
        if (i % 31 == 0)  sine_col.back() += 10.0;  // Inject anomalies
    }
    df.load_column("sine col", std::move(sine_col));

    and_knn_v<double>   anomaly1 { 3, 4 };

    df.single_act_visit<double>("sine col", anomaly1);

    const auto  anomalous_indices1 = anomaly1.get_anomalous_indices();

    assert(anomaly1.get_result().size() == 1024);
    assert(anomalous_indices1.size() == 34);
    assert(anomalous_indices1[0] == 0);
    assert(anomalous_indices1[1] == 31);
    assert(anomalous_indices1[2] == 62);
    assert(anomalous_indices1[17] == 527);
    assert(anomalous_indices1[22] == 682);
    assert(anomalous_indices1[32] == 992);
    assert(anomalous_indices1[33] == 1023);

    // Now do the same thing for IBM market data
    //
    StrDataFrame    ibm;

    try  {
        ibm.read("IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }
    ibm.get_column<double>("IBM_Adj_Close")[502] = 800.0;
    ibm.get_column<double>("IBM_Adj_Close")[1001] = 900.0;
    ibm.get_column<double>("IBM_Adj_Close")[2002] = 850.0;

    and_knn_v<double, std::string>  anomaly2 { 3, 4, normalization_type::z_score };

    ibm.single_act_visit<double>("IBM_Adj_Close", anomaly2);

    const auto  anomalous_indices2 = anomaly2.get_anomalous_indices(0.9);

    assert(anomalous_indices2.size() == 3);
    assert(anomalous_indices2[0] == 502);
    assert(anomalous_indices2[1] == 1001);
    assert(anomalous_indices2[2] == 2002);
}

C++ DataFrame