Back to Documentations

Signature Description Parameters
template<typename T>
using KNNPair = std::pair<std::vector<T>, std::size_t>; 

template<typename T>
using KNNResult = std::vector<KNNPair<T>>;

template<typename T>
using KNNDistFunc =
    std::function<T(const std::vector<T> &X, const std::vector<T> &y)>;

// ---------------------------

template<typename T>
KNNResult<T>
knn(std::vector<const char *> &&col_names,
    const std::vector<T> &target,
    size_type k,
    KNNDistFunc<T> &&dfunc =
        [](const std::vector<T> &X, const std::vector<T> &y) -> T  {
            T   dist { 0 };

            for (std::size_t i { 0 }; const auto &xval : X)  {
                const T &yval = y[i++];

                dist += (xval - yval) * (xval - yval);
            }
            return (std::sqrt(dist));
        }
    ) const;
This implements the K-Nearest Neighbors (KNN) algorithm. KNN is a machine learning technique that uses proximity to classify or predict data points. It's a supervised learning algorithm that's often used for classification problems.
As mentioned KNN can be used for both classification and regression. This method is agnostic of what the intention of the user is. This method simply calculates the KNN and returns the result in a vector of pairs. The vector has k nearest neighbors. Each pair entry contains the value of the neighbor (first element) and the 0-based index of the neighbor (second element) in your dataset. The vector is sorted from nearest to furthest.
It is up the user to process this result for his/her purpose. User can calculate average or weighed average for prediction, or it can use the indices into his/her categorical data for classification.
T: Type of the named columns
col_names: Vector of column names of independent features
target: Dependent feature
k: The K parameter
dfunc: A function to calculate distance between two features. The default is Euclidean distance
static void test_knn()  {

    std::cout << "\nTesting knn( ) ..." << std::endl;

    StrDataFrame    df;

    try  {
        df.read("IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }

    const auto  result =
        df.knn<double>({ "IBM_Open", "IBM_High", "IBM_Low", "IBM_Close" },
                       { 78.95, 80.48, 78.35, 80.48 }, 4);

    assert(result.size() == 4);

    assert(result[0].second == 500);  // Index into the IBM data columns
    assert(result[0].first.size() == 4);
    assert((std::fabs(result[0].first[0] - 78.9) < 0.01));
    assert((std::fabs(result[0].first[2] - 78.32) < 0.01));
    assert((std::fabs(result[0].first[3] - 80.4) < 0.01));

    assert(result[1].second == 541);  // Index into the IBM data columns
    assert(result[1].first.size() == 4);
    assert((std::fabs(result[1].first[0] - 78.8) < 0.01));
    assert((std::fabs(result[1].first[2] - 78.19) < 0.01));
    assert((std::fabs(result[1].first[3] - 80.57) < 0.01));

    assert(result[2].second == 558);  // Index into the IBM data columns
    assert(result[2].first.size() == 4);
    assert((std::fabs(result[2].first[0] - 78.5) < 0.01));
    assert((std::fabs(result[2].first[2] - 78.36) < 0.01));
    assert((std::fabs(result[2].first[3] - 80.11) < 0.01));

    assert(result[3].second == 1232);  // Index into the IBM data columns
    assert(result[3].first.size() == 4);
    assert((std::fabs(result[3].first[0] - 79.25) < 0.01));
    assert((std::fabs(result[3].first[2] - 78.87) < 0.01));
    assert((std::fabs(result[3].first[3] - 80.36) < 0.01));
}

C++ DataFrame