Back to Documentations

Signature Description Parameters
#include <DataFrame/DataFrameMLVisitors.h>

template<size_t K, typename T, typename I = unsigned long,
         std::size_t A = 0>
struct KMeansVisitor;
This is a single action visitor, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This functor class finds the K means in the data. It could also cluster the data around the means. This works with both scalar and multidimensional (i.e. vector and arrays) datasets.
The constructor takes 3 parameters
  1. Number of iterations
  2. If calc_clusters is true -- default -- the actual clusters are also calculated in addition to the means
  3. A function to calculate distance between two data points of type T (with default)
KMeansVisitor(std::size_t num_of_iter,
              bool calc_clusters = true,
              distance_func f = [](const T &x, const T &y) -> double {
                                    return ((x - y) * (x - y));
                                });
get_results() returns an array of K means.
get_clusters() returns an array of K VectorPtrView's which contain the data clustered around the K-Means. The first element in each VectorPtrView is the mean and the reset are the data points belonging to that cluster.
get_clusters_idxs() returns an array of K std::vector<size_type>'s which contains indices of the data in each cluster.
K: Number of means to find
T: Column data type
I: Index type
A: Memory alignment boundary for vectors. Default is system default alignment
struct  Point  {

    double  x { 0.0 };
    double  y { 0.0 };

    Point() = default;
    Point(double xx, double yy) : x(xx), y(yy)  {   }
    Point(const Point &) = default;
    Point &operator = (const Point &) = default;

    friend Point operator + (const Point &lhs, const Point &rhs)  {

        return (Point(lhs.x + rhs.x, lhs.y + rhs.y));
    }
    friend Point operator / (const Point &lhs, double rhs)  {

        return (Point(lhs.x / rhs, lhs.y / rhs));
    }

    template<typename S>
    friend S &operator << (S &s, const Point &rhs)  {

        return (s << rhs.x << ", " << rhs.y);
    }
};

static double point_distance(const Point &lhs, const Point &rhs)  {

    return ((lhs.x - rhs.x) * (lhs.x - rhs.x) + (lhs.y - rhs.y) * (lhs.y - rhs.y));
}

// -------------------------------------

static void test_k_means()  {

    std::cout << "\nTesting k-means visitor ..." << std::endl;

    const size_t            item_cnt = 1024;
    MyDataFrame             df;
    RandGenParams<double>   p;

    p.mean = 1.0;  // Default
    p.std = 0.005;
    p.seed = 10;

    df.load_data(MyDataFrame::gen_sequence_index(0, item_cnt, 1), std::make_pair("col1", gen_lognormal_dist<double, 128>(item_cnt, p)));

    KMeansVisitor<5, double, unsigned long, 128>  km_visitor(1000, true,
                                                             [](const double &x, const double &y)  {
                                                                 return ((x - y) * (x - y));
                                                             },
                                                             10);

    df.single_act_visit<double>("col1", km_visitor);
    std::cout << "Means of clusters are: ";
    for (const auto citer : km_visitor.get_result())
        std::cout << citer << ", ";
    std::cout << std::endl;

    // Using the calculated means, separate the given column into clusters
    // const auto  &clusters = km_visitor.get_clusters();
    // bool        found = false;

    // for (auto iter : clusters)  {
    //     if (::fabs(iter[0] - 1.89348) < 0.00001)  {
    //         if (::fabs(iter[6] - 1.44231) < 0.00001)  {
    //             found = true;
    //             break;
    //         }
    //     }
    // }
    // assert(found);
    // found = false;
    // for (auto iter : clusters)  {
    //     if (::fabs(iter[0] - 0.593126) < 0.00001)  {
    //         if (::fabs(iter[2] - 0.950026) < 0.00001)  {
    //             found = true;
    //             break;
    //         }
    //     }
    // }
    // assert(found);
    // found = false;
    // for (auto iter : clusters)  {
    //     if (::fabs(iter[0] - 14.2245) < 0.0001)  {
    //         found = true;
    //         break;
    //     }
    // }
    // assert(found);
    // found = false;
    // for (auto iter : clusters)  {
    //     if (::fabs(iter[0] - 6.90427) < 0.00001)  {
    //         found = true;
    //         break;
    //     }
    // }
    // assert(found);
    // found = false;
    // for (auto iter : clusters)  {
    //     if (::fabs(iter[0] - 3.8146) < 0.00001)  {
    //         found = true;
    //         break;
    //     }
    // }
    // assert(found);

    // Now try with Points
    //
    p.seed = 200;

    auto    x_vec = gen_lognormal_dist<double, 128>(item_cnt, p);

    p.seed = 4356;

    auto                y_vec = gen_lognormal_dist<double, 128>(item_cnt, p);
    StlVecType<Point>   points;

    points.reserve(item_cnt);
    for (size_t i = 0; i < item_cnt; ++i)
        points.push_back(Point(x_vec[i], y_vec[i]));
    df.load_column<Point>("point_col", std::move(points));

    KMeansVisitor<5, Point, unsigned long, 128> km_visitor2(1000, true, point_distance, 10);

    df.single_act_visit<Point>("point_col", km_visitor2);

    // Using the calculated means, separate the given column into clusters
    //
    const auto  &clusters2 = km_visitor2.get_clusters();

    for (auto iter : clusters2)  {
        for (auto iter2 : iter)  {
            std::cout << iter2.x << " | " << iter2.y << ", ";
        }
        std::cout << "\n\n" << std::endl;
    }

    // found = false;
    // for (auto iter : clusters2)  {
    //     if (::fabs(iter[0].x - 18.9556) < 0.1 &&
    //         ::fabs(iter[0].y - 2.17537) < 0.1)  {
    //         if (::fabs(iter[6].x - 16.7309) < 0.1 &&
    //             ::fabs(iter[6].y - 0.872376) < 0.1)  {
    //             found = true;
    //             break;
    //         }
    //     }
    // }
    // assert(found);

    // found = false;
    // for (auto iter : clusters2)  {
    //     if (::fabs(iter[0].x - 0.943977) < 0.1 &&
    //         ::fabs(iter[0].y - 0.910989) < 0.1)  {
    //         if (::fabs(iter[2].x - 0.30509) < 0.1 &&
    //             ::fabs(iter[2].y - 1.69017) < 0.1)  {
    //             found = true;
    //             break;
    //         }
    //     }
    // }
    // assert(found);
    // found = false;
    // for (auto iter : clusters2)  {
    //     if (::fabs(iter[0].x - 4.31973) < 0.1 &&
    //         ::fabs(iter[0].y - 1.24214) < 0.1)  {
    //         if (::fabs(iter[3].x - 4.68381) < 0.1 &&
    //             ::fabs(iter[3].y - 0.453632) < 0.1)  {
    //             found = true;
    //             break;
    //         }
    //     }
    // }
    // assert(found);
    // found = false;
    // for (auto iter : clusters2)  {
    //     if (::fabs(iter[0].x - 1.5694) < 0.1 &&
    //         ::fabs(iter[0].y - 15.3338) < 0.1)  {
    //         found = true;
    //         break;
    //     }
    // }
    // assert(found);
    // found = false;
    // for (auto iter : clusters2)  {
    //     if (::fabs(iter[0].x - 1.29624) < 0.1 &&
    //         ::fabs(iter[0].y - 4.13919) < 0.1)  {
    //         found = true;
    //         break;
    //     }
    // }
    // assert(found);

    // Now try with multidimensional dataset (vector of arrays)
    //
    RandGenParams<double>   p2;

    p2.seed = 123;
    p2.min_value = -20.0;
    p2.max_value = 20.0;

    using col_t = std::array<double, 3>;

    auto                rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * 3, p2);
    StlVecType<col_t>   multi_dimen_col(df.get_index().size());
    auto                dist_func = [](const col_t &x, const col_t &y) -> double  {
                                        double  sum { 0 };

                                        for (std::size_t i { 0 }; i < x.size(); ++i)  {
                                            const double    diff { x[i] - y[i] };

                                            sum += diff * diff;
                                        }
                                        return (std::sqrt(sum));
                                    };

    for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i)  {
        multi_dimen_col[i][0] = rand_vec[j++];
        multi_dimen_col[i][1] = rand_vec[j++];
        multi_dimen_col[i][2] = rand_vec[j++];
    }
    df.load_column<col_t>("multi_dimen_col", std::move(multi_dimen_col));

    KMeansVisitor<4, col_t, unsigned long, 128> kmean(1000, true, dist_func);

    df.single_act_visit<col_t>("multi_dimen_col", kmean);

    assert(kmean.get_clusters_idxs().size() == 4);
    for (const auto &mean : kmean.get_result())
        std::cout << mean << "\n\n";
}

C++ DataFrame