| Signature | Description | Parameters |
|---|---|---|
#include <DataFrame/DataFrameMLVisitors.h> template<size_t K, typename T, typename I = unsigned long, std::size_t A = 0> struct KMeansVisitor; |
This is a single action visitor, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface. This functor class finds the K means in the data. It could also cluster the data around the means. This works with both scalar and multidimensional (i.e. vector and arrays) datasets. The constructor takes 3 parameters
KMeansVisitor(std::size_t num_of_iter, bool calc_clusters = true, distance_func f = [](const T &x, const T &y) -> double { return ((x - y) * (x - y)); });get_results() returns an array of K means. get_clusters() returns an array of K VectorPtrView's which contain the data clustered around the K-Means. The first element in each VectorPtrView is the mean and the reset are the data points belonging to that cluster. get_clusters_idxs() returns an array of K std::vector<size_type>'s which contains indices of the data in each cluster. |
K: Number of means to find T: Column data type I: Index type A: Memory alignment boundary for vectors. Default is system default alignment |
struct Point { double x { 0.0 }; double y { 0.0 }; Point() = default; Point(double xx, double yy) : x(xx), y(yy) { } Point(const Point &) = default; Point &operator = (const Point &) = default; friend Point operator + (const Point &lhs, const Point &rhs) { return (Point(lhs.x + rhs.x, lhs.y + rhs.y)); } friend Point operator / (const Point &lhs, double rhs) { return (Point(lhs.x / rhs, lhs.y / rhs)); } template<typename S> friend S &operator << (S &s, const Point &rhs) { return (s << rhs.x << ", " << rhs.y); } }; static double point_distance(const Point &lhs, const Point &rhs) { return ((lhs.x - rhs.x) * (lhs.x - rhs.x) + (lhs.y - rhs.y) * (lhs.y - rhs.y)); } // ------------------------------------- static void test_k_means() { std::cout << "\nTesting k-means visitor ..." << std::endl; const size_t item_cnt = 1024; MyDataFrame df; RandGenParams<double> p; p.mean = 1.0; // Default p.std = 0.005; p.seed = 10; df.load_data(MyDataFrame::gen_sequence_index(0, item_cnt, 1), std::make_pair("col1", gen_lognormal_dist<double, 128>(item_cnt, p))); KMeansVisitor<5, double, unsigned long, 128> km_visitor(1000, true, [](const double &x, const double &y) { return ((x - y) * (x - y)); }, 10); df.single_act_visit<double>("col1", km_visitor); std::cout << "Means of clusters are: "; for (const auto citer : km_visitor.get_result()) std::cout << citer << ", "; std::cout << std::endl; // Using the calculated means, separate the given column into clusters // const auto &clusters = km_visitor.get_clusters(); // bool found = false; // for (auto iter : clusters) { // if (::fabs(iter[0] - 1.89348) < 0.00001) { // if (::fabs(iter[6] - 1.44231) < 0.00001) { // found = true; // break; // } // } // } // assert(found); // found = false; // for (auto iter : clusters) { // if (::fabs(iter[0] - 0.593126) < 0.00001) { // if (::fabs(iter[2] - 0.950026) < 0.00001) { // found = true; // break; // } // } // } // assert(found); // found = false; // for (auto iter : clusters) { // if (::fabs(iter[0] - 14.2245) < 0.0001) { // found = true; // break; // } // } // assert(found); // found = false; // for (auto iter : clusters) { // if (::fabs(iter[0] - 6.90427) < 0.00001) { // found = true; // break; // } // } // assert(found); // found = false; // for (auto iter : clusters) { // if (::fabs(iter[0] - 3.8146) < 0.00001) { // found = true; // break; // } // } // assert(found); // Now try with Points // p.seed = 200; auto x_vec = gen_lognormal_dist<double, 128>(item_cnt, p); p.seed = 4356; auto y_vec = gen_lognormal_dist<double, 128>(item_cnt, p); StlVecType<Point> points; points.reserve(item_cnt); for (size_t i = 0; i < item_cnt; ++i) points.push_back(Point(x_vec[i], y_vec[i])); df.load_column<Point>("point_col", std::move(points)); KMeansVisitor<5, Point, unsigned long, 128> km_visitor2(1000, true, point_distance, 10); df.single_act_visit<Point>("point_col", km_visitor2); // Using the calculated means, separate the given column into clusters // const auto &clusters2 = km_visitor2.get_clusters(); for (auto iter : clusters2) { for (auto iter2 : iter) { std::cout << iter2.x << " | " << iter2.y << ", "; } std::cout << "\n\n" << std::endl; } // found = false; // for (auto iter : clusters2) { // if (::fabs(iter[0].x - 18.9556) < 0.1 && // ::fabs(iter[0].y - 2.17537) < 0.1) { // if (::fabs(iter[6].x - 16.7309) < 0.1 && // ::fabs(iter[6].y - 0.872376) < 0.1) { // found = true; // break; // } // } // } // assert(found); // found = false; // for (auto iter : clusters2) { // if (::fabs(iter[0].x - 0.943977) < 0.1 && // ::fabs(iter[0].y - 0.910989) < 0.1) { // if (::fabs(iter[2].x - 0.30509) < 0.1 && // ::fabs(iter[2].y - 1.69017) < 0.1) { // found = true; // break; // } // } // } // assert(found); // found = false; // for (auto iter : clusters2) { // if (::fabs(iter[0].x - 4.31973) < 0.1 && // ::fabs(iter[0].y - 1.24214) < 0.1) { // if (::fabs(iter[3].x - 4.68381) < 0.1 && // ::fabs(iter[3].y - 0.453632) < 0.1) { // found = true; // break; // } // } // } // assert(found); // found = false; // for (auto iter : clusters2) { // if (::fabs(iter[0].x - 1.5694) < 0.1 && // ::fabs(iter[0].y - 15.3338) < 0.1) { // found = true; // break; // } // } // assert(found); // found = false; // for (auto iter : clusters2) { // if (::fabs(iter[0].x - 1.29624) < 0.1 && // ::fabs(iter[0].y - 4.13919) < 0.1) { // found = true; // break; // } // } // assert(found); // Now try with multidimensional dataset (vector of arrays) // RandGenParams<double> p2; p2.seed = 123; p2.min_value = -20.0; p2.max_value = 20.0; using col_t = std::array<double, 3>; auto rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * 3, p2); StlVecType<col_t> multi_dimen_col(df.get_index().size()); auto dist_func = [](const col_t &x, const col_t &y) -> double { double sum { 0 }; for (std::size_t i { 0 }; i < x.size(); ++i) { const double diff { x[i] - y[i] }; sum += diff * diff; } return (std::sqrt(sum)); }; for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i) { multi_dimen_col[i][0] = rand_vec[j++]; multi_dimen_col[i][1] = rand_vec[j++]; multi_dimen_col[i][2] = rand_vec[j++]; } df.load_column<col_t>("multi_dimen_col", std::move(multi_dimen_col)); KMeansVisitor<4, col_t, unsigned long, 128> kmean(1000, true, dist_func); df.single_act_visit<col_t>("multi_dimen_col", kmean); assert(kmean.get_clusters_idxs().size() == 4); for (const auto &mean : kmean.get_result()) std::cout << mean << "\n\n"; }