| Signature | Description | Parameters |
|---|---|---|
#include <DataFrame/DataFrameMLVisitors.h> template<typename T, typename I = unsigned long, std::size_t A = 0> struct BIRCHVisitor; |
This is a single action visitor, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface. This functor implements the BIRCH (Balanced Iterative Reducing and Clustering using Hierarchies) clustering algorithm. BIRCH is particularly suitable for very large datasets. It ultimately uses the K-Means algorithm to calculate the centroids and clusters, but first it reduces the problem to a much smaller version by using a CFTree ( (Clustering Feature Tree). In other words, it is an unsupervised data mining method designed for efficiently clustering very large, multi-dimensional datasets, particularly when memory resources are limited. It achieves speed and memory efficiency by summarizing data into a compact data structure called a Clustering Feature (CF) tree, typically in a single scan of the dataset. This works with both scalar and multidimensional (i.e. vector and arrays) datasets. The constructor takes 6 parameters:
using distance_func = std::function<double(const T &x, const T &y)>; BIRCHVisitor(size_type k, double threshold, size_type max_entries = 1000, bool calc_clusters = true, size_type max_iter = 1000); // Default distance function for scalar datasets // [](const T &x, const T &y) -> double { return (static_cast<double>(std::abs(x - y))); } // Default distance function for multidimensional datasets(vectors/arrays) // [](const T &x, const T &y) -> double { double sum { 0 }; for (size_type i { 0 }; i < size_type(x.size()); ++i) { const double diff { x[i] - y[i] }; sum += diff * diff; } return (std::sqrt(sum)); }get_results() Returns an array of K means (centroids). get_clusters_idxs() Returns a vector of K std::vector<size_type>'s which contains indices of the data column for each cluster. set_dist_func(distance_func &&f) Resets the default distance function. |
T: Column data type I: Index type A: Memory alignment boundary for vectors. Default is system default alignment |
static void test_BIRCHVisitor() { std::cout << "\nTesting BIRCHVisitor{ } ..." << std::endl; ULDataFrame df; try { df.read("FORD.csv", io_format::csv2); } catch (const DataFrameError &ex) { std::cout << ex.what() << std::endl; ::exit(-1); } BIRCHVisitor<double> birch(4, 2.5); df.single_act_visit<double>("FORD_Close", birch); // The centroids // assert(birch.get_result().size() == 4); assert(std::fabs(birch.get_result()[0] - 3.02152) < 0.00001); assert(std::fabs(birch.get_result()[1] - 9.35769) < 0.00001); assert(std::fabs(birch.get_result()[2] - 14.2126) < 0.0001); assert(std::fabs(birch.get_result()[3] - 28.1059) < 0.0001); const auto &close = df.get_column<double>("FORD_Close"); // Print the clusters // for (const auto &vec : birch.get_clusters_idxs()) { for (const auto &idx : vec) std::cout << close[idx] << ", "; std::cout << "\n\n\n"; } std::cout << std::endl; // Now multidimensional data // RandGenParams<double> p; p.seed = 123; p.min_value = -20.0; p.max_value = 20.0; using col_t = std::array<double, 3>; auto rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * 3, p); std::vector<col_t> multi_dimen_col(df.get_index().size()); for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i) { multi_dimen_col[i][0] = rand_vec[j++]; multi_dimen_col[i][1] = rand_vec[j++]; multi_dimen_col[i][2] = rand_vec[j++]; } df.load_column<col_t>("multi_dimen_col", std::move(multi_dimen_col)); BIRCHVisitor<col_t> birch2(4, 2.5); // This is the default distance function for multidimensional data. // But I am setting it here for testing and illustration. // birch2.set_dist_func( [](const col_t &x, const col_t &y) -> double { double sum { 0 }; for (std::size_t i { 0 }; i < x.size(); ++i) { const double diff { x[i] - y[i] }; sum += diff * diff; } return (std::sqrt(sum)); }); df.single_act_visit<col_t>("multi_dimen_col", birch2); assert(birch2.get_result().size() == 4); assert(std::fabs(birch2.get_result()[0][0] - -10.9757) < 0.0001); assert(std::fabs(birch2.get_result()[0][1] - -1.08913) < 0.00001); assert(std::fabs(birch2.get_result()[0][2] - 9.23815) < 0.00001); assert(std::fabs(birch2.get_result()[1][0] - -0.544063) < 0.000001); assert(std::fabs(birch2.get_result()[1][1] - 10.899) < 0.001); assert(std::fabs(birch2.get_result()[1][2] - -9.79122) < 0.00001); assert(std::fabs(birch2.get_result()[2][0] - 0.911082) < 0.000001); assert(std::fabs(birch2.get_result()[2][1] - -10.8084) < 0.0001); assert(std::fabs(birch2.get_result()[2][2] - -9.55188) < 0.00001); assert(std::fabs(birch2.get_result()[3][0] - 10.4331) < 0.0001); assert(std::fabs(birch2.get_result()[3][1] - 1.78207) < 0.00001); assert(std::fabs(birch2.get_result()[3][2] - 9.60288) < 0.00001); }