Back to Documentations

Signature Description Parameters
#include <DataFrame/DataFrameMLVisitors.h>

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
struct  BIRCHVisitor;
This is a single action visitor, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This functor implements the BIRCH (Balanced Iterative Reducing and Clustering using Hierarchies) clustering algorithm. BIRCH is particularly suitable for very large datasets. It ultimately uses the K-Means algorithm to calculate the centroids and clusters, but first it reduces the problem to a much smaller version by using a CFTree ( (Clustering Feature Tree). In other words, it is an unsupervised data mining method designed for efficiently clustering very large, multi-dimensional datasets, particularly when memory resources are limited. It achieves speed and memory efficiency by summarizing data into a compact data structure called a Clustering Feature (CF) tree, typically in a single scan of the dataset. This works with both scalar and multidimensional (i.e. vector and arrays) datasets.
The constructor takes 6 parameters:
  1. k: Number of clusters used by KMeans algorithm
  2. threshold: It is the maximum radius (standard deviation) allowed for a CF (Clustering Feature) entry
  3. max_entries: It is the maximum number of CF (Clustering Feature) entries that can be stored in the flat CF-Tree before it needs to be rebuilt
  4. calc_clusters: If it is true, in addition to centroids, it also calculates the indices to the original data column for each cluster
  5. max_iter: It is the maximum number of iterations for KMeans before it converges
using distance_func = std::function<double(const T &x, const T &y)>;

BIRCHVisitor(size_type k,
             double threshold,
             size_type max_entries = 1000,
             bool calc_clusters = true,
             size_type max_iter = 1000);

// Default distance function for scalar datasets
//
[](const T &x, const T &y) -> double  {
   return (static_cast<double>(std::abs(x - y)));
}

// Default distance function for multidimensional datasets(vectors/arrays)
//
[](const T &x, const T &y) -> double  {
   double  sum { 0 };

   for (size_type i { 0 }; i < size_type(x.size()); ++i)  {
       const double    diff { x[i] - y[i] };

       sum += diff * diff;
   }
   return (std::sqrt(sum));
}
get_results() Returns an array of K means (centroids).
get_clusters_idxs() Returns a vector of K std::vector<size_type>'s which contains indices of the data column for each cluster.
set_dist_func(distance_func &&f) Resets the default distance function.
T: Column data type
I: Index type
A: Memory alignment boundary for vectors. Default is system default alignment
static void test_BIRCHVisitor()  {

    std::cout << "\nTesting BIRCHVisitor{ } ..." << std::endl;

    ULDataFrame df;

    try  {
        df.read("FORD.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }

    BIRCHVisitor<double>    birch(4, 2.5);

    df.single_act_visit<double>("FORD_Close", birch);

    // The centroids
    //
    assert(birch.get_result().size() == 4);
    assert(std::fabs(birch.get_result()[0] - 3.02152) < 0.00001);
    assert(std::fabs(birch.get_result()[1] - 9.35769) < 0.00001);
    assert(std::fabs(birch.get_result()[2] - 14.2126) < 0.0001);
    assert(std::fabs(birch.get_result()[3] - 28.1059) < 0.0001);

    const auto  &close = df.get_column<double>("FORD_Close");

    // Print the clusters
    //
    for (const auto &vec : birch.get_clusters_idxs())  {
        for (const auto &idx : vec)
            std::cout << close[idx] << ", ";
        std::cout << "\n\n\n";
    }
    std::cout << std::endl;

    // Now multidimensional data
    //
    RandGenParams<double>   p;

    p.seed = 123;
    p.min_value = -20.0;
    p.max_value = 20.0;

    using col_t = std::array<double, 3>;

    auto    rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * 3, p);

    std::vector<col_t>  multi_dimen_col(df.get_index().size());

    for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i)  {
        multi_dimen_col[i][0] = rand_vec[j++];
        multi_dimen_col[i][1] = rand_vec[j++];
        multi_dimen_col[i][2] = rand_vec[j++];
    }
    df.load_column<col_t>("multi_dimen_col", std::move(multi_dimen_col));

    BIRCHVisitor<col_t> birch2(4, 2.5);

    // This is the default distance function for multidimensional data.
    // But I am setting it here for testing and illustration.
    //
    birch2.set_dist_func(
        [](const col_t &x, const col_t &y) -> double  {
            double  sum { 0 };

            for (std::size_t i { 0 }; i < x.size(); ++i)  {
                const double    diff { x[i] - y[i] };

                sum += diff * diff;
            }
            return (std::sqrt(sum));
        });
    df.single_act_visit<col_t>("multi_dimen_col", birch2);

    assert(birch2.get_result().size() == 4);
    assert(std::fabs(birch2.get_result()[0][0] - -10.9757) < 0.0001);
    assert(std::fabs(birch2.get_result()[0][1] - -1.08913) < 0.00001);
    assert(std::fabs(birch2.get_result()[0][2] - 9.23815) < 0.00001);

    assert(std::fabs(birch2.get_result()[1][0] - -0.544063) < 0.000001);
    assert(std::fabs(birch2.get_result()[1][1] - 10.899) < 0.001);
    assert(std::fabs(birch2.get_result()[1][2] - -9.79122) < 0.00001);

    assert(std::fabs(birch2.get_result()[2][0] - 0.911082) < 0.000001);
    assert(std::fabs(birch2.get_result()[2][1] - -10.8084) < 0.0001);
    assert(std::fabs(birch2.get_result()[2][2] - -9.55188) < 0.00001);

    assert(std::fabs(birch2.get_result()[3][0] - 10.4331) < 0.0001);
    assert(std::fabs(birch2.get_result()[3][1] - 1.78207) < 0.00001);
    assert(std::fabs(birch2.get_result()[3][2] - 9.60288) < 0.00001);
}

C++ DataFrame