Back to Documentations

Signature Description Parameters
#include <DataFrame/DataFrameMLVisitors.h>

template<size_t K, typename T, typename I = unsigned long,
         std::size_t A = 0>
struct SpectralClusteringVisitor;
This is a single action visitor, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

Spectral clustering is a graph-based algorithm for finding k arbitrarily shaped clusters in data. The technique involves representing the data in a low dimension. In the low dimension, clusters in the data are more widely separated, enabling you to use KMeans clustering. This low dimension is based on eigenvectors of a Laplacian matrix. A Laplacian matrix is one way of representing a similarity graph that models the local neighborhood relationships between data points as an undirected graph. You can use spectral clustering when you know the number of clusters, but the algorithm also provides a way to estimate the number of clusters in your data.
Spectral clustering is very interesting and different from other clustering algorithms. Other clustering algorithms cluster the data, one way or another, based on proximity of data. But spectral algorithm clusters based on patterns. It doesn’t cluster the actual data. It clusters the Eigenvectors of the Laplacian matrix. The downside is it requires very intensive calculations and it might be slow for large datasets.
This works with both scalar and multidimensional (i.e. vector and arrays) datasets.

The constructor takes 4 parameters
  1. Number of iterations for KMeans calculations
  2. sigma -- used in a Gaussian kernel function, that controls the "width" of the neighborhood considered when calculating similarities between datapoints; essentially, how far apart two points can be while still being considered "similar" to each other
  3. A function to calculate similarity between two datapoints of type T (with default)
  4. Seed for random number generation. The defualt is to generate different set each time.
using similarity_func = std::function<double(const T &x, const T &y, double sigma)>;

SpectralClusteringVisitor(size_type num_of_iter,
                          double sigma,
                          seed_t seed = seed_t(-1));

// Default similarity function for scalar datasets
//
[](const T &x, const T &y, double sigma) -> double  {
    const double    diff { x - y };

    return (std::exp(-(diff * diff) / (2 * sigma * sigma)));
};

// Default similarity function for multidimensional datasets (vectors/arrays)
//
[](const T &x, const T &y, double sigma) -> double  {
    double sum { 0 };

    for (size_type i { 0 }; i < x.size(); ++i)  {
        const double    diff { x[i] - y[i] };

        sum += diff * diff;
    }
    return (std::exp(-sum / (2 * sigma * sigma)));
};
get_results() Returns a vector of vectors containing datapoint values of each cluster.
get_clusters_idxs() Returns an array of K std::vector<size_type>'s which contains indices of the data in each cluster.
set_sim_func(similarity_func &&f) Resets the default similarity function.

NOTE: There is no interface to return the K means/centroids. The means are the means of the eigenvectors of the Laplacian matrix which are not particularly useful.
K: Number of centroids to find
T: Column data type
I: Index type
A: Memory alignment boundary for vectors. Default is system default alignment
static void test_SpectralClusteringVisitor()  {

    std::cout << "\nTesting SpectralClusteringVisitor{ } ..." << std::endl;

    StrDataFrame    df;

    try  {
        df.read("SHORT_IBM.csv", io_format::csv2, { .starting_row = 0, .num_rows = 1000 });
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }

    spect_v<4, double, std::string> spc(1000, 7, 123);

    df.single_act_visit<double>("IBM_Close", spc);

    const auto  &clusters = spc.get_result();

    assert(clusters.size() == 4);

    assert(clusters[0].size() == 89);
    assert(clusters[1].size() == 688);
    assert(clusters[2].size() == 208);
    assert(clusters[3].size() == 15);

    assert(std::fabs(clusters[0][0] - 177.9) < 0.001);
    assert(std::fabs(clusters[0][23] - 171.12) < 0.001);
    assert(std::fabs(clusters[0][61] - 177.18) < 0.001);
    assert(std::fabs(clusters[0][88] - 170.05) < 0.001);

    assert(std::fabs(clusters[1][0] - 169.1) < 0.001);
    assert(std::fabs(clusters[1][300] - 140.19) < 0.001);
    assert(std::fabs(clusters[1][542] - 152.51) < 0.001);
    assert(std::fabs(clusters[1][687] - 153.23) < 0.001);

    assert(std::fabs(clusters[2][0] - 185.53) < 0.001);
    assert(std::fabs(clusters[2][100] - 181.22) < 0.001);
    assert(std::fabs(clusters[2][200] - 179.4) < 0.001);
    assert(std::fabs(clusters[2][207] - 179.45) < 0.001);

    assert(std::fabs(clusters[3][0] - 121.86) < 0.001);
    assert(std::fabs(clusters[3][8] - 124.83) < 0.001);
    assert(std::fabs(clusters[3][12] - 120.19) < 0.001);
    assert(std::fabs(clusters[3][14] - 122.74) < 0.001);

    // Now multidimensional data
    //
    RandGenParams<double>   p;

    p.seed = 123;
    p.min_value = -20.0;
    p.max_value = 20.0;

    using col_t = std::array<double, 3>;

    auto               rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * 3, p);
   std::vector<col_t>  multi_dimen_col(df.get_index().size());

    for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i)  {
        multi_dimen_col[i][0] = rand_vec[j++];
        multi_dimen_col[i][1] = rand_vec[j++];
        multi_dimen_col[i][2] = rand_vec[j++];
    }
    df.load_column<col_t>("multi_dimen_col", std::move(multi_dimen_col));

    spect_v<4, col_t, std::string>  ml_spc(1000, 7, 123);

    // This is the default similarity function for multidimensional data.
    // But I am setting it here for testing and illustration.
    //
    ml_spc.set_sim_func([](const col_t &x, const col_t &y, double sigma) -> double  {
                            double sum { 0 };

                            for (std::size_t i { 0 }; i < x.size(); ++i)  {
                                const double    diff { x[i] - y[i] };

                                sum += diff * diff;
                            }
                            return (std::exp(-sum / (2 * sigma * sigma)));
                        });
    df.single_act_visit<col_t>("multi_dimen_col", ml_spc);

    const auto  &ml_clusters = ml_spc.get_result();

    assert(ml_clusters.size() == 4);
    assert(ml_spc.get_clusters_idxs()[0].size() == 327);
    assert(ml_spc.get_clusters_idxs()[1].size() == 206);
    assert(ml_spc.get_clusters_idxs()[2].size() == 253);
    assert(ml_spc.get_clusters_idxs()[3].size() == 214);
    assert((ml_spc.get_clusters_idxs()[0].size() +
            ml_spc.get_clusters_idxs()[1].size() +
            ml_spc.get_clusters_idxs()[2].size() +
            ml_spc.get_clusters_idxs()[3].size() == df.get_index().size()));

    assert(std::fabs(ml_spc.get_result()[0][5][0] - 16.4466) < 0.0001);
    assert(std::fabs(ml_spc.get_result()[0][5][1] - -2.12685) < 0.00001);
    assert(std::fabs(ml_spc.get_result()[0][5][2] - 8.58622) < 0.00001);
    assert(std::fabs(ml_spc.get_result()[2][5][0] - 5.05986) < 0.00001);
    assert(std::fabs(ml_spc.get_result()[2][5][1] - -1.57193) < 0.00001);
    assert(std::fabs(ml_spc.get_result()[2][5][2] - -7.64881) < 0.00001);
    assert(std::fabs(ml_spc.get_result()[3][7][0] - -19.7594) < 0.0001);
    assert(std::fabs(ml_spc.get_result()[3][7][1] - 8.27358) < 0.00001);
    assert(std::fabs(ml_spc.get_result()[3][7][2] - -6.11494) < 0.00001);
}

C++ DataFrame