Back to Documentations

Signature Description Parameters
#include <DataFrame/DataFrameMLVisitors.h>

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
struct DBSCANVisitor;
This is a single action visitor, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

Density-Based Spatial Clustering of Applications with Noise (DBSCAN) is a data clustering algorithm proposed by Martin Ester, Hans-Peter Kriegel, Jörg Sander, and Xiaowei Xu in 1996. It is a density-based clustering non-parametric algorithm: given a set of points in some space, it groups together points that are closely packed (points with many nearby neighbors), and marks as outliers points that lie alone in low-density regions (those whose nearest neighbors are too far away). DBSCAN is one of the most commonly used and cited clustering algorithms.
This works with both scalar and multidimensional (i.e. vector and arrays) datasets.

The constructor takes 2 parameters
  1. Minimum number of datapoints to constitute a cluster
  2. The distance used to determine if a data point is in the same area as other data points
using distance_func = std::function<double(const T &x, const T &y)>;

DBSCANVisitor(id_t min_mems, double max_dist);

// Default distance function for scalar datasets
//
[](const T &x, const T &y) -> double  {
   return (static_cast<double>((x - y) * (x - y)));
}

// Default distance function for multidimensional datasets(vectors/arrays)
//
[](const T &x, const T &y) -> double  {
   double  sum { 0 };

   for (size_type i { 0 }; i < size_type(x.size()); ++i)  {
       const double    diff { x[i] - y[i] };

       sum += diff * diff;
   }
   return (std::sqrt(sum));
};
get_results() Returns a vector of vectors containing datapoint values of each cluster.
get_clusters_idxs() Returns a vector of vectors containing indices to datapoints of each cluster.
get_noisey_idxs() Returns a vector containing indices to datapoints that could not be placed in any cluster. Ideally you want this to be empty.
set_dist_func(distance_func &&f) Resets the default distance function.
T: Column data type
I: Index type
A: Memory alignment boundary for vectors. Default is system default alignment
static void test_DBSCANVisitor()  {

    std::cout << "\nTesting DBSCANVisitor{ } ..." << std::endl;

    StrDataFrame    df;

    try  {
        df.read("SHORT_IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }

    auto    lbd = [](const std::string &, const double &) -> bool { return (true); };
    auto    view = df.get_view_by_sel<double, decltype(lbd), double, long>("IBM_Open", lbd);

    DBSCANVisitor<double, std::string>  dbscan(10, 4);

    // Set a different distance function than default
    //
    dbscan.set_dist_func([](const double &x, const double &y)  {
                             return (std::fabs(x - y));
                         });

    view.single_act_visit<double>("IBM_Close", dbscan);

    assert(dbscan.get_noisey_idxs().size() == 2);
    assert(dbscan.get_noisey_idxs()[0] == 1564);
    assert(dbscan.get_noisey_idxs()[1] == 1565);

    assert(dbscan.get_result().size() == 19);
    assert(dbscan.get_result()[0].size() == 11);
    assert(dbscan.get_result()[4].size() == 31);
    assert(dbscan.get_result()[10].size() == 294);
    assert(dbscan.get_result()[14].size() == 82);
    assert(dbscan.get_result()[18].size() == 10);
    assert(dbscan.get_result()[0][6] == 185.679993);
    assert(dbscan.get_result()[4][18] == 167.330002);
    assert(dbscan.get_result()[10][135] == 145.160004);
    assert(dbscan.get_result()[18][3] == 103.550003);

    // Now multidimensional data
    //
    RandGenParams<double>   p;

    p.seed = 123;
    p.min_value = -20.0;
    p.max_value = 20.0;

    using col_t = std::array<double, 3>;

    auto    rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * 3, p);

    std::vector<col_t>  multi_dimen_col(df.get_index().size());

    for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i)  {
        multi_dimen_col[i][0] = rand_vec[j++];
        multi_dimen_col[i][1] = rand_vec[j++];
        multi_dimen_col[i][2] = rand_vec[j++];
    }
    df.load_column<col_t>("multi_dimen_col", std::move(multi_dimen_col));

    DBSCANVisitor<col_t, std::string>   md_dbscan(10, 4);

    df.single_act_visit<col_t>("multi_dimen_col", md_dbscan);

    const auto  &md_clusters = md_dbscan.get_result();

    assert(md_clusters.size() == 102); // Number of clusters

    assert(md_clusters[0].size() == 14);
    assert(std::fabs(md_clusters[0][6][1] - -19.9438) < 0.0001);

    assert(md_clusters[58].size() == 12);
    assert(std::fabs(md_clusters[58][3][0] - -6.41034) < 0.00001);

    assert(md_clusters[101].size() == 10);
    assert(std::fabs(md_clusters[101][9][2] - -5.92195) < 0.00001);
}

C++ DataFrame