Back to Documentations

Signature Description
enum class  mean_shift_kernel : unsigned char  {

    // if d <= 1 then 1 else 0
    //
    uniform = 1,

    // if d <= 1 then 1 - abs(d) else 0
    //
    triangular = 2,

    // if d <= 1 then 1 - d * d else 0
    //
    parabolic = 3,

    //  x = 1 - d * d
    //    if d <= 1 then x * x else 0    
    //
    biweight = 4,

    // x = 1 - d * d
    // if d <= 1 then x * x * x else 0
    //
    triweight = 5,

    // x = 1 - d * d * d
    // if d <= 1 then x * x * x else 0
    //
    tricube = 6,

    // e-0.5 * d * d
    //
    gaussian = 7,

    // if d <= 1 then cos(M_PI_2 * d) else 0
    //
    cosin = 8,

    // 1 / (2 + ed + e-d)
    //
    logistic = 9,

    // 1.0 / (ed + e-d)
    //
    sigmoid = 10,

    // x = M_SQRT1_2 * abs(d)
    // e-x * sin(x + M_PI_4)
    //
    silverman = 11,
};
Kernal is a fancy mathematical name for a weight assigned to a distance between datapoints

Signature Description Parameters
#include <DataFrame/DataFrameMLVisitors.h>

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
struct MeanShiftVisitor;
This is a single action visitor, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

Mean-Shift is falling under the category of a clustering algorithm in contrast of Unsupervised learning that assigns the data points to the clusters iteratively by shifting points towards the mode (mode is the highest density of data points in the region, in the context of the Mean-Shift). As such, it is also known as the Mode-seeking algorithm.
Runtime complexity is O(I * n2) where I is number of iterations.
This works with both scalar and multidimensional (i.e. vector and arrays) datasets.

The constructor takes 5 parameters
  1. Kernel bandwidth refers to the width or spread of the kernel function used in mean shift clustering
  2. The distance used to determine if a datapoint is in the same area as other datapoints
  3. Kernel method specified above. Kernal is a fancy mathematical name for a weight assigned to a distance between datapoints
  4. Maximum number of iterations before it converges
using distance_func = std::function<double(const T &x, const T &y)>;

MeanShiftVisitor(double kernel_bandwidth,
                 double max_dist,
                 mean_shift_kernel kernel = mean_shift_kernel::gaussian,
                 size_type max_iteration = 50);

// Default distance function for scalar datasets
//
[](const T &x, const T &y) -> double  {
   return (static_cast<double>((x - y) * (x - y)));
}

// Default distance function for multidimensional datasets(vectors/arrays)
//
[](const T &x, const T &y) -> double  {
   double  sum { 0 };

   for (size_type i { 0 }; i < size_type(x.size()); ++i)  {
       const double    diff { x[i] - y[i] };

       sum += diff * diff;
   }
   return (std::sqrt(sum));
};
get_results() Returns a vector of vectors containing datapoint values of each cluster.
get_clusters_idxs() Returns a vector of vectors containing indices to datapoints of each cluster.
set_dist_func(distance_func &&f) Resets the default distance function.
T: Column data type
I: Index type
A: Memory alignment boundary for vectors. Default is system default alignment
static void test_MeanShiftVisitor()  {

    std::cout << "\nTesting MeanShiftVisitor{ } ..." << std::endl;

    StrDataFrame    df;

    try  {
        df.read("SHORT_IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }

    MeanShiftVisitor<double, std::string>   mshift(1.0, 4, mean_shift_kernel::gaussian);

    mshift.set_dist_func([](const double &x, const double &y) -> double  {
                             return (std::fabs(x - y));
                         });
    df.single_act_visit<double>("IBM_Close", mshift);

    assert(mshift.get_result().size() == 19);
    assert(mshift.get_result()[0].size() == 106);
    assert(mshift.get_result()[4].size() == 19);
    assert(mshift.get_result()[6].size() == 274);
    assert(mshift.get_result()[10].size() == 180);
    assert(mshift.get_result()[14].size() == 29);
    assert(mshift.get_result()[18].size() == 2);
    assert(std::fabs(mshift.get_result()[0][6] - 184.16) < 0.001);
    assert(std::fabs(mshift.get_result()[4][18] - 194.0) < 0.001);
    assert(std::fabs(mshift.get_result()[6][273] - 154.31) < 0.001);
    assert(std::fabs(mshift.get_result()[10][135] - 137.61) < 0.001);
    assert(std::fabs(mshift.get_result()[18][1] - 94.77) < 0.001);

    // Now multidimensional data
    //
    RandGenParams<double>   p;

    p.seed = 123;
    p.min_value = -20.0;
    p.max_value = 20.0;

    using col_t = std::array<double, 3>;

    auto                rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * 3, p);
    std::vector<col_t>  multi_dimen_col(df.get_index().size());

    for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i)  {
        multi_dimen_col[i][0] = rand_vec[j++];
        multi_dimen_col[i][1] = rand_vec[j++];
        multi_dimen_col[i][2] = rand_vec[j++];
    }
    df.load_column<col_t>("multi_dimen_col", std::move(multi_dimen_col));

    MeanShiftVisitor<col_t, std::string>    md_mshift(1.0, 10, mean_shift_kernel::sigmoid);

    df.single_act_visit<col_t>("multi_dimen_col", md_mshift);

    const auto  &md_clusters = md_mshift.get_result();

    assert(md_clusters.size() == 53); // Number of clusters

    assert(md_clusters[0].size() == 74);
    assert(std::fabs(md_clusters[0][6][1] - -1.8807) < 0.0001);

    assert(md_clusters[28].size() == 36);
    assert(std::fabs(md_clusters[28][3][0] - 12.6347) < 0.0001);

    assert(md_clusters[52].size() == 1);
    assert(std::fabs(md_clusters[52][0][2] - 19.2094) < 0.0001);
}

C++ DataFrame