| Signature | Description |
|---|---|
enum class mean_shift_kernel : unsigned char { // if d <= 1 then 1 else 0 // uniform = 1, // if d <= 1 then 1 - abs(d) else 0 // triangular = 2, // if d <= 1 then 1 - d * d else 0 // parabolic = 3, // x = 1 - d * d // if d <= 1 then x * x else 0 // biweight = 4, // x = 1 - d * d // if d <= 1 then x * x * x else 0 // triweight = 5, // x = 1 - d * d * d // if d <= 1 then x * x * x else 0 // tricube = 6, // e-0.5 * d * d // gaussian = 7, // if d <= 1 then cos(M_PI_2 * d) else 0 // cosin = 8, // 1 / (2 + ed + e-d) // logistic = 9, // 1.0 / (ed + e-d) // sigmoid = 10, // x = M_SQRT1_2 * abs(d) // e-x * sin(x + M_PI_4) // silverman = 11, }; |
Kernal is a fancy mathematical name for a weight assigned to a distance between datapoints |
| Signature | Description | Parameters |
|---|---|---|
#include <DataFrame/DataFrameMLVisitors.h> template<typename T, typename I = unsigned long, std::size_t A = 0> struct MeanShiftVisitor; |
This is a single action visitor, meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface. Mean-Shift is falling under the category of a clustering algorithm in contrast of Unsupervised learning that assigns the data points to the clusters iteratively by shifting points towards the mode (mode is the highest density of data points in the region, in the context of the Mean-Shift). As such, it is also known as the Mode-seeking algorithm. Runtime complexity is O(I * n2) where I is number of iterations. This works with both scalar and multidimensional (i.e. vector and arrays) datasets. The constructor takes 5 parameters
using distance_func = std::function<double(const T &x, const T &y)>; MeanShiftVisitor(double kernel_bandwidth, double max_dist, mean_shift_kernel kernel = mean_shift_kernel::gaussian, size_type max_iteration = 50); // Default distance function for scalar datasets // [](const T &x, const T &y) -> double { return (static_cast<double>((x - y) * (x - y))); } // Default distance function for multidimensional datasets(vectors/arrays) // [](const T &x, const T &y) -> double { double sum { 0 }; for (size_type i { 0 }; i < size_type(x.size()); ++i) { const double diff { x[i] - y[i] }; sum += diff * diff; } return (std::sqrt(sum)); };get_results() Returns a vector of vectors containing datapoint values of each cluster. get_clusters_idxs() Returns a vector of vectors containing indices to datapoints of each cluster. set_dist_func(distance_func &&f) Resets the default distance function. |
T: Column data type I: Index type A: Memory alignment boundary for vectors. Default is system default alignment |
static void test_MeanShiftVisitor() { std::cout << "\nTesting MeanShiftVisitor{ } ..." << std::endl; StrDataFrame df; try { df.read("SHORT_IBM.csv", io_format::csv2); } catch (const DataFrameError &ex) { std::cout << ex.what() << std::endl; ::exit(-1); } MeanShiftVisitor<double, std::string> mshift(1.0, 4, mean_shift_kernel::gaussian); mshift.set_dist_func([](const double &x, const double &y) -> double { return (std::fabs(x - y)); }); df.single_act_visit<double>("IBM_Close", mshift); assert(mshift.get_result().size() == 19); assert(mshift.get_result()[0].size() == 106); assert(mshift.get_result()[4].size() == 19); assert(mshift.get_result()[6].size() == 274); assert(mshift.get_result()[10].size() == 180); assert(mshift.get_result()[14].size() == 29); assert(mshift.get_result()[18].size() == 2); assert(std::fabs(mshift.get_result()[0][6] - 184.16) < 0.001); assert(std::fabs(mshift.get_result()[4][18] - 194.0) < 0.001); assert(std::fabs(mshift.get_result()[6][273] - 154.31) < 0.001); assert(std::fabs(mshift.get_result()[10][135] - 137.61) < 0.001); assert(std::fabs(mshift.get_result()[18][1] - 94.77) < 0.001); // Now multidimensional data // RandGenParams<double> p; p.seed = 123; p.min_value = -20.0; p.max_value = 20.0; using col_t = std::array<double, 3>; auto rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * 3, p); std::vector<col_t> multi_dimen_col(df.get_index().size()); for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i) { multi_dimen_col[i][0] = rand_vec[j++]; multi_dimen_col[i][1] = rand_vec[j++]; multi_dimen_col[i][2] = rand_vec[j++]; } df.load_column<col_t>("multi_dimen_col", std::move(multi_dimen_col)); MeanShiftVisitor<col_t, std::string> md_mshift(1.0, 10, mean_shift_kernel::sigmoid); df.single_act_visit<col_t>("multi_dimen_col", md_mshift); const auto &md_clusters = md_mshift.get_result(); assert(md_clusters.size() == 53); // Number of clusters assert(md_clusters[0].size() == 74); assert(std::fabs(md_clusters[0][6][1] - -1.8807) < 0.0001); assert(md_clusters[28].size() == 36); assert(std::fabs(md_clusters[28][3][0] - 12.6347) < 0.0001); assert(md_clusters[52].size() == 1); assert(std::fabs(md_clusters[52][0][2] - 19.2094) < 0.0001); }