← Back to Documentations

Signature	Description	Parameters
template<typename T, typename ... Ts> std::vector<DataFrame> get_data_by_mshift(const char *col_name, double kernel_bandwidth, double max_distance, mean_shift_kernel kernel = mean_shift_kernel::gaussian, size_type num_of_iter = 50) const;	This uses Mean-Shift algorithm to divide the named column into clusters. It returns an array of DataFrame's each containing one of the clusters of data based on the named column. Unlike K-Means clustering, you do not have to specify the number of clusters. Self is unchanged. This works for both scalar and multidimensional (i.e. vectors/arrays) data types. NOTE: Currently this only uses the default distance functions in the MeanShift visitor.	T: Type of the named column Ts: The list of types for all columns. A type should be specified only once col_name: Name of the data column kernel_bandwidth: The width or spread of the kernel function used max_distance: Maximum distance between two data points in the same cluster mean_shift_kernel: Kernel type used num_of_iter: Maximum number of iterations for AP clustering algorithm to converge
template<typename T, typename ... Ts> std::vector<PtrView> get_view_by_mshift(const char *col_name, double kernel_bandwidth, double max_distance, mean_shift_kernel kernel = mean_shift_kernel::gaussian, size_type num_of_iter = 50);	This is identical to above get_data_by_mshift(), but: The result is a std::vector of views Since the result is a view, you cannot call make_consistent() on the result. NOTE: There are certain operations that you cannot do with a view. For example, you cannot add/delete columns, etc.	T: Type of the named column Ts: The list of types for all columns. A type should be specified only once col_name: Name of the data column kernel_bandwidth: The width or spread of the kernel function used max_distance: Maximum distance between two data points in the same cluster mean_shift_kernel: Kernel type used num_of_iter: Maximum number of iterations for AP clustering algorithm to converge
template<typename T, typename ... Ts> std::vector<ConstPtrView> get_view_by_mshift(const char *col_name, double kernel_bandwidth, double max_distance, mean_shift_kernel kernel = mean_shift_kernel::gaussian, size_type num_of_iter = 50) const;	Same as above view, but it returns a std::vector of const views. You can not change data in const views. But if the data is changed in the original DataFrame or through another view, it is reflected in the const view.	T: Type of the named column Ts: The list of types for all columns. A type should be specified only once col_name: Name of the data column kernel_bandwidth: The width or spread of the kernel function used max_distance: Maximum distance between two data points in the same cluster mean_shift_kernel: Kernel type used num_of_iter: Maximum number of iterations for AP clustering algorithm to converge

void test_get_data_by_mshift()  {

    std::cout << "\nTesting get_data_by_mshift( ) ..." << std::endl;

    typedef StdDataFrame64<std::string> StrDataFrame;

    StrDataFrame    df;

    try  {
        df.read("SHORT_IBM.dat", io_format::binary);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }

    StrDataFrame    df2 = df;

    auto    lbd = [](const std::string &, const double &) -> bool { return (true); };
    auto    view = df2.get_view_by_sel<double, decltype(lbd), double, long>("IBM_Open", lbd);

    // I am using both views and dataframes to make sure both work
    //
    auto    views = view.get_view_by_mshift<double, double, long>("IBM_Close", 1, 4, mean_shift_kernel::gaussian);
    auto    dfs = df.get_data_by_mshift<double, double, long>("IBM_Close", 1, 4, mean_shift_kernel::gaussian);
   
    assert(views.size() == 38);
    assert(dfs.size() == 38);
    assert(views[0].get_index().size() == 56);
    assert(dfs[0].get_index().size() == 56);
    assert(views[4].get_index().size() == 20);
    assert(views[6].get_index().size() == 3);
    assert(views[10].get_index().size() == 45);
    assert(views[14].get_index().size() == 101);
    assert(views[18].get_index().size() == 164);
    assert(dfs[18].get_index().size() == 164);

    assert((std::fabs(views[0].get_column<double>("IBM_Close")[7] - 183.69) < 0.001));
    assert((std::fabs(dfs[5].get_column<double>("IBM_Open")[15] - 173.91) < 0.001));
    assert((std::fabs(views[16].get_column<double>("IBM_High")[3] - 166.02) < 0.001));
    assert(dfs[18].get_column<long>("IBM_Volume")[0] == 10189700);
    assert(views[18].get_index()[1] == "2015-09-01");
}