Back to Documentations

Signature Description Parameters
template<std::size_t K, arithmetic T, typename ... Ts>
std::array<DataFrame, K>
get_data_by_kmeans(const char *col_name,
                   std::function<double(const T &x, const T &y)> &&dfunc =
                       [](const T &x, const T &y) -> double  {
                           return ((x - y) * (x - y));
                       },
                   size_type num_of_iter = 1000,
                   seed_t seed = seed_t(-1)) const;
This uses k-means clustering algorithm to divide the named column into K clusters. It returns a std::array of K DataFrame's each containing one of the clusters of data based on the named column.
Self is unchanged.

NOTE Type T must support arithmetic operations
K: Number of clusters for k-means clustering algorithm
T: Type of the named column
Ts: The list of types for all columns. A type should be specified only once
col_name: Name of the data column
dfunc: A function to calculate the distance between two data points in the named column
num_of_iter: Maximum number of iterations for k-means clustering algorithm before converging
seed: Seed for random number generator to initialize k-means clustering algorithm. Default is a random number for each call.
template<std::size_t K, arithmetic T, typename ... Ts>
std::array<PtrView, K>
get_view_by_kmeans(const char *col_name,
                   std::function<double(const T &x, const T &y)> &&dfunc =
                       [](const T &x, const T &y) -> double  {
                           return ((x - y) * (x - y));
                       },
                   size_type num_of_iter = 1000,
                   seed_t seed = seed_t(-1));
This is identical to above get_data_by_kmeans(), but:
  1. The result is a std::array of K views
  2. Since the result is a view, you cannot call make_consistent() on the result.
NOTE: There are certain operations that you cannot do with a view. For example, you cannot add/delete columns, etc.
K: Number of clusters for k-means clustering algorithm
T: Type of the named column
Ts: The list of types for all columns. A type should be specified only once
col_name: Name of the data column
dfunc: A function to calculate the distance between two data points in the named column
num_of_iter: Maximum number of iterations for k-means clustering algorithm before converging
seed: Seed for random number generator to initialize k-means clustering algorithm. Default is a random number for each call.
template<std::size_t K, arithmetic T, typename ... Ts>
std::array<ConstPtrView, K>
get_view_by_kmeans(const char *col_name,
                   std::function<double(const T &x, const T &y)> &&dfunc =
                       [](const T &x, const T &y) -> double  {
                           return ((x - y) * (x - y));
                       },
                   size_type num_of_iter = 1000,
                   seed_t seed = seed_t(-1)) const;
Same as above view, but it returns a std::array of K const view. You can not change data in const views. But if the data is changed in the original DataFrame or through another view, it is reflected in the const view. K: Number of clusters for k-means clustering algorithm
T: Type of the named column
Ts: The list of types for all columns. A type should be specified only once
col_name: Name of the data column
dfunc: A function to calculate the distance between two data points in the named column
num_of_iter: Maximum number of iterations for k-means clustering algorithm before converging
seed: Seed for random number generator to initialize k-means clustering algorithm. Default is a random number for each call.
static void test_get_data_by_kmeans()  {

    std::cout << "\nTesting get_data_by_kmeans( ) ..." << std::endl;

    typedef StdDataFrame64<std::string> StrDataFrame;

    StrDataFrame    df;

    try  {
        df.read("SHORT_IBM.dat", io_format::binary);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
    }

    StrDataFrame    df2 = df;

    auto    lbd = [](const std::string &, const double &) -> bool { return (true); };
    auto    view = df2.get_view_by_sel<double, decltype(lbd), double, long>("IBM_Open", lbd);

    auto    result_df =
        df.get_data_by_kmeans <4, double, double, long>("IBM_Close",
                                                        [](const double &x, const double &y) -> double  {
                                                            return (std::fabs(x - y));
                                                        },
                                                        1000,  // Number of iterations
                                                        1234); // Random number seed
    auto    result_view =
        view.get_view_by_kmeans<4, double, double, long>("IBM_Close",
                                                         [](const double &x, const double &y) -> double  {
                                                             return (std::fabs(x - y));
                                                         },
                                                         1000,  //  Number of iterations
                                                         1234); // Random number seed

    assert(result_df.size() == 4);
    assert(result_df.size() == result_view.size());

    assert(result_df[0].get_index().size() == 272);
    assert(result_df[0].get_column<double>("IBM_Open").size() == 272);
    assert(result_df[0].get_index()[0] == "2014-01-02");
    assert(result_df[0].get_index()[271] == "2017-04-07");
    assert(result_df[0].get_column<double>("IBM_High")[200] == 182.839996);
    assert(result_df[0].get_column<long>("IBM_Volume")[100] == 3721600);
    assert(result_view[0].get_index().size() == 272);
    assert(result_view[0].get_column<double>("IBM_Open").size() == 272);
    assert(result_view[0].get_index()[0] == "2014-01-02");
    assert(result_view[0].get_index()[271] == "2017-04-07");
    assert(result_view[0].get_column<double>("IBM_High")[200] == 182.839996);
    assert(result_view[0].get_column<long>("IBM_Volume")[100] == 3721600);

    assert(result_df[1].get_index().size() == 585);
    assert(result_df[1].get_column<double>("IBM_Open").size() == 585);
    assert(result_df[1].get_index()[0] == "2014-10-20");
    assert(result_df[1].get_index()[584] == "2020-02-21");
    assert(result_df[1].get_column<double>("IBM_High")[200] == 153.100006);
    assert(result_df[1].get_column<long>("IBM_Volume")[100] == 3749600);
    assert(result_view[1].get_index().size() == 585);
    assert(result_view[1].get_column<double>("IBM_Open").size() == 585);
    assert(result_view[1].get_index()[0] == "2014-10-20");
    assert(result_view[1].get_index()[584] == "2020-02-21");
    assert(result_view[1].get_column<double>("IBM_High")[200] == 153.100006);
    assert(result_view[1].get_column<long>("IBM_Volume")[100] == 3749600);

    assert(result_df[2].get_index().size() == 258);
    assert(result_df[2].get_column<double>("IBM_Open").size() == 258);
    assert(result_df[2].get_index()[0] == "2016-01-15");
    assert(result_df[2].get_index()[257] == "2020-10-30");
    assert(result_df[2].get_column<double>("IBM_High")[200] == 127.239998);
    assert(result_df[2].get_column<long>("IBM_Volume")[100] == 12502100);
    assert(result_view[2].get_index().size() == 258);
    assert(result_view[2].get_column<double>("IBM_Open").size() == 258);
    assert(result_view[2].get_index()[0] == "2016-01-15");
    assert(result_view[2].get_index()[257] == "2020-10-30");
    assert(result_view[2].get_column<double>("IBM_High")[200] == 127.239998);
    assert(result_view[2].get_column<long>("IBM_Volume")[100] == 12502100);

    assert(result_df[3].get_index().size() == 606);
    assert(result_df[3].get_column<double>("IBM_Open").size() == 606);
    assert(result_df[3].get_index()[0] == "2015-08-21");
    assert(result_df[3].get_index()[605] == "2020-10-08");
    assert(result_df[3].get_column<double>("IBM_High")[200] == 145.880005);
    assert(result_df[3].get_column<long>("IBM_Volume")[100] == 4386200);
    assert(result_view[3].get_index().size() == 606);
    assert(result_view[3].get_column<double>("IBM_Open").size() == 606);
    assert(result_view[3].get_index()[0] == "2015-08-21");
    assert(result_view[3].get_index()[605] == "2020-10-08");
    assert(result_view[3].get_column<double>("IBM_High")[200] == 145.880005);
    assert(result_view[3].get_column<long>("IBM_Volume")[100] == 4386200);
}


C++ DataFrame