| Signature | Description | Parameters |
|---|---|---|
template<std::size_t K, arithmetic T, typename ... Ts> std::array<DataFrame, K> get_data_by_kmeans(const char *col_name, std::function<double(const T &x, const T &y)> &&dfunc = [](const T &x, const T &y) -> double { return ((x - y) * (x - y)); }, size_type num_of_iter = 1000, seed_t seed = seed_t(-1)) const; |
This uses k-means clustering algorithm to divide the named column into K clusters. It returns a std::array of K DataFrame's each containing one of the clusters of data based on the named column. Self is unchanged. NOTE Type T must support arithmetic operations |
K: Number of clusters for k-means clustering algorithm T: Type of the named column Ts: The list of types for all columns. A type should be specified only once col_name: Name of the data column dfunc: A function to calculate the distance between two data points in the named column num_of_iter: Maximum number of iterations for k-means clustering algorithm before converging seed: Seed for random number generator to initialize k-means clustering algorithm. Default is a random number for each call. |
template<std::size_t K, arithmetic T, typename ... Ts> std::array<PtrView, K> get_view_by_kmeans(const char *col_name, std::function<double(const T &x, const T &y)> &&dfunc = [](const T &x, const T &y) -> double { return ((x - y) * (x - y)); }, size_type num_of_iter = 1000, seed_t seed = seed_t(-1)); |
This is identical to above get_data_by_kmeans(), but:
|
K: Number of clusters for k-means clustering algorithm T: Type of the named column Ts: The list of types for all columns. A type should be specified only once col_name: Name of the data column dfunc: A function to calculate the distance between two data points in the named column num_of_iter: Maximum number of iterations for k-means clustering algorithm before converging seed: Seed for random number generator to initialize k-means clustering algorithm. Default is a random number for each call. |
template<std::size_t K, arithmetic T, typename ... Ts> std::array<ConstPtrView, K> get_view_by_kmeans(const char *col_name, std::function<double(const T &x, const T &y)> &&dfunc = [](const T &x, const T &y) -> double { return ((x - y) * (x - y)); }, size_type num_of_iter = 1000, seed_t seed = seed_t(-1)) const; |
Same as above view, but it returns a std::array of K const view. You can not change data in const views. But if the data is changed in the original DataFrame or through another view, it is reflected in the const view. |
K: Number of clusters for k-means clustering algorithm T: Type of the named column Ts: The list of types for all columns. A type should be specified only once col_name: Name of the data column dfunc: A function to calculate the distance between two data points in the named column num_of_iter: Maximum number of iterations for k-means clustering algorithm before converging seed: Seed for random number generator to initialize k-means clustering algorithm. Default is a random number for each call. |
static void test_get_data_by_kmeans() { std::cout << "\nTesting get_data_by_kmeans( ) ..." << std::endl; typedef StdDataFrame64<std::string> StrDataFrame; StrDataFrame df; try { df.read("SHORT_IBM.dat", io_format::binary); } catch (const DataFrameError &ex) { std::cout << ex.what() << std::endl; } StrDataFrame df2 = df; auto lbd = [](const std::string &, const double &) -> bool { return (true); }; auto view = df2.get_view_by_sel<double, decltype(lbd), double, long>("IBM_Open", lbd); auto result_df = df.get_data_by_kmeans <4, double, double, long>("IBM_Close", [](const double &x, const double &y) -> double { return (std::fabs(x - y)); }, 1000, // Number of iterations 1234); // Random number seed auto result_view = view.get_view_by_kmeans<4, double, double, long>("IBM_Close", [](const double &x, const double &y) -> double { return (std::fabs(x - y)); }, 1000, // Number of iterations 1234); // Random number seed assert(result_df.size() == 4); assert(result_df.size() == result_view.size()); assert(result_df[0].get_index().size() == 272); assert(result_df[0].get_column<double>("IBM_Open").size() == 272); assert(result_df[0].get_index()[0] == "2014-01-02"); assert(result_df[0].get_index()[271] == "2017-04-07"); assert(result_df[0].get_column<double>("IBM_High")[200] == 182.839996); assert(result_df[0].get_column<long>("IBM_Volume")[100] == 3721600); assert(result_view[0].get_index().size() == 272); assert(result_view[0].get_column<double>("IBM_Open").size() == 272); assert(result_view[0].get_index()[0] == "2014-01-02"); assert(result_view[0].get_index()[271] == "2017-04-07"); assert(result_view[0].get_column<double>("IBM_High")[200] == 182.839996); assert(result_view[0].get_column<long>("IBM_Volume")[100] == 3721600); assert(result_df[1].get_index().size() == 585); assert(result_df[1].get_column<double>("IBM_Open").size() == 585); assert(result_df[1].get_index()[0] == "2014-10-20"); assert(result_df[1].get_index()[584] == "2020-02-21"); assert(result_df[1].get_column<double>("IBM_High")[200] == 153.100006); assert(result_df[1].get_column<long>("IBM_Volume")[100] == 3749600); assert(result_view[1].get_index().size() == 585); assert(result_view[1].get_column<double>("IBM_Open").size() == 585); assert(result_view[1].get_index()[0] == "2014-10-20"); assert(result_view[1].get_index()[584] == "2020-02-21"); assert(result_view[1].get_column<double>("IBM_High")[200] == 153.100006); assert(result_view[1].get_column<long>("IBM_Volume")[100] == 3749600); assert(result_df[2].get_index().size() == 258); assert(result_df[2].get_column<double>("IBM_Open").size() == 258); assert(result_df[2].get_index()[0] == "2016-01-15"); assert(result_df[2].get_index()[257] == "2020-10-30"); assert(result_df[2].get_column<double>("IBM_High")[200] == 127.239998); assert(result_df[2].get_column<long>("IBM_Volume")[100] == 12502100); assert(result_view[2].get_index().size() == 258); assert(result_view[2].get_column<double>("IBM_Open").size() == 258); assert(result_view[2].get_index()[0] == "2016-01-15"); assert(result_view[2].get_index()[257] == "2020-10-30"); assert(result_view[2].get_column<double>("IBM_High")[200] == 127.239998); assert(result_view[2].get_column<long>("IBM_Volume")[100] == 12502100); assert(result_df[3].get_index().size() == 606); assert(result_df[3].get_column<double>("IBM_Open").size() == 606); assert(result_df[3].get_index()[0] == "2015-08-21"); assert(result_df[3].get_index()[605] == "2020-10-08"); assert(result_df[3].get_column<double>("IBM_High")[200] == 145.880005); assert(result_df[3].get_column<long>("IBM_Volume")[100] == 4386200); assert(result_view[3].get_index().size() == 606); assert(result_view[3].get_column<double>("IBM_Open").size() == 606); assert(result_view[3].get_index()[0] == "2015-08-21"); assert(result_view[3].get_index()[605] == "2020-10-08"); assert(result_view[3].get_column<double>("IBM_High")[200] == 145.880005); assert(result_view[3].get_column<long>("IBM_Volume")[100] == 4386200); }