| Signature | Description | Parameters |
|---|---|---|
template<std::size_t K, arithmetic T, typename ... Ts> std::array<DataFrame, K> get_data_by_spectral(const char *col_name, double sigma, seed_t seed = seed_t(-1), std::function<double(const T &x, const T &y, double sigma)> &&sfunc = [](const T &x, const T &y, double sigma) -> double { return (std::exp(-((x - y) * (x - y)) / (2 * sigma * sigma))); }, size_type num_of_iter = 1000) const; |
This uses spectral clustering algorithm to divide the named column into K clusters. It returns a std::array of K DataFrame's each containing one of the clusters of data based on the named column. Self is unchanged. NOTE Type T must support arithmetic operations |
K: Number of clusters for k-means clustering algorithm T: Type of the named column Ts: The list of types for all columns. A type should be specified only once col_name: Name of the data column sfunc: A function to calculate the similarity matrix between data points in the named column num_of_iter: Maximum number of iterations for k-means clustering algorithm before converging seed: Seed for random number generator to initialize k-means clustering algorithm. Default is a random number for each call. |
template<std::size_t K, arithmetic T, typename ... Ts> std::array<PtrView, K> get_view_by_spectral(const char *col_name, double sigma, seed_t seed = seed_t(-1), std::function<double(const T &x, const T &y, double sigma)> &&sfunc = [](const T &x, const T &y, double sigma) -> double { return (std::exp(-((x - y) * (x - y)) / (2 * sigma * sigma))); }, size_type num_of_iter = 1000); |
This is identical to above get_data_by_spectral(), but:
|
K: Number of clusters for k-means clustering algorithm T: Type of the named column Ts: The list of types for all columns. A type should be specified only once col_name: Name of the data column sfunc: A function to calculate the similarity matrix between data points in the named column num_of_iter: Maximum number of iterations for k-means clustering algorithm before converging seed: Seed for random number generator to initialize k-means clustering algorithm. Default is a random number for each call. |
template<std::size_t K, arithmetic T, typename ... Ts> std::array<ConstPtrView, K> get_view_by_spectral(const char *col_name, double sigma, seed_t seed = seed_t(-1), std::function<double(const T &x, const T &y, double sigma)> &&sfunc = [](const T &x, const T &y, double sigma) -> double { return (std::exp(-((x - y) * (x - y)) / (2 * sigma * sigma))); }, size_type num_of_iter = 1000) const; |
Same as above view, but it returns a std::array of K const view. You can not change data in const views. But if the data is changed in the original DataFrame or through another view, it is reflected in the const view. |
K: Number of clusters for k-means clustering algorithm T: Type of the named column Ts: The list of types for all columns. A type should be specified only once col_name: Name of the data column sfunc: A function to calculate the similarity matrix between data points in the named column num_of_iter: Maximum number of iterations for k-means clustering algorithm before converging seed: Seed for random number generator to initialize k-means clustering algorithm. Default is a random number for each call. |
static void test_get_data_by_spectral() { std::cout << "\nTesting get_data_by_spectral( ) ..." << std::endl; typedef StdDataFrame64<std::string> StrDataFrame; StrDataFrame df; try { df.read("SHORT_IBM.csv", io_format::csv2, { .starting_row = 1000, .num_rows = 500 }); } catch (const DataFrameError &ex) { std::cout << ex.what() << std::endl; ::exit(-1); } StrDataFrame df2 = df; auto lbd = [](const std::string &, const double &) -> bool { return (true); }; auto view = df2.get_view_by_sel<double, decltype(lbd), double, long>("IBM_Open", lbd); auto result_df = df.get_data_by_spectral <3, double, double, long>("IBM_Close", 8, 89); auto result_view = view.get_view_by_spectral<3, double, double, long>("IBM_Close", 8, 89); assert(result_df.size() == 3); assert(result_df.size() == result_view.size()); assert(result_df[0].get_index().size() == 1); assert(result_df[0].get_column<double>("IBM_Open").size() == 1); assert(result_df[0].get_index()[0] == "2018-12-24"); assert(result_df[0].get_column<double>("IBM_High")[0] == 111.0); assert(result_df[0].get_column<long>("IBM_Volume")[0] == 3821400); assert(result_view[0].get_column<double>("IBM_High")[0] == 111.0); assert(result_view[0].get_column<long>("IBM_Volume")[0] == 3821400); assert(result_df[1].get_index().size() == 47); assert(result_df[1].get_column<double>("IBM_Open").size() == 47); assert(result_df[1].get_index()[0] == "2018-10-29"); assert(result_df[1].get_index()[46] == "2019-01-22"); assert(result_df[1].get_column<double>("IBM_High")[20] == 121.68); assert(result_df[1].get_column<long>("IBM_Volume")[35] == 4346700); assert(result_view[1].get_index().size() == 47); assert(result_view[1].get_column<double>("IBM_Open").size() == 47); assert(result_view[1].get_index()[0] == "2018-10-29"); assert(result_view[1].get_index()[46] == "2019-01-22"); assert(result_view[1].get_column<double>("IBM_High")[20] == 121.68); assert(result_view[1].get_column<long>("IBM_Volume")[35] == 4346700); assert(result_df[2].get_index().size() == 452); assert(result_df[2].get_column<double>("IBM_Open").size() == 452); assert(result_df[2].get_index()[0] == "2017-12-20"); assert(result_df[2].get_index()[451] == "2019-12-16"); assert(result_df[2].get_column<double>("IBM_High")[200] == 149.070007); assert(result_df[2].get_column<long>("IBM_Volume")[300] == 4958000); assert(result_view[2].get_index().size() == 452); assert(result_view[2].get_column<double>("IBM_Open").size() == 452); assert(result_view[2].get_index()[0] == "2017-12-20"); assert(result_view[2].get_index()[451] == "2019-12-16"); assert(result_view[2].get_column<double>("IBM_High")[200] == 149.070007); assert(result_view[2].get_column<long>("IBM_Volume")[300] == 4958000); }