Back to Documentations

Signature Description Parameters
template<std::size_t K, arithmetic T, typename ... Ts>
std::array<DataFrame, K>
get_data_by_spectral(const char *col_name,
                     double sigma,
                     seed_t seed = seed_t(-1),
                     std::function<double(const T &x, const T &y, double sigma)>  &&sfunc =
                         [](const T &x, const T &y, double sigma) -> double  {
                             return (std::exp(-((x - y) * (x - y)) / (2 * sigma * sigma)));
                         },
                     size_type num_of_iter = 1000) const;
This uses spectral clustering algorithm to divide the named column into K clusters. It returns a std::array of K DataFrame's each containing one of the clusters of data based on the named column.
Self is unchanged.

NOTE Type T must support arithmetic operations
K: Number of clusters for k-means clustering algorithm
T: Type of the named column
Ts: The list of types for all columns. A type should be specified only once
col_name: Name of the data column
sfunc: A function to calculate the similarity matrix between data points in the named column
num_of_iter: Maximum number of iterations for k-means clustering algorithm before converging
seed: Seed for random number generator to initialize k-means clustering algorithm. Default is a random number for each call.
template<std::size_t K, arithmetic T, typename ... Ts>
std::array<PtrView, K>
get_view_by_spectral(const char *col_name,
                     double sigma,
                     seed_t seed = seed_t(-1),
                     std::function<double(const T &x, const T &y, double sigma)>  &&sfunc =
                         [](const T &x, const T &y, double sigma) -> double  {
                             return (std::exp(-((x - y) * (x - y)) / (2 * sigma * sigma)));
                         },
                     size_type num_of_iter = 1000);
This is identical to above get_data_by_spectral(), but:
  1. The result is a std::array of K views
  2. Since the result is a view, you cannot call make_consistent() on the result.
NOTE: There are certain operations that you cannot do with a view. For example, you cannot add/delete columns, etc.
K: Number of clusters for k-means clustering algorithm
T: Type of the named column
Ts: The list of types for all columns. A type should be specified only once
col_name: Name of the data column
sfunc: A function to calculate the similarity matrix between data points in the named column
num_of_iter: Maximum number of iterations for k-means clustering algorithm before converging
seed: Seed for random number generator to initialize k-means clustering algorithm. Default is a random number for each call.
template<std::size_t K, arithmetic T, typename ... Ts>
std::array<ConstPtrView, K>
get_view_by_spectral(const char *col_name,
                     double sigma,
                     seed_t seed = seed_t(-1),
                     std::function<double(const T &x, const T &y, double sigma)>  &&sfunc =
                         [](const T &x, const T &y, double sigma) -> double  {
                             return (std::exp(-((x - y) * (x - y)) / (2 * sigma * sigma)));
                         },
                     size_type num_of_iter = 1000) const;
Same as above view, but it returns a std::array of K const view. You can not change data in const views. But if the data is changed in the original DataFrame or through another view, it is reflected in the const view. K: Number of clusters for k-means clustering algorithm
T: Type of the named column
Ts: The list of types for all columns. A type should be specified only once
col_name: Name of the data column
sfunc: A function to calculate the similarity matrix between data points in the named column
num_of_iter: Maximum number of iterations for k-means clustering algorithm before converging
seed: Seed for random number generator to initialize k-means clustering algorithm. Default is a random number for each call.
static void test_get_data_by_spectral()  {

    std::cout << "\nTesting get_data_by_spectral( ) ..." << std::endl;

    typedef StdDataFrame64<std::string> StrDataFrame;

    StrDataFrame    df;

    try  {
        df.read("SHORT_IBM.csv", io_format::csv2, { .starting_row = 1000, .num_rows = 500 });
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }

    StrDataFrame    df2 = df;

    auto    lbd = [](const std::string &, const double &) -> bool { return (true); };
    auto    view = df2.get_view_by_sel<double, decltype(lbd), double, long>("IBM_Open", lbd);

    auto    result_df = df.get_data_by_spectral <3, double, double, long>("IBM_Close", 8, 89);
    auto    result_view = view.get_view_by_spectral<3, double, double, long>("IBM_Close", 8, 89);

    assert(result_df.size() == 3);
    assert(result_df.size() == result_view.size());

    assert(result_df[0].get_index().size() == 1);
    assert(result_df[0].get_column<double>("IBM_Open").size() == 1);
    assert(result_df[0].get_index()[0] == "2018-12-24");
    assert(result_df[0].get_column<double>("IBM_High")[0] == 111.0);
    assert(result_df[0].get_column<long>("IBM_Volume")[0] == 3821400);
    assert(result_view[0].get_column<double>("IBM_High")[0] == 111.0);
    assert(result_view[0].get_column<long>("IBM_Volume")[0] == 3821400);

    assert(result_df[1].get_index().size() == 47);
    assert(result_df[1].get_column<double>("IBM_Open").size() == 47);
    assert(result_df[1].get_index()[0] == "2018-10-29");
    assert(result_df[1].get_index()[46] == "2019-01-22");
    assert(result_df[1].get_column<double>("IBM_High")[20] == 121.68);
    assert(result_df[1].get_column<long>("IBM_Volume")[35] == 4346700);
    assert(result_view[1].get_index().size() == 47);
    assert(result_view[1].get_column<double>("IBM_Open").size() == 47);
    assert(result_view[1].get_index()[0] == "2018-10-29");
    assert(result_view[1].get_index()[46] == "2019-01-22");
    assert(result_view[1].get_column<double>("IBM_High")[20] == 121.68);
    assert(result_view[1].get_column<long>("IBM_Volume")[35] == 4346700);

    assert(result_df[2].get_index().size() == 452);
    assert(result_df[2].get_column<double>("IBM_Open").size() == 452);
    assert(result_df[2].get_index()[0] == "2017-12-20");
    assert(result_df[2].get_index()[451] == "2019-12-16");
    assert(result_df[2].get_column<double>("IBM_High")[200] == 149.070007);
    assert(result_df[2].get_column<long>("IBM_Volume")[300] == 4958000);
    assert(result_view[2].get_index().size() == 452);
    assert(result_view[2].get_column<double>("IBM_Open").size() == 452);
    assert(result_view[2].get_index()[0] == "2017-12-20");
    assert(result_view[2].get_index()[451] == "2019-12-16");
    assert(result_view[2].get_column<double>("IBM_High")[200] == 149.070007);
    assert(result_view[2].get_column<long>("IBM_Volume")[300] == 4958000);
}

C++ DataFrame