Back to Documentations

Signature Description
template<typename T>
struct  KShapeParams  {

    using seed_t = std::random_device::result_type;

    // Parameter for how to normalize the columns
    //
    normalization_type  norm_t { normalization_type::z_score };

    // Max iteration to extract the centroid for a shape
    //
    long                shape_iter { 10L };

    // Max iteration for main algorithm
    //
    long                max_iter { 1000L };

    // The min difference in the main loop to break out
    //
    T                   epsilon { T(1e-8) };

    // Seed for the random number generator. The default is random seed
    //
    seed_t              seed { seed_t(-1) };
};
A structure containing the parameteres to kshape_groups() call

Signature Description Parameters
template<typename T>
std::vector<std::vector<std::string>>
kshape_groups(const std::vector<const char *> &col_names,
              long k,
              const KShapeParams<T> params = { }) const;
K-shape is a powerful, unsupervised time-series clustering algorithm that groups sequences based on their shape, not just magnitude, by using a novel distance measure derived from normalized cross-correlation, making it great for finding similar patterns (like energy use, heartbeats, or sensor data) regardless of shifts or scaling, and it works like k-means but with shape-specific logic.
Instead of Euclidean distance (which focuses on absolute values), k-Shape uses Normalized Cross-Correlation (NCC) to find the best alignment (shift) between two time series, measuring how similar their patterns are.
Like-means, you must specify the number of clusters.
The return result is a vector of k vectors of column names. Each vector contains a specific cluster.

NOTE: All specified columns must be of the same length.
T: Type of the named columns
k: Number of expected clusters
params: rest of parameters necessary for this operation
static void test_kshape_groups()  {

    std::cout << "\nTesting kshape_groups( ) ..." << std::endl;

    StrDataFrame    df;

    try  {
        df.read("IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }

    const std::size_t   data_s = df.get_index().size();

    // Cluster 1: Sine wave pattern
    //
    for (std::size_t i = 0; i < 5; ++i)  {
        std::vector<double> series(data_s);
        const std::string   col_name { "Sine Wave " };

        for (std::size_t j = 0; j < data_s; ++j)
            series[j] = std::sin(2 * M_PI * j / 25.0) + (std::rand() % 100) / 500.0;

        df.load_column((col_name + std::to_string(i)).c_str(), std::move(series), nan_policy::dont_pad_with_nans);
    }

    // Cluster 2: Exponential growth pattern
    //
    for (std::size_t i = 0; i < 5; ++i)  {
        std::vector<double> series(data_s);
        const std::string   col_name { "Exponential Inc " };

        for (std::size_t j = 0; j < data_s; ++j)
            series[j] = std::exp(j / 25.0) - 1.0 + (std::rand() % 100) / 500.0;

        df.load_column((col_name + std::to_string(i)).c_str(), std::move(series), nan_policy::dont_pad_with_nans);
    }

    // Cluster 3: Linear increasing pattern
    //
    for (std::size_t i = 0; i < 5; ++i) {
        std::vector<double> series(data_s);
        const std::string   col_name { "Linear Inc " };

        for (std::size_t j = 0; j < data_s; ++j)
            series[j] = j / 25.0 + (std::rand() % 100) / 500.0;

        df.load_column((col_name + std::to_string(i)).c_str(), std::move(series), nan_policy::dont_pad_with_nans);
    }

    const auto result =
        df.kshape_groups<double>({ "IBM_Open", "IBM_High", "IBM_Low", "IBM_Close",
                                   "Sine Wave 0", "Sine Wave 1", "Sine Wave 2", "Sine Wave 3", "Sine Wave 4",
                                   "Exponential Inc 0", "Exponential Inc 1", "Exponential Inc 2", "Exponential Inc 3", "Exponential Inc 4",
                                   "Linear Inc 0", "Linear Inc 1", "Linear Inc 2", "Linear Inc 3", "Linear Inc 4" },
                                 4L,
                                 { .seed = 123 });

    assert(result.size() == 4);
    assert((result[0] == std::vector<std::string> { "Linear Inc 0", "Linear Inc 1", "Linear Inc 2", "Linear Inc 3", "Linear Inc 4" }));
    assert((result[1] == std::vector<std::string> { "Exponential Inc 0", "Exponential Inc 1", "Exponential Inc 2", "Exponential Inc 3", "Exponential Inc 4" }));
    assert((result[2] == std::vector<std::string> { "IBM_Open", "IBM_High", "IBM_Low", "IBM_Close" }));
    assert((result[3] == std::vector<std::string> { "Sine Wave 0", "Sine Wave 1", "Sine Wave 2", "Sine Wave 3", "Sine Wave 4" }));
}

C++ DataFrame