Back to Documentations

Signature Description Parameters
#include <DataFrame/DataFrameStatsVisitors.h>

template<typename T, typename I = unsigned long>
struct VarVisitor;
This functor class calculates the variance of a given column.
This works with both scalar and multidimensional (i.e. vector and arrays) datasets. For multidimensiional datasetes, you must use the single_act_visit() interface.
          explicit
          VarVisitor(bool bias = false,
                     bool skipnan = false,
                     bool stable_algo = false);
        
bias: If true it divides by n - 1, otherwise by n.
skip_nan: If true it skips over nan numbers as if they didn't exist.
stable_algo: If true, it uses a version of Kahan summation that is numerically stable for data with very large values. Kahan summation is slower than regular summation, so only use it, if your data contains very large values.

There are also the following member functions:
get_result(): Returns the variance.
              In case of scalar dataset, the variance is a single number. In case of multidimensional dataset,
              the variance is a square matrix of data dimension rows and columns.
get_count(): Returns the number of valid datapoints (none NaN)
get_mean(): Returns the mean of the time-series. In case of scalar dataset,
            the mean is a single number. In case of multidimensional dataset, the mean is
            a vector of size data dimension.
        
T: Column data type.
I: Index type.

void test_md_stats()  {

    std::cout << "\nTesting get_md_stats( ) ..." << std::endl;

    constexpr std::size_t   item_cnt = 1024;
    ULDataFrame             df;

    df.load_index(ULDataFrame::gen_sequence_index(0, item_cnt, 1));

    RandGenParams<double>   p;

    p.seed = 123;
    p.min_value = 0.5;
    p.max_value = 2.0;

    constexpr std::size_t   dim { 3 };

    using ary_col_t = std::array<double, dim>;
    using vec_col_t = std::vector<double>;

    // Generate and load 3 random columns
    //
    auto    rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * dim, p);

    std::vector<ary_col_t>  array_col(df.get_index().size());
    std::vector<ary_col_t>  array_col2(df.get_index().size());
    std::vector<vec_col_t>  vector_col(df.get_index().size());

    for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i)  {
        vector_col[i].resize(dim);
        for (std::size_t d { 0 }; d < dim; ++d)
            array_col[i][d] = vector_col[i][d] = rand_vec[j++];
    }
    df.load_column<ary_col_t>("array_col", std::move(array_col));
    df.load_column<vec_col_t>("vector_col", std::move(vector_col));

    p.seed = 1024;
    rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * dim, p);
    for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i)  {
        for (std::size_t d { 0 }; d < dim; ++d)
            array_col2[i][d] = rand_vec[j++];
    }
    df.load_column<ary_col_t>("array_col2", std::move(array_col2));

    // Standard Deviation / Variance
    //
    VarVisitor<vec_col_t>   var;
    StdVisitor<ary_col_t>   stdev;

    df.single_act_visit<vec_col_t>("vector_col", var);
    df.single_act_visit<ary_col_t>("array_col", stdev);

    const auto  &var_result = var.get_result();

    assert(var_result.rows() == dim);
    assert(var_result.cols() == dim);
    assert(std::fabs(var_result(0, 0) - 0.188617) < 0.000001);
    assert(std::fabs(var_result(1, 2) - 0.004326) < 0.000001);
    assert(std::fabs(var_result(2, 1) - 0.004326) < 0.000001);

    const auto  &var_mean = var.get_mean();

    assert(var_mean.size() == dim);
    assert(std::fabs(var_mean[1] - 1.2646) < 0.0001);
    assert(std::fabs(var_mean[2] - 1.24829) < 0.00001);

    const auto  &std_result = stdev.get_result();

    assert(std_result.size() == dim);
    assert(std::fabs(std_result[1] - 0.428274) < 0.000001);
    assert(std::fabs(std_result[2] - 0.431151) < 0.000001);

    const auto  &std_mean = stdev.get_mean();

    assert(std_mean.size() == dim);
    assert(std::fabs(std_mean[1] - 1.2646) < 0.0001);
    assert(std::fabs(std_mean[2] - 1.24829) < 0.00001);
}

C++ DataFrame