| Signature | Description | Parameters |
|---|---|---|
#include <DataFrame/DataFrameStatsVisitors.h> template<typename T, typename I = unsigned long> struct CovVisitor; |
This functor class calculates the covariance of two given columns. In addition, it provides the variances of both columns. This works with both scalar and multidimensional (i.e. vectors or arrays) datasets. For multidimensiional datasetes, you must use the single_act_visit() interface.
explicit
CovVisitor(bool bias = false,
bool skipnan = false,
bool stable_algo = false);
bias: If true it divides by n - 1, otherwise by n.skip_nan: If true it skips over nan numbers as if they didn't exist. stable_algo: If true, it uses a version of Kahan summation that is numerically stable for data with very large values. Kahan summation is slower than regular summation, so only use it, if your data contains very large values. There are also the following member functions:
get_result(): Returns the covariance.
In case of scalar dataset, the covariance is a single number. In case of multidimensional dataset,
the covariance is a matrix of first column dimension rows and second column dimension columns.
get_count(): Returns the number of valid datapoints (none NaN), only when dealing with scalar datasets.
get_mean1(): Returns the mean of the first time-series. In case of scalar dataset,
the mean is a single number. In case of multidimensional dataset, the mean is
a vector of size first column dimension.
get_mean2(): Returns the mean of the second time-series. In case of scalar dataset,
the mean is a single number. In case of multidimensional dataset, the mean is
a vector of size second column dimension.
|
T: Column data type. T must be an arithmetic-enabled type I: Index type. |
std::cout << "\nTesting Covariance Visitor ..." << std::endl; CovVisitor<double> cov_visitor; auto fut10 = df.visit_async<double, double>("dbl_col", "dbl_col_2", cov_visitor); const double cov = fut10.get().get_result(); assert(fabs(cov - -0.358381) < 0.000001);
// ---------------------------------------------------------------------------- void test_md_stats() { std::cout << "\nTesting get_md_stats( ) ..." << std::endl; constexpr std::size_t item_cnt = 1024; ULDataFrame df; df.load_index(ULDataFrame::gen_sequence_index(0, item_cnt, 1)); RandGenParams<double> p; p.seed = 123; p.min_value = 0.5; p.max_value = 2.0; constexpr std::size_t dim { 3 }; using ary_col_t = std::array<double, dim>; using vec_col_t = std::vector<double>; // Generate and load 3 random columns // auto rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * dim, p); std::vector<ary_col_t> array_col(df.get_index().size()); std::vector<ary_col_t> array_col2(df.get_index().size()); std::vector<vec_col_t> vector_col(df.get_index().size()); for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i) { vector_col[i].resize(dim); for (std::size_t d { 0 }; d < dim; ++d) array_col[i][d] = vector_col[i][d] = rand_vec[j++]; } df.load_column<ary_col_t>("array_col", std::move(array_col)); df.load_column<vec_col_t>("vector_col", std::move(vector_col)); p.seed = 1024; rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * dim, p); for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i) { for (std::size_t d { 0 }; d < dim; ++d) array_col2[i][d] = rand_vec[j++]; } df.load_column<ary_col_t>("array_col2", std::move(array_col2)); // Covariance // CovVisitor<ary_col_t> cov; df.single_act_visit<ary_col_t, ary_col_t>("array_col", "array_col2", cov); const auto &cov_result { cov.get_result() }; const auto &mean1_result { cov.get_mean1() }; const auto &mean2_result { cov.get_mean2() }; assert(cov_result.rows() == dim); assert(cov_result.cols() == dim); assert(std::fabs(cov_result(0, 0) - 0.00187) < 0.00001); assert(std::fabs(cov_result(1, 2) - 0.00420) < 0.00001); assert(std::fabs(cov_result(2, 1) - -0.000457) < 0.000001); assert(mean1_result.size() == dim); assert(std::fabs(mean1_result[1] - 1.2646) < 0.0001); assert(std::fabs(mean1_result[2] - 1.24829) < 0.00001); assert(mean2_result.size() == dim); assert(std::fabs(mean2_result[1] - 1.25428) < 0.00001); assert(std::fabs(mean2_result[2] - 1.25122) < 0.00001); CovVisitor<vec_col_t> cov2; df.single_act_visit<vec_col_t, vec_col_t>("vector_col", "vector_col", cov2); const auto &cov_result2 { cov2.get_result() }; assert(cov_result2.rows() == dim); assert(cov_result2.cols() == dim); assert(std::fabs(cov_result2(0, 0) - 0.188617) < 0.000001); assert(std::fabs(cov_result2(1, 2) - 0.004326) < 0.000001); assert(std::fabs(cov_result2(2, 1) - 0.004326) < 0.000001); }