| Signature | Description | Parameters |
|---|---|---|
#include <DataFrame/DataFrameStatsVisitors.h> template<arithmetic T, typename I = unsigned long> struct ChiSquaredTestVisitor; // ------------------------------------- template<typename T, typename I = unsigned long> using chis_test_v = ChiSquaredTestVisitor<T, I>; |
This functor class calculates the Chi Squared Test A chi-squared test χ2 is a statistical hypothesis test that determines if there is a significant difference between expected and observed frequencies in one or more categories. It is used to test for a relationship between two categorical variables (test of independence) or to see if the observed distribution of a single categorical variable matches a specific distribution (goodness of fit test). The basic idea is to compare actual and expected values, and if the calculated χ2 value is greater than the critical value, you reject the null hypothesis that there is no difference. In this implementation The test take two time-series in this order; obserebd and expected. This works with both scalar and multidimensional (i.e. vectors or arrays) datasets. In case of multidimensional data, the analysis is done per dimension (channel). get_result() returns the test statistics or the so called χ2. In case of multidimensional inputs, the result is a vector of dimension size. in MD case it applies the Wilson–Hilferty approximation per dimension get_p_value(size_type degree_of_freedom) returns the p-value that represents the probability of obtaining a test statistic as extreme as, or more extreme than, the one calculated from the data, assuming the null hypothesis is true. It is used to determine the statistical significance of the result: if the p-value is less than a pre-determined significance level (commonly 0.05), the null hypothesis is rejected. In case of multidimensional inputs, the result is a vector of dimension size. get_p_value(const std::vector<size_type> &dfdoms) MD-only overload for when each dimension has a different number of degrees of freedom, which is the statistically correct usage in the general case.
ChiSquaredTestVisitor();
|
T: Column data type. I: Index type. |
static void test_ChiSquaredTestVisitor() { std::cout << "\nTesting ChiSquaredTestVisitor{ } ..." << std::endl; std::vector<unsigned long> idx = { 123450, 123451, 123452, 123453, 123454 }; std::vector<double> ob1 = { 1, 2, 3, 4, 5 }; std::vector<double> ex1 = { 8, 9, 10, 11, 12 }; std::vector<double> ob2 = { 5, 18, 42, 27, 8 }; std::vector<double> ex2 = { 8, 20, 36, 24, 12 }; std::vector<double> ob3 = { 2, 5, 6, 8, 4 }; std::vector<double> ex3 = { 5, 5, 5, 5, 5 }; ULDataFrame df; df.load_data(std::move(idx), std::make_pair("observation 1", ob1), std::make_pair("expected 1", ex1), std::make_pair("observation 2", ob2), std::make_pair("expected 2", ex2), std::make_pair("observation 3", ob3), std::make_pair("expected 3", ex3)); ChiSquaredTestVisitor<double> chi; df.single_act_visit<double, double>("observation 1", "expected 1", chi); assert(std::fabs(chi.get_result() - 25.0073) < 0.0001); assert(chi.get_p_value(4) < 0.0000000001); df.single_act_visit<double, double>("observation 2", "expected 2", chi); assert(std::fabs(chi.get_result() - 4.0333) < 0.0001); assert(std::fabs(chi.get_p_value(4) - 0.4953) < 0.0001); df.single_act_visit<double, double>("observation 3", "expected 3", chi); assert(std::fabs(chi.get_result() - 4.0) < 0.0001); assert(std::fabs(chi.get_p_value(4) - 0.5) < 0.0001); // Now multidimensional data // constexpr std::size_t dim { 3 }; using ary_col_t = std::array<double, dim>; using vec_col_t = std::vector<double>; std::vector<ary_col_t> ary_observed { { 18.0, 10.0, 30.0 }, { 22.0, 20.0, 20.0 }, { 19.0, 15.0, 33.0 }, { 21.0, 18.0, 16.0 }, { 20.0, 20.0, 41.0 }, }; std::vector<vec_col_t> vec_observed { { 18.0, 10.0, 30.0 }, { 22.0, 20.0, 20.0 }, { 19.0, 15.0, 33.0 }, { 21.0, 18.0, 16.0 }, { 20.0, 20.0, 41.0 }, }; std::vector<ary_col_t> ary_expected { { 20.0, 16.6667, 28.0 }, { 20.0, 16.6667, 21.0 }, { 20.0, 16.6667, 35.0 }, { 20.0, 16.6667, 14.0 }, { 20.0, 16.6667, 42.0 }, }; std::vector<vec_col_t> vec_expected { { 20.0, 16.6667, 28.0 }, { 20.0, 16.6667, 21.0 }, { 20.0, 16.6667, 35.0 }, { 20.0, 16.6667, 14.0 }, { 20.0, 16.6667, 42.0 }, }; df.load_column<ary_col_t>("ARY OBSV", std::move(ary_observed), nan_policy::dont_pad_with_nans); df.load_column<vec_col_t>("VEC OBSV", std::move(vec_observed), nan_policy::dont_pad_with_nans); df.load_column<ary_col_t>("ARY EXPT", std::move(ary_expected), nan_policy::dont_pad_with_nans); df.load_column<vec_col_t>("VEC EXPT", std::move(vec_expected), nan_policy::dont_pad_with_nans); ChiSquaredTestVisitor<ary_col_t> ary_chi; ChiSquaredTestVisitor<vec_col_t> vec_chi; df.single_act_visit<ary_col_t, ary_col_t>("ARY OBSV", "ARY EXPT", ary_chi); df.single_act_visit<vec_col_t, vec_col_t>("VEC OBSV", "VEC EXPT", vec_chi); assert(ary_chi.get_result().size() == dim); assert(ary_chi.get_p_value(4).size() == dim); assert(ary_chi.get_p_value({ 4, 4, 4 }).size() == dim); assert(vec_chi.get_result().size() == dim); assert(vec_chi.get_p_value(4).size() == dim); assert(vec_chi.get_p_value({ 4, 4, 4 }).size() == dim); assert(std::fabs(ary_chi.get_result()[0] - 0.5) < 0.01); assert(std::fabs(ary_chi.get_result()[2] - 0.614286) < 0.000001); assert(std::fabs(ary_chi.get_p_value(4)[0] - 0.892038) < 0.000001); assert(std::fabs(ary_chi.get_p_value(4)[1] - 0.461508) < 0.000001); assert(std::fabs(ary_chi.get_p_value(4)[2] - 0.884353) < 0.000001); assert(std::fabs(ary_chi.get_p_value({4,4,4})[0] - 0.892038) < 0.000001); assert(std::fabs(ary_chi.get_p_value({4,4,4})[1] - 0.461508) < 0.000001); assert(std::fabs(ary_chi.get_p_value({4,4,4})[2] - 0.884353) < 0.000001); assert(std::fabs(vec_chi.get_result()[0] - 0.5) < 0.01); assert(std::fabs(vec_chi.get_result()[2] - 0.614286) < 0.000001); assert(std::fabs(vec_chi.get_p_value(4)[0] - 0.892038) < 0.000001); assert(std::fabs(vec_chi.get_p_value(4)[1] - 0.461508) < 0.000001); assert(std::fabs(vec_chi.get_p_value(4)[2] - 0.884353) < 0.000001); assert(std::fabs(vec_chi.get_p_value({ 4, 4, 4 })[0] - 0.892038) < 0.000001); assert(std::fabs(vec_chi.get_p_value({ 4, 4, 4 })[1] - 0.461508) < 0.000001); assert(std::fabs(vec_chi.get_p_value({ 4, 4, 4 })[2] - 0.884353) < 0.000001); }