Back to Documentations

Signature Description Parameters
#include <DataFrame/DataFrameStatsVisitors.h>

template<typename T, typename I = unsigned long>
struct KolmoSmirnovTestVisitor;

// -------------------------------------

template<typename T, typename I = unsigned long>
using ks_test_v = KolmoSmirnovTestVisitor<T, I>;
This functor class calculates two sample Kolmogorov Smirnov Test

The two-sample Kolmogorov-Smirnov test finds the largest vertical distance between the empirical cdfs for two samples. Unusually large distances indicate that the sample is not consistent with the hypothesized distribution (or that the two samples are not consistent with having come from the same distribution).
These tests are nonparametric in the sense that the distribution of the test statistic under the null doesn't depend on which specific distribution was specified under the null (or which common distribution the two samples are drawn from).

The get_result() returns KS statistic test D-value. (maximum absolute difference between the ECDFs). You can use the D-value with critical value approximation to decide whether to reject the null hypothesis at a certain significance level (e.g. α = 0.05).
There is also a get_p_value() member function that returns the p-value (the probability that the observed D-value (or more extreme) could occur if the null hypothesis is true).
    KolmoSmirnovTestVisitor();
        
T: Column data type.
I: Index type.
static void test_KolmoSmirnovTestVisitor()  {

    std::cout << "\nTesting KolmoSmirnovTestVisitor{ } ..." << std::endl;

    StrDataFrame    ibm;

    try  {
        ibm.read("IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }

    MinVisitor<double, std::string> min_val;
    MaxVisitor<double, std::string> max_val;

    ibm.single_act_visit<double>("IBM_Close", min_val);
    ibm.single_act_visit<double>("IBM_Close", max_val);

    const auto              col_s = ibm.get_index().size();
    RandGenParams<double>   p1 { .min_value = min_val.get_result(), .max_value = max_val.get_result() };
    RandGenParams<double>   p2 { .min_value = min_val.get_result(), .max_value = max_val.get_result() };

    p1.mean = 4;
    p1.std = 2.0;
    p1.seed = 1056;
    ibm.load_column("normal 1", gen_normal_dist<double>(col_s, p1));

    p1.mean = 0;
    p1.std = 1.0;
    p1.seed = 123;
    ibm.load_column("normal 2", gen_normal_dist<double>(col_s, p1));

    p2.seed = 123;
    ibm.load_column("uniform", gen_uniform_real_dist<double>(col_s, p2));
    ibm.load_column("exponential", gen_exponential_dist<double>(col_s, p2));
    ibm.load_column("lognormal", gen_lognormal_dist<double>(col_s, p2));

    ks_test_v<double, std::string>  ks_test;

    ibm.single_act_visit<double, double>("IBM_Close", "IBM_Open", ks_test);
    assert((std::fabs(ks_test.get_result() - 0.0034) < 0.0001));
    assert((std::fabs(ks_test.get_p_value() - 1.0) < 0.0001));

    ibm.single_act_visit<double, double>("IBM_Low", "IBM_High", ks_test);
    assert((std::fabs(ks_test.get_result() - 0.0296) < 0.0001));
    assert((std::fabs(ks_test.get_p_value() - 0.0242) < 0.0001));

    ibm.single_act_visit<double, double>("IBM_Close", "uniform", ks_test);
    assert((std::fabs(ks_test.get_result() - 0.1224) < 0.0001));
    assert((std::fabs(ks_test.get_p_value() - 0.0) < 0.0001));

    ibm.single_act_visit<double, double>("IBM_Close", "normal 1", ks_test);
    assert((std::fabs(ks_test.get_result() - 1.0) < 0.0001));
    assert((std::fabs(ks_test.get_p_value() - 0.0) < 0.0001));

    ibm.single_act_visit<double, double>("IBM_Close", "normal 2", ks_test);
    assert((std::fabs(ks_test.get_result() - 1.0) < 0.0001));
    assert((std::fabs(ks_test.get_p_value() - 0.0) < 0.0001));

    ibm.single_act_visit<double, double>("IBM_Close", "lognormal", ks_test);
    assert((std::fabs(ks_test.get_result() - 0.9998) < 0.0001));
    assert((std::fabs(ks_test.get_p_value() - 0.0) < 0.0001));

    ibm.single_act_visit<double, double>("normal 2", "normal 1", ks_test);
    assert((std::fabs(ks_test.get_result() - 0.8326) < 0.0001));
    assert((std::fabs(ks_test.get_p_value() - 0.0) < 0.0001));

    ibm.single_act_visit<double, double>("normal 2", "lognormal", ks_test);
    assert((std::fabs(ks_test.get_result() - 0.5353) < 0.0001));
    assert((std::fabs(ks_test.get_p_value() - 0.0) < 0.0001));

    ibm.single_act_visit<double, double>("IBM_Close", "IBM_Close", ks_test);
    assert((std::fabs(ks_test.get_result() - 0.0) < 0.0001));
    assert((std::fabs(ks_test.get_p_value() - 1.0) < 0.0001));
}

C++ DataFrame