Back to Documentations

Signature Description Parameters
#include <DataFrame/DataFrameStatsVisitors.h>

template<typename T, typename I = unsigned long>
struct ShapiroWilkTestVisitor;

// -------------------------------------

template<typename T, typename I = unsigned long>
using swilk_test_v = ShapiroWilkTestVisitor<T, I>;
This functor class calculates the Shapiro Wilk Test

The Shapiro-Wilk test is a statistical test used to determine if a sample of data comes from a normally distributed population. It assesses how closely the data distribution matches a normal distribution with the same mean and standard deviation. A low p-value (typically less than 0.05) suggests the data is likely not normally distributed. It's particularly useful when you have a small to moderate sample size (although it can handle larger samples as well) and before using parametric statistical methods that assume normality.

get_result() returns the test statistics (W). This value ranges from 0 to 1, with values closer to 1 suggesting a better fit to a normal distribution.
get_p_value() member function returns the p-value. This value represents the probability of observing data as extreme as, or more extreme than, the data you have, assuming the data is actually normally distributed.
    ShapiroWilkTestVisitor();
        
T: Column data type.
I: Index type.
static void test_ShapiroWilkTestVisitor()  {

    std::cout << "\nTesting ShapiroWilkTestVisitor{ } ..." << std::endl;

    StrDataFrame    ibm;

    try  {
        ibm.read("IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }

    const auto              col_s = ibm.get_index().size();
    RandGenParams<double>   p1 { .min_value = 99, .max_value = 200, .seed = 123 };

    ibm.load_column("uniform", gen_uniform_real_dist<double>(col_s, p1));
    ibm.load_column("exponential", gen_exponential_dist<double>(col_s, p1));
    ibm.load_column("lognormal", gen_lognormal_dist<double>(col_s, p1));
    ibm.load_column("normal", gen_normal_dist<double>(col_s, p1));

    RandGenParams<double>   p2 { .seed = 123, .mean = 0, .std = 1.0 };

    ibm.load_column("std_normal", gen_normal_dist<double>(col_s, p2));

    ShapiroWilkTestVisitor<double, std::string> swt;

    ibm.single_act_visit<double>("IBM_Close", swt);
    assert((std::fabs(swt.get_result() - 0.953874) < 0.000001));
    assert((std::fabs(swt.get_p_value() - 3.56659e-37) < 0.0000000000001));

    ibm.single_act_visit<double>("uniform", swt);
    assert((std::fabs(swt.get_result() - 0.954293) < 0.000001));
    assert((std::fabs(swt.get_p_value() - 4.8411e-37) < 0.0000000000001));

    ibm.single_act_visit<double>("exponential", swt);
    assert((std::fabs(swt.get_result() - 0.819813) < 0.000001));
    assert((std::fabs(swt.get_p_value() - 9.28713e-60) < 0.0000000000001));

    ibm.single_act_visit<double>("lognormal", swt);
    assert((std::fabs(swt.get_result() - 0.524102) < 0.000001));
    assert(swt.get_p_value() == 0.0);

    ibm.single_act_visit<double>("normal", swt);
    assert((std::fabs(swt.get_result() - 0.99974) < 0.00001));
    assert((std::fabs(swt.get_p_value() - 0.816855) < 0.000001));

    ibm.single_act_visit<double>("std_normal", swt);
    assert((std::fabs(swt.get_result() - 0.99974) < 0.00001));
    assert((std::fabs(swt.get_p_value() - 0.816855) < 0.000001));
}

C++ DataFrame