Back to Documentations

Signature Description Parameters
#include <DataFrame/DataFrameStatsVisitors.h>

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
struct ZScoreVisitor;

// -------------------------------------

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
using zs_v = ZScoreVisitor<T, I, A>;
This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This functor class calculates the z-score each value in a given column against the same column as the population. Its result is a vector of z-scores.

This works with both scalar and multidimensional (i.e. vectors or arrays) datasets. For multidimensiional datasetes, you must use the single_act_visit() interface.

get_result(): Returns the vector of z-scors. In case of scalar dataset, this is a vector of scalar z-scores. In case of multidimensional dataset, this is a vector of vectors of component-wise z-scors.
T: Column data type.
I: Index type.
A: Memory alignment boundary for vectors. Default is system default alignment
static void test_z_score_visitor()  {

    std::cout << "\nTesting Z-Score visitors ..." << std::endl;

    StlVecType<unsigned long>  idx = { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466, 123467, 123468, 123469, 123470, 123471, 123472, 123473 };
    StlVecType<double>         d1 = { 99.00011, 99.00012, 99.00013, 99.00014, 99.00015, 99.00016, 99.000113, 99.000112, 99.000111, 99.00019, 99.00018, 99.00017, 99.000114, 99.000115, 99.000116, 99.000117, 99.000118, 99.000119, 99.0001114, 99.0001113, 99.0001112 };
    StlVecType<double>         d2 = { 10.1, 20.1, 30.1, 40.1, 50.1, 60.1, 70.1, 120.1, 110.1, 28.1, 18.1, 100.1, 90.1, 80.1, 130.1, 140.1, 150.1, 160.1, 170.1, 180.1, 190.1 };
    MyDataFrame                df;

    df.load_data(std::move(idx),
                 std::make_pair("col_1", d1),
                 std::make_pair("col_2", d2));

    ZScoreVisitor<double>   z_score;
    ZScoreVisitor<double>   z_score2;
    const auto              result = df.single_act_visit<double>("col_1", z_score).get_result();
    const auto              result2 = df.single_act_visit<double>("col_2", z_score2).get_result();

    assert(result.size() == 21);
    assert(fabs(result[0] - -0.774806) < 0.000001);
    assert(fabs(result[4] - 0.816872) < 0.000001);
    assert(fabs(result[10] - 2.01063) < 0.000001);
    assert(fabs(result[19] - -0.723076) < 0.000001);
    assert(fabs(result[20] - -0.727055) < 0.000001);

    assert(result2.size() == 21);
    assert(fabs(result2[0] - -1.42003) < 0.00001);
    assert(fabs(result2[4] - -0.732921) < 0.00001);
    assert(fabs(result2[10] - -1.28261) < 0.00001);
    assert(fabs(result2[19] - 1.5002) < 0.00001);
    assert(fabs(result2[20] - 1.67198) < 0.00001);

    const MyDataFrame           const_df = df;
    SampleZScoreVisitor<double> z_score3;
    auto                        fut = const_df.single_act_visit_async<double, double>("col_1", "col_2", z_score3);
    auto                        result3 = fut.get().get_result();

    assert(fabs(result3 - -1136669.1600501483772) < 0.000001);
    result3 = df.single_act_visit<double, double>("col_2", "col_2", z_score3).get_result();
    assert(result3 == 0.0);

    // Now multidimensional data
    //
    RandGenParams<double>   p;

    p.seed = 123;
    p.min_value = 0;
    p.max_value = 10.0;

    constexpr std::size_t   dim { 3 };

    using ary_col_t = std::array<double, dim>;
    using vec_col_t = std::vector<double>;

    // Generate and load 3 random columns
    //
    auto    rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * dim, p);

    StlVecType<ary_col_t>   array_col(df.get_index().size());
    StlVecType<vec_col_t>   vector_col(df.get_index().size());

    for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i)  {
        vector_col[i].resize(dim);
        for (std::size_t d { 0 }; d < dim; ++d)
            array_col[i][d] = vector_col[i][d] = rand_vec[j++];
    }
    df.load_column<ary_col_t>("array_col", std::move(array_col));
    df.load_column<vec_col_t>("vector_col", std::move(vector_col));

    ZScoreVisitor<ary_col_t>    ary_md_zs;
    ZScoreVisitor<vec_col_t>    vec_md_zs;

    const auto  &ary_res = df.single_act_visit<ary_col_t>("array_col", ary_md_zs).get_result();
    const auto  &vec_res = df.single_act_visit<vec_col_t>("vector_col", vec_md_zs).get_result();

    assert(ary_res.size() == 21);
    assert(vec_res.size() == 21);
    for (const auto &vec : ary_res)
        assert(vec.size() == dim);
    for (const auto &vec : vec_res)
        assert(vec.size() == dim);
    assert(std::fabs(ary_res[0][0] - -0.956961) < 0.000001);
    assert(std::fabs(ary_res[0][2] - 1.342) < 0.001);
    assert(std::fabs(vec_res[0][0] - -0.956961) < 0.000001);
    assert(std::fabs(vec_res[0][2] - 1.342) < 0.001);
    assert(std::fabs(ary_res[20][1] - 1.06051) < 0.00001);
    assert(std::fabs(ary_res[20][2] - 0.219589) < 0.000001);
    assert(std::fabs(vec_res[20][1] - 1.06051) < 0.00001);
    assert(std::fabs(vec_res[20][2] - 0.219589) < 0.000001);
}

C++ DataFrame