| Signature | Description |
|---|---|
enum class box_cox_type : unsigned char { // yλ - 1 // y(λ) = ---------- if λ != 0 // λ // // y(λ) = log(y) if λ == 0 // original = 1, // yλ - 1 // y(λ) = --------------- if λ != 0 // λ * GM(λ - 1) // // y(λ) = GM * log(y) if λ == 0 // geometric_mean = 2, // (|y| + 1)λ - 1 // y(λ) = sign(y) * ----------------- if λ != 0 // λ // // y(λ) = sign(y) * log(|y| + 1) if λ == 0 // modulus = 3, // eλy - 1 // y(λ) = ----------- if λ != 0 // λ // // y(λ) = y if λ == 0 // exponential = 4, }; |
Different Box-Cox transformation formulas to be used with BoxCoxVisitor. |
| Signature | Description | Parameters |
|---|---|---|
#include <DataFrame/DataFrameStatsVisitors.h> template<typename T, typename I = unsigned long, std::size_t A = 0> struct BoxCoxVisitor; // ------------------------------------- template<typename T, typename I = unsigned long, std::size_t A = 0> using bcox_v = BoxCoxVisitor<T, I, A>; |
This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface. This visitor implements the Box-Cox transformation. This is a power transformation to a normal distribution. It is not guaranteed to always work. The most important factor in this transformation is the power lambda factor. Lambda is usually between -5 and 5. In case of original and geometric_mean, all series values must be positive. If there are negative values, you must set the is_all_positive flag to false. In this case the visitor will shift the series. The shift value is the absolute value of the min of the series + 0.0000001. In other types, the series could have both +/- values. This works with both scalar and multidimensional (i.e. vectors or arrays) datasets. get_result() returns the vector of transformed data in case of a scalar column. In case of a multidimensional column, it returns a vector of vectors. Each inner vector is the length of data dimension. In case of a multidimensional column, the transformation is done per dimension
BoxCoxVisitor(box_cox_type bc_type,
T lambda,
bool is_all_positive);
|
T: Column data type. I: Index type. A: Memory alignment boundary for vectors. Default is system default alignment |
static void test_BoxCoxVisitor() { std::cout << "\nTesting BoxCoxVisitor{ } ..." << std::endl; using MyDataFrame = StdDataFrame<unsigned long>; const size_t item_cnt = 16; MyDataFrame df; RandGenParams<double> p; p.mean = 5.6; p.std = 0.5; p.seed = 123; p.min_value = -15; p.max_value = 30; df.load_data(MyDataFrame::gen_sequence_index(0, item_cnt, 1), std::make_pair("lognormal", gen_lognormal_dist<double>(item_cnt, p)), std::make_pair("normal", gen_normal_dist<double>(item_cnt, p)), std::make_pair("uniform_real", gen_uniform_real_dist<double>(item_cnt, p))); BoxCoxVisitor<double> bc_v1(box_cox_type::original, 1.5, true); const auto &result1 = df.single_act_visit<double>("lognormal", bc_v1).get_result(); BoxCoxVisitor<double> bc_v2(box_cox_type::original, 1.5, false); const auto &result2 = df.single_act_visit<double>("uniform_real", bc_v2).get_result(); BoxCoxVisitor<double> bc_v3(box_cox_type::modulus, -0.5, false); const auto &result3 = df.single_act_visit<double>("uniform_real", bc_v3).get_result(); BoxCoxVisitor<double> bc_v4(box_cox_type::exponential, -0.5, false); const auto &result4 = df.single_act_visit<double>("uniform_real", bc_v4).get_result(); assert(result1.size() == item_cnt); assert(result2.size() == item_cnt); assert(result3.size() == item_cnt); assert(result4.size() == item_cnt); assert(std::fabs(result1[0] - 0.870871) < 0.000001); assert(std::fabs(result1[8] - -0.047667) < 0.000001); assert(std::fabs(result1[15] - 0.059915) < 0.000001); assert(std::fabs(result2[0] - 25.9053) < 0.0001); assert(std::fabs(result2[8] - 134.035) < 0.001); assert(std::fabs(result2[15] - 177.17) < 0.01); assert(std::fabs(result3[0] - -0.55133) < 0.00001); assert(std::fabs(result3[8] - 1.58168) < 0.00001); assert(std::fabs(result3[15] - 1.63402) < 0.00001); assert(std::fabs(result4[0] - -1.14604) < 0.00001); assert(std::fabs(result4[8] - 1.99996) < 0.00001); assert(std::fabs(result4[15] - 2.0) < 0.01); // Now multidimensional data // RandGenParams<double> p2; p2.seed = 123; p2.min_value = -1.0; p2.max_value = 5.0; constexpr std::size_t dim { 3 }; using ary_col_t = std::array<double, dim>; using vec_col_t = std::vector<double>; // Generate and load 3 random columns // auto rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * dim, p2); std::vector<ary_col_t> array_col(df.get_index().size()); std::vector<vec_col_t> vector_col(df.get_index().size()); for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i) { vector_col[i].resize(dim); for (std::size_t d { 0 }; d < dim; ++d) array_col[i][d] = vector_col[i][d] = rand_vec[j++]; } df.load_column<ary_col_t>("array_col", std::move(array_col)); df.load_column<vec_col_t>("vector_col", std::move(vector_col)); BoxCoxVisitor<ary_col_t> ary_bc_m(box_cox_type::modulus, -0.5, false); const auto &ary_res_m = df.single_act_visit<ary_col_t>("array_col", ary_bc_m).get_result(); BoxCoxVisitor<vec_col_t> vec_bc_m(box_cox_type::modulus, -0.5, false); const auto &vec_res_m = df.single_act_visit<vec_col_t>("vector_col", vec_bc_m).get_result(); assert(ary_res_m.size() == item_cnt); assert(vec_res_m.size() == item_cnt); assert(std::fabs(ary_res_m[0][1] - 0.904972) < 0.000001); assert(std::fabs(ary_res_m[8][0] - 0.921917) < 0.000001); assert(std::fabs(ary_res_m[15][2] - -0.248427) < 0.000001); assert(std::fabs(vec_res_m[0][1] - 0.904972) < 0.000001); assert(std::fabs(vec_res_m[8][0] - 0.921917) < 0.000001); assert(std::fabs(vec_res_m[15][2] - -0.248427) < 0.000001); BoxCoxVisitor<ary_col_t> ary_bc_o(box_cox_type::original, -0.5, false); const auto &ary_res_o = df.single_act_visit<ary_col_t>("array_col", ary_bc_o).get_result(); BoxCoxVisitor<vec_col_t> vec_bc_o(box_cox_type::original, -0.5, false); const auto &vec_res_o = df.single_act_visit<vec_col_t>("vector_col", vec_bc_o).get_result(); assert(ary_res_o.size() == item_cnt); assert(vec_res_o.size() == item_cnt); assert(std::fabs(ary_res_o[0][1] - 0.903587) < 0.000001); assert(std::fabs(ary_res_o[8][0] - 0.920595) < 0.000001); assert(std::fabs(ary_res_o[15][2] - -0.411572) < 0.000001); assert(std::fabs(vec_res_o[0][1] - 0.903587) < 0.000001); assert(std::fabs(vec_res_o[8][0] - 0.920595) < 0.000001); assert(std::fabs(vec_res_o[15][2] - -0.411572) < 0.000001); BoxCoxVisitor<ary_col_t> ary_bc_gm( box_cox_type::geometric_mean, -0.5, false); const auto &ary_res_gm = df.single_act_visit<ary_col_t>("array_col", ary_bc_gm).get_result(); BoxCoxVisitor<vec_col_t> vec_bc_gm( box_cox_type::geometric_mean, -0.5, false); const auto &vec_res_gm = df.single_act_visit<vec_col_t>("vector_col", vec_bc_gm).get_result(); assert(ary_res_gm.size() == item_cnt); assert(vec_res_gm.size() == item_cnt); assert(std::fabs(ary_res_gm[0][1] - 0.56114) < 0.00001); assert(std::fabs(ary_res_gm[8][0] - 4.00197) < 0.00001); assert(std::fabs(ary_res_gm[15][2] - -0.844804) < 0.000001); assert(std::fabs(vec_res_gm[0][1] - 0.56114) < 0.00001); assert(std::fabs(vec_res_gm[8][0] - 4.00197) < 0.00001); assert(std::fabs(vec_res_gm[15][2] - -0.844804) < 0.000001); }