| Signature | Description |
|---|---|
enum class prob_dist_type : unsigned char { // No negative values // yi = xi / sum(X) // arithmetic = 1, // All values must be >= 1 // yi = loge(xi) / sum(loge(X)) // log = 2, // yi = e^xi / sum(e^X) // softmax = 3, // yi = 2^xi / sum(2^X) // pow2 = 4, // yi = 10^xi / sum(10^X) // pow10 = 5, }; |
This specifies how to convert a vector of values to a probability distribution. Values in a probability distribution are between 0 and 1 and they all sum up to 1. Note that some methods do not work with negative values or zero. |
| Signature | Description |
|---|---|
enum class normalization_type : unsigned char { none = 0, // These only apply to scalar data // simple = 1, // V / sum(xi) euclidean = 2, // V / sqrt(sum(xi^2)) maxi = 3, // V / max(xi) decimal_scaling = 4, // V / 10^max(xi) order log_transform = 5, // ln(xi) root_transform = 6, // sqrt(xi) // These apply to both scalar and multidimensional data // z_score = 7, // (V - μ(V)) / σ(V) min_max = 8, // (V - min(V)) / (max(V) - min(V)) // These only apply to multidimensional data // unit_length = 9, // Divide the points by their norms flat_vector = 10, // Normalize entire collection as one flattened vector }; |
These are different kinds of normalization supported in NormalizeVisitor. maxi and decimal_scaling produce similar results. |
| Signature | Description | Parameters |
|---|---|---|
include <DataFrame/DataFrameStatsVisitors.h> template<typename T, typename I = unsigned long, std::size_t A = 0> struct ProbabilityDistVisitor; // ------------------------------------- template<typename T, typename I = unsigned long, std::size_t A = 0> using pd_v = ProbabilityDistVisitor<T, I, A>; |
This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface. This class converts the given column of values into a probability distribution based on one of the above methods. Values in a probability distribution are between 0 and 1 and they all add up to 1. Please note that some of that methods require that values be only positive or above 1. If you don't adhere to the requirement, you will get garbage
explicit
ProbabilityDistVisitor(prob_dist_type pdtype);
|
T: Column data type I: Index type A: Memory alignment boundary for vectors. Default is system default alignment |
#include <DataFrame/DataFrameStatsVisitors.h> template<typename T, typename I = unsigned long, std::size_t A = 0> struct NormalizeVisitor; // ------------------------------------- template<typename T, typename I = unsigned long, std::size_t A = 0> using norm_v = NormalizeVisitor<T, I, A>; |
This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface. This functor class normalizes column data using the method specified in the constructor. This works with both scalar and multidimensional (vector of vectors or vector of arrays) datasets. NOTE: If your normalization type does not match with your dataset type, you get an exception. See normalization types above.
explicit
NormalizeVisitor(normalization_type t = normalization_type::min_max);
|
T: Column data type I: Index type A: Memory alignment boundary for vectors. Default is system default alignment |
#include <DataFrame/DataFrameStatsVisitors.h> template<typename T, typename I = unsigned long, std::size_t A = 0> struct StandardizeVisitor; // ------------------------------------- template<typename T, typename I = unsigned long, std::size_t A = 0> using stand_v = StandardizeVisitor<T, I, A>; |
This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface. This functor class standardizes column data by doing y = (x - mean) / std. |
T: Column data type I: Index type A: Memory alignment boundary for vectors. Default is system default alignment |
static void test_ProbabilityDistVisitor() { std::cout << "\nTesting ProbabilityDistVisitor{ } ..." << std::endl; MyDataFrame df; StlVecType<unsigned long> idxvec = { 1, 2, 3, 10, 5, 7, 8, 12, 9, 12, 10, 13, 10, 15, 14 }; StlVecType<double> dblvec = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; StlVecType<double> dblvec2 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; StlVecType<double> dblvec3 = { 0, 1, -2, 3, 4, 5, 6, 7, -8, 9, 10, -11, 12, -13, 14}; df.load_data(std::move(idxvec), std::make_pair("dbl_col", dblvec), std::make_pair("dbl_col_2", dblvec2), std::make_pair("dbl_col_3", dblvec3)); pd_v<double, unsigned long, 256> pd { prob_dist_type::arithmetic }; double sum { 0 }; df.single_act_visit<double>("dbl_col", pd); assert(pd.get_result().size() == 15); for (const auto val : pd.get_result()) { assert(val >= 0 && val <= 1.0); sum += val; } assert(std::abs(sum - 1.0) < 0.0001); pd_v<double, unsigned long, 256> pd2 { prob_dist_type::log }; df.single_act_visit<double>("dbl_col_2", pd2); assert(pd2.get_result().size() == 15); sum = 0; for (const auto val : pd2.get_result()) { assert(val >= 0 && val <= 1.0); sum += val; } assert(std::abs(sum - 1.0) < 0.0001); pd_v<double, unsigned long, 256> pd3 { prob_dist_type::softmax }; df.single_act_visit<double>("dbl_col_3", pd3); assert(pd3.get_result().size() == 15); sum = 0; for (const auto val : pd3.get_result()) { assert(val >= 0 && val <= 1.0); sum += val; } assert(std::abs(sum - 1.0) < 0.0001); pd_v<double, unsigned long, 256> pd4 { prob_dist_type::pow2 }; df.single_act_visit<double>("dbl_col_3", pd4); assert(pd4.get_result().size() == 15); sum = 0; for (const auto val : pd4.get_result()) { assert(val >= 0 && val <= 1.0); sum += val; } assert(std::abs(sum - 1.0) < 0.0001); pd_v<double, unsigned long, 256> pd5 { prob_dist_type::pow10 }; df.single_act_visit<double>("dbl_col_3", pd5); assert(pd5.get_result().size() == 15); sum = 0; for (const auto val : pd5.get_result()) { assert(val >= 0 && val <= 1.0); sum += val; } assert(std::abs(sum - 1.0) < 0.0001); } // ----------------------------------------------------------------------------- static void test_NormalizeVisitor() { std::cout << "\nTesting NormalizeVisitor{ } ..." << std::endl; StlVecType<unsigned long> ulgvec2 = { 123450, 123451, 123452, 123450, 123455, 123450, 123449, 123450, 123451, 123450, 123452, 123450, 123455, 123450, 123454, 123450, 123450, 123457, 123458, 123459, 123450, 123441, 123442, 123432, 123450, 123450, 123435, 123450 }; StlVecType<double> dblvec = { 1.2345, 2.2345, 3.2345, 4.2345, 5.2345, 3.0, 0.9999, 10.0, 4.25, 0.009, 8.0, 2.2222, 3.3333, 15.6, 11.0, 5.25, 1.009, 2.111, 9.0, 3.2222, 4.3333, 12.0, 6.25, 2.009, 3.111, 10.0, 4.2222, 5.3333 }; MyDataFrame df; df.load_data(std::move(ulgvec2), std::make_pair("dbl_col", dblvec)); // Do various Normalize testing // NormalizeVisitor<double, unsigned long, 64> norm_v; // min_max method auto result = df.single_act_visit<double>("dbl_col", norm_v).get_result(); StlVecType<double> norm_result = { 0.078603, 0.142743, 0.206882, 0.271022, 0.335161, 0.191841, 0.0635559, 0.640818, 0.272016, 0, 0.512539, 0.141954, 0.213219, 1, 0.704958, 0.336155, 0.0641396, 0.134821, 0.576679, 0.206093, 0.277359, 0.769098, 0.400295, 0.128279, 0.198961, 0.640818, 0.270233, 0.341498, }; double sum { 0 }; for (size_t i = 0; i < result.size(); ++i) assert(fabs(result[i] - norm_result[i]) < 0.00001); for (const auto &r : result) sum += r; assert(fabs(sum - 9.11974) < 0.00001); // NormalizeVisitor<double, unsigned long, 64> norm_simple { normalization_type::simple }; StlVecType<double> result_simple = { 0.00866693, 0.0156875, 0.0227081, 0.0297287, 0.0367493, 0.0210618, 0.0070199, 0.070206, 0.0298376, 6.31854e-05, 0.0561648, 0.0156012, 0.0234018, 0.109521, 0.0772266, 0.0368582, 0.00708379, 0.0148205, 0.0631854, 0.0226218, 0.0304224, 0.0842472, 0.0438788, 0.0141044, 0.0218411, 0.070206, 0.0296424, 0.037443 }; result = df.single_act_visit<double>("dbl_col", norm_simple).get_result(); for (size_t i = 0; i < result.size(); ++i) assert(fabs(result[i] - result_simple[i]) < 0.00001); sum = 0; for (const auto &r : result) sum += r; assert(sum == 1.0); // NormalizeVisitor<double, unsigned long, 64> norm_euclidean { normalization_type::euclidean }; StlVecType<double> result_euclidean = { 0.0368709, 0.0667379, 0.0966049, 0.126472, 0.156339, 0.0896011, 0.029864, 0.29867, 0.126935, 0.000268803, 0.238936, 0.0663705, 0.0995558, 0.465926, 0.328537, 0.156802, 0.0301358, 0.0630493, 0.268803, 0.0962376, 0.129423, 0.358404, 0.186669, 0.0600029, 0.0929163, 0.29867, 0.126105, 0.15929 }; result = df.single_act_visit<double>("dbl_col", norm_euclidean).get_result(); for (size_t i = 0; i < result.size(); ++i) assert(fabs(result[i] - result_euclidean[i]) < 0.00001); sum = 0; for (const auto &r : result) sum += r; assert(fabs(sum - 4.2542) < 0.0001); // NormalizeVisitor<double, unsigned long, 64> norm_maxi { normalization_type::maxi }; StlVecType<double> result_maxi = { 0.0791346, 0.143237, 0.20734, 0.271442, 0.335545, 0.192308, 0.0640962, 0.641026, 0.272436, 0.000576923, 0.512821, 0.142449, 0.213673, 1, 0.705128, 0.336538, 0.0646795, 0.135321, 0.576923, 0.206551, 0.277776, 0.769231, 0.400641, 0.128782, 0.199423, 0.641026, 0.270654, 0.341878 }; result = df.single_act_visit<double>("dbl_col", norm_maxi).get_result(); for (size_t i = 0; i < result.size(); ++i) assert(fabs(result[i] - result_maxi[i]) < 0.00001); sum = 0; for (const auto &r : result) sum += r; assert(fabs(sum - 9.13063) < 0.00001); // NormalizeVisitor<double, unsigned long, 64> norm_z_score { normalization_type::z_score }; StlVecType<double> result_z_score = { -1.00542, -0.744444, -0.48347, -0.222497, 0.0384758, -0.544669, -1.06664, 1.28214, -0.218452, -1.32524, 0.760197, -0.747654, -0.457686, 2.74359, 1.54312, 0.0425209, -1.06427, -0.776674, 1.02117, -0.48668, -0.196713, 1.80409, 0.303494, -0.803293, -0.515701, 1.28214, -0.225707, 0.06426 }; result = df.single_act_visit<double>("dbl_col", norm_z_score).get_result(); for (size_t i = 0; i < result.size(); ++i) assert(fabs(result[i] - result_z_score[i]) < 0.00001); sum = 0; for (const auto &r : result) sum += r; assert(fabs(sum - 0.0) < 0.000000000001); // NormalizeVisitor<double, unsigned long, 64> norm_decimal_scaling { normalization_type::decimal_scaling }; StlVecType<double> result_decimal_scaling = { 0.00791346, 0.0143237, 0.020734, 0.0271442, 0.0335545, 0.0192308, 0.00640962, 0.0641026, 0.0272436, 5.76923e-05, 0.0512821, 0.0142449, 0.0213673, 0.1, 0.0705128, 0.0336538, 0.00646795, 0.0135321, 0.0576923, 0.0206551, 0.0277776, 0.0769231, 0.0400641, 0.0128782, 0.0199423, 0.0641026, 0.0270654, 0.0341878 }; result = df.single_act_visit<double>("dbl_col", norm_decimal_scaling).get_result(); for (size_t i = 0; i < result.size(); ++i) assert(fabs(result[i] - result_decimal_scaling[i]) < 0.00001); sum = 0; for (const auto &r : result) sum += r; assert(fabs(sum - 0.91306) < 0.00001); // NormalizeVisitor<double, unsigned long, 64> norm_log_transform { normalization_type::log_transform }; StlVecType<double> result_log_transform = { 0.210666, 0.804017, 1.17387, 1.44327, 1.65527, 1.09861, -0.000100005, 2.30259, 1.44692, -4.71053, 2.07944, 0.798498, 1.20396, 2.74727, 2.3979, 1.65823, 0.00895974, 0.747162, 2.19722, 1.17006, 1.46633, 2.48491, 1.83258, 0.697637, 1.13494, 2.30259, 1.44036, 1.67397 }; result = df.single_act_visit<double>("dbl_col", norm_log_transform).get_result(); for (size_t i = 0; i < result.size(); ++i) assert(fabs(result[i] - result_log_transform[i]) < 0.00001); sum = 0; for (const auto &r : result) sum += r; assert(fabs(sum - 33.4666) < 0.0001); // NormalizeVisitor<double, unsigned long, 64> norm_root_transform { normalization_type::root_transform }; StlVecType<double> result_root_transform = { 1.11108, 1.49482, 1.79847, 2.05779, 2.2879, 1.73205, 0.99995, 3.16228, 2.06155, 0.0948683, 2.82843, 1.4907, 1.82573, 3.94968, 3.31662, 2.29129, 1.00449, 1.45293, 3, 1.79505, 2.08166, 3.4641, 2.5, 1.41739, 1.7638, 3.16228, 2.0548, 2.30939 }; result = df.single_act_visit<double>("dbl_col", norm_root_transform).get_result(); for (size_t i = 0; i < result.size(); ++i) assert(fabs(result[i] - result_root_transform[i]) < 0.00001); sum = 0; for (const auto &r : result) sum += r; assert(fabs(sum - 58.5091) < 0.0001); // Now do Standardize testing // StandardizeVisitor<double, unsigned long, 64> stand_v; StlVecType<double> stand_result = { -1.00542, -0.744444, -0.48347, -0.222497, 0.0384758, -0.544669, -1.06664, 1.28214, -0.218452, -1.32524, 0.760197, -0.747654, -0.457686, 2.74359, 1.54312, 0.0425209, -1.06427, -0.776674, 1.02117, -0.48668, -0.196713, 1.80409, 0.303494, -0.803293, -0.515701, 1.28214, -0.225707, 0.06426 }; result = df.single_act_visit<double>("dbl_col", stand_v).get_result(); for (size_t i = 0; i < result.size(); ++i) assert(fabs(result[i] - stand_result[i]) < 0.00001); // Now multidimensional data // RandGenParams<double> p; p.seed = 123; p.min_value = -20.0; p.max_value = 20.0; using col_t = std::array<double, 3>; const auto rand_vec = gen_uniform_real_dist<double, 64>(df.get_index().size() * 3, p); StlVecType<col_t> multi_dimen_col(df.get_index().size()); for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i) { multi_dimen_col[i][0] = rand_vec[j++]; multi_dimen_col[i][1] = rand_vec[j++]; multi_dimen_col[i][2] = rand_vec[j++]; } df.load_column<col_t>("multi_dimen_col", std::move(multi_dimen_col)); NormalizeVisitor<col_t, unsigned long, 64> md_norm1 { normalization_type::min_max }; df.single_act_visit<col_t>("multi_dimen_col", md_norm1); assert(md_norm1.get_result().size() == 28); assert(std::fabs(md_norm1.get_result()[0][1] - 0.582342) < 0.000001); assert(std::fabs(md_norm1.get_result()[9][2] - 1.0) < 0.00001); assert(std::fabs(md_norm1.get_result()[9][1] - 0.0) < 0.00001); assert(std::fabs(md_norm1.get_result()[22][2] - 0.363817) < 0.00001); assert(std::fabs(md_norm1.get_result()[27][2] - 0.650224) < 0.000001); NormalizeVisitor<col_t, unsigned long, 64> md_norm2 { normalization_type::z_score }; df.single_act_visit<col_t>("multi_dimen_col", md_norm2); assert(md_norm2.get_result().size() == 28); assert(std::fabs(md_norm2.get_result()[0][1] - 0.15879) < 0.00001); assert(std::fabs(md_norm2.get_result()[9][2] - 1.53805) < 0.00001); assert(std::fabs(md_norm2.get_result()[9][1] - -1.74383) < 0.00001); assert(std::fabs(md_norm2.get_result()[22][2] - -0.440033) < 0.000001); assert(std::fabs(md_norm2.get_result()[27][2] - 0.450492) < 0.000001); NormalizeVisitor<col_t, unsigned long, 64> md_norm3 { normalization_type::unit_length }; df.single_act_visit<col_t>("multi_dimen_col", md_norm3); assert(md_norm3.get_result().size() == 28); assert(std::fabs(md_norm3.get_result()[0][1] - 0.116694) < 0.000001); assert(std::fabs(md_norm3.get_result()[9][2] - 0.630255) < 0.000001); assert(std::fabs(md_norm3.get_result()[9][1] - -0.713796) < 0.000001); assert(std::fabs(md_norm3.get_result()[22][2] - -0.253627) < 0.000001); assert(std::fabs(md_norm3.get_result()[27][2] - 0.374604) < 0.000001); NormalizeVisitor<col_t, unsigned long, 64> md_norm4 { normalization_type::flat_vector }; df.single_act_visit<col_t>("multi_dimen_col", md_norm4); assert(md_norm4.get_result().size() == 28); assert(std::fabs(md_norm4.get_result()[0][1] - 0.021617) < 0.000001); assert(std::fabs(md_norm4.get_result()[9][2] - 0.170005) < 0.000001); assert(std::fabs(md_norm4.get_result()[9][1] - -0.19254) < 0.00001); assert(std::fabs(md_norm4.get_result()[22][2] - -0.059508) < 0.000001); assert(std::fabs(md_norm4.get_result()[27][2] - 0.043818) < 0.000001); }