| Signature | Description |
|---|---|
enum class policy_loss_baseline : unsigned char { none = 0, // No baseline: loss = -log(ap) * r constant = 1, // Constant baseline: loss = -log(ap) * (r - b) mean = 2, // Mean baseline: loss = -log(ap) * (r - mean(r)) standardize = 3, // Standardized: loss = -log(ap) * (r - mean(r)) / std(r) }; |
Enum defining the baseline/normalization mode Used to reduce variance in the policy gradient loss: lossi = -log(π(si|ai)) * (ri - baseline) |
| Signature | Description | Parameters |
|---|---|---|
#include <DataFrame/DataFrameMLVisitors.h> template<typename T, typename I = unsigned long, std::size_t A = 0> struct PolicyLearningLossVisitor; // ------------------------------------- template<typename T, typename I = unsigned long, std::size_t A = 0> using plloss_v = PolicyLearningLossVisitor<T, I, A>; |
This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface. This functor calculates loss function of policy learning. It requires two columns action probability and reward. The formula is: L = -log(P(state|action)) * reward This works with both scalar and multidimensional (i.e. vectors or arrays) datasets. get_result() returns the vector of values. In case of a multidimensional column, it returns a vector of vectors. Each inner vector is the length of data dimension. In case of a multidimensional column, the values are done per dimension
explicit
PolicyLearningLossVisitor(policy_loss_baseline baseline = policy_loss_baseline::none,
std::optional
|
T: Column data type. I: Index type. A: Memory alignment boundary for vectors. Default is system default alignment |
static void test_PolicyLearningLossVisitor() { std::cout << "\nTesting PolicyLearningLossVisitor{ } ..." << std::endl; MyDataFrame df; StlVecType<unsigned long> idxvec = { 1, 2, 3, 10, 5, 7, 8, 12, 9, 12, 10, 13, 10, 15, 14 }; StlVecType<double> dblvec = { 0.01, 0.5, 0.35, 0.1, 0.11, 0.05, 0.06, 0.03, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.08}; StlVecType<double> dblvec2 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; StlVecType<double> dblvec3 = { 0, 1, -2, 3, 4, 5, 6, 7, -8, 9, 10, -11, 12, -13, 14}; df.load_data(std::move(idxvec), std::make_pair("action_prob", dblvec), std::make_pair("reward", dblvec2), std::make_pair("dbl_col_3", dblvec3)); plloss_v<double, unsigned long, 256> pll; df.single_act_visit<double, double>("action_prob", "reward", pll); assert(std::abs(pll.get_result()[0] - 4.6052) < 0.0001); assert(std::abs(pll.get_result()[6] - 19.6939) < 0.0001); assert(std::abs(pll.get_result()[14] - 37.8859) < 0.0001); // Now multidimensional data // constexpr std::size_t dim { 3 }; using ary_col_t = std::array<double, dim>; using vec_col_t = std::vector<double>; StlVecType<ary_col_t> ary_action { { 0.9, 0.8, 0.95 }, // high confidence, all dims { 0.1, 0.05, 0.2 }, // low confidence, exercises log near 0 { 0.5, 0.5, 0.5 }, // uniform uncertainty { 0.75, 0.3, 0.6 }, // mixed confidence { 0.99, 0.99, 0.99 }, // near-certainty (log close to 0) { 0.01, 0.01, 0.01 }, // near-zero probs (exercises epsilon clamp) { 0.4, 0.7, 0.55 }, // moderate confidence { 0.6, 0.45, 0.8 }, // mixed }; StlVecType<vec_col_t> vec_action { { 0.9, 0.8, 0.95 }, // high confidence, all dims { 0.1, 0.05, 0.2 }, // low confidence, exercises log near 0 { 0.5, 0.5, 0.5 }, // uniform uncertainty { 0.75, 0.3, 0.6 }, // mixed confidence { 0.99, 0.99, 0.99 }, // near-certainty (log close to 0) { 0.01, 0.01, 0.01 }, // near-zero probs (exercises epsilon clamp) { 0.4, 0.7, 0.55 }, // moderate confidence { 0.6, 0.45, 0.8 }, // mixed }; StlVecType<ary_col_t> ary_reward { { 1.0, 2.0, 0.5 }, // positive reward { -1.0, -0.5, -2.0 }, // negative reward (bad action) { 0.0, 0.0, 0.0 }, // zero reward (loss should be 0) { 3.0, -1.0, 1.5 }, // mixed signs across dims { 0.1, 0.2, 0.05 }, // small positive { -3.0, -2.5, -1.0 }, // large negative (penalises near-zero probs) { 2.0, 1.0, -0.5 }, // mixed signs { -0.1, 0.8, 1.2 }, // mixed signs }; StlVecType<vec_col_t> vec_reward { { 1.0, 2.0, 0.5 }, // positive reward { -1.0, -0.5, -2.0 }, // negative reward (bad action) { 0.0, 0.0, 0.0 }, // zero reward (loss should be 0) { 3.0, -1.0, 1.5 }, // mixed signs across dims { 0.1, 0.2, 0.05 }, // small positive { -3.0, -2.5, -1.0 }, // large negative (penalises near-zero probs) { 2.0, 1.0, -0.5 }, // mixed signs { -0.1, 0.8, 1.2 }, // mixed signs }; df.load_column<ary_col_t>("ARY ACTION", std::move(ary_action), nan_policy::dont_pad_with_nans); df.load_column<vec_col_t>("VEC ACTION", std::move(vec_action), nan_policy::dont_pad_with_nans); df.load_column<ary_col_t>("ARY REWARD", std::move(ary_reward), nan_policy::dont_pad_with_nans); df.load_column<vec_col_t>("VEC REWARD", std::move(vec_reward), nan_policy::dont_pad_with_nans); plloss_v<ary_col_t, unsigned long, 256> ary_pll { policy_loss_baseline::mean }; plloss_v<vec_col_t, unsigned long, 256> vec_pll { policy_loss_baseline::mean }; df.single_act_visit<ary_col_t, ary_col_t>("ARY ACTION", "ARY REWARD", ary_pll); df.single_act_visit<vec_col_t, vec_col_t>("VEC ACTION", "VEC REWARD", vec_pll); assert(ary_pll.get_result().size() == 8); for (const auto &ary : ary_pll.get_result()) assert(ary.size() == dim); assert(std::abs(ary_pll.get_result()[0][0] - 0.079020) < 0.000001); assert(ary_pll.get_result()[2][1] > -1e-16); assert(std::abs(ary_pll.get_result()[5][1] - -11.5129) < 0.0001); assert(std::abs(ary_pll.get_result()[7][2] - 0.274745) < 0.000001); assert(vec_pll.get_result().size() == 8); for (const auto &vec : vec_pll.get_result()) assert(vec.size() == dim); assert(std::abs(vec_pll.get_result()[0][0] - 0.079020) < 0.000001); assert(vec_pll.get_result()[2][1] > -1e-16); assert(std::abs(vec_pll.get_result()[5][1] - -11.5129) < 0.0001); assert(std::abs(vec_pll.get_result()[7][2] - 0.274745) < 0.000001); }