Back to Documentations

Signature Description Parameters
#include <DataFrame/DataFrameMLVisitors.h>

template<arithmetic T, typename I = unsigned long>
struct  MutualInfoVisitor;

// ----------------------------------

template<typename T, typename I = unsigned long>
using mut_i_v = MutualInfoVisitor<T, I>;

This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This visitor calculates Mutual Information. In probability theory and information theory, the mutual information (MI) of two random variables is a measure of the mutual dependence between the two variables. More specifically, it quantifies the "amount of information" (in units such as shannons (bits), nats or hartleys) obtained about one random variable by observing the other random variable. The concept of mutual information is intimately linked to that of entropy of a random variable, a fundamental notion in information theory that quantifies the expected "amount of information" held in a random variable.
Not limited to real-valued random variables and linear dependence like the correlation coefficient, MI is more general and determines how different the joint distribution of the pair (X , Y) is from the product of the marginal distributions of X and Y. MI is the expected value of the pointwise mutual information (PMI).

This works with both scalar and multidimensional (i.e. vectors and arrays) datasets.

get_result() returns the numeric value of Mutual Information.
explicit
MutualInfoVisitor(double log_base = 2);
        
log_base: When computing mutual information (or any entropy-related quantity), the logarithm base determines the unit of the result
T: Column data type. T must be an arithmetic-enabled type
I: Index type.
static void test_MutualInfoVisitor()  {

    std::cout << "\nTesting MutualInfoVisitor{ } ..." << std::endl;

    using MyDataFrame = StdDataFrame<unsigned long>;

    MyDataFrame                 df;
    std::vector<unsigned long>  idxvec = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 };
    std::vector<std::string>    strvec = { "zz", "bb", "cc", "ww", "ee", "ff", "gg", "hh", "ii", "jj", "kk", "ll", "mm", "nn", "oo" };
    RandGenParams<int>          p;

    p.seed = 123;
    p.max_value = 4;
    p.min_value = -4;
    df.load_data(std::move(idxvec),
                 std::make_pair("int_col_1", gen_uniform_int_dist<int>(idxvec.size(), p)),
                 std::make_pair("str_col", strvec));
    p.seed = 675;
    df.load_column("int_col_2", gen_uniform_int_dist<int>(idxvec.size(), p));

    std::vector<int>    intcol3(df.get_index().size());
    const auto          int_col_1 = df.get_column<int>("int_col_1");

    for (std::size_t i { 0 }; i < intcol3.size(); ++i)  {
        if (int_col_1[i] >= 0)  intcol3[i] = 1;
        else  intcol3[i] = -1;
    }
    df.load_column("int_col_3", intcol3);

    MutualInfoVisitor<int>  minfo;

    df.single_act_visit<int, int>("int_col_1", "int_col_1", minfo);
    assert((std::fabs(minfo.get_result() - 12.4866) < 0.0001));

    df.single_act_visit<int, int>("int_col_1", "int_col_2", minfo);
    assert((std::fabs(minfo.get_result() - 1.81499) < 0.00001));

    df.single_act_visit<int, int>("int_col_1", "int_col_3", minfo);
    assert((std::fabs(minfo.get_result() - 4.24521) < 0.00001));

    // Now multidimensional data
    //
    constexpr std::size_t   dim { 2 };

    using ary_col_t = std::array<double, dim>;
    using vec_col_t = std::vector<double>;

    const std::vector<double>   a { 1.0, 0.0 };
    const std::vector<double>   b { 0.0, 1.0 };

    std::vector<vec_col_t>  vec_col_x1  { a, b, { 1.0, 1.0 } };
    std::vector<ary_col_t>  ary_col_y1  { { 1.0, 0.0 }, { 0.0, 1.0 }, { 1.0, 1.0 }, };
    std::vector<vec_col_t>  vec_col_x2 { a, a, a, b, b, b, b, a };
    std::vector<vec_col_t>  vec_col_x3 { a, a, b, b, b, b, a, a };
    std::vector<vec_col_t>  vec_col_x4 { { 4.5, 5.6 }, { 3.25, 50,6 }, { 7.6, 66.66 }, {80.1, 80.2 }, { 90.1, 90.2 }, { 33.56, 45.4 }, { 22.2, 23.2 }, { 11.1, 45.2 } };

    df.load_column<vec_col_t>("COL VEC1", std::move(vec_col_x1), nan_policy::dont_pad_with_nans);
    df.load_column<ary_col_t>("COL ARY1", std::move(ary_col_y1), nan_policy::dont_pad_with_nans);
    df.load_column<vec_col_t>("COL VEC2", std::move(vec_col_x2), nan_policy::dont_pad_with_nans);
    df.load_column<vec_col_t>("COL VEC3", std::move(vec_col_x3), nan_policy::dont_pad_with_nans);
    df.load_column<vec_col_t>("COL VEC4", std::move(vec_col_x4), nan_policy::dont_pad_with_nans);

    MutualInfoVisitor<vec_col_t>    mi_vec;
    MutualInfoVisitor<ary_col_t>    mi_ary;

    df.single_act_visit<vec_col_t, vec_col_t>("COL VEC1", "COL VEC1", mi_vec);
    df.single_act_visit<ary_col_t, ary_col_t>("COL ARY1", "COL ARY1", mi_ary);
    assert((std::fabs(mi_vec.get_result() - 1.58496) < 0.00001));
    assert((std::fabs(mi_ary.get_result() - 1.58496) < 0.00001));

    df.single_act_visit<vec_col_t, vec_col_t>("COL VEC2", "COL VEC2", mi_vec);
    assert((std::fabs(mi_vec.get_result() - 4.0) < 0.000000001));

    df.single_act_visit<vec_col_t, vec_col_t>("COL VEC2", "COL VEC4", mi_vec);
    assert((std::fabs(mi_vec.get_result() - 1.0) < 0.000000001));
}

C++ DataFrame