Back to Documentations

Signature Description
enum class vector_sim_type : unsigned char  {
    euclidean_dist = 1,     // Euclidean distance
    manhattan_dist = 2,     // Manhattan distance
    dot_product = 3,

    // This only applies to scalar columns (not multidimensional).
    //
    simple_similarity = 4,  // Two binary vectors must be of equal length

    // This only applies to scalar columns (not multidimensional).
    //
    cosine_similarity = 5,
    jaccard_similarity = 6, // Relatively expensive to calculate

    // Hamming distance is number of unequal members
    // Two vectors must be of equal length
    //
    hamming_dist = 7,
};
Different types of vector similarity measurementloss function

Signature Description Parameters
#include <DataFrame/DataFrameMLVisitors.h>

template<vector_sim_type TYP, typename T, typename I = unsigned long>
struct VectorSimilarityVisitor;

// -------------------------------------

template<vector_sim_type TYP, typename T, typename I = unsigned long>
using vs_v = VectorSimilarityVisitor<TYP, T, I>;
Similarity measures play a crucial role in machine learning. These measures quantify the similarity between objects, data points, or vectors in a mathematical manner. Understanding the concept of similarity in the vector space and employing appropriate measures is fundamental in solving a wide range of real-world problems. There are several similarity measures that can be used to calculate how close two vectors are in the embedding space (See above vector_sim_type).

This works with both scalar and multidimensional (i.e. vectors or arrays) datasets.
TYP: Type of similarity specified above. The reason for this being a template param is to take advantage of constexpr if clauses for incompatible types.
T: Column data type.
I: Index type.
static void test_VectorSimilarityVisitor()  {

    std::cout << "\nTesting VectorSimilarityVisitor {  } ..." << std::endl;

    MyDataFrame df;

    StlVecType<unsigned long>  idxvec = { 1UL, 2UL, 3UL, 4UL, 5UL, 6UL, 7UL, 8UL, 9UL, 10UL };
    StlVecType<double>         dblvec1 = { 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, -1.2 };
    StlVecType<double>         dblvec2 = { 1.15, 2.18, 3.31, 4.39, 5.48, 6.5, 7.8, 8.81, 9.88, -1.4 };
    StlVecType<double>         dblvec3 = { 0.0, 1.1, 9.8, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, -1.5 };
    StlVecType<double>         dblvec4 = { 5.9, 4.4, 1.0, 9.8, 5.3, 5.5, 3.8, 4.1, -3.3, -1.5 };
    StlVecType<double>         dblvec5 = { 0, 1, 1, 0, 0, 1, 1, 1, 0, 1 };
    StlVecType<double>         dblvec6 = { 1, 0, 1, 0, 1, 1, 0, 1, 0, 1 };
    StlVecType<std::string>    strvec1 = { "Today", "I", "need", "to", "learn", "about", "Jaccard", "similarity", ".", "/" };
    StlVecType<std::string>    strvec2 = { "Later", "I", "will", "need", "other", "things", "to", "do", "", "" };

    df.load_data(std::move(idxvec),
                 std::make_pair("dbl_col1", dblvec1),
                 std::make_pair("dbl_col2", dblvec2),
                 std::make_pair("dbl_col3", dblvec3),
                 std::make_pair("dbl_col4", dblvec4),
                 std::make_pair("dbl_col5", dblvec5),
                 std::make_pair("dbl_col6", dblvec6),
                 std::make_pair("str_col1", strvec1),
                 std::make_pair("str_col2", strvec2));

    VectorSimilarityVisitor<vector_sim_type::euclidean_dist, double>    vs_1;

    df.single_act_visit<double, double>("dbl_col1", "dbl_col2", vs_1);
    assert(std::abs(vs_1.get_result() - 0.253) < 0.0001);

    VectorSimilarityVisitor<vector_sim_type::manhattan_dist, double>    vs_2;

    df.single_act_visit<double, double>("dbl_col1", "dbl_col2", vs_2);
    assert(std::abs(vs_2.get_result() - 0.54) < 0.0001);

    VectorSimilarityVisitor<vector_sim_type::dot_product, double>   vs_3;

    df.single_act_visit<double, double>("dbl_col1", "dbl_col2", vs_3);
    assert(std::abs(vs_3.get_result() - 346.42) < 0.0001);

    vs_v<vector_sim_type::cosine_similarity, double>    vs_4;

    df.single_act_visit<double, double>("dbl_col1", "dbl_col2", vs_4);
    assert(std::abs(vs_4.get_result() - 0.9999) < 0.0001);

    vs_v<vector_sim_type::simple_similarity, double>    vs_5;

    df.single_act_visit<double, double>("dbl_col5", "dbl_col6", vs_5);
    assert(std::abs(vs_5.get_result() - 0.05556) < 0.00001);

    vs_v<vector_sim_type::jaccard_similarity, double>   vs_6;

    df.single_act_visit<double, double>("dbl_col3", "dbl_col4", vs_6);
    assert(std::abs(vs_6.get_result() - 0.25) < 0.0001);

    VectorSimilarityVisitor<vector_sim_type::jaccard_similarity, std::string>  vs_7;

    df.single_act_visit<std::string, std::string>("str_col1", "str_col2", vs_7);
    assert(std::abs(vs_7.get_result() - 0.1875) < 0.0001);

    vs_v<vector_sim_type::hamming_dist, double> vs_8;

    df.single_act_visit<double, double>("dbl_col3", "dbl_col4", vs_8);
    assert(std::abs(vs_8.get_result() - 8.0) < 0.0001);

    vs_v<vector_sim_type::hamming_dist, std::string>    vs_9;

    df.single_act_visit<std::string, std::string>("str_col1", "str_col2", vs_9);
    assert(std::abs(vs_9.get_result() - 9.0) < 0.0001);

    // Now multidimensional data
    //
    constexpr std::size_t   dim { 3 };

    using ary_col_t = std::array<double, dim>;
    using vec_col_t = std::vector<double>;

    StlVecType<ary_col_t>   ary_1  { {1, 0, 0}, {1, 2, 3}, {1, 0, 0} };
    StlVecType<vec_col_t>   vec_1  { {1, 0, 0}, {1, 2, 3}, {1, 0, 0} };
    StlVecType<ary_col_t>   ary_2  { {1, 0, 0}, {4, 5, 6}, {0, 1, 0} };
    StlVecType<vec_col_t>   vec_2  { {1, 0, 0}, {4, 5, 6}, {0, 1, 0} };

    df.load_column<ary_col_t>("ARY 1", std::move(ary_1), nan_policy::dont_pad_with_nans);
    df.load_column<vec_col_t>("VEC 1", std::move(vec_1), nan_policy::dont_pad_with_nans);
    df.load_column<ary_col_t>("ARY 2", std::move(ary_2), nan_policy::dont_pad_with_nans);
    df.load_column<vec_col_t>("VEC 2", std::move(vec_2), nan_policy::dont_pad_with_nans);

    vs_v<vector_sim_type::euclidean_dist, ary_col_t>    ary_md_ed;
    vs_v<vector_sim_type::euclidean_dist, vec_col_t>    vec_md_ed;

    df.single_act_visit<ary_col_t, ary_col_t>("ARY 1", "ARY 2", ary_md_ed);
    df.single_act_visit<vec_col_t, vec_col_t>("VEC 1", "VEC 2", vec_md_ed);
    assert(std::abs(ary_md_ed.get_result() - 6.61037) < 0.00001);
    assert(std::abs(vec_md_ed.get_result() - 6.61037) < 0.00001);

    vs_v<vector_sim_type::manhattan_dist, ary_col_t>    ary_md_md;
    vs_v<vector_sim_type::manhattan_dist, vec_col_t>    vec_md_md;

    df.single_act_visit<ary_col_t, ary_col_t>("ARY 1", "ARY 2", ary_md_md);
    df.single_act_visit<vec_col_t, vec_col_t>("VEC 1", "VEC 2", vec_md_md);
    assert(std::abs(ary_md_md.get_result() - 11.0) < 0.0001);
    assert(std::abs(vec_md_md.get_result() - 11.0) < 0.0001);

    vs_v<vector_sim_type::dot_product, ary_col_t>   ary_md_dp;
    vs_v<vector_sim_type::dot_product, vec_col_t>   vec_md_dp;

    df.single_act_visit<ary_col_t, ary_col_t>("ARY 1", "ARY 2", ary_md_dp);
    df.single_act_visit<vec_col_t, vec_col_t>("VEC 1", "VEC 2", vec_md_dp);
    assert(std::abs(ary_md_dp.get_result() - 33.0) < 0.0001);
    assert(std::abs(vec_md_dp.get_result() - 33.0) < 0.0001);

    vs_v<vector_sim_type::jaccard_similarity, ary_col_t>    ary_md_js;
    vs_v<vector_sim_type::jaccard_similarity, vec_col_t>    vec_md_js;

    df.single_act_visit<ary_col_t, ary_col_t>("ARY 1", "ARY 2", ary_md_js);
    df.single_act_visit<vec_col_t, vec_col_t>("VEC 1", "VEC 2", vec_md_js);
    assert(std::abs(ary_md_js.get_result() - 0.25) < 0.0001);
    assert(std::abs(vec_md_js.get_result() - 0.25) < 0.0001);
}

C++ DataFrame