Back to Documentations

Signature Description
enum class vector_sim_type : unsigned char  {
    euclidean_dist = 1,     // Euclidean distance
    manhattan_dist = 2,     // Manhattan distance
    dot_product = 3,
    simple_similarity = 4,  // Two binary vectors must be of equal length
    cosine_similarity = 5,
    jaccard_similarity = 6, // Relatively expensive to calculate

    // Hamming distance is number of unequal members
    // Two vectors must be of equal length
    //
    hamming_dist = 7,
};
Different types of vector similarity measurementloss function

Signature Description Parameters
#include <DataFrame/DataFrameMLVisitors.h>

template<vector_sim_type TYP, typename T, typename I = unsigned long>
struct VectorSimilarityVisitor;

// -------------------------------------

template<vector_sim_type TYP, typename T, typename I = unsigned long>
using vs_v = VectorSimilarityVisitor<TYP, T, I>;
Similarity measures play a crucial role in machine learning. These measures quantify the similarity between objects, data points, or vectors in a mathematical manner. Understanding the concept of similarity in the vector space and employing appropriate measures is fundamental in solving a wide range of real-world problems. There are several similarity measures that can be used to calculate how close two vectors are in the embedding space (See above vector_sim_type).
TYP: Type of similarity specified above. The reason for this being a template param is to take advantage of constexpr if clauses for incompatible types.
T: Column data type.
I: Index type.
static void test_VectorSimilarityVisitor()  {

    std::cout << "\nTesting VectorSimilarityVisitor {  } ..." << std::endl;

    MyDataFrame df;

    StlVecType<unsigned long>  idxvec = { 1UL, 2UL, 3UL, 4UL, 5UL, 6UL, 7UL, 8UL, 9UL, 10UL };
    StlVecType<double>         dblvec1 = { 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, -1.2 };
    StlVecType<double>         dblvec2 = { 1.15, 2.18, 3.31, 4.39, 5.48, 6.5, 7.8, 8.81, 9.88, -1.4 };
    StlVecType<double>         dblvec3 = { 0.0, 1.1, 9.8, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, -1.5 };
    StlVecType<double>         dblvec4 = { 5.9, 4.4, 1.0, 9.8, 5.3, 5.5, 3.8, 4.1, -3.3, -1.5 };
    StlVecType<double>         dblvec5 = { 0, 1, 1, 0, 0, 1, 1, 1, 0, 1 };
    StlVecType<double>         dblvec6 = { 1, 0, 1, 0, 1, 1, 0, 1, 0, 1 };
    StlVecType<std::string>    strvec1 = { "Today", "I", "need", "to", "learn", "about", "Jaccard", "similarity", ".", "/" };
    StlVecType<std::string>    strvec2 = { "Later", "I", "will", "need", "other", "things", "to", "do", "", "" };

    df.load_data(std::move(idxvec),
                 std::make_pair("dbl_col1", dblvec1),
                 std::make_pair("dbl_col2", dblvec2),
                 std::make_pair("dbl_col3", dblvec3),
                 std::make_pair("dbl_col4", dblvec4),
                 std::make_pair("dbl_col5", dblvec5),
                 std::make_pair("dbl_col6", dblvec6),
                 std::make_pair("str_col1", strvec1),
                 std::make_pair("str_col2", strvec2));

    VectorSimilarityVisitor<vector_sim_type::euclidean_dist, double>    vs_1;

    df.single_act_visit<double, double>("dbl_col1", "dbl_col2", vs_1);
    assert(std::abs(vs_1.get_result() - 0.253) < 0.0001);

    VectorSimilarityVisitor<vector_sim_type::manhattan_dist, double>    vs_2;

    df.single_act_visit<double, double>("dbl_col1", "dbl_col2", vs_2);
    assert(std::abs(vs_2.get_result() - 0.54) < 0.0001);

    VectorSimilarityVisitor<vector_sim_type::dot_product, double>   vs_3;

    df.single_act_visit<double, double>("dbl_col1", "dbl_col2", vs_3);
    assert(std::abs(vs_3.get_result() - 346.42) < 0.0001);

    vs_v<vector_sim_type::cosine_similarity, double>    vs_4;

    df.single_act_visit<double, double>("dbl_col1", "dbl_col2", vs_4);
    assert(std::abs(vs_4.get_result() - 0.9999) < 0.0001);

    vs_v<vector_sim_type::simple_similarity, double>    vs_5;

    df.single_act_visit<double, double>("dbl_col5", "dbl_col6", vs_5);
    assert(std::abs(vs_5.get_result() - -1.5) < 0.0001);

    vs_v<vector_sim_type::jaccard_similarity, double>   vs_6;

    df.single_act_visit<double, double>("dbl_col3", "dbl_col4", vs_6);
    assert(std::abs(vs_6.get_result() - 0.25) < 0.0001);

    VectorSimilarityVisitor<vector_sim_type::jaccard_similarity, std::string>  vs_7;

    df.single_act_visit<std::string, std::string>("str_col1", "str_col2", vs_7);
    assert(std::abs(vs_7.get_result() - 0.1765) < 0.0001);

    vs_v<vector_sim_type::hamming_dist, double> vs_8;

    df.single_act_visit<double, double>("dbl_col3", "dbl_col4", vs_8);
    assert(std::abs(vs_8.get_result() - 8.0) < 0.0001);

    vs_v<vector_sim_type::hamming_dist, std::string>    vs_9;

    df.single_act_visit<std::string, std::string>("str_col1", "str_col2", vs_9);
    assert(std::abs(vs_9.get_result() - 9.0) < 0.0001);
}

C++ DataFrame