| Signature | Description |
|---|---|
enum class vector_sim_type : unsigned char { euclidean_dist = 1, // Euclidean distance manhattan_dist = 2, // Manhattan distance dot_product = 3, simple_similarity = 4, // Two binary vectors must be of equal length cosine_similarity = 5, jaccard_similarity = 6, // Relatively expensive to calculate // Hamming distance is number of unequal members // Two vectors must be of equal length // hamming_dist = 7, }; |
Different types of vector similarity measurementloss function |
| Signature | Description | Parameters |
|---|---|---|
#include <DataFrame/DataFrameMLVisitors.h> template<vector_sim_type TYP, typename T, typename I = unsigned long> struct VectorSimilarityVisitor; // ------------------------------------- template<vector_sim_type TYP, typename T, typename I = unsigned long> using vs_v = VectorSimilarityVisitor<TYP, T, I>; |
Similarity measures play a crucial role in machine learning. These measures quantify the similarity between objects, data points, or vectors in a mathematical manner. Understanding the concept of similarity in the vector space and employing appropriate measures is fundamental in solving a wide range of real-world problems. There are several similarity measures that can be used to calculate how close two vectors are in the embedding space (See above vector_sim_type). |
TYP: Type of similarity specified above. The reason for this being a template param is to take advantage of constexpr if clauses for incompatible types. T: Column data type. I: Index type. |
static void test_VectorSimilarityVisitor() { std::cout << "\nTesting VectorSimilarityVisitor { } ..." << std::endl; MyDataFrame df; StlVecType<unsigned long> idxvec = { 1UL, 2UL, 3UL, 4UL, 5UL, 6UL, 7UL, 8UL, 9UL, 10UL }; StlVecType<double> dblvec1 = { 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, -1.2 }; StlVecType<double> dblvec2 = { 1.15, 2.18, 3.31, 4.39, 5.48, 6.5, 7.8, 8.81, 9.88, -1.4 }; StlVecType<double> dblvec3 = { 0.0, 1.1, 9.8, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, -1.5 }; StlVecType<double> dblvec4 = { 5.9, 4.4, 1.0, 9.8, 5.3, 5.5, 3.8, 4.1, -3.3, -1.5 }; StlVecType<double> dblvec5 = { 0, 1, 1, 0, 0, 1, 1, 1, 0, 1 }; StlVecType<double> dblvec6 = { 1, 0, 1, 0, 1, 1, 0, 1, 0, 1 }; StlVecType<std::string> strvec1 = { "Today", "I", "need", "to", "learn", "about", "Jaccard", "similarity", ".", "/" }; StlVecType<std::string> strvec2 = { "Later", "I", "will", "need", "other", "things", "to", "do", "", "" }; df.load_data(std::move(idxvec), std::make_pair("dbl_col1", dblvec1), std::make_pair("dbl_col2", dblvec2), std::make_pair("dbl_col3", dblvec3), std::make_pair("dbl_col4", dblvec4), std::make_pair("dbl_col5", dblvec5), std::make_pair("dbl_col6", dblvec6), std::make_pair("str_col1", strvec1), std::make_pair("str_col2", strvec2)); VectorSimilarityVisitor<vector_sim_type::euclidean_dist, double> vs_1; df.single_act_visit<double, double>("dbl_col1", "dbl_col2", vs_1); assert(std::abs(vs_1.get_result() - 0.253) < 0.0001); VectorSimilarityVisitor<vector_sim_type::manhattan_dist, double> vs_2; df.single_act_visit<double, double>("dbl_col1", "dbl_col2", vs_2); assert(std::abs(vs_2.get_result() - 0.54) < 0.0001); VectorSimilarityVisitor<vector_sim_type::dot_product, double> vs_3; df.single_act_visit<double, double>("dbl_col1", "dbl_col2", vs_3); assert(std::abs(vs_3.get_result() - 346.42) < 0.0001); vs_v<vector_sim_type::cosine_similarity, double> vs_4; df.single_act_visit<double, double>("dbl_col1", "dbl_col2", vs_4); assert(std::abs(vs_4.get_result() - 0.9999) < 0.0001); vs_v<vector_sim_type::simple_similarity, double> vs_5; df.single_act_visit<double, double>("dbl_col5", "dbl_col6", vs_5); assert(std::abs(vs_5.get_result() - -1.5) < 0.0001); vs_v<vector_sim_type::jaccard_similarity, double> vs_6; df.single_act_visit<double, double>("dbl_col3", "dbl_col4", vs_6); assert(std::abs(vs_6.get_result() - 0.25) < 0.0001); VectorSimilarityVisitor<vector_sim_type::jaccard_similarity, std::string> vs_7; df.single_act_visit<std::string, std::string>("str_col1", "str_col2", vs_7); assert(std::abs(vs_7.get_result() - 0.1765) < 0.0001); vs_v<vector_sim_type::hamming_dist, double> vs_8; df.single_act_visit<double, double>("dbl_col3", "dbl_col4", vs_8); assert(std::abs(vs_8.get_result() - 8.0) < 0.0001); vs_v<vector_sim_type::hamming_dist, std::string> vs_9; df.single_act_visit<std::string, std::string>("str_col1", "str_col2", vs_9); assert(std::abs(vs_9.get_result() - 9.0) < 0.0001); }