Back to Documentations

Signature Description
enum class  gen_join_type : unsigned char  {

    no_match = 1,      // Don't include any of the join participants 
    include_left = 2,  // Include the LHS
    include_right = 3, // Include the RHS
    include_both = 4,  // Include both join participants
};
Enumerated type to specify result of a predicate for join.


Signature Description Parameters
template<typename RHS_T,
         comparable LHS_COL_T, comparable RHS_COL_T,
         typename ... Ts>
DataFrame<unsigned long, H>
gen_join(const RHS_T &rhs,
         const char *lhs_col_name,
         const char *rhs_col_name,
         std::function<gen_join_type(
             const IndexType &,
             const typename RHS_T::IndexType &,
             const LHS_COL_T &,
             const RHS_COL_T &)> &&predicate) const;
This is the most general method to join two DataFrames. It requires the name of two columns, one from self (lhs) and one from rhs. The columns may or may not have the same type. It also takes a function called predicate. Datapoints from both self and rhs indices and the two columns are passed to predicate one by one.

NOTE: The datapoints are passed to predicate in the same order that they are. So DataFrames' order (sorting) and predicate logic must match.
NOTE: The columns are processed until the minimum length of the two columns. If you have columns of different length, you may consider calling make_consistent() before joining.
NOTE: All same name columns in lhs and rhs will have lhs. and rhs. prefixes in their names in the returned DataFrame.
NOTE: The result DataFrame will at least have two column names lhs.INDEX and rhs.INDEX containing the lhs and rhs indices datapoints

The predicate has the following parameters:
  1. A const reference to the index datapoint of self
  2. A const reference to the index datapoint of rhs
  3. A const reference to the lhs (self) column datapoint
  4. A const reference to the rhs column datapoint

NOTE: This join is done by what is called in the industry a table-scan
RHS_T: Type of the rhs DataFrame
LHS_COL_T: Type of the lhs column
RHS_COL_T: Type of the rhs column
Ts: List all the types of all data columns. A type should be specified in the list only once.
rhs: rhs DataFrame
lhs_col_name: lhs (self) column name. It can be any of the data column names or DF_INDEX_COL_NAME
rhs_col_name: rhs column name. It can be any of the data column names or DF_INDEX_COL_NAME
predicate: A function described above that determines the result
static void test_gen_join()  {

    std::cout << "\nTesting gen_join( ) ..." << std::endl;

    std::vector<unsigned long>  idx = { 123450, 123451, 123452, 123453, 123454, 123455, 123456, 123457, 123458, 123459, 123460, 123461, 123462, 123466 };
    std::vector<double>         d1 = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 };
    std::vector<double>         d2 = { 8, 9, 10, 11, 12, 13, 14, 20, 22, 23, 30, 31, 32, 1.89 };
    std::vector<double>         d3 = { 15, 16, 15, 18, 19, 16, 21, 0.34, 1.56, 0.34, 2.3, 0.34, 19.0 };
    std::vector<int>            i1 = { 22, 23, 24, 25, 99 };
    MyDataFrame                 df;

    df.load_data(std::move(idx),
                 std::make_pair("col_1", d1),
                 std::make_pair("col_2", d2),
                 std::make_pair("col_3", d3),
                 std::make_pair("col_4", i1));

    auto    vw = df.get_view<double, int>( { "col_1", "col_2", "col_3", "col_4" });

    std::vector<unsigned long>  idx2 = { 123452, 123453, 123455, 123458, 123466, 223450, 223451, 223454, 223456, 223457, 223459, 223460, 223461, 223462 };
    std::vector<double>         d12 = { 11, 12, 13, 14, 15, 16, 17, 18, 19, 110, 111, 112, 113, 114 };
    std::vector<double>         d22 = { 8, 19, 110, 111, 9, 113, 114, 99, 122, 123, 130, 131, 20, 11.89 };
    std::vector<double>         d32 = { 115, 116, 115, 118, 119, 116, 121, 10.34, 11.56, 10.34, 12.3, 10.34, 119.0 };
    std::vector<int>            i12 = { 122, 123, 124, 125, 199 };
    MyDataFrame                 df2;

    df2.load_data(std::move(idx2),
                  std::make_pair("xcol_1", d12),
                  std::make_pair("col_2", d22),
                  std::make_pair("xcol_3", d32),
                  std::make_pair("col_4", i12));

    auto    vw2 = df2.get_view<double, int>( { "xcol_1", "col_2", "xcol_3", "col_4" });

    auto    predicate =
        [](const unsigned long &, const unsigned long &,
           const double &lhs_val, const double &rhs_val) -> gen_join_type  {
            if (lhs_val == rhs_val)
                return (gen_join_type::include_both);
            return (gen_join_type::no_match);
        };

    df.write<std::ostream, double, int>(std::cout, io_format::pretty_prt, { .precision = 2 });
    std::cout << "\n\n\n";

    df2.write<std::ostream, double, int>(std::cout, io_format::pretty_prt, { .precision = 2 });
    std::cout << "\n\n\n";

    auto    inner_result = df.gen_join<decltype(df2), double, double, double, int>(df2, "col_2", "col_2", predicate);
    auto    inner_result_vw = vw.gen_join<decltype(df2), double, double, double, int>(df2, "col_2", "col_2", predicate);

    inner_result.write<std::ostream, double, int, unsigned long>(std::cout, io_format::pretty_prt, { .precision = 2 });
    std::cout << "\n\n\n";

    assert(inner_result.get_index().size() == 1);
    assert(inner_result.get_column<double>("xcol_1")[0] == 11.0);
    assert(inner_result.get_column<double>("xcol_3")[0] == 115.0);
    assert(inner_result.get_column<int>("lhs.col_4")[0] == 22);
    assert(inner_result.get_column<unsigned long>("rhs.INDEX")[0] == 123452);
    assert(inner_result.get_column<unsigned long>("lhs.INDEX")[0] == 123450);

    assert(inner_result_vw.get_index().size() == 1);
    assert(inner_result_vw.get_column<double>("col_1")[0] == 1.0);
    assert(inner_result_vw.get_column<int>("lhs.col_4")[0] == 22);
    assert(inner_result_vw.get_column<unsigned long>("rhs.INDEX")[0] == 123452);

    auto    predicate2 =
        [](const unsigned long &, const unsigned long &,
           const double &lhs_val, const double &rhs_val) -> gen_join_type  {
            if (lhs_val == rhs_val)
                return (gen_join_type::include_both);
            return (gen_join_type::include_right);
        };

    auto    result_vw2 = vw.gen_join<decltype(df2), double, double, double, int>(df2, "col_2", "col_2", predicate2);

    result_vw2.write<std::ostream, double, int, unsigned long>(std::cout, io_format::pretty_prt, { .precision = 2 });
    std::cout << "\n\n\n";

    assert(result_vw2.get_index().size() == 14);
    assert(result_vw2.get_column<double>("xcol_1")[0] == 11.0);
    assert(result_vw2.get_column<double>("xcol_1")[7] == 18.0);
    assert(result_vw2.get_column<double>("xcol_1")[13] == 114.0);
    assert(result_vw2.get_column<double>("xcol_3")[0] == 115.0);
    assert(result_vw2.get_column<double>("xcol_3")[10] == 12.3);
    assert(result_vw2.get_column<int>("lhs.col_4")[0] == 22);
    assert(result_vw2.get_column<int>("lhs.col_4")[6] == 0);
    assert(result_vw2.get_column<int>("lhs.col_4")[12] == 0);
    assert(result_vw2.get_column<int>("rhs.col_4")[0] == 122);
    assert(result_vw2.get_column<int>("rhs.col_4")[6] == 0);
    assert(result_vw2.get_column<int>("rhs.col_4")[12] == 0);
    assert(result_vw2.get_column<unsigned long>("rhs.INDEX")[0] == 123452);
    assert(result_vw2.get_column<unsigned long>("lhs.INDEX")[0] == 123450);
    assert(result_vw2.get_column<unsigned long>("lhs.INDEX")[8] == 0 );

    auto    predicate3 =
        [](const unsigned long &, const unsigned long &,
           const int &col_4, const double &xcol_1) -> gen_join_type  {
            if ((col_4 < 23 && col_4 != 0) || xcol_1 > 112.0)
                return (gen_join_type::include_both);
            return (gen_join_type::no_match);
        };

    auto    result_vw3 = vw.gen_join<MyDataFrame, int, double, double, int>(df2, "col_4", "xcol_1", predicate3);

    result_vw3.write<std::ostream, double, int, unsigned long>(std::cout, io_format::pretty_prt, { .precision = 2 });
    std::cout << "\n\n\n";

    assert(result_vw3.get_index().size() == 3);
    assert(result_vw3.get_column<double>("xcol_1")[0] == 11.0);
    assert(result_vw3.get_column<double>("xcol_1")[1] == 113.0);
    assert(result_vw3.get_column<double>("xcol_1")[2] == 114.0);
    assert(result_vw3.get_column<unsigned long>("lhs.INDEX")[0] == 123450);
    assert(result_vw3.get_column<unsigned long>("lhs.INDEX")[1] == 123462);
    assert(result_vw3.get_column<unsigned long>("lhs.INDEX")[2] == 123466);
    assert(result_vw3.get_column<unsigned long>("rhs.INDEX")[0] == 123452);
    assert(result_vw3.get_column<unsigned long>("rhs.INDEX")[1] == 223461);
    assert(result_vw3.get_column<unsigned long>("rhs.INDEX")[2] == 223462);
    assert(result_vw3.get_column<int>("lhs.col_4")[0] == 22);
    assert(result_vw3.get_column<int>("lhs.col_4")[1] == 0);
    assert(result_vw3.get_column<int>("lhs.col_4")[2] == 0);
    assert(result_vw3.get_column<int>("rhs.col_4")[0] == 122);
    assert(result_vw3.get_column<int>("rhs.col_4")[1] == 0);
    assert(result_vw3.get_column<int>("rhs.col_4")[2] == 0);
    assert(result_vw3.get_column<double>("lhs.col_2")[0] == 8.0);
    assert(result_vw3.get_column<double>("lhs.col_2")[1] == 32.0);
    assert(result_vw3.get_column<double>("lhs.col_2")[2] == 1.89);
    assert(result_vw3.get_column<double>("rhs.col_2")[0] == 8.0);
    assert(result_vw3.get_column<double>("rhs.col_2")[1] == 20.0);
    assert(result_vw3.get_column<double>("rhs.col_2")[2] == 11.89);
    assert(result_vw3.get_column<double>("col_3")[0] == 15.0);
    assert(result_vw3.get_column<double>("col_3")[1] == 19.0);
    assert(std::isnan(result_vw3.get_column<double>("col_3")[2]));
    assert(result_vw3.get_column<double>("xcol_3")[0] == 115.0);
    assert(result_vw3.get_column<double>("xcol_3")[1] == 119.0);
    assert(std::isnan(result_vw3.get_column<double>("xcol_3")[2]));
}

C++ DataFrame