Back to Documentations

Signature Description Parameters
#include <DataFrame/DataFrameStatsVisitors.h>

template<std::floating_point T, typename I = unsigned long,
         typename L = std::string, std::size_t A = 0>
struct  DivideToBinsVisitor;

// -------------------------------------

template<std::floating_point T, typename I = unsigned long,
         typename L = std::string, std::size_t A = 0>
using cut_v = DivideToBinsVisitor<T, I, L, A>;
This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This visitor categorizes the given column, akin to Pandas cut() function. It is used to divide or group numerical data into different categories (called bins). This is helpful when we have a list of numbers and want to separate them into meaningful groups.

There are two ways you can configure this visitor through the two constructors:
explicit
DivideToBinsVisitor(size_type bins,
                    label_vec_t &&labels = {  },
                    bool right = true,
                    bool include_lowest = false);
         
In this constructor, user provides the number of bins and optionally a set of labels. Number of labels must be equal to number of bins, if provided. The bins will be equally distanced in this case.

explicit
DivideToBinsVisitor(std::vector<value_type> &&edges,
                    label_vec_t &&labels = {  },
                    bool right = true,
                    bool include_lowest = false);
         
In this constructor, user provides n number of edges which divides the given column into n – 1 bins. The user can also optionally provide a set of labels. The number of labels must be equal to n – 1. In this case, bins can be variably distanced.

right: Indicates whether bins includes the rightmost edge or not.
include_lowest: Whether the first interval should be left-inclusive or not.

This visitor also defines the following types:
using label_type = L;
using pair_t = std::pair<T, T>;
using result_type =
    std::vector<pair_t, typename allocator_declare<pair_t, A>::type>;
using label_vec_t =
    std::vector<label_type, typename allocator_declare<label_type, A>::type>;

This visitor defines the following functions to return the results:
get_result(): Returns a vector of std::pairs. It contains the same number of datapoints as the input column. It contains the edges of bins for each datapoint in the input column.
get_labels(): Returns the optionally user given labels assigned to each datapoint of the input column. It contains the same number of datapoints as the input column.
T: Column data type
I: Index type
L: Type of user's optionally provided labels
A: Memory alignment boundary for vectors. Default is system default alignment
#include <DataFrame/DataFrameStatsVisitors.h>

template<std::floating_point T, typename I = unsigned long,
         typename L = std::string, std::size_t A = 0>
struct  DivideToQuantilesVisitor;

// -------------------------------------

template<std::floating_point T, typename I = unsigned long,
         typename L = std::string, std::size_t A = 0>
using qcut_v = DivideToQuantilesVisitor<T, I, L, A>;
This is a "single action visitor", meaning it is passed the whole data vector in one call and you must use the single_act_visit() interface.

This is similar to DivideToBinsVisitor but it used quantiles to discretize and label the data. This is akin to Pandas qcut() function.

There are two ways you can configure this visitor through the two constructors:
explicit
DivideToQuantilesVisitor(size_type quantiles,
                         label_vec_t &&labels = {  });
         
In this constructor, user provides the number of quantiles and optionally a set of labels. Number of labels must be equal to number of quantiles, if provided.

explicit
DivideToQuantilesVisitor(std::vector &&quantiles,
                         label_vec_t &&labels = {  });
         
In this constructor, user provides n number of quantiles which divides the given column into n – 1 bins. The user can also optionally provide a set of labels. The number of labels must be equal to n – 1. The list of quantiles must start with 0 and end with 1.

This visitor also defines the following types:
using label_type = L;
using pair_t = std::pair<T, T>;
using result_type =
    std::vector<pair_t, typename allocator_declare<pair_t, A>::type>;
using label_vec_t =
    std::vector<label_type, typename allocator_declare<label_type, A>::type>;

This visitor defines the following functions to return the results:
get_result(): Returns a vector of std::pairs. It contains the same number of datapoints as the input column. It contains the edges of bins for each datapoint in the input column.
get_labels(): Returns the optionally user given labels assigned to each datapoint of the input column. It contains the same number of datapoints as the input column.
T: Column data type
I: Index type
L: Type of user's optionally provided labels
A: Memory alignment boundary for vectors. Default is system default alignment
static void test_DivideToBinsVisitor()  {

    std::cout << "\nTesting DivideToBinsVisitor{ } ..." << std::endl;

    StrDataFrame    ibm;

    try  {
        ibm.read("IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }

    cut_v<double, std::string>  cut { 4, { "Low Bin", "Mid Low Bin", "Mid High Bin", "High Bin" } };

    ibm.single_act_visit<double>("IBM_Close", cut);
    ibm.load_column("String Labels", std::move(cut.get_labels()));
    ibm.load_column("Edges", std::move(cut.get_result()));

    // ibm.write<std::ostream, double, long, std::string, std::pair<double, double>>(
    //      std::cout, io_format::pretty_prt, { .precision = 4 });

    const auto  &close_col = ibm.get_column<double>("IBM_Close");
    const auto  &lables_col = ibm.get_column<std::string>("String Labels");
    const auto  &edges_col = ibm.get_column<std::pair<double, double>>("Edges");

    assert((std::fabs(close_col[0] - 98.5625) < 0.0001));
    assert((lables_col[0] == "Mid Low Bin"));
    assert((edges_col[0] == std::pair<double, double>{ 95, 135 }));

    assert((std::fabs(close_col[7] - 93.0) < 0.01));
    assert((lables_col[7] == "Low Bin"));
    assert((edges_col[7] == std::pair<double, double>{ 55, 95 }));

    assert((std::fabs(close_col[463] - 74.2) < 0.1));
    assert((lables_col[463] == "Low Bin"));
    assert((edges_col[463] == std::pair<double, double>{ 55, 95 }));

    assert((std::fabs(close_col[1570] - 100.38) < 0.01));
    assert((lables_col[1570] == "Mid Low Bin"));
    assert((edges_col[1570] == std::pair<double, double>{ 95, 135 }));

    assert((std::fabs(close_col[2110] - 94.15) < 0.01));
    assert((lables_col[2110] == "Low Bin"));
    assert((edges_col[2110] == std::pair<double, double>{ 55, 95 }));

    assert((std::fabs(close_col[2574] - 159.21) < 0.01));
    assert((lables_col[2574] == "Mid High Bin"));
    assert((edges_col[2574] == std::pair<double, double>{ 135, 175 }));

    assert((std::fabs(close_col[4086] - 174.29) < 0.01));
    assert((lables_col[4086] == "Mid High Bin"));
    assert((edges_col[4086] == std::pair<double, double>{ 135, 175 }));

    assert((std::fabs(close_col[4107] - 180.05) < 0.01));
    assert((lables_col[4107] == "High Bin"));
    assert((edges_col[4107] == std::pair<double, double>{ 175, 215 }));

    assert((std::fabs(close_col[5030] - 111.66) < 0.01));
    assert((lables_col[5030] == "Mid Low Bin"));
    assert((edges_col[5030] == std::pair<double, double>{ 95, 135 }));

    cut_v<double, std::string>  cut2 { { 50, 120, 175, 300 }, { "Low Bin", "Mid Bin", "High Bin" } };

    ibm.single_act_visit<double>("IBM_Close", cut2);
    ibm.load_column("String Labels 2", std::move(cut2.get_labels()));
    ibm.load_column("Edges 2", std::move(cut2.get_result()));

    const auto  &lables_col2 = ibm.get_column<std::string>("String Labels 2");
    const auto  &edges_col2 = ibm.get_column<std::pair<double, double>>("Edges 2");

    assert((std::fabs(close_col[0] - 98.5625) < 0.0001));
    assert((lables_col2[0] == "Low Bin"));
    assert((edges_col2[0] == std::pair<double, double>{ 50, 120 }));

    assert((std::fabs(close_col[7] - 93.0) < 0.01));
    assert((lables_col2[7] == "Low Bin"));
    assert((edges_col2[7] == std::pair<double, double>{ 50, 120 }));

    assert((std::fabs(close_col[463] - 74.2) < 0.1));
    assert((lables_col2[463] == "Low Bin"));
    assert((edges_col2[463] == std::pair<double, double>{ 50, 120 }));

    assert((std::fabs(close_col[1570] - 100.38) < 0.01));
    assert((lables_col2[1570] == "Low Bin"));
    assert((edges_col2[1570] == std::pair<double, double>{ 50, 120 }));

    assert((std::fabs(close_col[2110] - 94.15) < 0.01));
    assert((lables_col2[2110] == "Low Bin"));
    assert((edges_col2[2110] == std::pair<double, double>{ 50, 120 }));

    assert((std::fabs(close_col[2574] - 159.21) < 0.01));
    assert((lables_col2[2574] == "Mid Bin"));
    assert((edges_col2[2574] == std::pair<double, double>{ 120, 175 }));

    assert((std::fabs(close_col[4086] - 174.29) < 0.01));
    assert((lables_col2[4086] == "Mid Bin"));
    assert((edges_col2[4086] == std::pair<double, double>{ 120, 175 }));

    assert((std::fabs(close_col[4107] - 180.05) < 0.01));
    assert((lables_col2[4107] == "High Bin"));
    assert((edges_col2[4107] == std::pair<double, double>{ 175, 300 }));

    assert((std::fabs(close_col[5030] - 111.66) < 0.01));
    assert((lables_col2[5030] == "Low Bin"));
    assert((edges_col2[5030] == std::pair<double, double>{ 50, 120 }));
}
// ----------------------------------------------------------------------------

static void test_DivideToQuantilesVisitor()  {

    std::cout << "\nTesting DivideToQuantilesVisitor{ } ..." << std::endl;

    StrDataFrame    ibm;

    try  {
        ibm.read("IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
        ::exit(-1);
    }

    qcut_v<double, std::string> qcut { 4, { "Low Bin", "Mid Low Bin", "Mid High Bin", "High Bin" } };

    ibm.single_act_visit<double>("IBM_Close", qcut);
    ibm.load_column("String Labels", std::move(qcut.get_labels()));
    ibm.load_column("Edges", std::move(qcut.get_result()));

    // ibm.write<std::ostream, double, long, std::string, std::pair<double, double>>(
    //      std::cout, io_format::pretty_prt, { .precision = 4 });

    const auto  &close_col = ibm.get_column<double>("IBM_Close");
    const auto  &lables_col = ibm.get_column<std::string>("String Labels");
    const auto  &edges_col = ibm.get_column<std::pair<double, double>>("Edges");

    assert((std::fabs(close_col[0] - 98.5625) < 0.0001));
    assert((lables_col[0] == "Mid Low Bin"));
    assert((edges_col[0] == std::pair<double, double>{ 94.1899985, 126.470001 }));

    assert((std::fabs(close_col[7] - 93.0) < 0.01));
    assert((lables_col[7] == "Low Bin"));
    assert((edges_col[7] == std::pair<double, double>{ 55.07, 94.1899985 }));

    assert((std::fabs(close_col[463] - 74.2) < 0.1));
    assert((lables_col[463] == "Low Bin"));
    assert((edges_col[463] == std::pair<double, double>{ 55.07, 94.1899985 }));

    assert((std::fabs(close_col[1570] - 100.38) < 0.01));
    assert((lables_col[1570] == "Mid Low Bin"));
    assert((edges_col[1570] == std::pair<double, double>{ 94.1899985, 126.470001 }));

    assert((std::fabs(close_col[2110] - 94.15) < 0.01));
    assert((lables_col[2110] == "Low Bin"));
    assert((edges_col[2110] == std::pair<double, double>{ 55.07, 94.1899985 }));

    assert((std::fabs(close_col[2574] - 159.21) < 0.01));
    assert((lables_col[2574] == "Mid High Bin"));
    assert((edges_col[2574] == std::pair<double, double>{ 126.470001, 159.529999 }));

    assert((std::fabs(close_col[4086] - 174.29) < 0.01));
    assert((lables_col[4086] == "High Bin"));
    assert((edges_col[4086] == std::pair<double, double>{ 159.529999, 215.800003 }));

    assert((std::fabs(close_col[4107] - 180.05) < 0.01));
    assert((lables_col[4107] == "High Bin"));
    assert((edges_col[4107] == std::pair<double, double>{ 159.529999, 215.800003 }));

    assert((std::fabs(close_col[5030] - 111.66) < 0.01));
    assert((lables_col[5030] == "Mid Low Bin"));
    assert((edges_col[5030] == std::pair<double, double>{ 94.1899985, 126.470001 }));

    qcut_v<double, std::string> qcut2 { { 0, 0.66, 1 }, { "Low Bin", "High Bin" } };

    ibm.single_act_visit<double>("IBM_Close", qcut2);
    ibm.load_column("String Labels 2", std::move(qcut2.get_labels()));
    ibm.load_column("Edges 2", std::move(qcut2.get_result()));

    const auto  &lables_col2 = ibm.get_column<std::string>("String Labels 2");
    const auto  &edges_col2 = ibm.get_column<std::pair<double, double>>("Edges 2");

    assert((std::fabs(close_col[0] - 98.5625) < 0.0001));
    assert((lables_col2[0] == "Low Bin"));
    assert((edges_col2[0] == std::pair<double, double>{ 55.07, 146.589996 }));

    assert((std::fabs(close_col[7] - 93.0) < 0.01));
    assert((lables_col2[7] == "Low Bin"));
    assert((edges_col2[7] == std::pair<double, double>{ 55.07, 146.589996 }));

    assert((std::fabs(close_col[463] - 74.2) < 0.1));
    assert((lables_col2[463] == "Low Bin"));
    assert((edges_col2[463] == std::pair<double, double>{ 55.07, 146.589996 }));

    assert((std::fabs(close_col[1570] - 100.38) < 0.01));
    assert((lables_col2[1570] == "Low Bin"));
    assert((edges_col2[1570] == std::pair<double, double>{ 55.07, 146.589996 }));

    assert((std::fabs(close_col[2110] - 94.15) < 0.01));
    assert((lables_col2[2110] == "Low Bin"));
    assert((edges_col2[2110] == std::pair<double, double>{ 55.07, 146.589996 }));

    assert((std::fabs(close_col[2574] - 159.21) < 0.01));
    assert((lables_col2[2574] == "High Bin"));
    assert((edges_col2[2574] == std::pair<double, double>{ 146.589996, 215.800003 }));

    assert((std::fabs(close_col[4086] - 174.29) < 0.01));
    assert((lables_col2[4086] == "High Bin"));
    assert((edges_col2[4086] == std::pair<double, double>{ 146.589996, 215.800003 }));

    assert((std::fabs(close_col[4107] - 180.05) < 0.01));
    assert((lables_col2[4107] == "High Bin"));
    assert((edges_col2[4107] == std::pair<double, double>{ 146.589996, 215.800003 }));

    assert((std::fabs(close_col[5030] - 111.66) < 0.01));
    assert((lables_col2[5030] == "Low Bin"));
    assert((edges_col2[5030] == std::pair<double, double>{ 55.07, 146.589996 }));
}

C++ DataFrame