Signature	Description
enum class correlation_type : unsigned char { // COV(X, Y) pearson = 1, // Ρ = ------------- // σ(X) * σ(Y) // Non-parametric (rank-based) correlations // // d_i = Difference in rank of X member vs. Y member // // 6 * ∑d_i² spearman = 2, // Ρ = 1 - -------------- // n * (n² - 1) // C = Concordant pairs, D = Discordant pairs // // C - D kendall_tau = 3, // Ρ = -------------------- // (n * (n - 1)) / 2 };	This specifies different correlation types.

Signature

Description

enum class  correlation_type : unsigned char  {

                  //       COV(X, Y)
    pearson = 1,  // Ρ = -------------
                  //      σ(X) * σ(Y)

    // Non-parametric (rank-based) correlations
    //
    // d_i = Difference in rank of X member vs. Y member
    //
                   //           6 * ∑d_i²
    spearman = 2,  // Ρ = 1 - --------------
                   //          n * (n² - 1)

    // C = Concordant pairs,  D = Discordant pairs
    //
                      //           C - D
    kendall_tau = 3,  // Ρ = --------------------
                      //      (n * (n - 1)) / 2
};

This specifies different correlation types.

Signature	Description	Parameters
#include <DataFrame/DataFrameStatsVisitors.h> template<typename T, typename I = unsigned long> struct CorrVisitor;	This functor class calculates the correlation of two given columns. This works with both scalar and multidimensional (i.e. vectors or arrays) datasets. For multidimensiional datasetes, you must use the single_act_visit() interface. NOTE: multidimensional datasets only work with Pearson correlation type explicit CorrVisitor(correlation_type t = correlation_type::pearson, bool bias = false, bool skipnan = false, bool stable_algo = false); correlation_type: Correlation type from above. bias: If true it divides by n - 1, otherwise by n. skip_nan: If true it skips over nan numbers as if they didn't exist. stable_algo: If true, it uses a version of Kahan summation that is numerically stable for data with very large values. Kahan summation is slower than regular summation, so only use it, if your data contains very large values. There are also the following member functions: get_result(): Returns the correlation. In case of scalar dataset, the correlation is a single number. In case of multidimensional dataset, the result is a vector of component wise (dimension wise) correlations. get_mean1(): Returns the mean of the first time-series. In case of scalar dataset, the mean is a single number. In case of multidimensional dataset, the mean is a vector of size data dimension. get_mean2(): Returns the mean of the second time-series. In case of scalar dataset, the mean is a single number. In case of multidimensional dataset, the mean is a vector of size data dimension.	T: Column data type. T must be an arithmetic-enabled type I: Index type.
#include <DataFrame/DataFrameStatsVisitors.h> template<typename T, typename I = unsigned long, std::size_t A = 0> struct CrossCorrVisitor;	This functor calculates a series of above correlations between lagged vectors of the first and second time-series. Lags are from min_lag (included) to max_lag (excluded) and incremented by 1. The result is a vector of correlations containing "max_lag – min_lag" values. If lag value is positive, the second time-series is lagged forward. Otherwise, the first time-series is lagged forward. CrossCorrVisitor (long min_lag, long max_lag, correlation_type t = correlation_type::pearson, bool biased = false, bool skip_nan = false, bool stable_algo = false); min_lag: Minimum lag period max_lag: Maximum lag period correlation_type: Correlation type from above. bias: If true it divides by n - 1, otherwise by n. skip_nan: If true it skips over nan numbers as if they didn't exist. stable_algo: If true, it uses a version of Kahan summation that is numerically stable for data with very large values. Kahan summation is slower than regular summation, so only use it, if your data contains very large values.	T: Column data type. T must be an arithmetic-enabled type I: Index type. A: Memory alignment boundary for vectors. Default is system default alignment

    MyDataFrame df;

    df.create_column<int>(static_cast<const char *>("col_name"));

    StlVecType<int>            intvec = { 1, 2, 3, 4, 5 };
    StlVecType<double>         dblvec = { 1.2345, 2.2345, 3.2345, 4.2345, 5.2345 };
    StlVecType<double>         dblvec2 = { 0.998, 0.3456, 0.056, 0.15678, 0.00345, 0.923, 0.06743, 0.1 };
    StlVecType<std::string>    strvec = { "Col_name", "Col_name", "Col_name", "Col_name", "Col_name" };
    StlVecType<unsigned long>  ulgvec = { 1UL, 2UL, 3UL, 4UL, 5UL, 8UL, 7UL, 6UL };
    StlVecType<unsigned long>  xulgvec = ulgvec;

    MyDataFrame::size_type  rc = df.load_data(std::move(ulgvec),
                                              std::make_pair("int_col", intvec),
                                              std::make_pair("dbl_col", dblvec),
                                              std::make_pair("dbl_col_2", dblvec2),
                                              std::make_pair("str_col", strvec),
                                              std::make_pair("ul_col", xulgvec));
    df.append_column<std::string>("str_col", "Additional column");
    df.append_column("dbl_col", 10.56);

    CorrVisitor<double> p_corr_visitor;
    CorrVisitor<double> s_corr_visitor(correlation_type::spearman);
    CorrVisitor<double> k_corr_visitor(correlation_type::kendall_tau);

    df.single_act_visit<double, double>("dbl_col", "dbl_col_2", s_corr_visitor);
    df.single_act_visit<double, double>("dbl_col", "dbl_col_2", k_corr_visitor);

    auto            fut = df.visit_async<double, double>("dbl_col", "dbl_col_2", p_corr_visitor);
    const double    p_corr = fut.get().get_result();

    assert(fabs(p_corr - -0.358381) < 0.000001);
    assert(fabs(s_corr_visitor.get_result() - -0.380952) < 0.000001);
    assert(fabs(k_corr_visitor.get_result() - -0.285714) < 0.000001);

// ----------------------------------------------------------------------------

static void test_CrossCorrVisitor()  {

    std::cout << "\nTesting CrossCorrVisitor{ } ..." << std::endl;

    typedef StdDataFrame64<std::string> StrDataFrame;

    StrDataFrame    df;

    try  {
        df.read("SHORT_IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
    }

    CrossCorrVisitor<double, std::string>   cc(-16, 16);

    df.single_act_visit<double, double>("IBM_Close", "IBM_Open", cc);

    assert(cc.get_result().size() == 32);
    assert(std::fabs(cc.get_result()[0] - 0.906) < 0.0001);
    assert(std::fabs(cc.get_result()[1] - 0.9117) < 0.0001);
    assert(std::fabs(cc.get_result()[15] - 0.9919) < 0.0001);
    assert(std::fabs(cc.get_result()[16] - 0.9971) < 0.0001);
    assert(std::fabs(cc.get_result()[30] - 0.9239) < 0.0001);
    assert(std::fabs(cc.get_result()[31] - 0.9179) < 0.0001);
}

// ----------------------------------------------------------------------------

void test_md_stats()  {

    std::cout << "\nTesting get_md_stats( ) ..." << std::endl;

    constexpr std::size_t   item_cnt = 1024;
    ULDataFrame             df;

    df.load_index(ULDataFrame::gen_sequence_index(0, item_cnt, 1));

    RandGenParams<double>   p;

    p.seed = 123;
    p.min_value = 0.5;
    p.max_value = 2.0;

    constexpr std::size_t   dim { 3 };

    using ary_col_t = std::array<double, dim>;
    using vec_col_t = std::vector<double>;

    // Generate and load 3 random columns
    //
    auto    rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * dim, p);

    std::vector<ary_col_t>  array_col(df.get_index().size());
    std::vector<ary_col_t>  array_col2(df.get_index().size());
    std::vector<vec_col_t>  vector_col(df.get_index().size());

    for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i)  {
        vector_col[i].resize(dim);
        for (std::size_t d { 0 }; d < dim; ++d)
            array_col[i][d] = vector_col[i][d] = rand_vec[j++];
    }
    df.load_column<ary_col_t>("array_col", std::move(array_col));
    df.load_column<vec_col_t>("vector_col", std::move(vector_col));

    p.seed = 1024;
    rand_vec = gen_uniform_real_dist<double>(df.get_index().size() * dim, p);
    for (std::size_t i { 0 }, j { 0 }; j < rand_vec.size(); ++i)  {
        for (std::size_t d { 0 }; d < dim; ++d)
            array_col2[i][d] = rand_vec[j++];
    }
    df.load_column<ary_col_t>("array_col2", std::move(array_col2));

    // Pearson Correlation
    //
    CorrVisitor<ary_col_t>  md_corr;

    df.single_act_visit<ary_col_t, ary_col_t>("array_col", "array_col2", md_corr);
    assert(md_corr.get_result().size() == dim);
    assert(md_corr.get_data_mean1().size() == dim);
    assert(md_corr.get_data_mean2().size() == dim);

    assert(std::fabs(md_corr.get_result()[0] - 0.00982) < 0.00001);
    assert(std::fabs(md_corr.get_result()[2] - 0.03198) < 0.00001);

    assert(std::fabs(md_corr.get_data_mean1()[0] - 1.25062) < 0.00001);
    assert(std::fabs(md_corr.get_data_mean1()[2] - 1.24829) < 0.00001);

    assert(std::fabs(md_corr.get_data_mean2()[0] - 1.24407) < 0.00001);
    assert(std::fabs(md_corr.get_data_mean2()[2] - 1.25122) < 0.00001);

    df.single_act_visit<ary_col_t, ary_col_t>("array_col", "array_col", md_corr);
    assert(std::fabs(md_corr.get_result()[0] - 1.0) < 0.00001);
    assert(std::fabs(md_corr.get_result()[2] - 1.0) < 0.00001);

    assert(std::fabs(md_corr.get_data_mean1()[0] - 1.25062) < 0.00001);
    assert(std::fabs(md_corr.get_data_mean1()[2] - 1.24829) < 0.00001);

    assert(std::fabs(md_corr.get_data_mean2()[0] - 1.25062) < 0.00001);
    assert(std::fabs(md_corr.get_data_mean2()[2] - 1.24829) < 0.00001);
}