Back to Documentations

Signature Description

enum class  correlation_type : unsigned char  {

    pearson = 1,   // Ρ = 
COV(X, Y) / σ(X) * σ(Y)
// Non-parametric (rank-based) correlations // spearman = 2, // Ρ = 1 -
6 * ∑di2 / n * (n2 - 1)
di = Difference in rank of X member vs. Y member kendall_tau = 3, // Ρ =
C - D / (n * (n - 1)) / 2
C = Concordant pairs, D = Discordant pairs };
This specifies different correlation types.

Signature Description Parameters
#include <DataFrame/DataFrameStatsVisitors.h>

template<typename T, typename I = unsigned long>
struct CorrVisitor;
        
This functor class calculates the correlation of two given columns.
  explicit
  CorrVisitor(correlation_type t = correlation_type::pearson,
              bool bias = false,
              bool skipnan = false,
              bool stable_algo = false);
        
correlation_type: Correlation type from above.
bias: If true it divides by n - 1, otherwise by n.
skip_nan: If true it skips over nan numbers as if they didn't exist.
stable_algo: If true, it uses a version of Kahan summation that is numerically stable for data with very large values. Kahan summation is slower than regular summation, so only use it, if your data contains very large values.

There are also the following member functions:
  get_result(): Returns the correlation
  get_mean1(): Returns the mean of the first time-series
  get_mean2(): Returns the mean of the second time-series
        
T: Column data type. T must be an arithmetic-enabled type
I: Index type.
#include <DataFrame/DataFrameStatsVisitors.h>

template<typename T, typename I = unsigned long,
         std::size_t A = 0>
struct CrossCorrVisitor;
        
This functor calculates a series of above correlations between lagged vectors of the first and second time-series. Lags are from min_lag (included) to max_lag (excluded) and incremented by 1. The result is a vector of correlations containing "max_lag – min_lag" values.
If lag value is positive, the second time-series is lagged forward. Otherwise, the first time-series is lagged forward.
  CrossCorrVisitor (long min_lag,
                    long max_lag,
                    correlation_type t = correlation_type::pearson,
                    bool biased = false,
                    bool skip_nan = false,
                    bool stable_algo = false);
        
min_lag: Minimum lag period
max_lag: Maximum lag period
correlation_type: Correlation type from above.
bias: If true it divides by n - 1, otherwise by n.
skip_nan: If true it skips over nan numbers as if they didn't exist.
stable_algo: If true, it uses a version of Kahan summation that is numerically stable for data with very large values. Kahan summation is slower than regular summation, so only use it, if your data contains very large values.
T: Column data type. T must be an arithmetic-enabled type
I: Index type.
A: Memory alignment boundary for vectors. Default is system default alignment
    MyDataFrame df;

    df.create_column<int>(static_cast<const char *>("col_name"));

    StlVecType<int>            intvec = { 1, 2, 3, 4, 5 };
    StlVecType<double>         dblvec = { 1.2345, 2.2345, 3.2345, 4.2345, 5.2345 };
    StlVecType<double>         dblvec2 = { 0.998, 0.3456, 0.056, 0.15678, 0.00345, 0.923, 0.06743, 0.1 };
    StlVecType<std::string>    strvec = { "Col_name", "Col_name", "Col_name", "Col_name", "Col_name" };
    StlVecType<unsigned long>  ulgvec = { 1UL, 2UL, 3UL, 4UL, 5UL, 8UL, 7UL, 6UL };
    StlVecType<unsigned long>  xulgvec = ulgvec;

    MyDataFrame::size_type  rc = df.load_data(std::move(ulgvec),
                                              std::make_pair("int_col", intvec),
                                              std::make_pair("dbl_col", dblvec),
                                              std::make_pair("dbl_col_2", dblvec2),
                                              std::make_pair("str_col", strvec),
                                              std::make_pair("ul_col", xulgvec));
    df.append_column<std::string>("str_col", "Additional column");
    df.append_column("dbl_col", 10.56);

    CorrVisitor<double> p_corr_visitor;
    CorrVisitor<double> s_corr_visitor(correlation_type::spearman);
    CorrVisitor<double> k_corr_visitor(correlation_type::kendall_tau);

    df.single_act_visit<double, double>("dbl_col", "dbl_col_2", s_corr_visitor);
    df.single_act_visit<double, double>("dbl_col", "dbl_col_2", k_corr_visitor);

    auto            fut = df.visit_async<double, double>("dbl_col", "dbl_col_2", p_corr_visitor);
    const double    p_corr = fut.get().get_result();

    assert(fabs(p_corr - -0.358381) < 0.000001);
    assert(fabs(s_corr_visitor.get_result() - -0.380952) < 0.000001);
    assert(fabs(k_corr_visitor.get_result() - -0.285714) < 0.000001);
// ----------------------------------------------------------------------------

static void test_CrossCorrVisitor()  {

    std::cout << "\nTesting CrossCorrVisitor{ } ..." << std::endl;

    typedef StdDataFrame64<std::string> StrDataFrame;

    StrDataFrame    df;

    try  {
        df.read("SHORT_IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
    }

    CrossCorrVisitor<double, std::string>   cc(-16, 16);

    df.single_act_visit<double, double>("IBM_Close", "IBM_Open", cc);

    assert(cc.get_result().size() == 32);
    assert(std::fabs(cc.get_result()[0] - 0.906) < 0.0001);
    assert(std::fabs(cc.get_result()[1] - 0.9117) < 0.0001);
    assert(std::fabs(cc.get_result()[15] - 0.9919) < 0.0001);
    assert(std::fabs(cc.get_result()[16] - 0.9971) < 0.0001);
    assert(std::fabs(cc.get_result()[30] - 0.9239) < 0.0001);
    assert(std::fabs(cc.get_result()[31] - 0.9179) < 0.0001);
}

C++ DataFrame