Back to Documentations

Signature Description
struct  PCAParams  {

    normalization_type  norm_type { normalization_type::z_score };

    // If populated (set above zero), number of top eigen values to keep.
    //
    long                num_comp_to_keep { 0 };

    // If populated (num_comp_is 0), percentage of eigenvalues to keep. 0.9 means 90%.
    //
    double              pct_comp_to_keep { 0.9 };
};
A structure containing the parameteres to pca_by_eigen() call

Signature Description Parameters
template<typename T>
Matrix<T, matrix_orient::column_major>
pca_by_eigen(std::vector<const char *> &&col_names,
             const PCAParams params = { }) const;
This uses Eigenspace evaluation to calculate Principal Component Analysis (PCA). It returns a matrix whose columns are the reduced dimensions with most significant information.

PCA is a dimensionality reduction method that is often used to reduce the dimensionality of large data sets, by transforming a large set of variables into a smaller one that still contains most of the information in the large set.
Reducing the number of variables of a data set naturally comes at the expense of accuracy, but the trick in dimensionality reduction is to trade a little accuracy for simplicity. Because smaller data sets are easier to explore and visualize, and thus make analyzing data points much easier and faster for machine learning algorithms without extraneous variables to process.
T: Type of the named columns
col_names: Vector of column names
params: Parameters necessary for for this operation
template<typename T>
std::tuple<Matrix<T, matrix_orient::column_major>, // U
           Matrix<T, matrix_orient::column_major>, // Σ
           Matrix<T, matrix_orient::column_major>> // V
compact_svd(std::vector<const char *> &&col_names,
            normalization_type norm_type =
                normalization_type::z_score) const;
This calculates Singular Value Decomposition (SVD). Optionaly it may normalize the original matrix first.
In linear algebra, SVD is a factorization of a real or complex matrix into a rotation, followed by a rescaling followed by another rotation. It generalizes the eigen-decomposition of a square normal matrix with an orthonormal eigenbasis to any ⁠mXn matrix.
It returns the 3 metrices U, Σ, and V inside a std::tuple.

U contains the left singular vectors of the original matrix, meaning its columns are orthonormal vectors that span the row space of the matrix.
Σ is a diagonal matrix that contains sqrt of eigenvalues of the original matrix, arranged in descending order.
V contains the right singular vectors of the original matrix, represented as its columns.
Original matrix (A) = U * Σ * VT
T: Type of the named columns
col_names: Vector of column names
norm_type: Type of normalization applied to raw data first
static void test_pca_by_eigen()  {

    std::cout << "\nTesting pca_by_eigen( ) ..." << std::endl;

    StrDataFrame    df;

    try  {
        df.read("IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
    }

    const auto  pca_mat = df.pca_by_eigen<double>({ "IBM_Close", "IBM_Open", "IBM_High", "IBM_Low" });

    // Dimensions were reduced to 1 containing at least 90% of the information.
    // This makes sense, since these 4 columns are highly correlated.
    //
    assert(pca_mat.cols() == 1);
    assert(pca_mat.rows() == 5031);
    assert(std::fabs(pca_mat(0, 0) - 197.063) < 0.001);
    assert(std::fabs(pca_mat(1, 0) - 200.875) < 0.001);
    assert(std::fabs(pca_mat(491, 0) - 149.02) < 0.01);
    assert(std::fabs(pca_mat(1348, 0) - 166.44) < 0.01);
    assert(std::fabs(pca_mat(2677, 0) - 333.405) < 0.001);
    assert(std::fabs(pca_mat(5029, 0) - 216.175) < 0.001);
    assert(std::fabs(pca_mat(5030, 0) - 219.555) < 0.001);

    const auto  pca_mat2 = df.pca_by_eigen<double>({ "IBM_Close", "IBM_Open", "IBM_High", "IBM_Low" },
                                                   { .num_comp_to_keep = 3 });

    // 3 most significant dimensions are kept.
    // As you can see the first column is unchanged and clearly contains
    // almost all of the information.
    //
    assert(pca_mat2.cols() == 3);
    assert(pca_mat2.rows() == 5031);

    assert(std::fabs(pca_mat2(0, 0) - 197.063) < 0.001);
    assert(std::fabs(pca_mat2(0, 1) - -0.0951913) < 0.001);
    assert(std::fabs(pca_mat2(0, 2) - 1.85473) < 0.001);

    assert(std::fabs(pca_mat2(1, 0) - 200.875) < 0.001);
    assert(std::fabs(pca_mat2(1, 1) - -2.08604) < 0.001);
    assert(std::fabs(pca_mat2(1, 2) - 2.68895) < 0.001);

    assert(std::fabs(pca_mat2(491, 0) - 149.02) < 0.01);
    assert(std::fabs(pca_mat2(491, 1) - -1.34957) < 0.01);
    assert(std::fabs(pca_mat2(491, 2) - 2.09026) < 0.01);

    assert(std::fabs(pca_mat2(1348, 0) - 166.44) < 0.01);
    assert(std::fabs(pca_mat2(1348, 1) - 0.0354559) < 0.01);
    assert(std::fabs(pca_mat2(1348, 2) - 0.41972) < 0.01);

    assert(std::fabs(pca_mat2(2677, 0) - 333.405) < 0.001);
    assert(std::fabs(pca_mat2(2677, 1) - -1.33686) < 0.001);
    assert(std::fabs(pca_mat2(2677, 2) - 2.13684) < 0.001);

    assert(std::fabs(pca_mat2(5029, 0) - 216.175) < 0.001);
    assert(std::fabs(pca_mat2(5029, 1) - -1.18141) < 0.001);
    assert(std::fabs(pca_mat2(5029, 2) - 2.18029) < 0.001);

    assert(std::fabs(pca_mat2(5030, 0) - 219.555) < 0.001);
    assert(std::fabs(pca_mat2(5030, 1) - -2.66858) < 0.001);
    assert(std::fabs(pca_mat2(5030, 2) - 2.85412) < 0.001);
}
// ----------------------------------------------------------------------------

static void test_compact_svd()  {

    std::cout << "\nTesting compact_svd( ) ..." << std::endl;

    StrDataFrame    df;

    try  {
        df.read("IBM.csv", io_format::csv2);
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
    }

    const auto  [U, S, V] = df.compact_svd<double>({ "IBM_Close", "IBM_Open", "IBM_High", "IBM_Low" });

    assert(U.rows() == 5031);
    assert(U.cols() == 4);  // Compact version has the same column # as the original matrix
    assert(std::fabs(U(0, 0) - -0.0115747) < 0.000001);
    assert(std::fabs(U(2, 3) - -0.0110622) < 0.000001);
    assert(std::fabs(U(4040, 2) - -0.0147074) < 0.000001);
    assert(std::fabs(U(4994, 1) - 0.0194639) < 0.000001);
    assert(std::fabs(U(5030, 3) - -0.000878688) < 0.000001);

    assert(S.rows() == 4);  // In compact version zero rows at the end are omitted
    assert(S.cols() == 4);
    assert(std::fabs(S(0, 0) - 141.821) < 0.001);
    assert(std::fabs(S(1, 1) - 1.91734) < 0.00001);
    assert(std::fabs(S(2, 2) - 1.62214) < 0.00001);
    assert(std::fabs(S(3, 3) - 0.73194) < 0.00001);
    assert(S(0, 2) == 0.0);
    assert(S(1, 2) == 0.0);
    assert(S(3, 0) == 0.0);

    assert(V.rows() == 4);
    assert(V.cols() == 4);
    assert(std::fabs(V(0, 0) - 0.499988) < 0.000001);
    assert(std::fabs(V(0, 2) - 0.003710) < 0.000001);
    assert(std::fabs(V(2, 2) - 0.700869) < 0.000001);
    assert(std::fabs(V(3, 1) - -0.00079) < 0.000001);
    assert(std::fabs(V(3, 3) - 0.491216) < 0.000001);
}

C++ DataFrame