Back to Documentations

Signature Description
truct  StringStats  {

    double  avg_size { 0 };        // Average of sizes
    double  std_size { 0 };        // Standard deviation of sizes
    double  avg_alphabets { 0 };   // Average number of a - z, A - Z
    double  avg_caps { 0 };        // Average number of A - Z
    double  avg_digits { 0 };      // Average number of 0 - 9
    double  avg_spaces { 0 };      // Average number of ' '
    double  avg_arithmetic { 0 };  // Average number of + - / *
    double  avg_line_feed { 0 };   // Average number of \n

    // Punctuations. Anything that's not an alphabet, digit, space, line feed,
    // or arithmetic operators.
    //
    double  avg_puncts { 0 };
};
A set of statistics about a string column

Signature Description Parameters
template<StringOnly T>
StringStats
get_str_col_stats(const char *col_name) const;
This call takes a string column as input and returns a set of statistics about the strings in that column. You could use these statistics to figure out the characteristics of the strings in the column. Or you could use the statistics to determine if a new string is acceptable for this column, or ...
T: Type of the input column. Based on the concept, it can only be either of these types: std::string, VirtualString, const char *, or char *.
col_name: Name of the input column.
static void test_get_str_col_stats()  {

    std::cout << "\nTesting get_str_col_stats(  ) ..." << std::endl;

    typedef StdDataFrame64<std::string> StrDataFrame;

    StrDataFrame    df;

    try  {
        df.read("data/SHORT_IBM.csv", io_format::csv2);

        auto    str_col = df.get_index();

        // IBM data has no string columns except for its index. Load the
        // index column as a regular column, so we have something to test
        // A sample string in this column is "2020-10-21"
        //
        df.load_column("ISO Dates", std::move(str_col));

        const StringStats   result = df.get_str_col_stats<std::string>("ISO Dates");

        assert(result.avg_size == 10.0);  // Average size of each string is 10 chars
        assert(result.std_size == 0);  // Every string is the same size
        assert(result.avg_alphabets == 0);  // No alphabets
        assert(result.avg_caps == 0);  // No Capital alphabets

        // 80% of all chars in the entire column are digits
        //
        assert(result.avg_digits == 0.8);
        assert(result.avg_spaces == 0);  // No spaces

        // 20% of all chars in the entire column are arithmetic operators
        //
        assert(result.avg_arithmetic == 0.2);
        assert(result.avg_line_feed == 0);  // No new lines
        assert(result.avg_puncts == 0);  // No punctuations
    }
    catch (const DataFrameError &ex)  {
        std::cout << ex.what() << std::endl;
    }
}

C++ DataFrame