DataFrameGroupBy#

class pandas::DataFrameGroupBy#

GroupBy class for split-apply-combine operations.

Example#

#include <pandas/pandas.h>
using namespace pandas;

// Use DataFrameGroupBy
DataFrameGroupBy obj;
// ... operations ...

Constructors#

Signature

Location

Example

DataFrameGroupBy(const DataFrame& df, const std::vector<std::string>& by, bool as_index = true, bool sort = true, bool dropna = true, bool observed = true, bool group_keys = true)

pd_groupby.h:100

DataFrameGroupBy(const DataFrame& df, const std::string& by, bool as_index = true, bool sort = true, bool dropna = true, bool observed = true, bool group_keys = true)

pd_groupby.h:111

Indexing / Selection#

Signature

Return Type

Location

Example

DataFrame first() const

DataFrame

pd_groupby.h:301

View

std::optional<std::string> first_by_index_name_() const

std::optional<std::string>

pd_groupby.h:90

DataFrame get_group(const std::string& key) const

DataFrame

pd_groupby.h:323

View

DataFrame get_group(const std::string& key, const std::set<std::string>& exclude_cols) const

DataFrame

pd_groupby.h:331

View

std::vector<std::string> get_numeric_value_columns() const

std::vector<std::string>

pd_groupby.h:447

View

std::vector<std::string> get_value_columns(const std::string& agg_name = "") const

std::vector<std::string>

pd_groupby.h:453

DataFrame head(int n = 5) const

DataFrame

pd_groupby.h:313

View

DataFrame idxmax(bool numeric_only = false) const

DataFrame

pd_groupby.h:465

View

DataFrame idxmin(bool numeric_only = false) const

DataFrame

pd_groupby.h:466

View

DataFrame idxmin_with_dtype(bool numeric_only = false) const

DataFrame

pd_groupby.h:263

View

DataFrame last() const

DataFrame

pd_groupby.h:304

View

DataFrame tail(int n = 5) const

DataFrame

pd_groupby.h:316

View

Data Manipulation#

Signature

Return Type

Location

Example

bool dropna() const

bool

pd_groupby.h:407

View

Statistics#

Signature

Return Type

Location

Example

DataFrame count() const

DataFrame

pd_groupby.h:166

View

DataFrame describe() const

DataFrame

pd_groupby.h:171

View

DataFrame max(bool numeric_only = false) const

DataFrame

pd_groupby.h:163

View

DataFrame mean(bool numeric_only = false) const

DataFrame

pd_groupby.h:161

View

DataFrame median(bool numeric_only = false) const

DataFrame

pd_groupby.h:167

View

DataFrame min(bool numeric_only = false) const

DataFrame

pd_groupby.h:162

View

DataFrame nunique(bool dropna = true) const

DataFrame

pd_groupby.h:170

View

DataFrame prod(bool numeric_only = false) const

DataFrame

pd_groupby.h:168

View

DataFrame sem(int ddof = 1, bool numeric_only = false) const

DataFrame

pd_groupby.h:169

View

DataFrame std_(int ddof = 1, bool numeric_only = false) const

DataFrame

pd_groupby.h:164

View

DataFrame sum(bool numeric_only = false) const

DataFrame

pd_groupby.h:160

View

DataFrame var(int ddof = 1, bool numeric_only = false) const

DataFrame

pd_groupby.h:165

View

Aggregation#

Signature

Return Type

Location

Example

DataFrame agg(const std::string& func_name) const

DataFrame

pd_groupby.h:177

View

DataFrame agg(const std::vector<std::string>& funcs) const

DataFrame

pd_groupby.h:183

View

DataFrame agg(const std::vector<std::pair<std::string, std::vector<std::string>>>& col_funcs) const

DataFrame

pd_groupby.h:193

View

DataFrame agg(const std::map<std::string, std::string>& col_func_map) const

DataFrame

pd_groupby.h:204

View

DataFrame agg(std::initializer_list<std::pair<std::string, std::vector<std::string>>> col_funcs_init) const

DataFrame

pd_groupby.h:234

View

PANDASCORE_API Result agg(const FuncArg& func) const

PANDASCORE_API Result

pd_groupby.h:352

View

DataFrame agg_callable_with_dtype( const std::function<pandas::ApplyCellResult( const pandas::Series<numpy::float64>&)>& cb) const

DataFrame

pd_groupby.h:257

View

DataFrame agg_impl( const std::vector<std::pair<std::string, std::vector<std::string>>>& col_funcs, bool list_form) const

DataFrame

pd_groupby.h:500

DataFrame agg_named(const std::vector<NamedAggSpec>& specs) const

DataFrame

pd_groupby.h:339

View

DataFrame agg_with_dtype(const std::string& how) const

DataFrame

pd_groupby.h:248

View

DataFrame agg_with_dtype_list(const std::vector<std::string>& funcs) const

DataFrame

pd_groupby.h:252

View

std::vector<double> aggregate_column(size_t col_idx, const std::string& func) const

std::vector<double>

pd_groupby.h:621

DataFrame apply(std::function<DataFrame(const DataFrame&)> fn, bool include_groups = true) const

DataFrame

pd_groupby.h:282

View

Series<numpy::float64> apply_collect_scalar_results( const std::vector<std::string>& keys, const std::vector<double>& values) const

Series<numpy::float64>

pd_groupby.h:526

View

Series<std::string> apply_collect_scalar_string_results( const std::vector<std::string>& keys, const std::vector<std::string>& values) const

Series<std::string>

pd_groupby.h:536

DataFrame apply_collect_series_results( const std::vector<std::string>& keys, const std::vector<std::string>& col_names, const std::map<std::string, std::vector<double>>& num_cols, const std::map<std::string, std::vector<std::string>>& str_cols, const std::string& columns_axis_name = "") const

DataFrame

pd_groupby.h:549

View

DataFrame apply_concat_dataframe_results( const std::vector<std::string>& keys, const std::vector<DataFrame>& dfs, bool use_group_keys) const

DataFrame

pd_groupby.h:563

View

void apply_int_dtype_if_needed(DataFrame& result, const std::string& result_col, const std::string& source_col, const std::string& func) const

void

pd_groupby.h:636

DataFrameGroupByResampler resample(const std::string& rule, const std::string& closed = "left", const std::string& label = "left") const

DataFrameGroupByResampler

pd_groupby.h:512

View

DataFrame transform_apply_numeric( std::function<std::vector<double>(const std::string&, const Series<numpy::float64>&)> fn) const

DataFrame

pd_groupby.h:473

DataFrame transform_concat_results( const std::map<std::string, std::vector<double>>& col_data, const std::vector<std::string>& value_cols) const

DataFrame

pd_groupby.h:584

DataFrame transform_named(const std::string& func_name) const

DataFrame

pd_groupby.h:593

View

Reshaping#

Signature

Return Type

Location

Example

squeeze_result(DataFrame& result) const

pd_groupby.h:441

View

Other Methods#

Signature

Return Type

Location

Example

bool as_index() const

bool

pd_groupby.h:398

void build_groups()

void

pd_groupby.h:617

std::vector<std::string> by_column_dtypes() const

std::vector<std::string>

pd_groupby.h:388

const std::vector<std::string>& by_columns() const

const std::vector<std::string>&

pd_groupby.h:385

std::vector<std::pair<std::string, std::vector<std::string>>> col_funcs( col_funcs_init.begin(), col_funcs_init.end())

std::vector<std::pair<std::string, std::vector<std::string>>>

pd_groupby.h:235

DataFrameGroupByColumn<T> column(const std::string& col_name) const

DataFrameGroupByColumn<T>

pd_groupby.h:292

View

static double compute_agg(const std::vector<double>& values, const std::string& func, int ddof = 1)

static double

pd_groupby.h:624

View

const DataFrame& dataframe() const

const DataFrame&

pd_groupby.h:382

View

DataFrame filter(std::function<bool(const DataFrame&)> predicate) const

DataFrame

pd_groupby.h:274

View

DataFrame filter_by_group_mask( const std::map<std::string, bool>& group_mask, bool use_dropna = true) const

DataFrame

pd_groupby.h:574

View

bool group_keys() const

bool

pd_groupby.h:404

const std::vector<std::string>& group_keys_order() const

const std::vector<std::string>&

pd_groupby.h:377

View

const std::unordered_map<std::string, std::vector<size_t>>& groups() const

const std::unordered_map<std::string, std::vector<size_t>>&

pd_groupby.h:372

View

DataFrame idx_extreme_impl_(int which, bool numeric_only) const

DataFrame

pd_groupby.h:492

bool list_selected() const

bool

pd_groupby.h:413

View

std::string make_group_key(size_t row_idx) const

std::string

pd_groupby.h:618

Series<int64_t> ngroup(bool ascending = true) const

Series<int64_t>

pd_groupby.h:359

size_t ngroups() const { return group_keys_order_.size()

size_t

pd_groupby.h:369

View

DataFrame nth(int n) const

DataFrame

pd_groupby.h:310

View

DataFrame nth(const std::vector<int>& positions, const std::string& dropna_mode = "") const

DataFrame

pd_groupby.h:613

View

DataFrame nth_by_resolved_slices( const std::vector<std::vector<ResolvedSlice>>& per_group_slices) const

DataFrame

pd_groupby.h:488

void rebuild_groups_with_empty_seeds(std::vector<std::string> keys)

void

pd_groupby.h:151

DataFrameGroupBy select(const std::vector<std::string>& columns) const

DataFrameGroupBy

pd_groupby.h:421

View

DataFrameGroupBy select_as_list(const std::vector<std::string>& columns) const

DataFrameGroupBy

pd_groupby.h:429

View

DataFrame select_rows_by_indices( const std::vector<size_t>& row_indices, const std::vector<std::string>& columns = {}, bool exclude_internal = false) const

DataFrame

pd_groupby.h:602

View

const std::vector<std::string>& selected_columns() const

const std::vector<std::string>&

pd_groupby.h:410

View

void set_extra_empty_keys(std::vector<std::string> keys)

void

pd_groupby.h:141

void set_owned_df(std::shared_ptr<DataFrame> df)

void

pd_groupby.h:123

void set_result_index(DataFrame& result) const

void

pd_groupby.h:627

void set_synthetic_freq_key(bool value)

void

pd_groupby.h:133

bool should_squeeze_to_series() const

bool

pd_groupby.h:416

View

Series<int64_t> size() const

Series<int64_t>

pd_groupby.h:366

View

bool sort_flag() const

bool

pd_groupby.h:401

Code Examples#

The following examples are extracted from the test suite.

first (pd_test_1_all.cpp:11616)
11606        void pd_test_groupby_first_last() {
11607            std::cout << "========= GroupBy first/last ====================";
11608
11609            std::map<std::string, std::vector<double>> data = {
11610                {"category", {1.0, 1.0, 2.0, 2.0}},
11611                {"value", {10.0, 20.0, 30.0, 40.0}}
11612            };
11613            pandas::DataFrame df(data);
11614
11615            auto first_result = df.groupby("category").first();
11616            auto last_result = df.groupby("category").last();
11617
11618            // First for group 1: 10, group 2: 30
11619            // Last for group 1: 20, group 2: 40
11620            double first1 = std::stod(first_result["value"].get_value_str(0));
11621            double first2 = std::stod(first_result["value"].get_value_str(1));
11622
11623            bool passed = ((std::abs(first1 - 10.0) < 0.001 && std::abs(first2 - 30.0) < 0.001) ||
11624                          (std::abs(first1 - 30.0) < 0.001 && std::abs(first2 - 10.0) < 0.001));
11625            if (!passed) {
get_group (pd_test_2_all.cpp:20487)
20477        ++g_fail;
20478    }
20479}
20480
20481static bool approx_eq(double a, double b, double tol = 1e-9) {
20482    if (std::isnan(a) && std::isnan(b)) return true;
20483    return std::abs(a - b) < tol;
20484}
20485
20486// =====================================================================
20487// Test: get_group() with exclude_cols removes groupby columns
20488// =====================================================================
20489
20490void pd_test_groupby_apply_get_group_exclude() {
20491    std::cout << "  -- pd_test_groupby_apply_get_group_exclude --" << std::endl;
20492
20493    pandas::DataFrame df;
20494    df.add_column("key", std::vector<std::string>{"a", "a", "b", "b"});
20495    df.add_column("val1", std::vector<double>{1.0, 2.0, 3.0, 4.0});
20496    df.add_column("val2", std::vector<double>{10.0, 20.0, 30.0, 40.0});
get_group (pd_test_2_all.cpp:20487)
20477        ++g_fail;
20478    }
20479}
20480
20481static bool approx_eq(double a, double b, double tol = 1e-9) {
20482    if (std::isnan(a) && std::isnan(b)) return true;
20483    return std::abs(a - b) < tol;
20484}
20485
20486// =====================================================================
20487// Test: get_group() with exclude_cols removes groupby columns
20488// =====================================================================
20489
20490void pd_test_groupby_apply_get_group_exclude() {
20491    std::cout << "  -- pd_test_groupby_apply_get_group_exclude --" << std::endl;
20492
20493    pandas::DataFrame df;
20494    df.add_column("key", std::vector<std::string>{"a", "a", "b", "b"});
20495    df.add_column("val1", std::vector<double>{1.0, 2.0, 3.0, 4.0});
20496    df.add_column("val2", std::vector<double>{10.0, 20.0, 30.0, 40.0});
get_numeric_value_columns (pd_test_5_all.cpp:36793)
36783}
36784
36785void case_1_groupby_numeric_columns_Int64() {
36786    const std::string tag = "[X1]";
36787    try {
36788        pandas::DataFrame df;
36789        df.add_column<std::string>("g", {"a","a","b","b"});
36790        df.add_column_nullable<int64_t>("v_Int64", {1, 2, 3, 4});
36791        df.add_column<double>("v_Float64", {1.0, 2.0, 3.0, 4.0});
36792        auto gb = df.groupby(std::vector<std::string>{"g"});
36793        auto cols = gb.get_numeric_value_columns();
36794        std::cout << tag << " numeric_cols.size=" << cols.size();
36795        for (auto& c : cols) std::cout << " [" << c << "]";
36796        std::cout << "\n";
36797        bool has_Int64 = std::find(cols.begin(), cols.end(), std::string("v_Int64")) != cols.end();
36798        std::cout << tag << " has_Int64=" << has_Int64 << "\n";
36799    } catch (const std::exception& e) {
36800        std::cout << tag << " exception: " << e.what() << "\n";
36801    }
36802}
head (pd_test_1_all.cpp:6301)
6291        void pd_test_dataframe_indexing() {
6292            std::cout << "========= indexing (loc/iloc) ==============";
6293
6294            std::map<std::string, std::vector<numpy::float64>> data;
6295            data["A"] = {10.0, 20.0, 30.0, 40.0, 50.0};
6296            data["B"] = {1.0, 2.0, 3.0, 4.0, 5.0};
6297
6298            pandas::DataFrame df(data);
6299
6300            // Test head
6301            auto head_df = df.head(3);
6302            if (head_df.nrows() != 3) {
6303                std::cout << "  [FAIL] : in pd_test_dataframe_indexing() : head(3) nrows != 3" << std::endl;
6304                throw std::runtime_error("pd_test_dataframe_indexing failed: head(3) nrows != 3");
6305            }
6306
6307            // Test tail
6308            auto tail_df = df.tail(2);
6309            if (tail_df.nrows() != 2) {
6310                std::cout << "  [FAIL] : in pd_test_dataframe_indexing() : tail(2) nrows != 2" << std::endl;
6311                throw std::runtime_error("pd_test_dataframe_indexing failed: tail(2) nrows != 2");
idxmax (pd_test_1_all.cpp:23956)
23946        std::cout << "====================================== [OK] pd_test_ffill_bfill test suite ========================== " << std::endl;
23947        return 0;
23948    }
23949
23950} // namespace dataframe_tests
23951// ------------------- pd_test_ffill_bfill.cpp (end) -----------------------------
23952
23953// ------------------- pd_test_idxmax_idxmin.cpp (start) -----------------------------
23954// dataframe_tests/pd_test_idxmax_idxmin.cpp
23955// Test for DataFrame.idxmax() and idxmin() methods
23956
23957#include <iostream>
23958#include <stdexcept>
23959#include <cmath>
23960#include <limits>
23961#include "../pandas/pd_dataframe.h"
23962
23963// CRITICAL: No using namespace directives
23964
23965namespace dataframe_tests {
idxmin (pd_test_1_all.cpp:23956)
23946        std::cout << "====================================== [OK] pd_test_ffill_bfill test suite ========================== " << std::endl;
23947        return 0;
23948    }
23949
23950} // namespace dataframe_tests
23951// ------------------- pd_test_ffill_bfill.cpp (end) -----------------------------
23952
23953// ------------------- pd_test_idxmax_idxmin.cpp (start) -----------------------------
23954// dataframe_tests/pd_test_idxmax_idxmin.cpp
23955// Test for DataFrame.idxmax() and idxmin() methods
23956
23957#include <iostream>
23958#include <stdexcept>
23959#include <cmath>
23960#include <limits>
23961#include "../pandas/pd_dataframe.h"
23962
23963// CRITICAL: No using namespace directives
23964
23965namespace dataframe_tests {
idxmin_with_dtype (pd_test_5_all.cpp:95397)
95387void case_701_dfgb_idxmin_rangeindex(int& local_fail) {
95388    std::cout << "-- case_701_dfgb_idxmin_rangeindex\n";
95389    // Default RangeIndex (int64). Result columns must keep int64 dtype.
95390    pandas::DataFrame df;
95391    df.add_column<double>("v", std::vector<double>{3.0, 1.0, 2.0, 0.5});
95392    df.add_column<int64_t>("key", std::vector<int64_t>{0, 0, 1, 1});
95393    auto gb = df.groupby("key");
95394    pandas::DataFrame out;
95395    std::string err;
95396    try { out = gb.idxmin_with_dtype(); }
95397    catch (const std::exception& e) { err = e.what(); }
95398    catch (...) { err = "<unknown>"; }
95399    pandas_tests::check(err.empty(),
95400        "C_26_case_701_dfgb_idxmin_rangeindex()_no_throw", local_fail);
95401    if (!err.empty()) { std::cout << "  err: " << err << "\n"; return; }
95402    std::string got = df_col_dtype(out, "v");
95403    bool ok = (got == "int64");
95404    pandas_tests::check(ok,
95405        "C_26_case_701_dfgb_idxmin_rangeindex()_dtype", local_fail);
95406    if (!ok) std::cout << "  got=[" << got << "] expected=[int64]\n";
last (pd_test_1_all.cpp:11617)
11607        void pd_test_groupby_first_last() {
11608            std::cout << "========= GroupBy first/last ====================";
11609
11610            std::map<std::string, std::vector<double>> data = {
11611                {"category", {1.0, 1.0, 2.0, 2.0}},
11612                {"value", {10.0, 20.0, 30.0, 40.0}}
11613            };
11614            pandas::DataFrame df(data);
11615
11616            auto first_result = df.groupby("category").first();
11617            auto last_result = df.groupby("category").last();
11618
11619            // First for group 1: 10, group 2: 30
11620            // Last for group 1: 20, group 2: 40
11621            double first1 = std::stod(first_result["value"].get_value_str(0));
11622            double first2 = std::stod(first_result["value"].get_value_str(1));
11623
11624            bool passed = ((std::abs(first1 - 10.0) < 0.001 && std::abs(first2 - 30.0) < 0.001) ||
11625                          (std::abs(first1 - 30.0) < 0.001 && std::abs(first2 - 10.0) < 0.001));
11626            if (!passed) {
11627                std::cout << "  [FAIL] : in pd_test_groupby_first_last() : first values incorrect" << std::endl;
tail (pd_test_1_all.cpp:6308)
6298            pandas::DataFrame df(data);
6299
6300            // Test head
6301            auto head_df = df.head(3);
6302            if (head_df.nrows() != 3) {
6303                std::cout << "  [FAIL] : in pd_test_dataframe_indexing() : head(3) nrows != 3" << std::endl;
6304                throw std::runtime_error("pd_test_dataframe_indexing failed: head(3) nrows != 3");
6305            }
6306
6307            // Test tail
6308            auto tail_df = df.tail(2);
6309            if (tail_df.nrows() != 2) {
6310                std::cout << "  [FAIL] : in pd_test_dataframe_indexing() : tail(2) nrows != 2" << std::endl;
6311                throw std::runtime_error("pd_test_dataframe_indexing failed: tail(2) nrows != 2");
6312            }
6313
6314            // Test iloc_rows range
6315            auto slice = df.iloc_rows(1, 4);
6316            if (slice.nrows() != 3) {
6317                std::cout << "  [FAIL] : in pd_test_dataframe_indexing() : iloc_rows(1,4) nrows != 3" << std::endl;
6318                throw std::runtime_error("pd_test_dataframe_indexing failed: iloc_rows(1,4) nrows != 3");
dropna (pd_test_1_all.cpp:531)
521        }
522
523        // Test isna array
524        numpy::NDArray<numpy::bool_> na_mask = arr.isna();
525        if (na_mask.getSize() != 4) {
526            std::cout << "  [FAIL] : in pd_test_categorical_array_na_handling() : isna size != 4" << std::endl;
527            throw std::runtime_error("pd_test_categorical_array_na_handling failed: isna size != 4");
528        }
529
530        // Test dropna
531        pandas::CategoricalArray dropped = arr.dropna();
532        if (dropped.size() != 2) {
533            std::cout << "  [FAIL] : in pd_test_categorical_array_na_handling() : dropna size != 2" << std::endl;
534            throw std::runtime_error("pd_test_categorical_array_na_handling failed: dropna size != 2");
535        }
536
537        // Test fillna (fill with existing category)
538        pandas::CategoricalArray filled = arr.fillna("a");  // 'a' is in categories
539        if (filled.has_na()) {
540            std::cout << "  [FAIL] : in pd_test_categorical_array_na_handling() : fillna should have no NA" << std::endl;
541            throw std::runtime_error("pd_test_categorical_array_na_handling failed: fillna should have no NA");
count (pd_test_1_all.cpp:66)
56        if (arr.is_na(0)) {
57            std::cout << "  [FAIL] : in pd_test_boolean_array_na_handling() : is_na(0) should be false" << std::endl;
58            throw std::runtime_error("pd_test_boolean_array_na_handling failed: is_na(0) should be false");
59        }
60
61        if (!arr.has_na()) {
62            std::cout << "  [FAIL] : in pd_test_boolean_array_na_handling() : has_na() should be true" << std::endl;
63            throw std::runtime_error("pd_test_boolean_array_na_handling failed: has_na() should be true");
64        }
65
66        if (arr.count() != 2) {
67            std::cout << "  [FAIL] : in pd_test_boolean_array_na_handling() : count() should be 2" << std::endl;
68            throw std::runtime_error("pd_test_boolean_array_na_handling failed: count() should be 2");
69        }
70
71        std::cout << " -> tests passed" << std::endl;
72    }
73
74    void pd_test_boolean_array_kleene_and() {
75        std::cout << "========= BooleanArray: Kleene AND ======================= ";
describe (pd_test_2_all.cpp:19793)
19783        ++g_fail;
19784    }
19785}
19786
19787static bool approx_eq(double a, double b, double tol = 1e-9) {
19788    if (std::isnan(a) && std::isnan(b)) return true;
19789    return std::abs(a - b) < tol;
19790}
19791
19792// =====================================================================
19793// Test: describe() default mode — numeric columns only
19794// =====================================================================
19795
19796void pd_test_describe_numeric_only() {
19797    std::cout << "  -- pd_test_describe_numeric_only --" << std::endl;
19798
19799    pandas::DataFrame df;
19800    df.add_column("A", std::vector<double>{1.0, 2.0, 3.0, 4.0, 5.0});
19801    df.add_column("B", std::vector<double>{10.0, 20.0, 30.0, 40.0, 50.0});
19802    df.add_column("Name", std::vector<std::string>{"a", "b", "c", "d", "e"});
max (pd_test_1_all.cpp:771)
761        pandas::CategoricalArray arr = pandas::CategoricalArray::from_codes(codes, cats, true);  // ordered
762
763        // Test min
764        std::optional<std::string> min_val = arr.min();
765        if (!min_val.has_value() || *min_val != "low") {
766            std::cout << "  [FAIL] : in pd_test_categorical_array_ordered_operations() : min != 'low'" << std::endl;
767            throw std::runtime_error("pd_test_categorical_array_ordered_operations failed: min != 'low'");
768        }
769
770        // Test max
771        std::optional<std::string> max_val = arr.max();
772        if (!max_val.has_value() || *max_val != "high") {
773            std::cout << "  [FAIL] : in pd_test_categorical_array_ordered_operations() : max != 'high'" << std::endl;
774            throw std::runtime_error("pd_test_categorical_array_ordered_operations failed: max != 'high'");
775        }
776
777        // Test unordered throws for min/max
778        pandas::CategoricalArray unordered = arr.as_unordered();
779        bool threw = false;
780        try {
781            unordered.min();
mean (pd_test_1_all.cpp:282)
272            std::optional<bool>(true),
273            std::optional<bool>(true)
274        });
275
276        auto s = arr.sum();
277        if (!s.has_value() || s.value() != 3) {
278            std::cout << "  [FAIL] : in pd_test_boolean_array_reductions() : sum should be 3" << std::endl;
279            throw std::runtime_error("pd_test_boolean_array_reductions failed: sum");
280        }
281
282        auto m = arr.mean();
283        if (!m.has_value() || std::abs(m.value() - 0.75) > 0.001) {
284            std::cout << "  [FAIL] : in pd_test_boolean_array_reductions() : mean should be 0.75" << std::endl;
285            throw std::runtime_error("pd_test_boolean_array_reductions failed: mean");
286        }
287
288        std::cout << " -> tests passed" << std::endl;
289    }
290
291    void pd_test_boolean_array_dtype() {
292        std::cout << "========= BooleanArray: dtype ======================= ";
median (pd_test_1_all.cpp:20910)
20900                throw std::runtime_error("pd_test_expanding_var failed: expanding var values incorrect");
20901            }
20902
20903            std::cout << " -> tests passed" << std::endl;
20904        }
20905
20906        void pd_test_expanding_median() {
20907            std::cout << "========= Expanding median ======================";
20908
20909            pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0});
20910            auto result = s.expanding().median();
20911
20912            // Expanding median: 1, 1.5, 2, 2.5, 3
20913            bool passed = std::abs(result[0] - 1.0) < 0.001 &&
20914                          std::abs(result[1] - 1.5) < 0.001 &&
20915                          std::abs(result[2] - 2.0) < 0.001 &&
20916                          std::abs(result[3] - 2.5) < 0.001 &&
20917                          std::abs(result[4] - 3.0) < 0.001;
20918            if (!passed) {
20919                std::cout << "  [FAIL] : in pd_test_expanding_median() : expanding median values incorrect" << std::endl;
20920                throw std::runtime_error("pd_test_expanding_median failed: expanding median values incorrect");
min (pd_test_1_all.cpp:764)
754    }
755
756    void pd_test_categorical_array_ordered_operations() {
757        std::cout << "========= CategoricalArray: ordered operations (min/max) ======================= ";
758
759        std::vector<std::string> cats = {"low", "medium", "high"};
760        std::vector<numpy::int32> codes = {0, 2, 1, 0, -1};  // low, high, medium, low, NA
761        pandas::CategoricalArray arr = pandas::CategoricalArray::from_codes(codes, cats, true);  // ordered
762
763        // Test min
764        std::optional<std::string> min_val = arr.min();
765        if (!min_val.has_value() || *min_val != "low") {
766            std::cout << "  [FAIL] : in pd_test_categorical_array_ordered_operations() : min != 'low'" << std::endl;
767            throw std::runtime_error("pd_test_categorical_array_ordered_operations failed: min != 'low'");
768        }
769
770        // Test max
771        std::optional<std::string> max_val = arr.max();
772        if (!max_val.has_value() || *max_val != "high") {
773            std::cout << "  [FAIL] : in pd_test_categorical_array_ordered_operations() : max != 'high'" << std::endl;
774            throw std::runtime_error("pd_test_categorical_array_ordered_operations failed: max != 'high'");
nunique (pd_test_1_all.cpp:10604)
10594    std::cout << " -> tests passed" << std::endl;
10595}
10596
10597void pd_test_extension_index_nunique() {
10598    std::cout << "========= nunique =========================";
10599
10600    pandas::CategoricalArray arr({"a", "b", "a", "c", "b", std::nullopt});
10601    pandas::CategoricalIndex idx(arr);
10602
10603    bool passed = (idx.nunique(true) == 3 && idx.nunique(false) == 4);
10604    if (!passed) {
10605        std::cout << "  [FAIL] : in pd_test_extension_index_nunique() : nunique check failed" << std::endl;
10606        throw std::runtime_error("pd_test_extension_index_nunique failed");
10607    }
10608
10609    std::cout << " -> tests passed" << std::endl;
10610}
10611
10612void pd_test_extension_index_factorize() {
10613    std::cout << "========= factorize =========================";
prod (pd_test_1_all.cpp:26082)
26072        std::cout << "====================================== [OK] pd_test_pivot_table test suite ========================== " << std::endl;
26073        return 0;
26074    }
26075
26076} // namespace dataframe_tests
26077// ------------------- pd_test_pivot_table.cpp (end) -----------------------------
26078
26079// ------------------- pd_test_prod.cpp (start) -----------------------------
26080// dataframe_tests/pd_test_prod.cpp
26081// Tests for DataFrame.prod() and DataFrame.prod_cols() methods
26082
26083#include <iostream>
26084#include <stdexcept>
26085#include <cmath>
26086#include <limits>
26087#include "../pandas/pd_dataframe.h"
26088
26089// CRITICAL: No using namespace directives
26090
26091namespace dataframe_tests {
sem (pd_test_1_all.cpp:4525)
4515#include "../pandas/pd_dataframe.h"
4516#include "../pandas/pd_series.h"
4517
4518namespace dataframe_tests {
4519    namespace dataframe_tests_aggregation {
4520
4521        void pd_test_aggregation_series_sem() {
4522            std::cout << "========= Series sem ============================";
4523
4524            pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0});
4525            auto sem_val = s.sem();
4526            // std(ddof=1) = sqrt(2.5), sem = sqrt(2.5)/sqrt(5) ≈ 0.707
4527            bool passed = sem_val.has_value() && std::abs(*sem_val - 0.707) < 0.01;
4528            if (!passed) {
4529                std::cout << "  [FAIL] : in pd_test_aggregation_series_sem() : sem value incorrect" << std::endl;
4530                throw std::runtime_error("pd_test_aggregation_series_sem failed: sem value incorrect");
4531            }
4532
4533            std::cout << " -> tests passed" << std::endl;
4534        }
std_ (pd_test_1_all.cpp:20752)
20742                throw std::runtime_error("pd_test_rolling_min_periods failed: with min_periods=1, idx 1 should be 3.0");
20743            }
20744
20745            std::cout << " -> tests passed" << std::endl;
20746        }
20747
20748        void pd_test_rolling_std() {
20749            std::cout << "========= Rolling std ===========================";
20750
20751            pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0});
20752            auto result = s.rolling(3).std_();
20753
20754            // std([1,2,3]) = 1.0 (ddof=1)
20755            // std([2,3,4]) = 1.0
20756            // std([3,4,5]) = 1.0
20757            bool passed = std::abs(result[2] - 1.0) < 0.001;
20758            if (!passed) {
20759                std::cout << "  [FAIL] : in pd_test_rolling_std() : rolling std should be 1.0" << std::endl;
20760                throw std::runtime_error("pd_test_rolling_std failed: rolling std should be 1.0");
20761            }
sum (pd_test_1_all.cpp:276)
266        }
267
268        // Test sum/mean
269        pandas::BooleanArray arr({
270            std::optional<bool>(true),
271            std::optional<bool>(false),
272            std::optional<bool>(true),
273            std::optional<bool>(true)
274        });
275
276        auto s = arr.sum();
277        if (!s.has_value() || s.value() != 3) {
278            std::cout << "  [FAIL] : in pd_test_boolean_array_reductions() : sum should be 3" << std::endl;
279            throw std::runtime_error("pd_test_boolean_array_reductions failed: sum");
280        }
281
282        auto m = arr.mean();
283        if (!m.has_value() || std::abs(m.value() - 0.75) > 0.001) {
284            std::cout << "  [FAIL] : in pd_test_boolean_array_reductions() : mean should be 0.75" << std::endl;
285            throw std::runtime_error("pd_test_boolean_array_reductions failed: mean");
286        }
var (pd_test_1_all.cpp:20890)
20880                throw std::runtime_error("pd_test_expanding_std failed: expanding std values incorrect");
20881            }
20882
20883            std::cout << " -> tests passed" << std::endl;
20884        }
20885
20886        void pd_test_expanding_var() {
20887            std::cout << "========= Expanding var =========================";
20888
20889            pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0});
20890            auto result = s.expanding().var();
20891
20892            // Expanding var (ddof=1): NaN, 0.5, 1.0, 1.6667, 2.5
20893            bool passed = std::isnan(result[0]) &&
20894                          std::abs(result[1] - 0.5) < 0.001 &&
20895                          std::abs(result[2] - 1.0) < 0.001 &&
20896                          std::abs(result[3] - 1.6667) < 0.001 &&
20897                          std::abs(result[4] - 2.5) < 0.001;
20898            if (!passed) {
20899                std::cout << "  [FAIL] : in pd_test_expanding_var() : expanding var values incorrect" << std::endl;
20900                throw std::runtime_error("pd_test_expanding_var failed: expanding var values incorrect");
agg (pd_test_1_all.cpp:11100)
11090        }
11091
11092        void pd_test_func_apply_series_agg() {
11093            std::cout << "========= Series agg ==================================";
11094
11095            pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0}, "values");
11096
11097            bool passed = true;
11098
11099            // Test string-based aggregation
11100            auto sum_result = s.agg("sum");
11101            if (!sum_result.has_value() || !approx_equal(sum_result.value(), 15.0)) {
11102                passed = false;
11103                std::cout << "  [FAIL] : in pd_test_func_apply_series_agg() : sum failed" << std::endl;
11104                throw std::runtime_error("pd_test_func_apply_series_agg failed: sum failed");
11105            }
11106
11107            auto mean_result = s.agg("mean");
11108            if (!mean_result.has_value() || !approx_equal(mean_result.value(), 3.0)) {
11109                passed = false;
11110                std::cout << "  [FAIL] : in pd_test_func_apply_series_agg() : mean failed" << std::endl;
agg (pd_test_1_all.cpp:11100)
11090        }
11091
11092        void pd_test_func_apply_series_agg() {
11093            std::cout << "========= Series agg ==================================";
11094
11095            pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0}, "values");
11096
11097            bool passed = true;
11098
11099            // Test string-based aggregation
11100            auto sum_result = s.agg("sum");
11101            if (!sum_result.has_value() || !approx_equal(sum_result.value(), 15.0)) {
11102                passed = false;
11103                std::cout << "  [FAIL] : in pd_test_func_apply_series_agg() : sum failed" << std::endl;
11104                throw std::runtime_error("pd_test_func_apply_series_agg failed: sum failed");
11105            }
11106
11107            auto mean_result = s.agg("mean");
11108            if (!mean_result.has_value() || !approx_equal(mean_result.value(), 3.0)) {
11109                passed = false;
11110                std::cout << "  [FAIL] : in pd_test_func_apply_series_agg() : mean failed" << std::endl;
agg (pd_test_1_all.cpp:11100)
11090        }
11091
11092        void pd_test_func_apply_series_agg() {
11093            std::cout << "========= Series agg ==================================";
11094
11095            pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0}, "values");
11096
11097            bool passed = true;
11098
11099            // Test string-based aggregation
11100            auto sum_result = s.agg("sum");
11101            if (!sum_result.has_value() || !approx_equal(sum_result.value(), 15.0)) {
11102                passed = false;
11103                std::cout << "  [FAIL] : in pd_test_func_apply_series_agg() : sum failed" << std::endl;
11104                throw std::runtime_error("pd_test_func_apply_series_agg failed: sum failed");
11105            }
11106
11107            auto mean_result = s.agg("mean");
11108            if (!mean_result.has_value() || !approx_equal(mean_result.value(), 3.0)) {
11109                passed = false;
11110                std::cout << "  [FAIL] : in pd_test_func_apply_series_agg() : mean failed" << std::endl;
agg (pd_test_1_all.cpp:11100)
11090        }
11091
11092        void pd_test_func_apply_series_agg() {
11093            std::cout << "========= Series agg ==================================";
11094
11095            pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0}, "values");
11096
11097            bool passed = true;
11098
11099            // Test string-based aggregation
11100            auto sum_result = s.agg("sum");
11101            if (!sum_result.has_value() || !approx_equal(sum_result.value(), 15.0)) {
11102                passed = false;
11103                std::cout << "  [FAIL] : in pd_test_func_apply_series_agg() : sum failed" << std::endl;
11104                throw std::runtime_error("pd_test_func_apply_series_agg failed: sum failed");
11105            }
11106
11107            auto mean_result = s.agg("mean");
11108            if (!mean_result.has_value() || !approx_equal(mean_result.value(), 3.0)) {
11109                passed = false;
11110                std::cout << "  [FAIL] : in pd_test_func_apply_series_agg() : mean failed" << std::endl;
agg (pd_test_1_all.cpp:11100)
11090        }
11091
11092        void pd_test_func_apply_series_agg() {
11093            std::cout << "========= Series agg ==================================";
11094
11095            pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0}, "values");
11096
11097            bool passed = true;
11098
11099            // Test string-based aggregation
11100            auto sum_result = s.agg("sum");
11101            if (!sum_result.has_value() || !approx_equal(sum_result.value(), 15.0)) {
11102                passed = false;
11103                std::cout << "  [FAIL] : in pd_test_func_apply_series_agg() : sum failed" << std::endl;
11104                throw std::runtime_error("pd_test_func_apply_series_agg failed: sum failed");
11105            }
11106
11107            auto mean_result = s.agg("mean");
11108            if (!mean_result.has_value() || !approx_equal(mean_result.value(), 3.0)) {
11109                passed = false;
11110                std::cout << "  [FAIL] : in pd_test_func_apply_series_agg() : mean failed" << std::endl;
agg (pd_test_1_all.cpp:11100)
11090        }
11091
11092        void pd_test_func_apply_series_agg() {
11093            std::cout << "========= Series agg ==================================";
11094
11095            pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0}, "values");
11096
11097            bool passed = true;
11098
11099            // Test string-based aggregation
11100            auto sum_result = s.agg("sum");
11101            if (!sum_result.has_value() || !approx_equal(sum_result.value(), 15.0)) {
11102                passed = false;
11103                std::cout << "  [FAIL] : in pd_test_func_apply_series_agg() : sum failed" << std::endl;
11104                throw std::runtime_error("pd_test_func_apply_series_agg failed: sum failed");
11105            }
11106
11107            auto mean_result = s.agg("mean");
11108            if (!mean_result.has_value() || !approx_equal(mean_result.value(), 3.0)) {
11109                passed = false;
11110                std::cout << "  [FAIL] : in pd_test_func_apply_series_agg() : mean failed" << std::endl;
agg_callable_with_dtype (pd_test_5_all.cpp:95045)
95035    run_sgb_case("count", "object:bool", "int64",
95036        "C_26_case_412_sgb_count_objbool()", lf); }
95037
95038void case_501_callable_int_returns_int64(int& local_fail) {
95039    std::cout << "-- case_501_callable_int_returns_int64\n";
95040    pandas::DataFrame df = make_mixed_df();
95041    auto gb = df.groupby("key");
95042    pandas::DataFrame out;
95043    std::string err;
95044    try {
95045        out = gb.agg_callable_with_dtype(make_int_callable(42));
95046    } catch (const std::exception& e) {
95047        err = e.what();
95048    } catch (...) {
95049        err = "<unknown>";
95050    }
95051    pandas_tests::check(err.empty(),
95052        "C_26_case_501_callable_int_returns_int64()_no_throw",
95053        local_fail);
95054    if (!err.empty()) {
95055        std::cout << "  err: " << err << "\n";
agg_named (pd_test_2_all.cpp:20534)
20524    check(approx_eq(sub_b["val1"].get_value_double(0), 3.0), "get_group_b_val1_r0");
20525    check(approx_eq(sub_b["val1"].get_value_double(1), 4.0), "get_group_b_val1_r1");
20526
20527    // Empty exclude_cols: same as no-exclude overload
20528    std::set<std::string> empty_exclude;
20529    auto sub_empty = gb.get_group("a", empty_exclude);
20530    check(sub_empty.ncols() == 3, "get_group_empty_excl_cols_3");
20531}
20532
20533// =====================================================================
20534// Test: agg_named() basic execution
20535// =====================================================================
20536
20537void pd_test_groupby_apply_named_agg_basic() {
20538    std::cout << "  -- pd_test_groupby_apply_named_agg_basic --" << std::endl;
20539
20540    pandas::DataFrame df;
20541    df.add_column("key", std::vector<std::string>{"a", "a", "b", "b"});
20542    df.add_column("val", std::vector<double>{1.0, 3.0, 5.0, 7.0});
20543
20544    auto gb = df.groupby("key");
agg_with_dtype (pd_test_5_all.cpp:94652)
94642static void run_dfgb_case(const std::string& fn,
94643                          const std::string& col,
94644                          const std::string& expected_dtype,
94645                          const std::string& label,
94646                          int& local_fail) {
94647    pandas::DataFrame df = make_mixed_df();
94648    auto gb = df.groupby("key");
94649    pandas::DataFrame out;
94650    std::string err;
94651    try {
94652        out = gb.agg_with_dtype(fn);
94653    } catch (const std::exception& e) {
94654        err = e.what();
94655    } catch (...) {
94656        err = "<unknown>";
94657    }
94658    pandas_tests::check(err.empty(),
94659        label + "_no_throw",
94660        local_fail);
94661    if (!err.empty()) {
94662        std::cout << "  err: " << err << "\n";
agg_with_dtype_list (pd_test_5_all.cpp:94682)
94672static void run_dfgb_list_case(const std::vector<std::string>& fns,
94673                               const std::string& src_col,
94674                               const std::vector<std::string>& expected,
94675                               const std::string& label,
94676                               int& local_fail) {
94677    pandas::DataFrame df = make_mixed_df();
94678    auto gb = df.groupby("key");
94679    pandas::DataFrame out;
94680    std::string err;
94681    try {
94682        out = gb.agg_with_dtype_list(fns);
94683    } catch (const std::exception& e) {
94684        err = e.what();
94685    } catch (...) {
94686        err = "<unknown>";
94687    }
94688    pandas_tests::check(err.empty(),
94689        label + "_no_throw",
94690        local_fail);
94691    if (!err.empty()) {
94692        std::cout << "  err: " << err << "\n";
apply (pd_test_1_all.cpp:11244)
11234        void pd_test_func_apply_dataframe_apply_axis0() {
11235            std::cout << "========= DataFrame apply axis=0 ======================";
11236
11237            std::map<std::string, std::vector<double>> data = {
11238                {"A", {1.0, 2.0, 3.0}},
11239                {"B", {4.0, 5.0, 6.0}}
11240            };
11241            pandas::DataFrame df(data);
11242
11243            // apply axis=0 applies function to each column
11244            auto result = df.apply([](const std::vector<double>& col) {
11245                return std::accumulate(col.begin(), col.end(), 0.0);
11246            }, 0);
11247
11248            bool passed = true;
11249
11250            // Plan F·dtype: axis=0 reduce now returns a single "result" column
11251            // with the original column names ("A", "B") as the row index.
11252            // Sum of A: 1+2+3=6, Sum of B: 4+5+6=15
11253            const auto& result_col = result["result"];
11254            double sum_a = std::stod(result_col.get_value_str(0));
apply_collect_scalar_results (pd_test_3_all.cpp:27341)
27331    std::vector<double> values;
27332    for (const auto& key : keys) {
27333        auto sub = gb.get_group(key);
27334        double sum = 0;
27335        for (size_t r = 0; r < sub.nrows(); ++r) {
27336            sum += sub["B"].get_value_double(r);
27337        }
27338        values.push_back(sum);
27339    }
27340
27341    auto result = gb.apply_collect_scalar_results(keys, values);
27342    check(result.size() == keys.size(), "scalar results size matches keys size");
27343
27344    bool found_bar = false, found_foo = false;
27345    for (size_t i = 0; i < result.size(); ++i) {
27346        std::string idx = result.index().get_value_str(i);
27347        if (idx == "bar") { check(result[i] == 6.0, "bar sum = 6"); found_bar = true; }
27348        if (idx == "foo") { check(result[i] == 9.0, "foo sum = 9"); found_foo = true; }
27349    }
27350    check(found_bar, "bar key found");
27351    check(found_foo, "foo key found");
apply_collect_series_results (pd_test_3_all.cpp:27376)
27366        auto sub = gb.get_group(key);
27367        double b_sum = 0, c_sum = 0;
27368        for (size_t r = 0; r < sub.nrows(); ++r) {
27369            b_sum += sub["B"].get_value_double(r);
27370            c_sum += sub["C"].get_value_double(r);
27371        }
27372        num_cols["B"].push_back(b_sum / sub.nrows());
27373        num_cols["C"].push_back(c_sum / sub.nrows());
27374    }
27375
27376    auto result = gb.apply_collect_series_results(keys, col_names, num_cols, str_cols);
27377    check(result.ncols() == 2, "series results has 2 columns");
27378    check(result.nrows() == keys.size(), "series results has correct rows");
27379    check(result.has_column("B"), "has column B");
27380    check(result.has_column("C"), "has column C");
27381}
27382
27383void pd_test_gb_apply_dataframe_results() {
27384    std::cout << "  -- pd_test_gb_apply_dataframe_results --" << std::endl;
27385
27386    auto df = make_test_df();
apply_concat_dataframe_results (pd_test_3_all.cpp:27398)
27388    std::vector<std::string> keys = gb.group_keys_order();
27389    std::vector<pandas::DataFrame> dfs;
27390    std::set<std::string> exclude;
27391    exclude.insert("A");
27392
27393    for (const auto& key : keys) {
27394        dfs.push_back(gb.get_group(key, exclude));
27395    }
27396
27397    auto result_gk = gb.apply_concat_dataframe_results(keys, dfs, true);
27398    check(result_gk.nrows() == df.nrows(), "concat with MI has all rows");
27399    check(result_gk.has_multiindex(), "concat with group_keys=true has MultiIndex");
27400
27401    auto result_no_gk = gb.apply_concat_dataframe_results(keys, dfs, false);
27402    check(result_no_gk.nrows() == df.nrows(), "concat without MI has all rows");
27403}
27404
27405void pd_test_gb_filter_basic() {
27406    std::cout << "  -- pd_test_gb_filter_basic --" << std::endl;
resample (pd_test_1_all.cpp:20321)
20311                "2020-01-01 00:00:00",
20312                "2020-01-01 12:00:00",
20313                "2020-01-02 00:00:00",
20314                "2020-01-02 12:00:00",
20315                "2020-01-03 00:00:00",
20316                "2020-01-03 12:00:00"
20317            };
20318            df.set_index(std::make_unique<pandas::Index<std::string>>(dates));
20319
20320            // Resample to daily
20321            auto resampler = df.resample("D");
20322            pandas::DataFrame result = resampler.sum();
20323
20324            // Check that we got aggregated results
20325            bool passed = (result.nrows() <= df.nrows());
20326
20327            if (!passed) {
20328                std::cout << "  [FAIL] : in pd_test_timeseries_resample_basic() : resample didn't reduce rows" << std::endl;
20329                throw std::runtime_error("pd_test_timeseries_resample_basic failed");
20330            }
transform_named (pd_test_3_all.cpp:27465)
27455    auto result_nodrop = gb.filter_by_group_mask(mask, false);
27456    check(result_nodrop.nrows() == 5, "dropna=false keeps all rows");
27457}
27458
27459void pd_test_gb_transform_same_shape() {
27460    std::cout << "  -- pd_test_gb_transform_same_shape --" << std::endl;
27461
27462    auto df = make_test_df();
27463    auto gb = df.groupby("A");
27464
27465    auto result = gb.transform_named("sum");
27466    check(result.nrows() == df.nrows(), "transform sum same nrows as input");
27467    check(result["B"].get_value_double(0) == 9.0, "row 0 (foo) B sum = 9");
27468    check(result["B"].get_value_double(1) == 6.0, "row 1 (bar) B sum = 6");
27469    check(result["B"].get_value_double(2) == 9.0, "row 2 (foo) B sum = 9");
27470
27471    auto result_mean = gb.transform_named("mean");
27472    check(result_mean.nrows() == df.nrows(), "transform mean same nrows");
27473    check(result_mean["B"].get_value_double(0) == 3.0, "row 0 (foo) B mean = 3");
27474    check(result_mean["B"].get_value_double(1) == 3.0, "row 1 (bar) B mean = 3");
squeeze_result (pd_test_2_all.cpp:20697)
20687    std::cout << "  -- test_groupby_squeeze_single_col --" << std::endl;
20688
20689    pandas::DataFrame df;
20690    df.add_column("key", std::vector<std::string>{"A", "A", "B", "B"});
20691    df.add_column("val", std::vector<numpy::float64>{1.0, 2.0, 3.0, 4.0});
20692
20693    auto gb = df.groupby("key");
20694    auto gb_sel = gb.select({"val"});  // single col, not list
20695    pandas::DataFrame result = gb_sel.sum();
20696
20697    auto squeezed = gb_sel.squeeze_result(result);
20698
20699    // Should be a Series<float64>
20700    check(std::holds_alternative<pandas::Series<numpy::float64>>(squeezed), "is_float64_series");
20701
20702    auto& s = std::get<pandas::Series<numpy::float64>>(squeezed);
20703    check(s.size() == 2, "size_2");
20704    check(s.name() == "val", "name_val");
20705    check(approx_eq(s[0], 3.0), "A_sum_3");
20706    check(approx_eq(s[1], 7.0), "B_sum_7");
20707}
column (pd_test_1_all.cpp:22039)
22029            std::string a1 = result.iat<double>(1, col_a_idx) == -1.0 ? "ok" : "fail";
22030            std::string a2 = result.iat<double>(2, col_a_idx) == 3.0 ? "ok" : "fail";
22031            std::string a3 = result.iat<double>(3, col_a_idx) == 4.0 ? "ok" : "fail";
22032
22033            if (a0 != "ok" || a1 != "ok" || a2 != "ok" || a3 != "ok") {
22034                passed = false;
22035                error_msg = "Column A values incorrect: A[0]=" + a0 + ", A[1]=" + a1 +
22036                            ", A[2]=" + a2 + ", A[3]=" + a3;
22037            }
22038
22039            // Check B column (all should be original)
22040            double b0 = result.iat<double>(0, col_b_idx);
22041            if (b0 != 5.0) {
22042                passed = false;
22043                error_msg = "B[0] should be 5, got " + std::to_string(b0);
22044            }
22045
22046            if (!passed) {
22047                std::cout << "  [FAIL] : in pd_test_where_basic() : " << error_msg << std::endl;
22048                throw std::runtime_error("pd_test_where_basic failed: " + error_msg);
22049            }
compute_agg (pd_test_5_all.cpp:112204)
112194    // Default signature is groupby(by, axis, level, as_index, sort, group_keys, observed, dropna).
112195    auto gb = df_in.groupby("k", 0, std::nullopt, /*as_index=*/true,
112196                            /*sort=*/true, /*group_keys=*/true,
112197                            /*observed=*/false, /*dropna=*/true);
112198    pandas::DataFrame df = gb.agg("sum");
112199    std::string actual = df.to_string();
112200
112201    // Pandas oracle (verified by analysis1 H3 logic + compute_agg empty=0.0):
112202    // - "a" observed, sum=10
112203    // - "b" observed, sum=20
112204    // - "c" unobserved -> compute_agg(empty, "sum") -> 0
112205    // Plan 12 (Logic-C int widening) has landed: aggregate_column now
112206    // preserves int64 for integer inputs, so the oracle is int64 with
112207    // integer literal display (no .0 suffix).
112208    std::string expected =
112209        "    v\n"
112210        "k    \n"
112211        "a  10\n"
112212        "b  20\n"
112213        "c   0";
112214    check_case("groupby_agg_dispatch_7c3a91_case_41",
dataframe (pd_test_2_all.cpp:11742)
11732                std::cout << "  [FAIL] : wrong dimensions" << std::endl;
11733                std::remove(temp_path.c_str());
11734                throw std::runtime_error("pd_test_to_hdf_mixed_types failed");
11735            }
11736
11737            std::remove(temp_path.c_str());
11738            std::cout << " -> tests passed" << std::endl;
11739        }
11740
11741        void pd_test_to_hdf_empty_dataframe() {
11742            std::cout << "========= to_hdf empty dataframe (real HDF5) ===================";
11743
11744            pandas::DataFrame df;
11745            std::string temp_path = "temp/test_hdf5_empty.h5";
11746            df.to_hdf(temp_path, "df", "w");
11747
11748            // Just verify file was created
11749            std::ifstream file(temp_path);
11750            if (!file.is_open()) {
11751                std::cout << "  [FAIL] : file not created" << std::endl;
11752                throw std::runtime_error("pd_test_to_hdf_empty_dataframe failed");
filter (pd_test_3_all.cpp:2805)
2795        threw = true;
2796    }
2797    if (!threw) {
2798        throw std::runtime_error("bool_() should throw for multi-element DataFrame");
2799    }
2800
2801    std::cout << " -> tests passed" << std::endl;
2802}
2803
2804void pd_test_3_all_df_filter() {
2805    std::cout << "========= DataFrame.filter() =============================";
2806
2807    std::map<std::string, std::vector<double>> data = {
2808        {"col_a", {1.0, 2.0, 3.0}},
2809        {"col_b", {4.0, 5.0, 6.0}},
2810        {"other", {7.0, 8.0, 9.0}}
2811    };
2812    pandas::DataFrame df(data);
2813
2814    // Test filter by items
2815    pandas::DataFrame filtered_items = df.filter({"col_a", "col_b"});
filter_by_group_mask (pd_test_3_all.cpp:27422)
27412    std::map<std::string, bool> mask;
27413    for (const auto& key : gb.group_keys_order()) {
27414        auto sub = gb.get_group(key);
27415        double sum = 0;
27416        for (size_t r = 0; r < sub.nrows(); ++r) {
27417            sum += sub["B"].get_value_double(r);
27418        }
27419        mask[key] = (sum > 5);
27420    }
27421
27422    auto result = gb.filter_by_group_mask(mask, true);
27423    check(result.nrows() == 5, "all rows pass filter (both groups sum > 5)");
27424
27425    std::map<std::string, bool> mask3;
27426    mask3["bar"] = false;
27427    mask3["foo"] = true;
27428    auto result3 = gb.filter_by_group_mask(mask3, true);
27429    check(result3.nrows() == 3, "only foo rows kept (3 rows)");
27430}
27431
27432void pd_test_gb_filter_preserves_order() {
group_keys_order (pd_test_3_all.cpp:23393)
23383    pandas::Series<numpy::float64> s({10.0, 20.0, 30.0, 40.0});
23384    std::vector<std::vector<std::string>> level_values = {
23385        {"a", "a", "b", "b"}, {"x", "y", "x", "y"}
23386    };
23387    std::vector<std::optional<std::string>> level_names = {"first", "second"};
23388    auto mi = pandas::MultiIndex::from_arrays<std::string>(level_values, level_names);
23389    s.set_multiindex(mi);
23390
23391    auto gb = s.groupby_by_level(static_cast<size_t>(0), true);
23392    if (gb.group_keys_order().size() != 2)
23393        throw std::runtime_error("expected 2 groups");
23394    auto sums = gb.sum();
23395    if (sums[0] != 30.0 || sums[1] != 70.0)
23396        throw std::runtime_error("sum mismatch");
23397    if (!gb.get_index_name().has_value() || *gb.get_index_name() != "first")
23398        throw std::runtime_error("index name mismatch");
23399
23400    std::cout << " -> tests passed" << std::endl;
23401}
groups (pd_test_2_all.cpp:20864)
20854// =====================================================================
20855// Per-group expanding tests
20856// =====================================================================
20857
20858void test_series_groupby_expanding_sum() {
20859    std::cout << "  -- test_series_groupby_expanding_sum --" << std::endl;
20860
20861    // Two groups: A=[1,2,3], B=[10,20]
20862    std::vector<numpy::float64> vals = {1.0, 10.0, 2.0, 20.0, 3.0};
20863    pandas::Series<numpy::float64> data(vals);
20864    pandas::Series<std::string> groups({"A", "B", "A", "B", "A"});
20865
20866    auto sgb = data.groupby(groups);
20867    pandas::SeriesGroupByExpandingWindow ew(sgb, 1);
20868    auto result = ew.sum();
20869
20870    check(result.size() == 5, "size_5");
20871    // A group: expanding sum = 1, 3, 6
20872    // B group: expanding sum = 10, 30
20873    // Original order: [A:1, B:10, A:3, B:30, A:6]
20874    check(approx_eq(result[0], 1.0), "A_exp_sum_0");
list_selected (pd_test_5_all.cpp:28524)
28514}
28515
28516void case_1_squeeze_flag_state_machine(int& local_fail) {
28517    std::cout << "-- H1 squeeze flag state machine\n";
28518    auto df = make_df_std();
28519    auto gb0 = df.groupby("key");
28520
28521    // (a) Base gb -> no selection -> squeeze false.
28522    pandas_tests::check(!gb0.should_squeeze_to_series(),
28523                        "H1.a.base_no_select_squeeze_false", local_fail);
28524    pandas_tests::check(!gb0.list_selected(),
28525                        "H1.a.base_list_selected_false", local_fail);
28526    check_eq("H1.a.base_selected_size_zero", 0,
28527             (long long)gb0.selected_columns().size(), local_fail);
28528
28529    // (b) select({c}) -> squeeze true.
28530    auto gb1 = gb0.select({"v_int"});
28531    pandas_tests::check(gb1.should_squeeze_to_series(),
28532                        "H1.b.select_single_squeeze_true", local_fail);
28533    pandas_tests::check(!gb1.list_selected(),
28534                        "H1.b.select_list_selected_false", local_fail);
ngroups (pd_test_1_all.cpp:11497)
11487            // Create DataFrame with category column
11488            std::map<std::string, std::vector<double>> data = {
11489                {"category", {1.0, 1.0, 2.0, 2.0, 2.0}},
11490                {"value", {10.0, 20.0, 30.0, 40.0, 50.0}}
11491            };
11492            pandas::DataFrame df(data);
11493
11494            // Test groupby
11495            auto grouped = df.groupby("category");
11496
11497            bool passed = grouped.ngroups() == 2;
11498            if (!passed) {
11499                std::cout << "  [FAIL] : in pd_test_groupby_basic() : ngroups should be 2" << std::endl;
11500                throw std::runtime_error("pd_test_groupby_basic failed: ngroups should be 2");
11501            }
11502
11503            std::cout << " -> tests passed" << std::endl;
11504        }
11505
11506        void pd_test_groupby_multiple_columns() {
11507            std::cout << "========= GroupBy multiple columns ==============";
nth (pd_test_3_all.cpp:27491)
27481    check(result_cumsum["B"].get_value_double(1) == 2.0, "row 1 (bar) cumsum B = 2");
27482    check(result_cumsum["B"].get_value_double(3) == 6.0, "row 3 (bar) cumsum B = 6");
27483}
27484
27485void pd_test_gb_nth_basic() {
27486    std::cout << "  -- pd_test_gb_nth_basic --" << std::endl;
27487
27488    auto df = make_test_df();
27489    auto gb = df.groupby("A");
27490
27491    auto result = gb.nth(0);
27492    check(result.nrows() == 2, "nth(0) returns 2 rows (one per group)");
27493
27494    auto result_last = gb.nth(-1);
27495    check(result_last.nrows() == 2, "nth(-1) returns 2 rows");
27496
27497    auto result_multi = gb.nth(std::vector<int>{0, -1});
27498    check(result_multi.nrows() == 4, "nth([0,-1]) returns 4 rows");
27499}
27500
27501void pd_test_gb_nth_slice() {
nth (pd_test_3_all.cpp:27491)
27481    check(result_cumsum["B"].get_value_double(1) == 2.0, "row 1 (bar) cumsum B = 2");
27482    check(result_cumsum["B"].get_value_double(3) == 6.0, "row 3 (bar) cumsum B = 6");
27483}
27484
27485void pd_test_gb_nth_basic() {
27486    std::cout << "  -- pd_test_gb_nth_basic --" << std::endl;
27487
27488    auto df = make_test_df();
27489    auto gb = df.groupby("A");
27490
27491    auto result = gb.nth(0);
27492    check(result.nrows() == 2, "nth(0) returns 2 rows (one per group)");
27493
27494    auto result_last = gb.nth(-1);
27495    check(result_last.nrows() == 2, "nth(-1) returns 2 rows");
27496
27497    auto result_multi = gb.nth(std::vector<int>{0, -1});
27498    check(result_multi.nrows() == 4, "nth([0,-1]) returns 4 rows");
27499}
27500
27501void pd_test_gb_nth_slice() {
select (pd_test_2_all.cpp:20694)
20684// =====================================================================
20685
20686void test_groupby_squeeze_single_col() {
20687    std::cout << "  -- test_groupby_squeeze_single_col --" << std::endl;
20688
20689    pandas::DataFrame df;
20690    df.add_column("key", std::vector<std::string>{"A", "A", "B", "B"});
20691    df.add_column("val", std::vector<numpy::float64>{1.0, 2.0, 3.0, 4.0});
20692
20693    auto gb = df.groupby("key");
20694    auto gb_sel = gb.select({"val"});  // single col, not list
20695    pandas::DataFrame result = gb_sel.sum();
20696
20697    auto squeezed = gb_sel.squeeze_result(result);
20698
20699    // Should be a Series<float64>
20700    check(std::holds_alternative<pandas::Series<numpy::float64>>(squeezed), "is_float64_series");
20701
20702    auto& s = std::get<pandas::Series<numpy::float64>>(squeezed);
20703    check(s.size() == 2, "size_2");
20704    check(s.name() == "val", "name_val");
select_as_list (pd_test_2_all.cpp:20751)
20741}
20742
20743void test_groupby_no_squeeze_list_key() {
20744    std::cout << "  -- test_groupby_no_squeeze_list_key --" << std::endl;
20745
20746    pandas::DataFrame df;
20747    df.add_column("key", std::vector<std::string>{"A", "A", "B", "B"});
20748    df.add_column("val", std::vector<numpy::float64>{1.0, 2.0, 3.0, 4.0});
20749
20750    auto gb = df.groupby("key");
20751    auto gb_sel = gb.select_as_list({"val"});  // list selection -> no squeeze
20752    pandas::DataFrame result = gb_sel.sum();
20753
20754    auto squeezed = gb_sel.squeeze_result(result);
20755    check(std::holds_alternative<std::monostate>(squeezed), "is_monostate_list_sel");
20756}
20757
20758// =====================================================================
20759// apply_result_index tests (MultiIndex reconstruction)
20760// =====================================================================
select_rows_by_indices (pd_test_3_all.cpp:27515)
27505    auto gb = df.groupby("A");
27506
27507    std::vector<size_t> selected;
27508    for (const auto& key : gb.group_keys_order()) {
27509        const auto& indices = gb.groups().at(key);
27510        for (size_t i = 0; i < std::min(size_t(2), indices.size()); ++i) {
27511            selected.push_back(indices[i]);
27512        }
27513    }
27514
27515    auto result = gb.select_rows_by_indices(selected);
27516    check(result.nrows() == 4, "slice [0:2] returns 4 rows");
27517}
27518
27519void pd_test_gb_nth_dropna() {
27520    std::cout << "  -- pd_test_gb_nth_dropna --" << std::endl;
27521
27522    std::map<std::string, std::vector<double>> data;
27523    data["B"] = {std::numeric_limits<double>::quiet_NaN(), 2.0, 3.0, 4.0, 5.0};
27524    data["C"] = {10.0, 20.0, 30.0, 40.0, 50.0};
27525    pandas::DataFrame df(data);
selected_columns (pd_test_5_all.cpp:28527)
28517    std::cout << "-- H1 squeeze flag state machine\n";
28518    auto df = make_df_std();
28519    auto gb0 = df.groupby("key");
28520
28521    // (a) Base gb -> no selection -> squeeze false.
28522    pandas_tests::check(!gb0.should_squeeze_to_series(),
28523                        "H1.a.base_no_select_squeeze_false", local_fail);
28524    pandas_tests::check(!gb0.list_selected(),
28525                        "H1.a.base_list_selected_false", local_fail);
28526    check_eq("H1.a.base_selected_size_zero", 0,
28527             (long long)gb0.selected_columns().size(), local_fail);
28528
28529    // (b) select({c}) -> squeeze true.
28530    auto gb1 = gb0.select({"v_int"});
28531    pandas_tests::check(gb1.should_squeeze_to_series(),
28532                        "H1.b.select_single_squeeze_true", local_fail);
28533    pandas_tests::check(!gb1.list_selected(),
28534                        "H1.b.select_list_selected_false", local_fail);
28535
28536    // (c) select_as_list({c}) 1-col -> squeeze false (DataFrame-style).
28537    auto gb2 = gb0.select_as_list({"v_int"});
should_squeeze_to_series (pd_test_5_all.cpp:28522)
28512        std::vector<std::string>{"level_0", "level_1"});
28513    return df;
28514}
28515
28516void case_1_squeeze_flag_state_machine(int& local_fail) {
28517    std::cout << "-- H1 squeeze flag state machine\n";
28518    auto df = make_df_std();
28519    auto gb0 = df.groupby("key");
28520
28521    // (a) Base gb -> no selection -> squeeze false.
28522    pandas_tests::check(!gb0.should_squeeze_to_series(),
28523                        "H1.a.base_no_select_squeeze_false", local_fail);
28524    pandas_tests::check(!gb0.list_selected(),
28525                        "H1.a.base_list_selected_false", local_fail);
28526    check_eq("H1.a.base_selected_size_zero", 0,
28527             (long long)gb0.selected_columns().size(), local_fail);
28528
28529    // (b) select({c}) -> squeeze true.
28530    auto gb1 = gb0.select({"v_int"});
28531    pandas_tests::check(gb1.should_squeeze_to_series(),
28532                        "H1.b.select_single_squeeze_true", local_fail);
size (pd_test_1_all.cpp:22)
12#include "../pandas/pd_boolean_array.h"
13
14namespace dataframe_tests {
15
16namespace dataframe_tests_boolean_array {
17    void pd_test_boolean_array_constructors() {
18        std::cout << "========= BooleanArray: constructors ======================= ";
19
20        // Default constructor
21        pandas::BooleanArray arr1;
22        if (arr1.size() != 0) {
23            std::cout << "  [FAIL] : in pd_test_boolean_array_constructors() : default constructor size != 0" << std::endl;
24            throw std::runtime_error("pd_test_boolean_array_constructors failed: default constructor size != 0");
25        }
26
27        // Initializer list constructor
28        pandas::BooleanArray arr2({
29            std::optional<bool>(true),
30            std::optional<bool>(false),
31            std::nullopt,
32            std::optional<bool>(true)