DataFrameGroupBy#
-
class pandas::DataFrameGroupBy#
GroupBy class for split-apply-combine operations.
Example#
#include <pandas/pandas.h>
using namespace pandas;
// Use DataFrameGroupBy
DataFrameGroupBy obj;
// ... operations ...
Constructors#
Signature |
Location |
Example |
|---|---|---|
|
pd_groupby.h:100 |
|
|
pd_groupby.h:111 |
Indexing / Selection#
Signature |
Return Type |
Location |
Example |
|---|---|---|---|
|
DataFrame |
pd_groupby.h:301 |
|
|
std::optional<std::string> |
pd_groupby.h:90 |
|
|
DataFrame |
pd_groupby.h:323 |
|
|
DataFrame |
pd_groupby.h:331 |
|
|
std::vector<std::string> |
pd_groupby.h:447 |
|
|
std::vector<std::string> |
pd_groupby.h:453 |
|
|
DataFrame |
pd_groupby.h:313 |
|
|
DataFrame |
pd_groupby.h:465 |
|
|
DataFrame |
pd_groupby.h:466 |
|
|
DataFrame |
pd_groupby.h:263 |
|
|
DataFrame |
pd_groupby.h:304 |
|
|
DataFrame |
pd_groupby.h:316 |
Data Manipulation#
Signature |
Return Type |
Location |
Example |
|---|---|---|---|
|
bool |
pd_groupby.h:407 |
Statistics#
Signature |
Return Type |
Location |
Example |
|---|---|---|---|
|
DataFrame |
pd_groupby.h:166 |
|
|
DataFrame |
pd_groupby.h:171 |
|
|
DataFrame |
pd_groupby.h:163 |
|
|
DataFrame |
pd_groupby.h:161 |
|
|
DataFrame |
pd_groupby.h:167 |
|
|
DataFrame |
pd_groupby.h:162 |
|
|
DataFrame |
pd_groupby.h:170 |
|
|
DataFrame |
pd_groupby.h:168 |
|
|
DataFrame |
pd_groupby.h:169 |
|
|
DataFrame |
pd_groupby.h:164 |
|
|
DataFrame |
pd_groupby.h:160 |
|
|
DataFrame |
pd_groupby.h:165 |
Aggregation#
Signature |
Return Type |
Location |
Example |
|---|---|---|---|
|
DataFrame |
pd_groupby.h:177 |
|
|
DataFrame |
pd_groupby.h:183 |
|
|
DataFrame |
pd_groupby.h:193 |
|
|
DataFrame |
pd_groupby.h:204 |
|
|
DataFrame |
pd_groupby.h:234 |
|
|
PANDASCORE_API Result |
pd_groupby.h:352 |
|
|
DataFrame |
pd_groupby.h:257 |
|
|
DataFrame |
pd_groupby.h:500 |
|
|
DataFrame |
pd_groupby.h:339 |
|
|
DataFrame |
pd_groupby.h:248 |
|
|
DataFrame |
pd_groupby.h:252 |
|
|
std::vector<double> |
pd_groupby.h:621 |
|
|
DataFrame |
pd_groupby.h:282 |
|
|
Series<numpy::float64> |
pd_groupby.h:526 |
|
|
Series<std::string> |
pd_groupby.h:536 |
|
|
DataFrame |
pd_groupby.h:549 |
|
|
DataFrame |
pd_groupby.h:563 |
|
|
void |
pd_groupby.h:636 |
|
|
DataFrameGroupByResampler |
pd_groupby.h:512 |
|
|
DataFrame |
pd_groupby.h:473 |
|
|
DataFrame |
pd_groupby.h:584 |
|
|
DataFrame |
pd_groupby.h:593 |
Reshaping#
Signature |
Return Type |
Location |
Example |
|---|---|---|---|
|
pd_groupby.h:441 |
Other Methods#
Signature |
Return Type |
Location |
Example |
|---|---|---|---|
|
bool |
pd_groupby.h:398 |
|
|
void |
pd_groupby.h:617 |
|
|
std::vector<std::string> |
pd_groupby.h:388 |
|
|
const std::vector<std::string>& |
pd_groupby.h:385 |
|
|
std::vector<std::pair<std::string, std::vector<std::string>>> |
pd_groupby.h:235 |
|
|
DataFrameGroupByColumn<T> |
pd_groupby.h:292 |
|
|
static double |
pd_groupby.h:624 |
|
|
const DataFrame& |
pd_groupby.h:382 |
|
|
DataFrame |
pd_groupby.h:274 |
|
|
DataFrame |
pd_groupby.h:574 |
|
|
bool |
pd_groupby.h:404 |
|
|
const std::vector<std::string>& |
pd_groupby.h:377 |
|
|
const std::unordered_map<std::string, std::vector<size_t>>& |
pd_groupby.h:372 |
|
|
DataFrame |
pd_groupby.h:492 |
|
|
bool |
pd_groupby.h:413 |
|
|
std::string |
pd_groupby.h:618 |
|
|
Series<int64_t> |
pd_groupby.h:359 |
|
|
size_t |
pd_groupby.h:369 |
|
|
DataFrame |
pd_groupby.h:310 |
|
|
DataFrame |
pd_groupby.h:613 |
|
|
DataFrame |
pd_groupby.h:488 |
|
|
void |
pd_groupby.h:151 |
|
|
DataFrameGroupBy |
pd_groupby.h:421 |
|
|
DataFrameGroupBy |
pd_groupby.h:429 |
|
|
DataFrame |
pd_groupby.h:602 |
|
|
const std::vector<std::string>& |
pd_groupby.h:410 |
|
|
void |
pd_groupby.h:141 |
|
|
void |
pd_groupby.h:123 |
|
|
void |
pd_groupby.h:627 |
|
|
void |
pd_groupby.h:133 |
|
|
bool |
pd_groupby.h:416 |
|
|
Series<int64_t> |
pd_groupby.h:366 |
|
|
bool |
pd_groupby.h:401 |
Code Examples#
The following examples are extracted from the test suite.
first (pd_test_1_all.cpp:11616)
11606 void pd_test_groupby_first_last() {
11607 std::cout << "========= GroupBy first/last ====================";
11608
11609 std::map<std::string, std::vector<double>> data = {
11610 {"category", {1.0, 1.0, 2.0, 2.0}},
11611 {"value", {10.0, 20.0, 30.0, 40.0}}
11612 };
11613 pandas::DataFrame df(data);
11614
11615 auto first_result = df.groupby("category").first();
11616 auto last_result = df.groupby("category").last();
11617
11618 // First for group 1: 10, group 2: 30
11619 // Last for group 1: 20, group 2: 40
11620 double first1 = std::stod(first_result["value"].get_value_str(0));
11621 double first2 = std::stod(first_result["value"].get_value_str(1));
11622
11623 bool passed = ((std::abs(first1 - 10.0) < 0.001 && std::abs(first2 - 30.0) < 0.001) ||
11624 (std::abs(first1 - 30.0) < 0.001 && std::abs(first2 - 10.0) < 0.001));
11625 if (!passed) {
get_group (pd_test_2_all.cpp:20487)
20477 ++g_fail;
20478 }
20479}
20480
20481static bool approx_eq(double a, double b, double tol = 1e-9) {
20482 if (std::isnan(a) && std::isnan(b)) return true;
20483 return std::abs(a - b) < tol;
20484}
20485
20486// =====================================================================
20487// Test: get_group() with exclude_cols removes groupby columns
20488// =====================================================================
20489
20490void pd_test_groupby_apply_get_group_exclude() {
20491 std::cout << " -- pd_test_groupby_apply_get_group_exclude --" << std::endl;
20492
20493 pandas::DataFrame df;
20494 df.add_column("key", std::vector<std::string>{"a", "a", "b", "b"});
20495 df.add_column("val1", std::vector<double>{1.0, 2.0, 3.0, 4.0});
20496 df.add_column("val2", std::vector<double>{10.0, 20.0, 30.0, 40.0});
get_group (pd_test_2_all.cpp:20487)
20477 ++g_fail;
20478 }
20479}
20480
20481static bool approx_eq(double a, double b, double tol = 1e-9) {
20482 if (std::isnan(a) && std::isnan(b)) return true;
20483 return std::abs(a - b) < tol;
20484}
20485
20486// =====================================================================
20487// Test: get_group() with exclude_cols removes groupby columns
20488// =====================================================================
20489
20490void pd_test_groupby_apply_get_group_exclude() {
20491 std::cout << " -- pd_test_groupby_apply_get_group_exclude --" << std::endl;
20492
20493 pandas::DataFrame df;
20494 df.add_column("key", std::vector<std::string>{"a", "a", "b", "b"});
20495 df.add_column("val1", std::vector<double>{1.0, 2.0, 3.0, 4.0});
20496 df.add_column("val2", std::vector<double>{10.0, 20.0, 30.0, 40.0});
get_numeric_value_columns (pd_test_5_all.cpp:36793)
36783}
36784
36785void case_1_groupby_numeric_columns_Int64() {
36786 const std::string tag = "[X1]";
36787 try {
36788 pandas::DataFrame df;
36789 df.add_column<std::string>("g", {"a","a","b","b"});
36790 df.add_column_nullable<int64_t>("v_Int64", {1, 2, 3, 4});
36791 df.add_column<double>("v_Float64", {1.0, 2.0, 3.0, 4.0});
36792 auto gb = df.groupby(std::vector<std::string>{"g"});
36793 auto cols = gb.get_numeric_value_columns();
36794 std::cout << tag << " numeric_cols.size=" << cols.size();
36795 for (auto& c : cols) std::cout << " [" << c << "]";
36796 std::cout << "\n";
36797 bool has_Int64 = std::find(cols.begin(), cols.end(), std::string("v_Int64")) != cols.end();
36798 std::cout << tag << " has_Int64=" << has_Int64 << "\n";
36799 } catch (const std::exception& e) {
36800 std::cout << tag << " exception: " << e.what() << "\n";
36801 }
36802}
head (pd_test_1_all.cpp:6301)
6291 void pd_test_dataframe_indexing() {
6292 std::cout << "========= indexing (loc/iloc) ==============";
6293
6294 std::map<std::string, std::vector<numpy::float64>> data;
6295 data["A"] = {10.0, 20.0, 30.0, 40.0, 50.0};
6296 data["B"] = {1.0, 2.0, 3.0, 4.0, 5.0};
6297
6298 pandas::DataFrame df(data);
6299
6300 // Test head
6301 auto head_df = df.head(3);
6302 if (head_df.nrows() != 3) {
6303 std::cout << " [FAIL] : in pd_test_dataframe_indexing() : head(3) nrows != 3" << std::endl;
6304 throw std::runtime_error("pd_test_dataframe_indexing failed: head(3) nrows != 3");
6305 }
6306
6307 // Test tail
6308 auto tail_df = df.tail(2);
6309 if (tail_df.nrows() != 2) {
6310 std::cout << " [FAIL] : in pd_test_dataframe_indexing() : tail(2) nrows != 2" << std::endl;
6311 throw std::runtime_error("pd_test_dataframe_indexing failed: tail(2) nrows != 2");
idxmax (pd_test_1_all.cpp:23956)
23946 std::cout << "====================================== [OK] pd_test_ffill_bfill test suite ========================== " << std::endl;
23947 return 0;
23948 }
23949
23950} // namespace dataframe_tests
23951// ------------------- pd_test_ffill_bfill.cpp (end) -----------------------------
23952
23953// ------------------- pd_test_idxmax_idxmin.cpp (start) -----------------------------
23954// dataframe_tests/pd_test_idxmax_idxmin.cpp
23955// Test for DataFrame.idxmax() and idxmin() methods
23956
23957#include <iostream>
23958#include <stdexcept>
23959#include <cmath>
23960#include <limits>
23961#include "../pandas/pd_dataframe.h"
23962
23963// CRITICAL: No using namespace directives
23964
23965namespace dataframe_tests {
idxmin (pd_test_1_all.cpp:23956)
23946 std::cout << "====================================== [OK] pd_test_ffill_bfill test suite ========================== " << std::endl;
23947 return 0;
23948 }
23949
23950} // namespace dataframe_tests
23951// ------------------- pd_test_ffill_bfill.cpp (end) -----------------------------
23952
23953// ------------------- pd_test_idxmax_idxmin.cpp (start) -----------------------------
23954// dataframe_tests/pd_test_idxmax_idxmin.cpp
23955// Test for DataFrame.idxmax() and idxmin() methods
23956
23957#include <iostream>
23958#include <stdexcept>
23959#include <cmath>
23960#include <limits>
23961#include "../pandas/pd_dataframe.h"
23962
23963// CRITICAL: No using namespace directives
23964
23965namespace dataframe_tests {
idxmin_with_dtype (pd_test_5_all.cpp:95397)
95387void case_701_dfgb_idxmin_rangeindex(int& local_fail) {
95388 std::cout << "-- case_701_dfgb_idxmin_rangeindex\n";
95389 // Default RangeIndex (int64). Result columns must keep int64 dtype.
95390 pandas::DataFrame df;
95391 df.add_column<double>("v", std::vector<double>{3.0, 1.0, 2.0, 0.5});
95392 df.add_column<int64_t>("key", std::vector<int64_t>{0, 0, 1, 1});
95393 auto gb = df.groupby("key");
95394 pandas::DataFrame out;
95395 std::string err;
95396 try { out = gb.idxmin_with_dtype(); }
95397 catch (const std::exception& e) { err = e.what(); }
95398 catch (...) { err = "<unknown>"; }
95399 pandas_tests::check(err.empty(),
95400 "C_26_case_701_dfgb_idxmin_rangeindex()_no_throw", local_fail);
95401 if (!err.empty()) { std::cout << " err: " << err << "\n"; return; }
95402 std::string got = df_col_dtype(out, "v");
95403 bool ok = (got == "int64");
95404 pandas_tests::check(ok,
95405 "C_26_case_701_dfgb_idxmin_rangeindex()_dtype", local_fail);
95406 if (!ok) std::cout << " got=[" << got << "] expected=[int64]\n";
last (pd_test_1_all.cpp:11617)
11607 void pd_test_groupby_first_last() {
11608 std::cout << "========= GroupBy first/last ====================";
11609
11610 std::map<std::string, std::vector<double>> data = {
11611 {"category", {1.0, 1.0, 2.0, 2.0}},
11612 {"value", {10.0, 20.0, 30.0, 40.0}}
11613 };
11614 pandas::DataFrame df(data);
11615
11616 auto first_result = df.groupby("category").first();
11617 auto last_result = df.groupby("category").last();
11618
11619 // First for group 1: 10, group 2: 30
11620 // Last for group 1: 20, group 2: 40
11621 double first1 = std::stod(first_result["value"].get_value_str(0));
11622 double first2 = std::stod(first_result["value"].get_value_str(1));
11623
11624 bool passed = ((std::abs(first1 - 10.0) < 0.001 && std::abs(first2 - 30.0) < 0.001) ||
11625 (std::abs(first1 - 30.0) < 0.001 && std::abs(first2 - 10.0) < 0.001));
11626 if (!passed) {
11627 std::cout << " [FAIL] : in pd_test_groupby_first_last() : first values incorrect" << std::endl;
tail (pd_test_1_all.cpp:6308)
6298 pandas::DataFrame df(data);
6299
6300 // Test head
6301 auto head_df = df.head(3);
6302 if (head_df.nrows() != 3) {
6303 std::cout << " [FAIL] : in pd_test_dataframe_indexing() : head(3) nrows != 3" << std::endl;
6304 throw std::runtime_error("pd_test_dataframe_indexing failed: head(3) nrows != 3");
6305 }
6306
6307 // Test tail
6308 auto tail_df = df.tail(2);
6309 if (tail_df.nrows() != 2) {
6310 std::cout << " [FAIL] : in pd_test_dataframe_indexing() : tail(2) nrows != 2" << std::endl;
6311 throw std::runtime_error("pd_test_dataframe_indexing failed: tail(2) nrows != 2");
6312 }
6313
6314 // Test iloc_rows range
6315 auto slice = df.iloc_rows(1, 4);
6316 if (slice.nrows() != 3) {
6317 std::cout << " [FAIL] : in pd_test_dataframe_indexing() : iloc_rows(1,4) nrows != 3" << std::endl;
6318 throw std::runtime_error("pd_test_dataframe_indexing failed: iloc_rows(1,4) nrows != 3");
dropna (pd_test_1_all.cpp:531)
521 }
522
523 // Test isna array
524 numpy::NDArray<numpy::bool_> na_mask = arr.isna();
525 if (na_mask.getSize() != 4) {
526 std::cout << " [FAIL] : in pd_test_categorical_array_na_handling() : isna size != 4" << std::endl;
527 throw std::runtime_error("pd_test_categorical_array_na_handling failed: isna size != 4");
528 }
529
530 // Test dropna
531 pandas::CategoricalArray dropped = arr.dropna();
532 if (dropped.size() != 2) {
533 std::cout << " [FAIL] : in pd_test_categorical_array_na_handling() : dropna size != 2" << std::endl;
534 throw std::runtime_error("pd_test_categorical_array_na_handling failed: dropna size != 2");
535 }
536
537 // Test fillna (fill with existing category)
538 pandas::CategoricalArray filled = arr.fillna("a"); // 'a' is in categories
539 if (filled.has_na()) {
540 std::cout << " [FAIL] : in pd_test_categorical_array_na_handling() : fillna should have no NA" << std::endl;
541 throw std::runtime_error("pd_test_categorical_array_na_handling failed: fillna should have no NA");
count (pd_test_1_all.cpp:66)
56 if (arr.is_na(0)) {
57 std::cout << " [FAIL] : in pd_test_boolean_array_na_handling() : is_na(0) should be false" << std::endl;
58 throw std::runtime_error("pd_test_boolean_array_na_handling failed: is_na(0) should be false");
59 }
60
61 if (!arr.has_na()) {
62 std::cout << " [FAIL] : in pd_test_boolean_array_na_handling() : has_na() should be true" << std::endl;
63 throw std::runtime_error("pd_test_boolean_array_na_handling failed: has_na() should be true");
64 }
65
66 if (arr.count() != 2) {
67 std::cout << " [FAIL] : in pd_test_boolean_array_na_handling() : count() should be 2" << std::endl;
68 throw std::runtime_error("pd_test_boolean_array_na_handling failed: count() should be 2");
69 }
70
71 std::cout << " -> tests passed" << std::endl;
72 }
73
74 void pd_test_boolean_array_kleene_and() {
75 std::cout << "========= BooleanArray: Kleene AND ======================= ";
describe (pd_test_2_all.cpp:19793)
19783 ++g_fail;
19784 }
19785}
19786
19787static bool approx_eq(double a, double b, double tol = 1e-9) {
19788 if (std::isnan(a) && std::isnan(b)) return true;
19789 return std::abs(a - b) < tol;
19790}
19791
19792// =====================================================================
19793// Test: describe() default mode — numeric columns only
19794// =====================================================================
19795
19796void pd_test_describe_numeric_only() {
19797 std::cout << " -- pd_test_describe_numeric_only --" << std::endl;
19798
19799 pandas::DataFrame df;
19800 df.add_column("A", std::vector<double>{1.0, 2.0, 3.0, 4.0, 5.0});
19801 df.add_column("B", std::vector<double>{10.0, 20.0, 30.0, 40.0, 50.0});
19802 df.add_column("Name", std::vector<std::string>{"a", "b", "c", "d", "e"});
max (pd_test_1_all.cpp:771)
761 pandas::CategoricalArray arr = pandas::CategoricalArray::from_codes(codes, cats, true); // ordered
762
763 // Test min
764 std::optional<std::string> min_val = arr.min();
765 if (!min_val.has_value() || *min_val != "low") {
766 std::cout << " [FAIL] : in pd_test_categorical_array_ordered_operations() : min != 'low'" << std::endl;
767 throw std::runtime_error("pd_test_categorical_array_ordered_operations failed: min != 'low'");
768 }
769
770 // Test max
771 std::optional<std::string> max_val = arr.max();
772 if (!max_val.has_value() || *max_val != "high") {
773 std::cout << " [FAIL] : in pd_test_categorical_array_ordered_operations() : max != 'high'" << std::endl;
774 throw std::runtime_error("pd_test_categorical_array_ordered_operations failed: max != 'high'");
775 }
776
777 // Test unordered throws for min/max
778 pandas::CategoricalArray unordered = arr.as_unordered();
779 bool threw = false;
780 try {
781 unordered.min();
mean (pd_test_1_all.cpp:282)
272 std::optional<bool>(true),
273 std::optional<bool>(true)
274 });
275
276 auto s = arr.sum();
277 if (!s.has_value() || s.value() != 3) {
278 std::cout << " [FAIL] : in pd_test_boolean_array_reductions() : sum should be 3" << std::endl;
279 throw std::runtime_error("pd_test_boolean_array_reductions failed: sum");
280 }
281
282 auto m = arr.mean();
283 if (!m.has_value() || std::abs(m.value() - 0.75) > 0.001) {
284 std::cout << " [FAIL] : in pd_test_boolean_array_reductions() : mean should be 0.75" << std::endl;
285 throw std::runtime_error("pd_test_boolean_array_reductions failed: mean");
286 }
287
288 std::cout << " -> tests passed" << std::endl;
289 }
290
291 void pd_test_boolean_array_dtype() {
292 std::cout << "========= BooleanArray: dtype ======================= ";
median (pd_test_1_all.cpp:20910)
20900 throw std::runtime_error("pd_test_expanding_var failed: expanding var values incorrect");
20901 }
20902
20903 std::cout << " -> tests passed" << std::endl;
20904 }
20905
20906 void pd_test_expanding_median() {
20907 std::cout << "========= Expanding median ======================";
20908
20909 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0});
20910 auto result = s.expanding().median();
20911
20912 // Expanding median: 1, 1.5, 2, 2.5, 3
20913 bool passed = std::abs(result[0] - 1.0) < 0.001 &&
20914 std::abs(result[1] - 1.5) < 0.001 &&
20915 std::abs(result[2] - 2.0) < 0.001 &&
20916 std::abs(result[3] - 2.5) < 0.001 &&
20917 std::abs(result[4] - 3.0) < 0.001;
20918 if (!passed) {
20919 std::cout << " [FAIL] : in pd_test_expanding_median() : expanding median values incorrect" << std::endl;
20920 throw std::runtime_error("pd_test_expanding_median failed: expanding median values incorrect");
min (pd_test_1_all.cpp:764)
754 }
755
756 void pd_test_categorical_array_ordered_operations() {
757 std::cout << "========= CategoricalArray: ordered operations (min/max) ======================= ";
758
759 std::vector<std::string> cats = {"low", "medium", "high"};
760 std::vector<numpy::int32> codes = {0, 2, 1, 0, -1}; // low, high, medium, low, NA
761 pandas::CategoricalArray arr = pandas::CategoricalArray::from_codes(codes, cats, true); // ordered
762
763 // Test min
764 std::optional<std::string> min_val = arr.min();
765 if (!min_val.has_value() || *min_val != "low") {
766 std::cout << " [FAIL] : in pd_test_categorical_array_ordered_operations() : min != 'low'" << std::endl;
767 throw std::runtime_error("pd_test_categorical_array_ordered_operations failed: min != 'low'");
768 }
769
770 // Test max
771 std::optional<std::string> max_val = arr.max();
772 if (!max_val.has_value() || *max_val != "high") {
773 std::cout << " [FAIL] : in pd_test_categorical_array_ordered_operations() : max != 'high'" << std::endl;
774 throw std::runtime_error("pd_test_categorical_array_ordered_operations failed: max != 'high'");
nunique (pd_test_1_all.cpp:10604)
10594 std::cout << " -> tests passed" << std::endl;
10595}
10596
10597void pd_test_extension_index_nunique() {
10598 std::cout << "========= nunique =========================";
10599
10600 pandas::CategoricalArray arr({"a", "b", "a", "c", "b", std::nullopt});
10601 pandas::CategoricalIndex idx(arr);
10602
10603 bool passed = (idx.nunique(true) == 3 && idx.nunique(false) == 4);
10604 if (!passed) {
10605 std::cout << " [FAIL] : in pd_test_extension_index_nunique() : nunique check failed" << std::endl;
10606 throw std::runtime_error("pd_test_extension_index_nunique failed");
10607 }
10608
10609 std::cout << " -> tests passed" << std::endl;
10610}
10611
10612void pd_test_extension_index_factorize() {
10613 std::cout << "========= factorize =========================";
prod (pd_test_1_all.cpp:26082)
26072 std::cout << "====================================== [OK] pd_test_pivot_table test suite ========================== " << std::endl;
26073 return 0;
26074 }
26075
26076} // namespace dataframe_tests
26077// ------------------- pd_test_pivot_table.cpp (end) -----------------------------
26078
26079// ------------------- pd_test_prod.cpp (start) -----------------------------
26080// dataframe_tests/pd_test_prod.cpp
26081// Tests for DataFrame.prod() and DataFrame.prod_cols() methods
26082
26083#include <iostream>
26084#include <stdexcept>
26085#include <cmath>
26086#include <limits>
26087#include "../pandas/pd_dataframe.h"
26088
26089// CRITICAL: No using namespace directives
26090
26091namespace dataframe_tests {
sem (pd_test_1_all.cpp:4525)
4515#include "../pandas/pd_dataframe.h"
4516#include "../pandas/pd_series.h"
4517
4518namespace dataframe_tests {
4519 namespace dataframe_tests_aggregation {
4520
4521 void pd_test_aggregation_series_sem() {
4522 std::cout << "========= Series sem ============================";
4523
4524 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0});
4525 auto sem_val = s.sem();
4526 // std(ddof=1) = sqrt(2.5), sem = sqrt(2.5)/sqrt(5) ≈ 0.707
4527 bool passed = sem_val.has_value() && std::abs(*sem_val - 0.707) < 0.01;
4528 if (!passed) {
4529 std::cout << " [FAIL] : in pd_test_aggregation_series_sem() : sem value incorrect" << std::endl;
4530 throw std::runtime_error("pd_test_aggregation_series_sem failed: sem value incorrect");
4531 }
4532
4533 std::cout << " -> tests passed" << std::endl;
4534 }
std_ (pd_test_1_all.cpp:20752)
20742 throw std::runtime_error("pd_test_rolling_min_periods failed: with min_periods=1, idx 1 should be 3.0");
20743 }
20744
20745 std::cout << " -> tests passed" << std::endl;
20746 }
20747
20748 void pd_test_rolling_std() {
20749 std::cout << "========= Rolling std ===========================";
20750
20751 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0});
20752 auto result = s.rolling(3).std_();
20753
20754 // std([1,2,3]) = 1.0 (ddof=1)
20755 // std([2,3,4]) = 1.0
20756 // std([3,4,5]) = 1.0
20757 bool passed = std::abs(result[2] - 1.0) < 0.001;
20758 if (!passed) {
20759 std::cout << " [FAIL] : in pd_test_rolling_std() : rolling std should be 1.0" << std::endl;
20760 throw std::runtime_error("pd_test_rolling_std failed: rolling std should be 1.0");
20761 }
sum (pd_test_1_all.cpp:276)
266 }
267
268 // Test sum/mean
269 pandas::BooleanArray arr({
270 std::optional<bool>(true),
271 std::optional<bool>(false),
272 std::optional<bool>(true),
273 std::optional<bool>(true)
274 });
275
276 auto s = arr.sum();
277 if (!s.has_value() || s.value() != 3) {
278 std::cout << " [FAIL] : in pd_test_boolean_array_reductions() : sum should be 3" << std::endl;
279 throw std::runtime_error("pd_test_boolean_array_reductions failed: sum");
280 }
281
282 auto m = arr.mean();
283 if (!m.has_value() || std::abs(m.value() - 0.75) > 0.001) {
284 std::cout << " [FAIL] : in pd_test_boolean_array_reductions() : mean should be 0.75" << std::endl;
285 throw std::runtime_error("pd_test_boolean_array_reductions failed: mean");
286 }
var (pd_test_1_all.cpp:20890)
20880 throw std::runtime_error("pd_test_expanding_std failed: expanding std values incorrect");
20881 }
20882
20883 std::cout << " -> tests passed" << std::endl;
20884 }
20885
20886 void pd_test_expanding_var() {
20887 std::cout << "========= Expanding var =========================";
20888
20889 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0});
20890 auto result = s.expanding().var();
20891
20892 // Expanding var (ddof=1): NaN, 0.5, 1.0, 1.6667, 2.5
20893 bool passed = std::isnan(result[0]) &&
20894 std::abs(result[1] - 0.5) < 0.001 &&
20895 std::abs(result[2] - 1.0) < 0.001 &&
20896 std::abs(result[3] - 1.6667) < 0.001 &&
20897 std::abs(result[4] - 2.5) < 0.001;
20898 if (!passed) {
20899 std::cout << " [FAIL] : in pd_test_expanding_var() : expanding var values incorrect" << std::endl;
20900 throw std::runtime_error("pd_test_expanding_var failed: expanding var values incorrect");
agg (pd_test_1_all.cpp:11100)
11090 }
11091
11092 void pd_test_func_apply_series_agg() {
11093 std::cout << "========= Series agg ==================================";
11094
11095 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0}, "values");
11096
11097 bool passed = true;
11098
11099 // Test string-based aggregation
11100 auto sum_result = s.agg("sum");
11101 if (!sum_result.has_value() || !approx_equal(sum_result.value(), 15.0)) {
11102 passed = false;
11103 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : sum failed" << std::endl;
11104 throw std::runtime_error("pd_test_func_apply_series_agg failed: sum failed");
11105 }
11106
11107 auto mean_result = s.agg("mean");
11108 if (!mean_result.has_value() || !approx_equal(mean_result.value(), 3.0)) {
11109 passed = false;
11110 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : mean failed" << std::endl;
agg (pd_test_1_all.cpp:11100)
11090 }
11091
11092 void pd_test_func_apply_series_agg() {
11093 std::cout << "========= Series agg ==================================";
11094
11095 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0}, "values");
11096
11097 bool passed = true;
11098
11099 // Test string-based aggregation
11100 auto sum_result = s.agg("sum");
11101 if (!sum_result.has_value() || !approx_equal(sum_result.value(), 15.0)) {
11102 passed = false;
11103 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : sum failed" << std::endl;
11104 throw std::runtime_error("pd_test_func_apply_series_agg failed: sum failed");
11105 }
11106
11107 auto mean_result = s.agg("mean");
11108 if (!mean_result.has_value() || !approx_equal(mean_result.value(), 3.0)) {
11109 passed = false;
11110 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : mean failed" << std::endl;
agg (pd_test_1_all.cpp:11100)
11090 }
11091
11092 void pd_test_func_apply_series_agg() {
11093 std::cout << "========= Series agg ==================================";
11094
11095 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0}, "values");
11096
11097 bool passed = true;
11098
11099 // Test string-based aggregation
11100 auto sum_result = s.agg("sum");
11101 if (!sum_result.has_value() || !approx_equal(sum_result.value(), 15.0)) {
11102 passed = false;
11103 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : sum failed" << std::endl;
11104 throw std::runtime_error("pd_test_func_apply_series_agg failed: sum failed");
11105 }
11106
11107 auto mean_result = s.agg("mean");
11108 if (!mean_result.has_value() || !approx_equal(mean_result.value(), 3.0)) {
11109 passed = false;
11110 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : mean failed" << std::endl;
agg (pd_test_1_all.cpp:11100)
11090 }
11091
11092 void pd_test_func_apply_series_agg() {
11093 std::cout << "========= Series agg ==================================";
11094
11095 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0}, "values");
11096
11097 bool passed = true;
11098
11099 // Test string-based aggregation
11100 auto sum_result = s.agg("sum");
11101 if (!sum_result.has_value() || !approx_equal(sum_result.value(), 15.0)) {
11102 passed = false;
11103 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : sum failed" << std::endl;
11104 throw std::runtime_error("pd_test_func_apply_series_agg failed: sum failed");
11105 }
11106
11107 auto mean_result = s.agg("mean");
11108 if (!mean_result.has_value() || !approx_equal(mean_result.value(), 3.0)) {
11109 passed = false;
11110 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : mean failed" << std::endl;
agg (pd_test_1_all.cpp:11100)
11090 }
11091
11092 void pd_test_func_apply_series_agg() {
11093 std::cout << "========= Series agg ==================================";
11094
11095 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0}, "values");
11096
11097 bool passed = true;
11098
11099 // Test string-based aggregation
11100 auto sum_result = s.agg("sum");
11101 if (!sum_result.has_value() || !approx_equal(sum_result.value(), 15.0)) {
11102 passed = false;
11103 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : sum failed" << std::endl;
11104 throw std::runtime_error("pd_test_func_apply_series_agg failed: sum failed");
11105 }
11106
11107 auto mean_result = s.agg("mean");
11108 if (!mean_result.has_value() || !approx_equal(mean_result.value(), 3.0)) {
11109 passed = false;
11110 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : mean failed" << std::endl;
agg (pd_test_1_all.cpp:11100)
11090 }
11091
11092 void pd_test_func_apply_series_agg() {
11093 std::cout << "========= Series agg ==================================";
11094
11095 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0}, "values");
11096
11097 bool passed = true;
11098
11099 // Test string-based aggregation
11100 auto sum_result = s.agg("sum");
11101 if (!sum_result.has_value() || !approx_equal(sum_result.value(), 15.0)) {
11102 passed = false;
11103 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : sum failed" << std::endl;
11104 throw std::runtime_error("pd_test_func_apply_series_agg failed: sum failed");
11105 }
11106
11107 auto mean_result = s.agg("mean");
11108 if (!mean_result.has_value() || !approx_equal(mean_result.value(), 3.0)) {
11109 passed = false;
11110 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : mean failed" << std::endl;
agg_callable_with_dtype (pd_test_5_all.cpp:95045)
95035 run_sgb_case("count", "object:bool", "int64",
95036 "C_26_case_412_sgb_count_objbool()", lf); }
95037
95038void case_501_callable_int_returns_int64(int& local_fail) {
95039 std::cout << "-- case_501_callable_int_returns_int64\n";
95040 pandas::DataFrame df = make_mixed_df();
95041 auto gb = df.groupby("key");
95042 pandas::DataFrame out;
95043 std::string err;
95044 try {
95045 out = gb.agg_callable_with_dtype(make_int_callable(42));
95046 } catch (const std::exception& e) {
95047 err = e.what();
95048 } catch (...) {
95049 err = "<unknown>";
95050 }
95051 pandas_tests::check(err.empty(),
95052 "C_26_case_501_callable_int_returns_int64()_no_throw",
95053 local_fail);
95054 if (!err.empty()) {
95055 std::cout << " err: " << err << "\n";
agg_named (pd_test_2_all.cpp:20534)
20524 check(approx_eq(sub_b["val1"].get_value_double(0), 3.0), "get_group_b_val1_r0");
20525 check(approx_eq(sub_b["val1"].get_value_double(1), 4.0), "get_group_b_val1_r1");
20526
20527 // Empty exclude_cols: same as no-exclude overload
20528 std::set<std::string> empty_exclude;
20529 auto sub_empty = gb.get_group("a", empty_exclude);
20530 check(sub_empty.ncols() == 3, "get_group_empty_excl_cols_3");
20531}
20532
20533// =====================================================================
20534// Test: agg_named() basic execution
20535// =====================================================================
20536
20537void pd_test_groupby_apply_named_agg_basic() {
20538 std::cout << " -- pd_test_groupby_apply_named_agg_basic --" << std::endl;
20539
20540 pandas::DataFrame df;
20541 df.add_column("key", std::vector<std::string>{"a", "a", "b", "b"});
20542 df.add_column("val", std::vector<double>{1.0, 3.0, 5.0, 7.0});
20543
20544 auto gb = df.groupby("key");
agg_with_dtype (pd_test_5_all.cpp:94652)
94642static void run_dfgb_case(const std::string& fn,
94643 const std::string& col,
94644 const std::string& expected_dtype,
94645 const std::string& label,
94646 int& local_fail) {
94647 pandas::DataFrame df = make_mixed_df();
94648 auto gb = df.groupby("key");
94649 pandas::DataFrame out;
94650 std::string err;
94651 try {
94652 out = gb.agg_with_dtype(fn);
94653 } catch (const std::exception& e) {
94654 err = e.what();
94655 } catch (...) {
94656 err = "<unknown>";
94657 }
94658 pandas_tests::check(err.empty(),
94659 label + "_no_throw",
94660 local_fail);
94661 if (!err.empty()) {
94662 std::cout << " err: " << err << "\n";
agg_with_dtype_list (pd_test_5_all.cpp:94682)
94672static void run_dfgb_list_case(const std::vector<std::string>& fns,
94673 const std::string& src_col,
94674 const std::vector<std::string>& expected,
94675 const std::string& label,
94676 int& local_fail) {
94677 pandas::DataFrame df = make_mixed_df();
94678 auto gb = df.groupby("key");
94679 pandas::DataFrame out;
94680 std::string err;
94681 try {
94682 out = gb.agg_with_dtype_list(fns);
94683 } catch (const std::exception& e) {
94684 err = e.what();
94685 } catch (...) {
94686 err = "<unknown>";
94687 }
94688 pandas_tests::check(err.empty(),
94689 label + "_no_throw",
94690 local_fail);
94691 if (!err.empty()) {
94692 std::cout << " err: " << err << "\n";
apply (pd_test_1_all.cpp:11244)
11234 void pd_test_func_apply_dataframe_apply_axis0() {
11235 std::cout << "========= DataFrame apply axis=0 ======================";
11236
11237 std::map<std::string, std::vector<double>> data = {
11238 {"A", {1.0, 2.0, 3.0}},
11239 {"B", {4.0, 5.0, 6.0}}
11240 };
11241 pandas::DataFrame df(data);
11242
11243 // apply axis=0 applies function to each column
11244 auto result = df.apply([](const std::vector<double>& col) {
11245 return std::accumulate(col.begin(), col.end(), 0.0);
11246 }, 0);
11247
11248 bool passed = true;
11249
11250 // Plan F·dtype: axis=0 reduce now returns a single "result" column
11251 // with the original column names ("A", "B") as the row index.
11252 // Sum of A: 1+2+3=6, Sum of B: 4+5+6=15
11253 const auto& result_col = result["result"];
11254 double sum_a = std::stod(result_col.get_value_str(0));
apply_collect_scalar_results (pd_test_3_all.cpp:27341)
27331 std::vector<double> values;
27332 for (const auto& key : keys) {
27333 auto sub = gb.get_group(key);
27334 double sum = 0;
27335 for (size_t r = 0; r < sub.nrows(); ++r) {
27336 sum += sub["B"].get_value_double(r);
27337 }
27338 values.push_back(sum);
27339 }
27340
27341 auto result = gb.apply_collect_scalar_results(keys, values);
27342 check(result.size() == keys.size(), "scalar results size matches keys size");
27343
27344 bool found_bar = false, found_foo = false;
27345 for (size_t i = 0; i < result.size(); ++i) {
27346 std::string idx = result.index().get_value_str(i);
27347 if (idx == "bar") { check(result[i] == 6.0, "bar sum = 6"); found_bar = true; }
27348 if (idx == "foo") { check(result[i] == 9.0, "foo sum = 9"); found_foo = true; }
27349 }
27350 check(found_bar, "bar key found");
27351 check(found_foo, "foo key found");
apply_collect_series_results (pd_test_3_all.cpp:27376)
27366 auto sub = gb.get_group(key);
27367 double b_sum = 0, c_sum = 0;
27368 for (size_t r = 0; r < sub.nrows(); ++r) {
27369 b_sum += sub["B"].get_value_double(r);
27370 c_sum += sub["C"].get_value_double(r);
27371 }
27372 num_cols["B"].push_back(b_sum / sub.nrows());
27373 num_cols["C"].push_back(c_sum / sub.nrows());
27374 }
27375
27376 auto result = gb.apply_collect_series_results(keys, col_names, num_cols, str_cols);
27377 check(result.ncols() == 2, "series results has 2 columns");
27378 check(result.nrows() == keys.size(), "series results has correct rows");
27379 check(result.has_column("B"), "has column B");
27380 check(result.has_column("C"), "has column C");
27381}
27382
27383void pd_test_gb_apply_dataframe_results() {
27384 std::cout << " -- pd_test_gb_apply_dataframe_results --" << std::endl;
27385
27386 auto df = make_test_df();
apply_concat_dataframe_results (pd_test_3_all.cpp:27398)
27388 std::vector<std::string> keys = gb.group_keys_order();
27389 std::vector<pandas::DataFrame> dfs;
27390 std::set<std::string> exclude;
27391 exclude.insert("A");
27392
27393 for (const auto& key : keys) {
27394 dfs.push_back(gb.get_group(key, exclude));
27395 }
27396
27397 auto result_gk = gb.apply_concat_dataframe_results(keys, dfs, true);
27398 check(result_gk.nrows() == df.nrows(), "concat with MI has all rows");
27399 check(result_gk.has_multiindex(), "concat with group_keys=true has MultiIndex");
27400
27401 auto result_no_gk = gb.apply_concat_dataframe_results(keys, dfs, false);
27402 check(result_no_gk.nrows() == df.nrows(), "concat without MI has all rows");
27403}
27404
27405void pd_test_gb_filter_basic() {
27406 std::cout << " -- pd_test_gb_filter_basic --" << std::endl;
resample (pd_test_1_all.cpp:20321)
20311 "2020-01-01 00:00:00",
20312 "2020-01-01 12:00:00",
20313 "2020-01-02 00:00:00",
20314 "2020-01-02 12:00:00",
20315 "2020-01-03 00:00:00",
20316 "2020-01-03 12:00:00"
20317 };
20318 df.set_index(std::make_unique<pandas::Index<std::string>>(dates));
20319
20320 // Resample to daily
20321 auto resampler = df.resample("D");
20322 pandas::DataFrame result = resampler.sum();
20323
20324 // Check that we got aggregated results
20325 bool passed = (result.nrows() <= df.nrows());
20326
20327 if (!passed) {
20328 std::cout << " [FAIL] : in pd_test_timeseries_resample_basic() : resample didn't reduce rows" << std::endl;
20329 throw std::runtime_error("pd_test_timeseries_resample_basic failed");
20330 }
transform_named (pd_test_3_all.cpp:27465)
27455 auto result_nodrop = gb.filter_by_group_mask(mask, false);
27456 check(result_nodrop.nrows() == 5, "dropna=false keeps all rows");
27457}
27458
27459void pd_test_gb_transform_same_shape() {
27460 std::cout << " -- pd_test_gb_transform_same_shape --" << std::endl;
27461
27462 auto df = make_test_df();
27463 auto gb = df.groupby("A");
27464
27465 auto result = gb.transform_named("sum");
27466 check(result.nrows() == df.nrows(), "transform sum same nrows as input");
27467 check(result["B"].get_value_double(0) == 9.0, "row 0 (foo) B sum = 9");
27468 check(result["B"].get_value_double(1) == 6.0, "row 1 (bar) B sum = 6");
27469 check(result["B"].get_value_double(2) == 9.0, "row 2 (foo) B sum = 9");
27470
27471 auto result_mean = gb.transform_named("mean");
27472 check(result_mean.nrows() == df.nrows(), "transform mean same nrows");
27473 check(result_mean["B"].get_value_double(0) == 3.0, "row 0 (foo) B mean = 3");
27474 check(result_mean["B"].get_value_double(1) == 3.0, "row 1 (bar) B mean = 3");
squeeze_result (pd_test_2_all.cpp:20697)
20687 std::cout << " -- test_groupby_squeeze_single_col --" << std::endl;
20688
20689 pandas::DataFrame df;
20690 df.add_column("key", std::vector<std::string>{"A", "A", "B", "B"});
20691 df.add_column("val", std::vector<numpy::float64>{1.0, 2.0, 3.0, 4.0});
20692
20693 auto gb = df.groupby("key");
20694 auto gb_sel = gb.select({"val"}); // single col, not list
20695 pandas::DataFrame result = gb_sel.sum();
20696
20697 auto squeezed = gb_sel.squeeze_result(result);
20698
20699 // Should be a Series<float64>
20700 check(std::holds_alternative<pandas::Series<numpy::float64>>(squeezed), "is_float64_series");
20701
20702 auto& s = std::get<pandas::Series<numpy::float64>>(squeezed);
20703 check(s.size() == 2, "size_2");
20704 check(s.name() == "val", "name_val");
20705 check(approx_eq(s[0], 3.0), "A_sum_3");
20706 check(approx_eq(s[1], 7.0), "B_sum_7");
20707}
column (pd_test_1_all.cpp:22039)
22029 std::string a1 = result.iat<double>(1, col_a_idx) == -1.0 ? "ok" : "fail";
22030 std::string a2 = result.iat<double>(2, col_a_idx) == 3.0 ? "ok" : "fail";
22031 std::string a3 = result.iat<double>(3, col_a_idx) == 4.0 ? "ok" : "fail";
22032
22033 if (a0 != "ok" || a1 != "ok" || a2 != "ok" || a3 != "ok") {
22034 passed = false;
22035 error_msg = "Column A values incorrect: A[0]=" + a0 + ", A[1]=" + a1 +
22036 ", A[2]=" + a2 + ", A[3]=" + a3;
22037 }
22038
22039 // Check B column (all should be original)
22040 double b0 = result.iat<double>(0, col_b_idx);
22041 if (b0 != 5.0) {
22042 passed = false;
22043 error_msg = "B[0] should be 5, got " + std::to_string(b0);
22044 }
22045
22046 if (!passed) {
22047 std::cout << " [FAIL] : in pd_test_where_basic() : " << error_msg << std::endl;
22048 throw std::runtime_error("pd_test_where_basic failed: " + error_msg);
22049 }
compute_agg (pd_test_5_all.cpp:112204)
112194 // Default signature is groupby(by, axis, level, as_index, sort, group_keys, observed, dropna).
112195 auto gb = df_in.groupby("k", 0, std::nullopt, /*as_index=*/true,
112196 /*sort=*/true, /*group_keys=*/true,
112197 /*observed=*/false, /*dropna=*/true);
112198 pandas::DataFrame df = gb.agg("sum");
112199 std::string actual = df.to_string();
112200
112201 // Pandas oracle (verified by analysis1 H3 logic + compute_agg empty=0.0):
112202 // - "a" observed, sum=10
112203 // - "b" observed, sum=20
112204 // - "c" unobserved -> compute_agg(empty, "sum") -> 0
112205 // Plan 12 (Logic-C int widening) has landed: aggregate_column now
112206 // preserves int64 for integer inputs, so the oracle is int64 with
112207 // integer literal display (no .0 suffix).
112208 std::string expected =
112209 " v\n"
112210 "k \n"
112211 "a 10\n"
112212 "b 20\n"
112213 "c 0";
112214 check_case("groupby_agg_dispatch_7c3a91_case_41",
dataframe (pd_test_2_all.cpp:11742)
11732 std::cout << " [FAIL] : wrong dimensions" << std::endl;
11733 std::remove(temp_path.c_str());
11734 throw std::runtime_error("pd_test_to_hdf_mixed_types failed");
11735 }
11736
11737 std::remove(temp_path.c_str());
11738 std::cout << " -> tests passed" << std::endl;
11739 }
11740
11741 void pd_test_to_hdf_empty_dataframe() {
11742 std::cout << "========= to_hdf empty dataframe (real HDF5) ===================";
11743
11744 pandas::DataFrame df;
11745 std::string temp_path = "temp/test_hdf5_empty.h5";
11746 df.to_hdf(temp_path, "df", "w");
11747
11748 // Just verify file was created
11749 std::ifstream file(temp_path);
11750 if (!file.is_open()) {
11751 std::cout << " [FAIL] : file not created" << std::endl;
11752 throw std::runtime_error("pd_test_to_hdf_empty_dataframe failed");
filter (pd_test_3_all.cpp:2805)
2795 threw = true;
2796 }
2797 if (!threw) {
2798 throw std::runtime_error("bool_() should throw for multi-element DataFrame");
2799 }
2800
2801 std::cout << " -> tests passed" << std::endl;
2802}
2803
2804void pd_test_3_all_df_filter() {
2805 std::cout << "========= DataFrame.filter() =============================";
2806
2807 std::map<std::string, std::vector<double>> data = {
2808 {"col_a", {1.0, 2.0, 3.0}},
2809 {"col_b", {4.0, 5.0, 6.0}},
2810 {"other", {7.0, 8.0, 9.0}}
2811 };
2812 pandas::DataFrame df(data);
2813
2814 // Test filter by items
2815 pandas::DataFrame filtered_items = df.filter({"col_a", "col_b"});
filter_by_group_mask (pd_test_3_all.cpp:27422)
27412 std::map<std::string, bool> mask;
27413 for (const auto& key : gb.group_keys_order()) {
27414 auto sub = gb.get_group(key);
27415 double sum = 0;
27416 for (size_t r = 0; r < sub.nrows(); ++r) {
27417 sum += sub["B"].get_value_double(r);
27418 }
27419 mask[key] = (sum > 5);
27420 }
27421
27422 auto result = gb.filter_by_group_mask(mask, true);
27423 check(result.nrows() == 5, "all rows pass filter (both groups sum > 5)");
27424
27425 std::map<std::string, bool> mask3;
27426 mask3["bar"] = false;
27427 mask3["foo"] = true;
27428 auto result3 = gb.filter_by_group_mask(mask3, true);
27429 check(result3.nrows() == 3, "only foo rows kept (3 rows)");
27430}
27431
27432void pd_test_gb_filter_preserves_order() {
group_keys_order (pd_test_3_all.cpp:23393)
23383 pandas::Series<numpy::float64> s({10.0, 20.0, 30.0, 40.0});
23384 std::vector<std::vector<std::string>> level_values = {
23385 {"a", "a", "b", "b"}, {"x", "y", "x", "y"}
23386 };
23387 std::vector<std::optional<std::string>> level_names = {"first", "second"};
23388 auto mi = pandas::MultiIndex::from_arrays<std::string>(level_values, level_names);
23389 s.set_multiindex(mi);
23390
23391 auto gb = s.groupby_by_level(static_cast<size_t>(0), true);
23392 if (gb.group_keys_order().size() != 2)
23393 throw std::runtime_error("expected 2 groups");
23394 auto sums = gb.sum();
23395 if (sums[0] != 30.0 || sums[1] != 70.0)
23396 throw std::runtime_error("sum mismatch");
23397 if (!gb.get_index_name().has_value() || *gb.get_index_name() != "first")
23398 throw std::runtime_error("index name mismatch");
23399
23400 std::cout << " -> tests passed" << std::endl;
23401}
groups (pd_test_2_all.cpp:20864)
20854// =====================================================================
20855// Per-group expanding tests
20856// =====================================================================
20857
20858void test_series_groupby_expanding_sum() {
20859 std::cout << " -- test_series_groupby_expanding_sum --" << std::endl;
20860
20861 // Two groups: A=[1,2,3], B=[10,20]
20862 std::vector<numpy::float64> vals = {1.0, 10.0, 2.0, 20.0, 3.0};
20863 pandas::Series<numpy::float64> data(vals);
20864 pandas::Series<std::string> groups({"A", "B", "A", "B", "A"});
20865
20866 auto sgb = data.groupby(groups);
20867 pandas::SeriesGroupByExpandingWindow ew(sgb, 1);
20868 auto result = ew.sum();
20869
20870 check(result.size() == 5, "size_5");
20871 // A group: expanding sum = 1, 3, 6
20872 // B group: expanding sum = 10, 30
20873 // Original order: [A:1, B:10, A:3, B:30, A:6]
20874 check(approx_eq(result[0], 1.0), "A_exp_sum_0");
list_selected (pd_test_5_all.cpp:28524)
28514}
28515
28516void case_1_squeeze_flag_state_machine(int& local_fail) {
28517 std::cout << "-- H1 squeeze flag state machine\n";
28518 auto df = make_df_std();
28519 auto gb0 = df.groupby("key");
28520
28521 // (a) Base gb -> no selection -> squeeze false.
28522 pandas_tests::check(!gb0.should_squeeze_to_series(),
28523 "H1.a.base_no_select_squeeze_false", local_fail);
28524 pandas_tests::check(!gb0.list_selected(),
28525 "H1.a.base_list_selected_false", local_fail);
28526 check_eq("H1.a.base_selected_size_zero", 0,
28527 (long long)gb0.selected_columns().size(), local_fail);
28528
28529 // (b) select({c}) -> squeeze true.
28530 auto gb1 = gb0.select({"v_int"});
28531 pandas_tests::check(gb1.should_squeeze_to_series(),
28532 "H1.b.select_single_squeeze_true", local_fail);
28533 pandas_tests::check(!gb1.list_selected(),
28534 "H1.b.select_list_selected_false", local_fail);
ngroups (pd_test_1_all.cpp:11497)
11487 // Create DataFrame with category column
11488 std::map<std::string, std::vector<double>> data = {
11489 {"category", {1.0, 1.0, 2.0, 2.0, 2.0}},
11490 {"value", {10.0, 20.0, 30.0, 40.0, 50.0}}
11491 };
11492 pandas::DataFrame df(data);
11493
11494 // Test groupby
11495 auto grouped = df.groupby("category");
11496
11497 bool passed = grouped.ngroups() == 2;
11498 if (!passed) {
11499 std::cout << " [FAIL] : in pd_test_groupby_basic() : ngroups should be 2" << std::endl;
11500 throw std::runtime_error("pd_test_groupby_basic failed: ngroups should be 2");
11501 }
11502
11503 std::cout << " -> tests passed" << std::endl;
11504 }
11505
11506 void pd_test_groupby_multiple_columns() {
11507 std::cout << "========= GroupBy multiple columns ==============";
nth (pd_test_3_all.cpp:27491)
27481 check(result_cumsum["B"].get_value_double(1) == 2.0, "row 1 (bar) cumsum B = 2");
27482 check(result_cumsum["B"].get_value_double(3) == 6.0, "row 3 (bar) cumsum B = 6");
27483}
27484
27485void pd_test_gb_nth_basic() {
27486 std::cout << " -- pd_test_gb_nth_basic --" << std::endl;
27487
27488 auto df = make_test_df();
27489 auto gb = df.groupby("A");
27490
27491 auto result = gb.nth(0);
27492 check(result.nrows() == 2, "nth(0) returns 2 rows (one per group)");
27493
27494 auto result_last = gb.nth(-1);
27495 check(result_last.nrows() == 2, "nth(-1) returns 2 rows");
27496
27497 auto result_multi = gb.nth(std::vector<int>{0, -1});
27498 check(result_multi.nrows() == 4, "nth([0,-1]) returns 4 rows");
27499}
27500
27501void pd_test_gb_nth_slice() {
nth (pd_test_3_all.cpp:27491)
27481 check(result_cumsum["B"].get_value_double(1) == 2.0, "row 1 (bar) cumsum B = 2");
27482 check(result_cumsum["B"].get_value_double(3) == 6.0, "row 3 (bar) cumsum B = 6");
27483}
27484
27485void pd_test_gb_nth_basic() {
27486 std::cout << " -- pd_test_gb_nth_basic --" << std::endl;
27487
27488 auto df = make_test_df();
27489 auto gb = df.groupby("A");
27490
27491 auto result = gb.nth(0);
27492 check(result.nrows() == 2, "nth(0) returns 2 rows (one per group)");
27493
27494 auto result_last = gb.nth(-1);
27495 check(result_last.nrows() == 2, "nth(-1) returns 2 rows");
27496
27497 auto result_multi = gb.nth(std::vector<int>{0, -1});
27498 check(result_multi.nrows() == 4, "nth([0,-1]) returns 4 rows");
27499}
27500
27501void pd_test_gb_nth_slice() {
select (pd_test_2_all.cpp:20694)
20684// =====================================================================
20685
20686void test_groupby_squeeze_single_col() {
20687 std::cout << " -- test_groupby_squeeze_single_col --" << std::endl;
20688
20689 pandas::DataFrame df;
20690 df.add_column("key", std::vector<std::string>{"A", "A", "B", "B"});
20691 df.add_column("val", std::vector<numpy::float64>{1.0, 2.0, 3.0, 4.0});
20692
20693 auto gb = df.groupby("key");
20694 auto gb_sel = gb.select({"val"}); // single col, not list
20695 pandas::DataFrame result = gb_sel.sum();
20696
20697 auto squeezed = gb_sel.squeeze_result(result);
20698
20699 // Should be a Series<float64>
20700 check(std::holds_alternative<pandas::Series<numpy::float64>>(squeezed), "is_float64_series");
20701
20702 auto& s = std::get<pandas::Series<numpy::float64>>(squeezed);
20703 check(s.size() == 2, "size_2");
20704 check(s.name() == "val", "name_val");
select_as_list (pd_test_2_all.cpp:20751)
20741}
20742
20743void test_groupby_no_squeeze_list_key() {
20744 std::cout << " -- test_groupby_no_squeeze_list_key --" << std::endl;
20745
20746 pandas::DataFrame df;
20747 df.add_column("key", std::vector<std::string>{"A", "A", "B", "B"});
20748 df.add_column("val", std::vector<numpy::float64>{1.0, 2.0, 3.0, 4.0});
20749
20750 auto gb = df.groupby("key");
20751 auto gb_sel = gb.select_as_list({"val"}); // list selection -> no squeeze
20752 pandas::DataFrame result = gb_sel.sum();
20753
20754 auto squeezed = gb_sel.squeeze_result(result);
20755 check(std::holds_alternative<std::monostate>(squeezed), "is_monostate_list_sel");
20756}
20757
20758// =====================================================================
20759// apply_result_index tests (MultiIndex reconstruction)
20760// =====================================================================
select_rows_by_indices (pd_test_3_all.cpp:27515)
27505 auto gb = df.groupby("A");
27506
27507 std::vector<size_t> selected;
27508 for (const auto& key : gb.group_keys_order()) {
27509 const auto& indices = gb.groups().at(key);
27510 for (size_t i = 0; i < std::min(size_t(2), indices.size()); ++i) {
27511 selected.push_back(indices[i]);
27512 }
27513 }
27514
27515 auto result = gb.select_rows_by_indices(selected);
27516 check(result.nrows() == 4, "slice [0:2] returns 4 rows");
27517}
27518
27519void pd_test_gb_nth_dropna() {
27520 std::cout << " -- pd_test_gb_nth_dropna --" << std::endl;
27521
27522 std::map<std::string, std::vector<double>> data;
27523 data["B"] = {std::numeric_limits<double>::quiet_NaN(), 2.0, 3.0, 4.0, 5.0};
27524 data["C"] = {10.0, 20.0, 30.0, 40.0, 50.0};
27525 pandas::DataFrame df(data);
selected_columns (pd_test_5_all.cpp:28527)
28517 std::cout << "-- H1 squeeze flag state machine\n";
28518 auto df = make_df_std();
28519 auto gb0 = df.groupby("key");
28520
28521 // (a) Base gb -> no selection -> squeeze false.
28522 pandas_tests::check(!gb0.should_squeeze_to_series(),
28523 "H1.a.base_no_select_squeeze_false", local_fail);
28524 pandas_tests::check(!gb0.list_selected(),
28525 "H1.a.base_list_selected_false", local_fail);
28526 check_eq("H1.a.base_selected_size_zero", 0,
28527 (long long)gb0.selected_columns().size(), local_fail);
28528
28529 // (b) select({c}) -> squeeze true.
28530 auto gb1 = gb0.select({"v_int"});
28531 pandas_tests::check(gb1.should_squeeze_to_series(),
28532 "H1.b.select_single_squeeze_true", local_fail);
28533 pandas_tests::check(!gb1.list_selected(),
28534 "H1.b.select_list_selected_false", local_fail);
28535
28536 // (c) select_as_list({c}) 1-col -> squeeze false (DataFrame-style).
28537 auto gb2 = gb0.select_as_list({"v_int"});
should_squeeze_to_series (pd_test_5_all.cpp:28522)
28512 std::vector<std::string>{"level_0", "level_1"});
28513 return df;
28514}
28515
28516void case_1_squeeze_flag_state_machine(int& local_fail) {
28517 std::cout << "-- H1 squeeze flag state machine\n";
28518 auto df = make_df_std();
28519 auto gb0 = df.groupby("key");
28520
28521 // (a) Base gb -> no selection -> squeeze false.
28522 pandas_tests::check(!gb0.should_squeeze_to_series(),
28523 "H1.a.base_no_select_squeeze_false", local_fail);
28524 pandas_tests::check(!gb0.list_selected(),
28525 "H1.a.base_list_selected_false", local_fail);
28526 check_eq("H1.a.base_selected_size_zero", 0,
28527 (long long)gb0.selected_columns().size(), local_fail);
28528
28529 // (b) select({c}) -> squeeze true.
28530 auto gb1 = gb0.select({"v_int"});
28531 pandas_tests::check(gb1.should_squeeze_to_series(),
28532 "H1.b.select_single_squeeze_true", local_fail);
size (pd_test_1_all.cpp:22)
12#include "../pandas/pd_boolean_array.h"
13
14namespace dataframe_tests {
15
16namespace dataframe_tests_boolean_array {
17 void pd_test_boolean_array_constructors() {
18 std::cout << "========= BooleanArray: constructors ======================= ";
19
20 // Default constructor
21 pandas::BooleanArray arr1;
22 if (arr1.size() != 0) {
23 std::cout << " [FAIL] : in pd_test_boolean_array_constructors() : default constructor size != 0" << std::endl;
24 throw std::runtime_error("pd_test_boolean_array_constructors failed: default constructor size != 0");
25 }
26
27 // Initializer list constructor
28 pandas::BooleanArray arr2({
29 std::optional<bool>(true),
30 std::optional<bool>(false),
31 std::nullopt,
32 std::optional<bool>(true)