SeriesGroupBy#
-
class pandas::SeriesGroupBy#
GroupBy class for split-apply-combine operations.
Example#
#include <pandas/pandas.h>
using namespace pandas;
// Use SeriesGroupBy
SeriesGroupBy obj;
// ... operations ...
Constructors#
Signature |
Location |
Example |
|---|---|---|
|
pd_series_groupby.h:78 |
|
|
pd_series_groupby.h:89 |
Construction#
Signature |
Return Type |
Location |
Example |
|---|---|---|---|
|
Series<T> |
pd_series_groupby.h:1157 |
|
|
Series<double> |
pd_series_groupby.h:1174 |
|
|
Series<int64_t> |
pd_series_groupby.h:1193 |
Indexing / Selection#
Signature |
Return Type |
Location |
Example |
|---|---|---|---|
|
Series<T> |
pd_series_groupby.h:380 |
|
|
Series<T> |
pd_series_groupby.h:1063 |
|
|
std::optional<std::string> |
pd_series_groupby.h:128 |
|
|
cat_values, cats, false, |
pd_series_groupby.h:1108 |
|
|
std::optional<std::string> |
pd_series_groupby.h:131 |
|
|
pandas::Result |
pd_series_groupby.h:728 |
|
|
Series<T> |
pd_series_groupby.h:472 |
Data Manipulation#
Signature |
Return Type |
Location |
Example |
|---|---|---|---|
|
void |
pd_series_groupby.h:120 |
Statistics#
Signature |
Return Type |
Location |
Example |
|---|---|---|---|
|
Series<int64_t> |
pd_series_groupby.h:276 |
|
|
Series<double> |
pd_series_groupby.h:852 |
|
|
Series<double> |
pd_series_groupby.h:877 |
|
|
Series<double> |
pd_series_groupby.h:827 |
|
|
Series<double> |
pd_series_groupby.h:802 |
|
|
Series<T> |
pd_series_groupby.h:342 |
|
|
Series<double> |
pd_series_groupby.h:240 |
|
|
Series<double> |
pd_series_groupby.h:589 |
|
|
Series<T> |
pd_series_groupby.h:304 |
|
|
Series<int64_t> |
pd_series_groupby.h:955 |
|
|
Series<double> |
pd_series_groupby.h:510 |
|
|
auto |
pd_series_groupby.h:180 |
|
|
Series<int64_t> |
pd_series_groupby.h:220 |
|
|
Series<double> |
pd_series_groupby.h:550 |
Aggregation#
Signature |
Return Type |
Location |
Example |
|---|---|---|---|
|
Series<double> |
pd_series_groupby.h:634 |
|
|
DataFrame |
pd_series_groupby.h:708 |
|
|
pandas::Result |
pd_series_groupby.h:716 |
|
|
pandas::Result |
pd_series_groupby.h:722 |
|
|
auto |
pd_series_groupby.h:737 |
|
|
void |
pd_series_groupby.h:1098 |
|
|
Series<T> |
pd_series_groupby.h:767 |
Arithmetic#
Signature |
Return Type |
Location |
Example |
|---|---|---|---|
|
const std::vector<std::string>& |
pd_series_groupby.h:137 |
Comparison#
Signature |
Return Type |
Location |
Example |
|---|---|---|---|
|
std::vector<std::vector<std::string>> |
pd_series_groupby.h:1114 |
Time Series#
Other Methods#
Signature |
Return Type |
Location |
Example |
|---|---|---|---|
|
void |
pd_series_groupby.h:1136 |
|
|
const std::vector<std::string>& |
pd_series_groupby.h:143 |
|
|
Series<int64_t> |
pd_series_groupby.h:991 |
|
|
const std::map<GroupT, std::vector<size_t>>& |
pd_series_groupby.h:107 |
|
|
const std::vector<GroupT>& |
pd_series_groupby.h:112 |
|
|
const std::string& |
pd_series_groupby.h:149 |
|
|
std::vector<GroupT> |
pd_series_groupby.h:1046 |
|
|
const std::map<GroupT, std::vector<size_t>>& |
pd_series_groupby.h:1054 |
|
|
Series<int64_t> |
pd_series_groupby.h:1015 |
|
|
size_t |
pd_series_groupby.h:1038 |
|
|
Series<T> |
pd_series_groupby.h:419 |
|
|
const Series<T>& |
pd_series_groupby.h:117 |
|
|
bool |
pd_series_groupby.h:125 |
|
|
void |
pd_series_groupby.h:153 |
|
|
void |
pd_series_groupby.h:140 |
|
|
void |
pd_series_groupby.h:146 |
|
|
void |
pd_series_groupby.h:134 |
|
|
void |
pd_series_groupby.h:123 |
|
|
void |
pd_series_groupby.h:124 |
|
|
Series<int64_t> |
pd_series_groupby.h:1082 |
|
|
std::string |
pd_series_groupby.h:95 |
|
|
std::string |
pd_series_groupby.h:102 |
Code Examples#
The following examples are extracted from the test suite.
first (pd_test_1_all.cpp:11616)
11606 void pd_test_groupby_first_last() {
11607 std::cout << "========= GroupBy first/last ====================";
11608
11609 std::map<std::string, std::vector<double>> data = {
11610 {"category", {1.0, 1.0, 2.0, 2.0}},
11611 {"value", {10.0, 20.0, 30.0, 40.0}}
11612 };
11613 pandas::DataFrame df(data);
11614
11615 auto first_result = df.groupby("category").first();
11616 auto last_result = df.groupby("category").last();
11617
11618 // First for group 1: 10, group 2: 30
11619 // Last for group 1: 20, group 2: 40
11620 double first1 = std::stod(first_result["value"].get_value_str(0));
11621 double first2 = std::stod(first_result["value"].get_value_str(1));
11622
11623 bool passed = ((std::abs(first1 - 10.0) < 0.001 && std::abs(first2 - 30.0) < 0.001) ||
11624 (std::abs(first1 - 30.0) < 0.001 && std::abs(first2 - 10.0) < 0.001));
11625 if (!passed) {
get_group (pd_test_2_all.cpp:20487)
20477 ++g_fail;
20478 }
20479}
20480
20481static bool approx_eq(double a, double b, double tol = 1e-9) {
20482 if (std::isnan(a) && std::isnan(b)) return true;
20483 return std::abs(a - b) < tol;
20484}
20485
20486// =====================================================================
20487// Test: get_group() with exclude_cols removes groupby columns
20488// =====================================================================
20489
20490void pd_test_groupby_apply_get_group_exclude() {
20491 std::cout << " -- pd_test_groupby_apply_get_group_exclude --" << std::endl;
20492
20493 pandas::DataFrame df;
20494 df.add_column("key", std::vector<std::string>{"a", "a", "b", "b"});
20495 df.add_column("val1", std::vector<double>{1.0, 2.0, 3.0, 4.0});
20496 df.add_column("val2", std::vector<double>{10.0, 20.0, 30.0, 40.0});
get_index_name (pd_test_3_all.cpp:23398)
23388 std::vector<std::optional<std::string>> level_names = {"first", "second"};
23389 auto mi = pandas::MultiIndex::from_arrays<std::string>(level_values, level_names);
23390 s.set_multiindex(mi);
23391
23392 auto gb = s.groupby_by_level(static_cast<size_t>(0), true);
23393 if (gb.group_keys_order().size() != 2)
23394 throw std::runtime_error("expected 2 groups");
23395 auto sums = gb.sum();
23396 if (sums[0] != 30.0 || sums[1] != 70.0)
23397 throw std::runtime_error("sum mismatch");
23398 if (!gb.get_index_name().has_value() || *gb.get_index_name() != "first")
23399 throw std::runtime_error("index name mismatch");
23400
23401 std::cout << " -> tests passed" << std::endl;
23402}
23403
23404void pd_test_groupby_level_multi() {
23405 std::cout << "========= groupby_by_level(multi) =====================";
23406
23407 pandas::Series<numpy::float64> s({1.0, 2.0, 3.0, 4.0});
23408 std::vector<std::vector<std::string>> level_values = {
get_index_name (pd_test_3_all.cpp:23398)
23388 std::vector<std::optional<std::string>> level_names = {"first", "second"};
23389 auto mi = pandas::MultiIndex::from_arrays<std::string>(level_values, level_names);
23390 s.set_multiindex(mi);
23391
23392 auto gb = s.groupby_by_level(static_cast<size_t>(0), true);
23393 if (gb.group_keys_order().size() != 2)
23394 throw std::runtime_error("expected 2 groups");
23395 auto sums = gb.sum();
23396 if (sums[0] != 30.0 || sums[1] != 70.0)
23397 throw std::runtime_error("sum mismatch");
23398 if (!gb.get_index_name().has_value() || *gb.get_index_name() != "first")
23399 throw std::runtime_error("index name mismatch");
23400
23401 std::cout << " -> tests passed" << std::endl;
23402}
23403
23404void pd_test_groupby_level_multi() {
23405 std::cout << "========= groupby_by_level(multi) =====================";
23406
23407 pandas::Series<numpy::float64> s({1.0, 2.0, 3.0, 4.0});
23408 std::vector<std::vector<std::string>> level_values = {
idxmin_with_dtype (pd_test_5_all.cpp:95397)
95387void case_701_dfgb_idxmin_rangeindex(int& local_fail) {
95388 std::cout << "-- case_701_dfgb_idxmin_rangeindex\n";
95389 // Default RangeIndex (int64). Result columns must keep int64 dtype.
95390 pandas::DataFrame df;
95391 df.add_column<double>("v", std::vector<double>{3.0, 1.0, 2.0, 0.5});
95392 df.add_column<int64_t>("key", std::vector<int64_t>{0, 0, 1, 1});
95393 auto gb = df.groupby("key");
95394 pandas::DataFrame out;
95395 std::string err;
95396 try { out = gb.idxmin_with_dtype(); }
95397 catch (const std::exception& e) { err = e.what(); }
95398 catch (...) { err = "<unknown>"; }
95399 pandas_tests::check(err.empty(),
95400 "C_26_case_701_dfgb_idxmin_rangeindex()_no_throw", local_fail);
95401 if (!err.empty()) { std::cout << " err: " << err << "\n"; return; }
95402 std::string got = df_col_dtype(out, "v");
95403 bool ok = (got == "int64");
95404 pandas_tests::check(ok,
95405 "C_26_case_701_dfgb_idxmin_rangeindex()_dtype", local_fail);
95406 if (!ok) std::cout << " got=[" << got << "] expected=[int64]\n";
last (pd_test_1_all.cpp:11617)
11607 void pd_test_groupby_first_last() {
11608 std::cout << "========= GroupBy first/last ====================";
11609
11610 std::map<std::string, std::vector<double>> data = {
11611 {"category", {1.0, 1.0, 2.0, 2.0}},
11612 {"value", {10.0, 20.0, 30.0, 40.0}}
11613 };
11614 pandas::DataFrame df(data);
11615
11616 auto first_result = df.groupby("category").first();
11617 auto last_result = df.groupby("category").last();
11618
11619 // First for group 1: 10, group 2: 30
11620 // Last for group 1: 20, group 2: 40
11621 double first1 = std::stod(first_result["value"].get_value_str(0));
11622 double first2 = std::stod(first_result["value"].get_value_str(1));
11623
11624 bool passed = ((std::abs(first1 - 10.0) < 0.001 && std::abs(first2 - 30.0) < 0.001) ||
11625 (std::abs(first1 - 30.0) < 0.001 && std::abs(first2 - 10.0) < 0.001));
11626 if (!passed) {
11627 std::cout << " [FAIL] : in pd_test_groupby_first_last() : first values incorrect" << std::endl;
set_index_name (pd_test_2_all.cpp:20842)
20832void test_sgb_apply_result_index_categorical() {
20833 std::cout << " -- test_sgb_apply_result_index_categorical --" << std::endl;
20834
20835 std::vector<numpy::float64> values = {5.0, 10.0};
20836 pandas::Series<std::string> by({"A", "B"});
20837 pandas::Series<numpy::float64> data(values);
20838
20839 auto sgb = data.groupby(by);
20840 sgb.set_categorical_categories({"A", "B", "C"});
20841 sgb.set_index_name("cat_key");
20842
20843 pandas::Series<numpy::float64> result(values);
20844 std::vector<std::string> idx_labels = {"A", "B"};
20845 result.set_index(std::make_unique<pandas::Index<std::string>>(idx_labels));
20846
20847 sgb.apply_result_index(result);
20848
20849 // Should have CategoricalIndex (dtype_name() returns "category")
20850 check(result.index().dtype_name() == "category", "is_categorical_index");
20851}
count (pd_test_1_all.cpp:66)
56 if (arr.is_na(0)) {
57 std::cout << " [FAIL] : in pd_test_boolean_array_na_handling() : is_na(0) should be false" << std::endl;
58 throw std::runtime_error("pd_test_boolean_array_na_handling failed: is_na(0) should be false");
59 }
60
61 if (!arr.has_na()) {
62 std::cout << " [FAIL] : in pd_test_boolean_array_na_handling() : has_na() should be true" << std::endl;
63 throw std::runtime_error("pd_test_boolean_array_na_handling failed: has_na() should be true");
64 }
65
66 if (arr.count() != 2) {
67 std::cout << " [FAIL] : in pd_test_boolean_array_na_handling() : count() should be 2" << std::endl;
68 throw std::runtime_error("pd_test_boolean_array_na_handling failed: count() should be 2");
69 }
70
71 std::cout << " -> tests passed" << std::endl;
72 }
73
74 void pd_test_boolean_array_kleene_and() {
75 std::cout << "========= BooleanArray: Kleene AND ======================= ";
cummax (pd_test_1_all.cpp:5152)
5142 // cummin: [1, 1, 1, 1]
5143 auto cmin = df.cummin();
5144 val = cmin["A"].get_value_str(3);
5145 passed = std::abs(std::stod(val) - 1.0) < 0.001;
5146 if (!passed) {
5147 std::cout << " [FAIL] : in pd_test_arithmetic_dataframe_cumulative() : cummin failed" << std::endl;
5148 throw std::runtime_error("pd_test_arithmetic_dataframe_cumulative failed: cummin failed");
5149 }
5150
5151 // cummax: [1, 2, 3, 4]
5152 auto cmax = df.cummax();
5153 val = cmax["A"].get_value_str(2);
5154 passed = std::abs(std::stod(val) - 3.0) < 0.001;
5155 if (!passed) {
5156 std::cout << " [FAIL] : in pd_test_arithmetic_dataframe_cumulative() : cummax failed" << std::endl;
5157 throw std::runtime_error("pd_test_arithmetic_dataframe_cumulative failed: cummax failed");
5158 }
5159
5160 std::cout << " -> tests passed" << std::endl;
5161 }
cummin (pd_test_1_all.cpp:5143)
5133 // cumprod: [1, 2, 6, 24]
5134 auto cp = df.cumprod();
5135 val = cp["A"].get_value_str(3);
5136 passed = std::abs(std::stod(val) - 24.0) < 0.001;
5137 if (!passed) {
5138 std::cout << " [FAIL] : in pd_test_arithmetic_dataframe_cumulative() : cumprod failed" << std::endl;
5139 throw std::runtime_error("pd_test_arithmetic_dataframe_cumulative failed: cumprod failed");
5140 }
5141
5142 // cummin: [1, 1, 1, 1]
5143 auto cmin = df.cummin();
5144 val = cmin["A"].get_value_str(3);
5145 passed = std::abs(std::stod(val) - 1.0) < 0.001;
5146 if (!passed) {
5147 std::cout << " [FAIL] : in pd_test_arithmetic_dataframe_cumulative() : cummin failed" << std::endl;
5148 throw std::runtime_error("pd_test_arithmetic_dataframe_cumulative failed: cummin failed");
5149 }
5150
5151 // cummax: [1, 2, 3, 4]
5152 auto cmax = df.cummax();
5153 val = cmax["A"].get_value_str(2);
cumprod (pd_test_1_all.cpp:5134)
5124 // cumsum: [1, 3, 6, 10]
5125 auto cs = df.cumsum();
5126 std::string val = cs["A"].get_value_str(2);
5127 bool passed = std::abs(std::stod(val) - 6.0) < 0.001;
5128 if (!passed) {
5129 std::cout << " [FAIL] : in pd_test_arithmetic_dataframe_cumulative() : cumsum failed" << std::endl;
5130 throw std::runtime_error("pd_test_arithmetic_dataframe_cumulative failed: cumsum failed");
5131 }
5132
5133 // cumprod: [1, 2, 6, 24]
5134 auto cp = df.cumprod();
5135 val = cp["A"].get_value_str(3);
5136 passed = std::abs(std::stod(val) - 24.0) < 0.001;
5137 if (!passed) {
5138 std::cout << " [FAIL] : in pd_test_arithmetic_dataframe_cumulative() : cumprod failed" << std::endl;
5139 throw std::runtime_error("pd_test_arithmetic_dataframe_cumulative failed: cumprod failed");
5140 }
5141
5142 // cummin: [1, 1, 1, 1]
5143 auto cmin = df.cummin();
5144 val = cmin["A"].get_value_str(3);
cumsum (pd_test_1_all.cpp:5125)
5115 }
5116
5117 void pd_test_arithmetic_dataframe_cumulative() {
5118 std::cout << "========= DataFrame cumulative ==================";
5119
5120 std::map<std::string, std::vector<double>> data;
5121 data["A"] = {1.0, 2.0, 3.0, 4.0};
5122 pandas::DataFrame df(data);
5123
5124 // cumsum: [1, 3, 6, 10]
5125 auto cs = df.cumsum();
5126 std::string val = cs["A"].get_value_str(2);
5127 bool passed = std::abs(std::stod(val) - 6.0) < 0.001;
5128 if (!passed) {
5129 std::cout << " [FAIL] : in pd_test_arithmetic_dataframe_cumulative() : cumsum failed" << std::endl;
5130 throw std::runtime_error("pd_test_arithmetic_dataframe_cumulative failed: cumsum failed");
5131 }
5132
5133 // cumprod: [1, 2, 6, 24]
5134 auto cp = df.cumprod();
5135 val = cp["A"].get_value_str(3);
max (pd_test_1_all.cpp:771)
761 pandas::CategoricalArray arr = pandas::CategoricalArray::from_codes(codes, cats, true); // ordered
762
763 // Test min
764 std::optional<std::string> min_val = arr.min();
765 if (!min_val.has_value() || *min_val != "low") {
766 std::cout << " [FAIL] : in pd_test_categorical_array_ordered_operations() : min != 'low'" << std::endl;
767 throw std::runtime_error("pd_test_categorical_array_ordered_operations failed: min != 'low'");
768 }
769
770 // Test max
771 std::optional<std::string> max_val = arr.max();
772 if (!max_val.has_value() || *max_val != "high") {
773 std::cout << " [FAIL] : in pd_test_categorical_array_ordered_operations() : max != 'high'" << std::endl;
774 throw std::runtime_error("pd_test_categorical_array_ordered_operations failed: max != 'high'");
775 }
776
777 // Test unordered throws for min/max
778 pandas::CategoricalArray unordered = arr.as_unordered();
779 bool threw = false;
780 try {
781 unordered.min();
mean (pd_test_1_all.cpp:282)
272 std::optional<bool>(true),
273 std::optional<bool>(true)
274 });
275
276 auto s = arr.sum();
277 if (!s.has_value() || s.value() != 3) {
278 std::cout << " [FAIL] : in pd_test_boolean_array_reductions() : sum should be 3" << std::endl;
279 throw std::runtime_error("pd_test_boolean_array_reductions failed: sum");
280 }
281
282 auto m = arr.mean();
283 if (!m.has_value() || std::abs(m.value() - 0.75) > 0.001) {
284 std::cout << " [FAIL] : in pd_test_boolean_array_reductions() : mean should be 0.75" << std::endl;
285 throw std::runtime_error("pd_test_boolean_array_reductions failed: mean");
286 }
287
288 std::cout << " -> tests passed" << std::endl;
289 }
290
291 void pd_test_boolean_array_dtype() {
292 std::cout << "========= BooleanArray: dtype ======================= ";
median (pd_test_1_all.cpp:20910)
20900 throw std::runtime_error("pd_test_expanding_var failed: expanding var values incorrect");
20901 }
20902
20903 std::cout << " -> tests passed" << std::endl;
20904 }
20905
20906 void pd_test_expanding_median() {
20907 std::cout << "========= Expanding median ======================";
20908
20909 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0});
20910 auto result = s.expanding().median();
20911
20912 // Expanding median: 1, 1.5, 2, 2.5, 3
20913 bool passed = std::abs(result[0] - 1.0) < 0.001 &&
20914 std::abs(result[1] - 1.5) < 0.001 &&
20915 std::abs(result[2] - 2.0) < 0.001 &&
20916 std::abs(result[3] - 2.5) < 0.001 &&
20917 std::abs(result[4] - 3.0) < 0.001;
20918 if (!passed) {
20919 std::cout << " [FAIL] : in pd_test_expanding_median() : expanding median values incorrect" << std::endl;
20920 throw std::runtime_error("pd_test_expanding_median failed: expanding median values incorrect");
min (pd_test_1_all.cpp:764)
754 }
755
756 void pd_test_categorical_array_ordered_operations() {
757 std::cout << "========= CategoricalArray: ordered operations (min/max) ======================= ";
758
759 std::vector<std::string> cats = {"low", "medium", "high"};
760 std::vector<numpy::int32> codes = {0, 2, 1, 0, -1}; // low, high, medium, low, NA
761 pandas::CategoricalArray arr = pandas::CategoricalArray::from_codes(codes, cats, true); // ordered
762
763 // Test min
764 std::optional<std::string> min_val = arr.min();
765 if (!min_val.has_value() || *min_val != "low") {
766 std::cout << " [FAIL] : in pd_test_categorical_array_ordered_operations() : min != 'low'" << std::endl;
767 throw std::runtime_error("pd_test_categorical_array_ordered_operations failed: min != 'low'");
768 }
769
770 // Test max
771 std::optional<std::string> max_val = arr.max();
772 if (!max_val.has_value() || *max_val != "high") {
773 std::cout << " [FAIL] : in pd_test_categorical_array_ordered_operations() : max != 'high'" << std::endl;
774 throw std::runtime_error("pd_test_categorical_array_ordered_operations failed: max != 'high'");
nunique (pd_test_1_all.cpp:10604)
10594 std::cout << " -> tests passed" << std::endl;
10595}
10596
10597void pd_test_extension_index_nunique() {
10598 std::cout << "========= nunique =========================";
10599
10600 pandas::CategoricalArray arr({"a", "b", "a", "c", "b", std::nullopt});
10601 pandas::CategoricalIndex idx(arr);
10602
10603 bool passed = (idx.nunique(true) == 3 && idx.nunique(false) == 4);
10604 if (!passed) {
10605 std::cout << " [FAIL] : in pd_test_extension_index_nunique() : nunique check failed" << std::endl;
10606 throw std::runtime_error("pd_test_extension_index_nunique failed");
10607 }
10608
10609 std::cout << " -> tests passed" << std::endl;
10610}
10611
10612void pd_test_extension_index_factorize() {
10613 std::cout << "========= factorize =========================";
std_ (pd_test_1_all.cpp:20752)
20742 throw std::runtime_error("pd_test_rolling_min_periods failed: with min_periods=1, idx 1 should be 3.0");
20743 }
20744
20745 std::cout << " -> tests passed" << std::endl;
20746 }
20747
20748 void pd_test_rolling_std() {
20749 std::cout << "========= Rolling std ===========================";
20750
20751 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0});
20752 auto result = s.rolling(3).std_();
20753
20754 // std([1,2,3]) = 1.0 (ddof=1)
20755 // std([2,3,4]) = 1.0
20756 // std([3,4,5]) = 1.0
20757 bool passed = std::abs(result[2] - 1.0) < 0.001;
20758 if (!passed) {
20759 std::cout << " [FAIL] : in pd_test_rolling_std() : rolling std should be 1.0" << std::endl;
20760 throw std::runtime_error("pd_test_rolling_std failed: rolling std should be 1.0");
20761 }
sum (pd_test_1_all.cpp:276)
266 }
267
268 // Test sum/mean
269 pandas::BooleanArray arr({
270 std::optional<bool>(true),
271 std::optional<bool>(false),
272 std::optional<bool>(true),
273 std::optional<bool>(true)
274 });
275
276 auto s = arr.sum();
277 if (!s.has_value() || s.value() != 3) {
278 std::cout << " [FAIL] : in pd_test_boolean_array_reductions() : sum should be 3" << std::endl;
279 throw std::runtime_error("pd_test_boolean_array_reductions failed: sum");
280 }
281
282 auto m = arr.mean();
283 if (!m.has_value() || std::abs(m.value() - 0.75) > 0.001) {
284 std::cout << " [FAIL] : in pd_test_boolean_array_reductions() : mean should be 0.75" << std::endl;
285 throw std::runtime_error("pd_test_boolean_array_reductions failed: mean");
286 }
sum_int64_bool_ (pd_test_5_all.cpp:55457)
55447 check_col_dtype("caseG3", df, 0, "float64", local_fail);
55448}
55449
55450static void f_seriesgroupby_agg_list_dtype_12_b3d2f7_caseH1_bool_sum(int& local_fail) {
55451 std::cout << "-- caseH1_bool_sum\n";
55452 pandas::Series<bool> v({true, false, true, true});
55453 pandas::Series<std::string> by({"a", "a", "b", "b"});
55454 auto sgb = v.groupby(by);
55455 pandas::DataFrame df = sgb.agg(std::vector<std::string>{"sum"});
55456 check_ncols("caseH1", df, 1, local_fail);
55457 // Plan 21: pandas widens bool sum to int64 — fixed via sum_int64_bool_().
55458 check_col_dtype("caseH1", df, 0, "int64", local_fail);
55459}
55460
55461static void f_seriesgroupby_agg_list_dtype_12_b3d2f7_caseH2_bool_first(int& local_fail) {
55462 std::cout << "-- caseH2_bool_first\n";
55463 pandas::Series<bool> v({true, false, true, true});
55464 pandas::Series<std::string> by({"a", "a", "b", "b"});
55465 auto sgb = v.groupby(by);
55466 pandas::DataFrame df = sgb.agg(std::vector<std::string>{"first"});
55467 check_ncols("caseH2", df, 1, local_fail);
var (pd_test_1_all.cpp:20890)
20880 throw std::runtime_error("pd_test_expanding_std failed: expanding std values incorrect");
20881 }
20882
20883 std::cout << " -> tests passed" << std::endl;
20884 }
20885
20886 void pd_test_expanding_var() {
20887 std::cout << "========= Expanding var =========================";
20888
20889 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0});
20890 auto result = s.expanding().var();
20891
20892 // Expanding var (ddof=1): NaN, 0.5, 1.0, 1.6667, 2.5
20893 bool passed = std::isnan(result[0]) &&
20894 std::abs(result[1] - 0.5) < 0.001 &&
20895 std::abs(result[2] - 1.0) < 0.001 &&
20896 std::abs(result[3] - 1.6667) < 0.001 &&
20897 std::abs(result[4] - 2.5) < 0.001;
20898 if (!passed) {
20899 std::cout << " [FAIL] : in pd_test_expanding_var() : expanding var values incorrect" << std::endl;
20900 throw std::runtime_error("pd_test_expanding_var failed: expanding var values incorrect");
agg (pd_test_1_all.cpp:11100)
11090 }
11091
11092 void pd_test_func_apply_series_agg() {
11093 std::cout << "========= Series agg ==================================";
11094
11095 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0}, "values");
11096
11097 bool passed = true;
11098
11099 // Test string-based aggregation
11100 auto sum_result = s.agg("sum");
11101 if (!sum_result.has_value() || !approx_equal(sum_result.value(), 15.0)) {
11102 passed = false;
11103 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : sum failed" << std::endl;
11104 throw std::runtime_error("pd_test_func_apply_series_agg failed: sum failed");
11105 }
11106
11107 auto mean_result = s.agg("mean");
11108 if (!mean_result.has_value() || !approx_equal(mean_result.value(), 3.0)) {
11109 passed = false;
11110 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : mean failed" << std::endl;
agg (pd_test_1_all.cpp:11100)
11090 }
11091
11092 void pd_test_func_apply_series_agg() {
11093 std::cout << "========= Series agg ==================================";
11094
11095 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0, 5.0}, "values");
11096
11097 bool passed = true;
11098
11099 // Test string-based aggregation
11100 auto sum_result = s.agg("sum");
11101 if (!sum_result.has_value() || !approx_equal(sum_result.value(), 15.0)) {
11102 passed = false;
11103 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : sum failed" << std::endl;
11104 throw std::runtime_error("pd_test_func_apply_series_agg failed: sum failed");
11105 }
11106
11107 auto mean_result = s.agg("mean");
11108 if (!mean_result.has_value() || !approx_equal(mean_result.value(), 3.0)) {
11109 passed = false;
11110 std::cout << " [FAIL] : in pd_test_func_apply_series_agg() : mean failed" << std::endl;
agg_with_dtype (pd_test_5_all.cpp:94652)
94642static void run_dfgb_case(const std::string& fn,
94643 const std::string& col,
94644 const std::string& expected_dtype,
94645 const std::string& label,
94646 int& local_fail) {
94647 pandas::DataFrame df = make_mixed_df();
94648 auto gb = df.groupby("key");
94649 pandas::DataFrame out;
94650 std::string err;
94651 try {
94652 out = gb.agg_with_dtype(fn);
94653 } catch (const std::exception& e) {
94654 err = e.what();
94655 } catch (...) {
94656 err = "<unknown>";
94657 }
94658 pandas_tests::check(err.empty(),
94659 label + "_no_throw",
94660 local_fail);
94661 if (!err.empty()) {
94662 std::cout << " err: " << err << "\n";
agg_with_dtype_list (pd_test_5_all.cpp:94682)
94672static void run_dfgb_list_case(const std::vector<std::string>& fns,
94673 const std::string& src_col,
94674 const std::vector<std::string>& expected,
94675 const std::string& label,
94676 int& local_fail) {
94677 pandas::DataFrame df = make_mixed_df();
94678 auto gb = df.groupby("key");
94679 pandas::DataFrame out;
94680 std::string err;
94681 try {
94682 out = gb.agg_with_dtype_list(fns);
94683 } catch (const std::exception& e) {
94684 err = e.what();
94685 } catch (...) {
94686 err = "<unknown>";
94687 }
94688 pandas_tests::check(err.empty(),
94689 label + "_no_throw",
94690 local_fail);
94691 if (!err.empty()) {
94692 std::cout << " err: " << err << "\n";
apply (pd_test_1_all.cpp:11244)
11234 void pd_test_func_apply_dataframe_apply_axis0() {
11235 std::cout << "========= DataFrame apply axis=0 ======================";
11236
11237 std::map<std::string, std::vector<double>> data = {
11238 {"A", {1.0, 2.0, 3.0}},
11239 {"B", {4.0, 5.0, 6.0}}
11240 };
11241 pandas::DataFrame df(data);
11242
11243 // apply axis=0 applies function to each column
11244 auto result = df.apply([](const std::vector<double>& col) {
11245 return std::accumulate(col.begin(), col.end(), 0.0);
11246 }, 0);
11247
11248 bool passed = true;
11249
11250 // Plan F·dtype: axis=0 reduce now returns a single "result" column
11251 // with the original column names ("A", "B") as the row index.
11252 // Sum of A: 1+2+3=6, Sum of B: 4+5+6=15
11253 const auto& result_col = result["result"];
11254 double sum_a = std::stod(result_col.get_value_str(0));
apply_result_index (pd_test_2_all.cpp:20781)
20771 pandas::Series<std::string> by(keys);
20772 pandas::Series<numpy::float64> data(values);
20773
20774 auto sgb = data.groupby(by);
20775 sgb.set_multiindex_names({"level0", "level1"});
20776
20777 // Create a "result" series with composite index
20778 pandas::Series<numpy::float64> result(values);
20779 result.set_index(std::make_unique<pandas::Index<std::string>>(keys));
20780
20781 sgb.apply_result_index(result);
20782
20783 // Should now have a MultiIndex
20784 check(result.has_multiindex(), "has_multiindex");
20785 check(result.multiindex().nlevels() == 2, "nlevels_2");
20786}
20787
20788void test_sgb_apply_result_index_3level() {
20789 std::cout << " -- test_sgb_apply_result_index_3level --" << std::endl;
20790
20791 using std::string;
transform (pd_test_1_all.cpp:11071)
11061 std::cout << " -> tests passed" << std::endl;
11062 }
11063
11064 void pd_test_func_apply_series_transform() {
11065 std::cout << "========= Series transform ============================";
11066
11067 pandas::Series<double> s({1.0, 2.0, 3.0, 4.0}, "values");
11068
11069 // Transform must return same shape
11070 auto result = s.transform([](double x) { return x * 2 + 1; });
11071
11072 bool passed = true;
11073 if (result.size() != s.size()) {
11074 passed = false;
11075 std::cout << " [FAIL] : in pd_test_func_apply_series_transform() : size changed" << std::endl;
11076 throw std::runtime_error("pd_test_func_apply_series_transform failed: size changed");
11077 }
11078
11079 std::vector<double> expected = {3.0, 5.0, 7.0, 9.0};
11080 for (size_t i = 0; i < result.size(); ++i) {
multiindex_names (pd_test_3_all.cpp:23419)
23409 {"a", "a", "b", "b"}, {"x", "y", "x", "y"}
23410 };
23411 std::vector<std::optional<std::string>> level_names = {"L0", "L1"};
23412 auto mi = pandas::MultiIndex::from_arrays<std::string>(level_values, level_names);
23413 s.set_multiindex(mi);
23414
23415 std::vector<size_t> levels = {0, 1};
23416 auto gb = s.groupby_by_level(levels, true);
23417 if (gb.group_keys_order().size() != 4)
23418 throw std::runtime_error("expected 4 composite groups");
23419 if (gb.multiindex_names().size() != 2 || gb.multiindex_names()[0] != "L0" || gb.multiindex_names()[1] != "L1")
23420 throw std::runtime_error("multiindex names mismatch");
23421
23422 std::cout << " -> tests passed" << std::endl;
23423}
23424
23425void pd_test_groupby_by_index() {
23426 std::cout << "========= groupby_by_index() ==========================";
23427
23428 pandas::Series<numpy::float64> s({10.0, 20.0, 30.0});
23429 s.set_index(pandas::Index<std::string>({"a", "b", "a"}));
diff (pd_test_1_all.cpp:5171)
5161 }
5162
5163 void pd_test_arithmetic_dataframe_diff_shift() {
5164 std::cout << "========= DataFrame diff/shift ==================";
5165
5166 std::map<std::string, std::vector<double>> data;
5167 data["A"] = {1.0, 3.0, 6.0, 10.0};
5168 pandas::DataFrame df(data);
5169
5170 // diff: [NaN, 2, 3, 4]
5171 auto d = df.diff();
5172 std::string val = d["A"].get_value_str(1);
5173 bool passed = std::abs(std::stod(val) - 2.0) < 0.001;
5174 if (!passed) {
5175 std::cout << " [FAIL] : in pd_test_arithmetic_dataframe_diff_shift() : diff failed" << std::endl;
5176 throw std::runtime_error("pd_test_arithmetic_dataframe_diff_shift failed: diff failed");
5177 }
5178
5179 // First element should be NaN
5180 val = d["A"].get_value_str(0);
5181 passed = std::isnan(std::stod(val));
shift (pd_test_1_all.cpp:5188)
5178 // First element should be NaN
5179 val = d["A"].get_value_str(0);
5180 passed = std::isnan(std::stod(val));
5181 if (!passed) {
5182 std::cout << " [FAIL] : in pd_test_arithmetic_dataframe_diff_shift() : diff NaN failed" << std::endl;
5183 throw std::runtime_error("pd_test_arithmetic_dataframe_diff_shift failed: diff NaN failed");
5184 }
5185
5186 // shift: [NaN, 1, 3, 6]
5187 auto s = df.shift();
5188 val = s["A"].get_value_str(1);
5189 passed = std::abs(std::stod(val) - 1.0) < 0.001;
5190 if (!passed) {
5191 std::cout << " [FAIL] : in pd_test_arithmetic_dataframe_diff_shift() : shift failed" << std::endl;
5192 throw std::runtime_error("pd_test_arithmetic_dataframe_diff_shift failed: shift failed");
5193 }
5194
5195 std::cout << " -> tests passed" << std::endl;
5196 }
categorical_categories (pd_test_3_all.cpp:23513)
23503 pandas::CategoricalArray cat({"a", "b", "a"}, {"a", "b", "c"});
23504
23505 auto gb_obs = s.groupby_by_categorical(cat, true, true);
23506 if (gb_obs.group_keys_order().size() != 2)
23507 throw std::runtime_error("expected 2 observed groups");
23508
23509 auto gb_all = s.groupby_by_categorical(cat, true, false);
23510 if (gb_all.group_keys_order().size() != 3)
23511 throw std::runtime_error("expected 3 groups with observed=false");
23512
23513 if (gb_obs.categorical_categories().size() != 3)
23514 throw std::runtime_error("categorical_categories not set");
23515
23516 std::cout << " -> tests passed" << std::endl;
23517}
23518
23519void pd_test_groupby_by_labels() {
23520 std::cout << "========= groupby_by_labels() =========================";
23521
23522 pandas::Series<numpy::float64> s({1.0, 2.0, 3.0, 4.0});
23523 std::vector<std::string> labels = {"X", "Y", "X", "Y"};
group_keys_order (pd_test_3_all.cpp:23393)
23383 pandas::Series<numpy::float64> s({10.0, 20.0, 30.0, 40.0});
23384 std::vector<std::vector<std::string>> level_values = {
23385 {"a", "a", "b", "b"}, {"x", "y", "x", "y"}
23386 };
23387 std::vector<std::optional<std::string>> level_names = {"first", "second"};
23388 auto mi = pandas::MultiIndex::from_arrays<std::string>(level_values, level_names);
23389 s.set_multiindex(mi);
23390
23391 auto gb = s.groupby_by_level(static_cast<size_t>(0), true);
23392 if (gb.group_keys_order().size() != 2)
23393 throw std::runtime_error("expected 2 groups");
23394 auto sums = gb.sum();
23395 if (sums[0] != 30.0 || sums[1] != 70.0)
23396 throw std::runtime_error("sum mismatch");
23397 if (!gb.get_index_name().has_value() || *gb.get_index_name() != "first")
23398 throw std::runtime_error("index name mismatch");
23399
23400 std::cout << " -> tests passed" << std::endl;
23401}
grouper_dtype (pd_test_3_all.cpp:23493)
23483 std::cout << "========= groupby_by_numeric() ========================";
23484
23485 pandas::Series<numpy::float64> s({10.0, 20.0, 30.0, 40.0});
23486 pandas::Series<numpy::float64> by_s({1.0, 2.0, 1.0, 2.0});
23487 auto gb = s.groupby_by_numeric(by_s, true);
23488 if (gb.group_keys_order().size() != 2)
23489 throw std::runtime_error("expected 2 groups");
23490 auto sums = gb.sum();
23491 if (sums[0] != 40.0 || sums[1] != 60.0)
23492 throw std::runtime_error("sum mismatch");
23493 if (gb.grouper_dtype() != "float64")
23494 throw std::runtime_error("grouper_dtype mismatch");
23495
23496 std::cout << " -> tests passed" << std::endl;
23497}
23498
23499void pd_test_groupby_by_categorical() {
23500 std::cout << "========= groupby_by_categorical() ====================";
23501
23502 pandas::Series<numpy::float64> s({10.0, 20.0, 30.0});
23503 pandas::CategoricalArray cat({"a", "b", "a"}, {"a", "b", "c"});
groups (pd_test_2_all.cpp:20864)
20854// =====================================================================
20855// Per-group expanding tests
20856// =====================================================================
20857
20858void test_series_groupby_expanding_sum() {
20859 std::cout << " -- test_series_groupby_expanding_sum --" << std::endl;
20860
20861 // Two groups: A=[1,2,3], B=[10,20]
20862 std::vector<numpy::float64> vals = {1.0, 10.0, 2.0, 20.0, 3.0};
20863 pandas::Series<numpy::float64> data(vals);
20864 pandas::Series<std::string> groups({"A", "B", "A", "B", "A"});
20865
20866 auto sgb = data.groupby(groups);
20867 pandas::SeriesGroupByExpandingWindow ew(sgb, 1);
20868 auto result = ew.sum();
20869
20870 check(result.size() == 5, "size_5");
20871 // A group: expanding sum = 1, 3, 6
20872 // B group: expanding sum = 10, 30
20873 // Original order: [A:1, B:10, A:3, B:30, A:6]
20874 check(approx_eq(result[0], 1.0), "A_exp_sum_0");
indices (pd_test_1_all.cpp:14921)
14911 passed = passed && r2_tup1[0] == "b" && r2_tup1[1] == "x";
14912 passed = passed && r2_tup2[0] == "c" && r2_tup2[1] == "x";
14913 }
14914
14915 // Test empty vector (no deletion)
14916 std::cout << " Test 3: Empty delete_(std::vector<size_t>{})..." << std::endl;
14917 auto result3 = mi.delete_(std::vector<size_t>{});
14918 std::cout << " Result size: " << result3.size() << " (expected " << mi.size() << ")" << std::endl;
14919 passed = passed && result3.size() == mi.size();
14920
14921 // Test duplicate indices (should be deduplicated)
14922 std::cout << " Test 4: Duplicate delete_({1, 1, 2})..." << std::endl;
14923 auto result4 = mi.delete_({1, 1, 2});
14924 std::cout << " Result size: " << result4.size() << " (expected 3)" << std::endl;
14925 passed = passed && result4.size() == 3;
14926
14927 // Test deleting all elements
14928 std::cout << " Test 5: Delete all delete_({0,1,2,3,4})..." << std::endl;
14929 auto result5 = mi.delete_({0, 1, 2, 3, 4});
14930 std::cout << " Result size: " << result5.size() << " (expected 0)" << std::endl;
14931 passed = passed && result5.size() == 0;
ngroups (pd_test_1_all.cpp:11497)
11487 // Create DataFrame with category column
11488 std::map<std::string, std::vector<double>> data = {
11489 {"category", {1.0, 1.0, 2.0, 2.0, 2.0}},
11490 {"value", {10.0, 20.0, 30.0, 40.0, 50.0}}
11491 };
11492 pandas::DataFrame df(data);
11493
11494 // Test groupby
11495 auto grouped = df.groupby("category");
11496
11497 bool passed = grouped.ngroups() == 2;
11498 if (!passed) {
11499 std::cout << " [FAIL] : in pd_test_groupby_basic() : ngroups should be 2" << std::endl;
11500 throw std::runtime_error("pd_test_groupby_basic failed: ngroups should be 2");
11501 }
11502
11503 std::cout << " -> tests passed" << std::endl;
11504 }
11505
11506 void pd_test_groupby_multiple_columns() {
11507 std::cout << "========= GroupBy multiple columns ==============";
nth (pd_test_3_all.cpp:27491)
27481 check(result_cumsum["B"].get_value_double(1) == 2.0, "row 1 (bar) cumsum B = 2");
27482 check(result_cumsum["B"].get_value_double(3) == 6.0, "row 3 (bar) cumsum B = 6");
27483}
27484
27485void pd_test_gb_nth_basic() {
27486 std::cout << " -- pd_test_gb_nth_basic --" << std::endl;
27487
27488 auto df = make_test_df();
27489 auto gb = df.groupby("A");
27490
27491 auto result = gb.nth(0);
27492 check(result.nrows() == 2, "nth(0) returns 2 rows (one per group)");
27493
27494 auto result_last = gb.nth(-1);
27495 check(result_last.nrows() == 2, "nth(-1) returns 2 rows");
27496
27497 auto result_multi = gb.nth(std::vector<int>{0, -1});
27498 check(result_multi.nrows() == 4, "nth([0,-1]) returns 4 rows");
27499}
27500
27501void pd_test_gb_nth_slice() {
series (pd_test_2_all.cpp:2307)
2297 std::vector<std::string> index = {"a", "b", "c", "d", "e"};
2298
2299 std::map<std::string, std::vector<numpy::float64>> data1;
2300 data1["col1"] = {1.0, 2.0, 3.0, 4.0, 5.0};
2301 data1["col2"] = {2.0, 4.0, 6.0, 8.0, 10.0}; // Perfectly correlated with col1
2302
2303 pandas::DataFrame df1(data1, std::make_unique<pandas::Index<std::string>>(index));
2304
2305 // Series with same index and values that correlate with df columns
2306 pandas::Series<numpy::float64> series({1.0, 2.0, 3.0, 4.0, 5.0});
2307 series.set_index(pandas::Index<std::string>(index));
2308
2309 pandas::Series<numpy::float64> result = df1.corrwith(series);
2310
2311 bool passed = true;
2312 // col1 should have correlation 1.0 with series
2313 if (!approx_equal(result[0], 1.0)) {
2314 std::cout << "\n [FAIL] : Expected correlation 1.0 for col1, got " << result[0] << std::endl;
2315 passed = false;
2316 }
set_categorical_categories (pd_test_2_all.cpp:20841)
20831}
20832
20833void test_sgb_apply_result_index_categorical() {
20834 std::cout << " -- test_sgb_apply_result_index_categorical --" << std::endl;
20835
20836 std::vector<numpy::float64> values = {5.0, 10.0};
20837 pandas::Series<std::string> by({"A", "B"});
20838 pandas::Series<numpy::float64> data(values);
20839
20840 auto sgb = data.groupby(by);
20841 sgb.set_categorical_categories({"A", "B", "C"});
20842 sgb.set_index_name("cat_key");
20843
20844 pandas::Series<numpy::float64> result(values);
20845 std::vector<std::string> idx_labels = {"A", "B"};
20846 result.set_index(std::make_unique<pandas::Index<std::string>>(idx_labels));
20847
20848 sgb.apply_result_index(result);
20849
20850 // Should have CategoricalIndex (dtype_name() returns "category")
20851 check(result.index().dtype_name() == "category", "is_categorical_index");
set_multiindex_names (pd_test_2_all.cpp:20775)
20765 // Simulate a 2-level groupby result with composite \x1f keys
20766 using std::string;
20767 string sep(1, '\x1f');
20768 std::vector<string> keys = {"A" + sep + "X", "A" + sep + "Y", "B" + sep + "X", "B" + sep + "Y"};
20769
20770 std::vector<numpy::float64> values = {1.0, 2.0, 3.0, 4.0};
20771 pandas::Series<std::string> by(keys);
20772 pandas::Series<numpy::float64> data(values);
20773
20774 auto sgb = data.groupby(by);
20775 sgb.set_multiindex_names({"level0", "level1"});
20776
20777 // Create a "result" series with composite index
20778 pandas::Series<numpy::float64> result(values);
20779 result.set_index(std::make_unique<pandas::Index<std::string>>(keys));
20780
20781 sgb.apply_result_index(result);
20782
20783 // Should now have a MultiIndex
20784 check(result.has_multiindex(), "has_multiindex");
20785 check(result.multiindex().nlevels() == 2, "nlevels_2");
size (pd_test_1_all.cpp:22)
12#include "../pandas/pd_boolean_array.h"
13
14namespace dataframe_tests {
15
16namespace dataframe_tests_boolean_array {
17 void pd_test_boolean_array_constructors() {
18 std::cout << "========= BooleanArray: constructors ======================= ";
19
20 // Default constructor
21 pandas::BooleanArray arr1;
22 if (arr1.size() != 0) {
23 std::cout << " [FAIL] : in pd_test_boolean_array_constructors() : default constructor size != 0" << std::endl;
24 throw std::runtime_error("pd_test_boolean_array_constructors failed: default constructor size != 0");
25 }
26
27 // Initializer list constructor
28 pandas::BooleanArray arr2({
29 std::optional<bool>(true),
30 std::optional<bool>(false),
31 std::nullopt,
32 std::optional<bool>(true)