diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index 60a791d3deaee..9f459f094a9b5 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -806,28 +806,92 @@ def std(col: Column) -> Column: bool_to_numeric=True, ) - def sum(self) -> FrameLike: + def sum(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> FrameLike: """ Compute sum of group values + .. versionadded:: 3.3.0 + + Parameters + ---------- + numeric_only : bool, default False + Include only float, int, boolean columns. If None, will attempt to use + everything, then use only numeric data. + It takes no effect since only numeric columns can be support here. + + .. versionadded:: 3.4.0 + min_count: int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result will be NA. + + .. versionadded:: 3.4.0 + Examples -------- >>> df = ps.DataFrame({"A": [1, 2, 1, 2], "B": [True, False, False, True], - ... "C": [3, 4, 3, 4], "D": ["a", "b", "b", "a"]}) + ... "C": [3, 4, 3, 4], "D": ["a", "a", "b", "a"]}) - >>> df.groupby("A").sum() + >>> df.groupby("A").sum().sort_index() B C A 1 1 6 2 1 8 + >>> df.groupby("D").sum().sort_index() + A B C + D + a 5 2 11 + b 1 0 3 + + >>> df.groupby("D").sum(min_count=3).sort_index() + A B C + D + a 5.0 2.0 11.0 + b NaN NaN NaN + + Notes + ----- + There is a behavior difference between pandas-on-Spark and pandas: + + * when there is a non-numeric aggregation column, it will be ignored + even if `numeric_only` is False. + See Also -------- pyspark.pandas.Series.groupby pyspark.pandas.DataFrame.groupby """ + if numeric_only is not None and not isinstance(numeric_only, bool): + raise TypeError("numeric_only must be None or bool") + if not isinstance(min_count, int): + raise TypeError("min_count must be integer") + + if numeric_only is not None and not numeric_only: + unsupported = [ + col.name + for col in self._agg_columns + if not isinstance(col.spark.data_type, (NumericType, BooleanType)) + ] + if len(unsupported) > 0: + log_advice( + "GroupBy.sum() can only support numeric and bool columns even if" + f"numeric_only=False, skip unsupported columns: {unsupported}" + ) + + if min_count > 0: + + def sum(col: Column) -> Column: + return F.when( + F.count(F.when(~F.isnull(col), F.lit(0))) < min_count, F.lit(None) + ).otherwise(F.sum(col)) + + else: + + def sum(col: Column) -> Column: + return F.sum(col) + return self._reduce_for_stat_function( - F.sum, accepted_spark_types=(NumericType,), bool_to_numeric=True + sum, accepted_spark_types=(NumericType,), bool_to_numeric=True ) # TODO: sync the doc. diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py index 14e699793f318..ba283cc9a0564 100644 --- a/python/pyspark/pandas/tests/test_groupby.py +++ b/python/pyspark/pandas/tests/test_groupby.py @@ -1413,6 +1413,25 @@ def test_max(self): self._test_stat_func(lambda groupby_obj: groupby_obj.max(numeric_only=True)) self._test_stat_func(lambda groupby_obj: groupby_obj.max(numeric_only=True, min_count=2)) + def test_sum(self): + pdf = pd.DataFrame( + { + "A": ["a", "a", "b", "a"], + "B": [1, 2, 1, 2], + "C": [-1.5, np.nan, -3.2, 0.1], + } + ) + psdf = ps.from_pandas(pdf) + self.assert_eq(pdf.groupby("A").sum().sort_index(), psdf.groupby("A").sum().sort_index()) + self.assert_eq( + pdf.groupby("A").sum(min_count=2).sort_index(), + psdf.groupby("A").sum(min_count=2).sort_index(), + ) + self.assert_eq( + pdf.groupby("A").sum(min_count=3).sort_index(), + psdf.groupby("A").sum(min_count=3).sort_index(), + ) + def test_mad(self): self._test_stat_func(lambda groupby_obj: groupby_obj.mad())