narwhals-dev · dangotbanned · Jul 14, 2025 · Jun 15, 2025 · Jun 15, 2025 · Jun 15, 2025
diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py
@@ -2,23 +2,104 @@
 
 import collections
 import warnings
-from typing import TYPE_CHECKING, Any, ClassVar
+from functools import lru_cache
+from itertools import chain
+from operator import methodcaller
+from typing import TYPE_CHECKING, Any, ClassVar, Literal
 
 from narwhals._compliant import EagerGroupBy
 from narwhals._expression_parsing import evaluate_output_names_and_aliases
 from narwhals._pandas_like.utils import select_columns_by_name
 from narwhals._utils import find_stacklevel
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator, Mapping, Sequence
+    from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
+
+    import pandas as pd
+    from pandas.api.typing import DataFrameGroupBy as _NativeGroupBy
+    from typing_extensions import TypeAlias, Unpack
 
     from narwhals._compliant.group_by import NarwhalsAggregation
+    from narwhals._compliant.typing import ScalarKwargs
     from narwhals._pandas_like.dataframe import PandasLikeDataFrame
     from narwhals._pandas_like.expr import PandasLikeExpr
 
+    NativeGroupBy: TypeAlias = "_NativeGroupBy[tuple[str, ...], Literal[True]]"
+
+NativeApply: TypeAlias = "Callable[[pd.DataFrame], pd.Series[Any]]"
+InefficientNativeAggregation: TypeAlias = Literal["cov", "skew"]
+NativeAggregation: TypeAlias = Literal[
+    "any",
+    "all",
+    "count",
+    "first",
+    "idxmax",
+    "idxmin",
+    "last",
+    "max",
+    "mean",
+    "median",
+    "min",
+    "nunique",
+    "prod",
+    "quantile",
+    "sem",
+    "size",
+    "std",
+    "sum",
+    "var",
+    InefficientNativeAggregation,
+]
+"""https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#built-in-aggregation-methods"""
+
+_AggFunc: TypeAlias = "NativeAggregation | Callable[..., Any]"
+"""Equivalent to `pd.NamedAgg.aggfunc`."""
+
+_NamedAgg: TypeAlias = "tuple[str, _AggFunc]"
+"""Equivalent to `pd.NamedAgg`."""
+
+
+@lru_cache(maxsize=32)
+def _agg_func(
+    name: NativeAggregation, /, **kwds: Unpack[ScalarKwargs]
+) -> _AggFunc:  # pragma: no cover
+    if name == "nunique":
+        return methodcaller(name, dropna=False)
+    if not kwds or kwds.get("ddof") == 1:
+        return name
+    return methodcaller(name, **kwds)
+
+
+def _named_aggs(
+    gb: PandasLikeGroupBy, /, expr: PandasLikeExpr, exclude: Sequence[str]
+) -> Iterator[tuple[str, _NamedAgg]]:  # pragma: no cover
+    output_names, aliases = evaluate_output_names_and_aliases(expr, gb.compliant, exclude)
+    function_name = gb._remap_expr_name(gb._leaf_name(expr))
+    aggfunc = _agg_func(function_name, **expr._scalar_kwargs)
+    for output_name, alias in zip(output_names, aliases):
+        yield alias, (output_name, aggfunc)
+
+
+def named_aggs(
+    gb: PandasLikeGroupBy, *exprs: PandasLikeExpr, exclude: Sequence[str]
+) -> dict[str, _NamedAgg]:  # pragma: no cover
+    """**Very early draft** for named agg-like input.
 
-class PandasLikeGroupBy(EagerGroupBy["PandasLikeDataFrame", "PandasLikeExpr", str]):
-    _REMAP_AGGS: ClassVar[Mapping[NarwhalsAggregation, Any]] = {
+    Ignoring most special-casing for now, just trying to work out the right shape.
+
+    The idea would be using this like:
+
+        df.groupby(...).agg(**named_aggs(..., ..., exclude=...)).reset_index()
+
+    Looks entirely different to the current `PandasLikeGroupBy` 🤔
+    """
+    return dict(chain.from_iterable(_named_aggs(gb, expr, exclude) for expr in exprs))
+
+
+class PandasLikeGroupBy(
+    EagerGroupBy["PandasLikeDataFrame", "PandasLikeExpr", NativeAggregation]
+):
+    _REMAP_AGGS: ClassVar[Mapping[NarwhalsAggregation, NativeAggregation]] = {
         "sum": "sum",
         "mean": "mean",
         "median": "median",
@@ -51,17 +132,21 @@ def __init__(
         else:
             native_frame = self.compliant.native
 
-        self._grouped = native_frame.groupby(
+        self._grouped: NativeGroupBy = native_frame.groupby(
             list(self._keys),
             sort=False,
             as_index=True,
             dropna=drop_null_keys,
             observed=True,
         )
 
+    # NOTE: Still have *quite* a bit of work to do here!
+    # -------------------------------------------------------
+    # NOTE: `C901`      Too complex                 (25 > 10)
+    # NOTE: `PLR0912`   Too many branches           (28 > 12)
+    # NOTE: `PLR0914`   Too many local variables    (27 > 15)
+    # NOTE: `PLR0915`   Too many statements         (83 > 50)
     def agg(self, *exprs: PandasLikeExpr) -> PandasLikeDataFrame:  # noqa: C901, PLR0912, PLR0914, PLR0915
-        implementation = self.compliant._implementation
-        backend_version = self.compliant._backend_version
         new_names: list[str] = self._keys.copy()
 
         all_aggs_are_simple = True
@@ -76,8 +161,8 @@ def agg(self, *exprs: PandasLikeExpr) -> PandasLikeDataFrame:  # noqa: C901, PLR
         # We need to do this separately from the rest so that we
         # can pass the `dropna` kwargs.
         nunique_aggs: dict[str, str] = {}
-        simple_aggs: dict[str, list[str]] = collections.defaultdict(list)
-        simple_aggs_functions: set[str] = set()
+        simple_aggs: dict[str, list[NativeAggregation]] = collections.defaultdict(list)
+        simple_aggs_functions: set[NativeAggregation] = set()
 
         # ddof to (output_names, aliases) mapping
         std_aggs: dict[int, tuple[list[str], list[str]]] = collections.defaultdict(
@@ -126,10 +211,11 @@ def agg(self, *exprs: PandasLikeExpr) -> PandasLikeDataFrame:  # noqa: C901, PLR
                         simple_agg_new_names.append(alias)
                         simple_aggs_functions.add(function_name)
 
-            result_aggs = []
+            result_aggs: list[pd.DataFrame] = []
 
             if simple_aggs:
                 # Fast path for single aggregation such as `df.groupby(...).mean()`
+                result_simple_aggs: pd.DataFrame
                 if (
                     len(simple_aggs_functions) == 1
                     and (agg_method := simple_aggs_functions.pop()) != "size"
@@ -138,24 +224,22 @@ def agg(self, *exprs: PandasLikeExpr) -> PandasLikeDataFrame:  # noqa: C901, PLR
                     result_simple_aggs = getattr(
                         self._grouped[list(simple_aggs.keys())], agg_method
                     )()
-                    result_simple_aggs.columns = [
+                    result_simple_aggs.columns = [  # type: ignore[assignment]
                         f"{a}_{agg_method}" for a in result_simple_aggs.columns
                     ]
                 else:
-                    result_simple_aggs = self._grouped.agg(simple_aggs)
-                    result_simple_aggs.columns = [
-                        f"{a}_{b}" for a, b in result_simple_aggs.columns
+                    result_simple_aggs = self._grouped.agg(simple_aggs)  # type: ignore[arg-type]
+                    result_simple_aggs.columns = [  # type: ignore[assignment,misc]
+                        f"{a}_{b}"  # type: ignore[has-type]
+                        for a, b in result_simple_aggs.columns
                     ]
                 if not (
                     set(result_simple_aggs.columns) == set(expected_old_names)
                     and len(result_simple_aggs.columns) == len(expected_old_names)
                 ):  # pragma: no cover
-                    msg = (
-                        f"Safety assertion failed, expected {expected_old_names} "
-                        f"got {result_simple_aggs.columns}, "
-                        "please report a bug at https://github.com/narwhals-dev/narwhals/issues"
+                    raise safety_assertion_error(
+                        expected_old_names, result_simple_aggs.columns
                     )
-                    raise AssertionError(msg)
 
                 # Rename columns, being very careful
                 expected_old_names_indices: dict[str, list[int]] = (
@@ -167,30 +251,31 @@ def agg(self, *exprs: PandasLikeExpr) -> PandasLikeDataFrame:  # noqa: C901, PLR
                     expected_old_names_indices[item].pop(0)
                     for item in result_simple_aggs.columns
                 ]
-                result_simple_aggs.columns = [simple_agg_new_names[i] for i in index_map]
+                result_simple_aggs.columns = [simple_agg_new_names[i] for i in index_map]  # type: ignore[assignment]
                 result_aggs.append(result_simple_aggs)
 
             if nunique_aggs:
                 result_nunique_aggs = self._grouped[list(nunique_aggs.values())].nunique(
                     dropna=False
                 )
-                result_nunique_aggs.columns = list(nunique_aggs.keys())
+                result_nunique_aggs.columns = list(nunique_aggs.keys())  # type: ignore[assignment]
 
                 result_aggs.append(result_nunique_aggs)
 
             if std_aggs:
                 for ddof, (std_output_names, std_aliases) in std_aggs.items():
                     _aggregation = self._grouped[std_output_names].std(ddof=ddof)
                     # `_aggregation` is a new object so it's OK to operate inplace.
-                    _aggregation.columns = std_aliases
+                    _aggregation.columns = std_aliases  # type: ignore[assignment]
                     result_aggs.append(_aggregation)
             if var_aggs:
                 for ddof, (var_output_names, var_aliases) in var_aggs.items():
                     _aggregation = self._grouped[var_output_names].var(ddof=ddof)
                     # `_aggregation` is a new object so it's OK to operate inplace.
-                    _aggregation.columns = var_aliases
+                    _aggregation.columns = var_aliases  # type: ignore[assignment]
                     result_aggs.append(_aggregation)
 
+            result: pd.DataFrame
             if result_aggs:
                 output_names_counter = collections.Counter(
                     c for frame in result_aggs for c in frame
@@ -211,61 +296,53 @@ def agg(self, *exprs: PandasLikeExpr) -> PandasLikeDataFrame:  # noqa: C901, PLR
                 result = self.compliant.__native_namespace__().DataFrame(
                     list(self._grouped.groups.keys()), columns=self._keys
                 )
-            # Keep inplace=True to avoid making a redundant copy.
-            # This may need updating, depending on https://github.com/pandas-dev/pandas/pull/51466/files
-            result.reset_index(inplace=True)  # noqa: PD002
-            return self.compliant._with_native(
-                select_columns_by_name(result, new_names, backend_version, implementation)
-            ).rename(dict(zip(self._keys, self._output_key_names)))
+            return self._select_results(result, new_names)
 
         if self.compliant.native.empty:
-            # Don't even attempt this, it's way too inconsistent across pandas versions.
-            msg = (
-                "No results for group-by aggregation.\n\n"
-                "Hint: you were probably trying to apply a non-elementary aggregation with a "
-                "pandas-like API.\n"
-                "Please rewrite your query such that group-by aggregations "
-                "are elementary. For example, instead of:\n\n"
-                "    df.group_by('a').agg(nw.col('b').round(2).mean())\n\n"
-                "use:\n\n"
-                "    df.with_columns(nw.col('b').round(2)).group_by('a').agg(nw.col('b').mean())\n\n"
-            )
-            raise ValueError(msg)
-
-        warnings.warn(
-            "Found complex group-by expression, which can't be expressed efficiently with the "
-            "pandas API. If you can, please rewrite your query such that group-by aggregations "
-            "are simple (e.g. mean, std, min, max, ...). \n\n"
-            "Please see: "
-            "https://narwhals-dev.github.io/narwhals/concepts/improve_group_by_operation/",
-            UserWarning,
-            stacklevel=find_stacklevel(),
+            raise empty_results_error()
+        return self._agg_complex(exprs, new_names)
+
+    def _select_results(
+        self, df: pd.DataFrame, /, new_names: list[str]
+    ) -> PandasLikeDataFrame:
+        compliant = self.compliant
+        # NOTE: Keep `inplace=True` to avoid making a redundant copy.
+        # This may need updating, depending on https://github.com/pandas-dev/pandas/pull/51466/files
+        df.reset_index(inplace=True)  # noqa: PD002
+        native = select_columns_by_name(
+            df, new_names, compliant._backend_version, compliant._implementation
         )
+        rename = dict(zip(self._keys, self._output_key_names))
+        return compliant._with_native(native).rename(rename)
 
-        def func(df: Any) -> Any:
+    def _agg_complex(
+        self, exprs: Iterable[PandasLikeExpr], new_names: list[str]
+    ) -> PandasLikeDataFrame:
+        warn_complex_group_by()
+        implementation = self.compliant._implementation
+        backend_version = self.compliant._backend_version
+        func = self._apply_exprs(exprs)
+        if implementation.is_pandas() and backend_version >= (2, 2):
+            result = self._grouped.apply(func, include_groups=False)
+        else:  # pragma: no cover
+            result = self._grouped.apply(func)
+        return self._select_results(result, new_names)
+
+    def _apply_exprs(self, exprs: Iterable[PandasLikeExpr]) -> NativeApply:
+        ns = self.compliant.__narwhals_namespace__()
+        into_series = ns._series.from_iterable
+
+        def fn(df: pd.DataFrame) -> pd.Series[Any]:
             out_group = []
             out_names = []
             for expr in exprs:
                 results_keys = expr(self.compliant._with_native(df))
 def func(df: Any) -> Any: 
     out_group = [] 
     out_names = [] 
     for expr in exprs: 
         results_keys = expr(self.compliant._with_native(df)) 
         for result_keys in results_keys: 
             out_group.append(result_keys.native.iloc[0]) 
             out_names.append(result_keys.name) 
     ns = self.compliant.__narwhals_namespace__() 
     return ns._series.from_iterable(out_group, index=out_names, context=ns).native 
 def _apply_exprs(self, exprs: Iterable[PandasLikeExpr]) -> NativeApply: 
     ns = self.compliant.__narwhals_namespace__() 
     into_series = ns._series.from_iterable 
     def fn(df: pd.DataFrame) -> pd.Series[Any]: 
         out_group = [] 
         out_names = [] 
         for expr in exprs: 
             results_keys = expr(self.compliant._with_native(df)) 
             for keys in results_keys: 
                 out_group.append(keys.native.iloc[0]) 
                 out_names.append(keys.name) 
         return into_series(out_group, index=out_names, context=ns).native 
     return fn 
 def func(df: Any) -> Any: 
     out_group = [] 
     out_names = [] 
     for expr in exprs: 
         results_keys = expr(self.compliant._with_native(df)) 
         for result_keys in results_keys: 
             out_group.append(result_keys.native.iloc[0]) 
             out_names.append(result_keys.name) 
     ns = self.compliant.__narwhals_namespace__() 
     return ns._series.from_iterable(out_group, index=out_names, context=ns).native 
 def _apply_exprs(self, exprs: Iterable[PandasLikeExpr]) -> NativeApply: 
     ns = self.compliant.__narwhals_namespace__() 
     into_series = ns._series.from_iterable 
  
     def fn(df: pd.DataFrame) -> pd.Series[Any]: 
         out_group = [] 
         out_names = [] 
         for expr in exprs: 
             results_keys = expr(self.compliant._with_native(df)) 
             for keys in results_keys: 
                 out_group.append(keys.native.iloc[0]) 
                 out_names.append(keys.name) 
         return into_series(out_group, index=out_names, context=ns).native 
  
     return fn 
-                for result_keys in results_keys:
-                    out_group.append(result_keys.native.iloc[0])
-                    out_names.append(result_keys.name)
-            ns = self.compliant.__narwhals_namespace__()
-            return ns._series.from_iterable(out_group, index=out_names, context=ns).native
-
-        if implementation.is_pandas() and backend_version >= (2, 2):
-            result_complex = self._grouped.apply(func, include_groups=False)
-        else:  # pragma: no cover
-            result_complex = self._grouped.apply(func)
+                for keys in results_keys:
+                    out_group.append(keys.native.iloc[0])
+                    out_names.append(keys.name)
+            return into_series(out_group, index=out_names, context=ns).native
 
-        # Keep inplace=True to avoid making a redundant copy.
-        # This may need updating, depending on https://github.com/pandas-dev/pandas/pull/51466/files
-        result_complex.reset_index(inplace=True)  # noqa: PD002
-        return self.compliant._with_native(
-            select_columns_by_name(
-                result_complex, new_names, backend_version, implementation
-            )
-        ).rename(dict(zip(self._keys, self._output_key_names)))
+        return fn
 
     def __iter__(self) -> Iterator[tuple[Any, PandasLikeDataFrame]]:
         with warnings.catch_warnings():
@@ -280,3 +357,41 @@ def __iter__(self) -> Iterator[tuple[Any, PandasLikeDataFrame]]:
                     key,
                     self.compliant._with_native(group).simple_select(*self._df.columns),
                 )
+
+
+def safety_assertion_error(
+    old_names: Sequence[str], new_names: Sequence[str] | pd.Index[str]
+) -> AssertionError:  # pragma: no cover
+    msg = (
+        f"Safety assertion failed, expected {old_names} "
+        f"got {new_names}, "
+        "please report a bug at https://github.com/narwhals-dev/narwhals/issues"
+    )
+    return AssertionError(msg)
+
+
+def empty_results_error() -> ValueError:
+    """Don't even attempt this, it's way too inconsistent across pandas versions."""
+    msg = (
+        "No results for group-by aggregation.\n\n"
+        "Hint: you were probably trying to apply a non-elementary aggregation with a "
+        "pandas-like API.\n"
+        "Please rewrite your query such that group-by aggregations "
+        "are elementary. For example, instead of:\n\n"
+        "    df.group_by('a').agg(nw.col('b').round(2).mean())\n\n"
+        "use:\n\n"
+        "    df.with_columns(nw.col('b').round(2)).group_by('a').agg(nw.col('b').mean())\n\n"
+    )
+    return ValueError(msg)
+
+
+def warn_complex_group_by() -> None:
+    warnings.warn(
+        "Found complex group-by expression, which can't be expressed efficiently with the "
+        "pandas API. If you can, please rewrite your query such that group-by aggregations "
+        "are simple (e.g. mean, std, min, max, ...). \n\n"
+        "Please see: "
+        "https://narwhals-dev.github.io/narwhals/concepts/improve_group_by_operation/",
+        UserWarning,
+        stacklevel=find_stacklevel(),
+    )