From ff661aed496d5f6100dd3cb9373a38dfcb16a280 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 20:34:31 +0100 Subject: [PATCH 001/101] chore: Add `CompliantExpr.first` Towards (#2526) --- narwhals/_compliant/expr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/narwhals/_compliant/expr.py b/narwhals/_compliant/expr.py index cd089a25ae..0b977dbe66 100644 --- a/narwhals/_compliant/expr.py +++ b/narwhals/_compliant/expr.py @@ -160,6 +160,7 @@ def cum_max(self, *, reverse: bool) -> Self: ... def cum_prod(self, *, reverse: bool) -> Self: ... def is_in(self, other: Any) -> Self: ... def sort(self, *, descending: bool, nulls_last: bool) -> Self: ... + def first(self) -> Self: ... def rank(self, method: RankMethod, *, descending: bool) -> Self: ... def replace_strict( self, From 1b77bd79d85f1445cb610ffffcfc56d3ab043a63 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 20:38:23 +0100 Subject: [PATCH 002/101] feat: "Implement" `PolarsExpr.First` --- narwhals/_polars/expr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/narwhals/_polars/expr.py b/narwhals/_polars/expr.py index c46aede652..e618621601 100644 --- a/narwhals/_polars/expr.py +++ b/narwhals/_polars/expr.py @@ -281,6 +281,7 @@ def struct(self) -> PolarsExprStructNamespace: diff: Method[Self] drop_nulls: Method[Self] fill_null: Method[Self] + first: Method[Self] gather_every: Method[Self] head: Method[Self] is_finite: Method[Self] From e84cba38b88456e146b9fa457be194fc634337f3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 20:41:41 +0100 Subject: [PATCH 003/101] feat: Add `EagerExpr.first` --- narwhals/_compliant/expr.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/narwhals/_compliant/expr.py b/narwhals/_compliant/expr.py index 0b977dbe66..9e5bbceb52 100644 --- a/narwhals/_compliant/expr.py +++ b/narwhals/_compliant/expr.py @@ -852,6 +852,9 @@ def func(df: EagerDataFrameT) -> Sequence[EagerSeriesT]: context=self, ) + def first(self) -> Self: + return self._reuse_series("first", returns_scalar=True) + @property def cat(self) -> EagerExprCatNamespace[Self]: return EagerExprCatNamespace(self) From 25ef2415ddd0be46c0a2d15801ba9aeb280cf5e7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 20:44:48 +0100 Subject: [PATCH 004/101] chore: Repeat for `*Series` --- narwhals/_compliant/series.py | 1 + narwhals/_polars/series.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/narwhals/_compliant/series.py b/narwhals/_compliant/series.py index f1c9451346..b51abedd13 100644 --- a/narwhals/_compliant/series.py +++ b/narwhals/_compliant/series.py @@ -177,6 +177,7 @@ def fill_null( limit: int | None, ) -> Self: ... def filter(self, predicate: Any) -> Self: ... + def first(self) -> Any: ... def gather_every(self, n: int, offset: int) -> Self: ... @unstable def hist( diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index b75b238147..ccb847a446 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -83,6 +83,7 @@ "drop_nulls", "fill_null", "filter", + "first", "gather_every", "head", "is_between", @@ -662,6 +663,7 @@ def struct(self) -> PolarsSeriesStructNamespace: drop_nulls: Method[Self] fill_null: Method[Self] filter: Method[Self] + first: Method[Any] gather_every: Method[Self] head: Method[Self] is_between: Method[Self] From 78822aafdf4ba70afc5869150fb38706f2f0a795 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 20:57:52 +0100 Subject: [PATCH 005/101] feat: Add `(Arrow|PandasLike)Series.first()` --- narwhals/_arrow/series.py | 3 +++ narwhals/_pandas_like/series.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 83a478b5a8..d15bbe3936 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -315,6 +315,9 @@ def filter(self, predicate: ArrowSeries | list[bool | None]) -> Self: other_native = predicate return self._with_native(self.native.filter(other_native)) + def first(self, *, _return_py_scalar: bool = True) -> Any: + return maybe_extract_py_scalar(self.native.getitem(0), _return_py_scalar) + def mean(self, *, _return_py_scalar: bool = True) -> float: return maybe_extract_py_scalar(pc.mean(self.native), _return_py_scalar) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 005c1cfb46..b1e3a7257c 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -381,6 +381,9 @@ def filter(self, predicate: Any) -> PandasLikeSeries: other_native = predicate return self._with_native(self.native.loc[other_native]).alias(self.name) + def first(self) -> Any: + return self.native.iloc[0] + def __eq__(self, other: object) -> PandasLikeSeries: # type: ignore[override] ser, other = align_and_extract_native(self, other) return self._with_native(ser == other).alias(self.name) From 4075c50f2496ab9908b25dc15e240650bc686dc0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 21:00:04 +0100 Subject: [PATCH 006/101] chore: Mark `LazyExpr.first` as `not_implemented` for now See https://github.com/narwhals-dev/narwhals/issues/2526#issuecomment-2869123455 --- narwhals/_compliant/expr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/narwhals/_compliant/expr.py b/narwhals/_compliant/expr.py index 9e5bbceb52..05a794d096 100644 --- a/narwhals/_compliant/expr.py +++ b/narwhals/_compliant/expr.py @@ -897,6 +897,7 @@ class LazyExpr( # type: ignore[misc] gather_every: not_implemented = not_implemented() replace_strict: not_implemented = not_implemented() cat: not_implemented = not_implemented() # pyright: ignore[reportAssignmentType] + first: not_implemented = not_implemented() @classmethod def _is_expr(cls, obj: Self | Any) -> TypeIs[Self]: From 45f24b9904009081cb165345056cb0d3a1986b78 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 21:08:55 +0100 Subject: [PATCH 007/101] feat: Add `SparkLikeExpr.first` --- narwhals/_spark_like/expr.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index a9d38213a1..189937bbea 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -568,6 +568,12 @@ def _clip_both( _clip_both, lower_bound=lower_bound, upper_bound=upper_bound ) + def first(self) -> Self: + def fn(_input: Column) -> Column: + return self._F.first(_input, ignorenulls=False) + + return self._with_callable(fn) + def is_finite(self) -> Self: def _is_finite(_input: Column) -> Column: # A value is finite if it's not NaN, and not infinite, while NULLs should be From 4041dd18f6539c3483bd11f94cc4dc1939934a89 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 21:13:30 +0100 Subject: [PATCH 008/101] feat: Add `DuckDBExpr.first` https://duckdb.org/docs/stable/sql/functions/aggregates#firstarg --- narwhals/_duckdb/expr.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index 2cc0d797f8..75e34bc13a 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -408,6 +408,12 @@ def _clip_both( _clip_both, lower_bound=lower_bound, upper_bound=upper_bound ) + def first(self) -> Self: + def fn(_input: duckdb.Expression) -> duckdb.Expression: + return FunctionExpression("first", _input) + + return self._with_callable(fn) + def sum(self) -> Self: return self._with_callable(lambda _input: FunctionExpression("sum", _input)) From bb9912d76927b527962d400159549be8328a094e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 21:24:14 +0100 Subject: [PATCH 009/101] feat: Add `DaskExpr.first` - Less sure about this one - `head(1)` also seemed like an option --- narwhals/_dask/expr.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index d5c8ff6081..63648f5b58 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -663,6 +663,12 @@ def is_finite(self) -> Self: return self._with_callable(da.isfinite, "is_finite") + def first(self) -> Self: + def fn(_input: dx.Series) -> dx.Series: + return _input[0].to_series() + + return self._with_callable(fn, "first") + @property def str(self) -> DaskExprStringNamespace: return DaskExprStringNamespace(self) From 6a53aa1a42baf260167714b831bd3ebdaa78522d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 21:25:41 +0100 Subject: [PATCH 010/101] revert: 4075c50f2496ab9908b25dc15e240650bc686dc0 All have *an* implementation now --- narwhals/_compliant/expr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/narwhals/_compliant/expr.py b/narwhals/_compliant/expr.py index 05a794d096..9e5bbceb52 100644 --- a/narwhals/_compliant/expr.py +++ b/narwhals/_compliant/expr.py @@ -897,7 +897,6 @@ class LazyExpr( # type: ignore[misc] gather_every: not_implemented = not_implemented() replace_strict: not_implemented = not_implemented() cat: not_implemented = not_implemented() # pyright: ignore[reportAssignmentType] - first: not_implemented = not_implemented() @classmethod def _is_expr(cls, obj: Self | Any) -> TypeIs[Self]: From 4efc939b73d1b6c92e6ce97b223a5b8fa183b414 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 22:26:35 +0100 Subject: [PATCH 011/101] feat: Add `nw.Series.first` --- narwhals/series.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/narwhals/series.py b/narwhals/series.py index 1f3c1881bb..36cd42d4a6 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -801,6 +801,25 @@ def clip( ) ) + def first(self) -> Any: + """Get the first element of the Series. + + Returns: + A scalar value or `None` if the Series is empty. + + Examples: + >>> import polars as pl + >>> import narwhals as nw + >>> + >>> s_native = pl.Series([1, 2, 3]) + >>> s_nw = nw.from_native(s_native, series_only=True) + >>> s_nw.first() + 1 + >>> s_nw.filter(s_nw > 5).first() is None + True + """ + return self._compliant_series.first() + def is_in(self, other: Any) -> Self: """Check if the elements of this Series are in the other sequence. From fc149c1845722d816ee1689e82294d9e2144cbfe Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 22:40:14 +0100 Subject: [PATCH 012/101] test: Add `Series.first` tests --- tests/expr_and_series/first_test.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 tests/expr_and_series/first_test.py diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py new file mode 100644 index 0000000000..0c6a75d6d8 --- /dev/null +++ b/tests/expr_and_series/first_test.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +import narwhals as nw +from tests.utils import assert_equal_data + +if TYPE_CHECKING: + from tests.utils import ConstructorEager + +data = {"a": [8, 2, 1], "b": [58, 5, 6], "c": [2.5, 1.0, 3.0]} + + +@pytest.mark.parametrize(("col", "expected"), [("a", 8), ("b", 58), ("c", 2.5)]) +def test_first_series( + constructor_eager: ConstructorEager, col: str, expected: float +) -> None: + series = nw.from_native(constructor_eager(data), eager_only=True)[col] + result = series.first() + assert_equal_data({col: [result]}, {col: [expected]}) + + +def test_first_series_empty(constructor_eager: ConstructorEager) -> None: + series = nw.from_native(constructor_eager(data), eager_only=True)["a"] + series = series.filter(series > 50) + result = series.first() + assert result is None From 7489e61b92fd8f918c278fac2fb5cfce63aad026 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 22:41:53 +0100 Subject: [PATCH 013/101] fix: I guess the stubs were wrong then? --- narwhals/_arrow/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index d15bbe3936..2fa82cbe5a 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -316,7 +316,7 @@ def filter(self, predicate: ArrowSeries | list[bool | None]) -> Self: return self._with_native(self.native.filter(other_native)) def first(self, *, _return_py_scalar: bool = True) -> Any: - return maybe_extract_py_scalar(self.native.getitem(0), _return_py_scalar) + return maybe_extract_py_scalar(self.native[0], _return_py_scalar) def mean(self, *, _return_py_scalar: bool = True) -> float: return maybe_extract_py_scalar(pc.mean(self.native), _return_py_scalar) From d2719a40e4334531f2a1a9d210df646a5a83343c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 22:48:20 +0100 Subject: [PATCH 014/101] fix: Handle the out-of-bounds case https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.first.html --- narwhals/_arrow/series.py | 3 ++- narwhals/_pandas_like/series.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 2fa82cbe5a..4f4ba029f8 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -316,7 +316,8 @@ def filter(self, predicate: ArrowSeries | list[bool | None]) -> Self: return self._with_native(self.native.filter(other_native)) def first(self, *, _return_py_scalar: bool = True) -> Any: - return maybe_extract_py_scalar(self.native[0], _return_py_scalar) + result = self.native[0] if len(self.native) else None + return maybe_extract_py_scalar(result, _return_py_scalar) def mean(self, *, _return_py_scalar: bool = True) -> float: return maybe_extract_py_scalar(pc.mean(self.native), _return_py_scalar) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index b1e3a7257c..82f397d0cf 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -382,7 +382,7 @@ def filter(self, predicate: Any) -> PandasLikeSeries: return self._with_native(self.native.loc[other_native]).alias(self.name) def first(self) -> Any: - return self.native.iloc[0] + return self.native.iloc[0] if len(self.native) else None def __eq__(self, other: object) -> PandasLikeSeries: # type: ignore[override] ser, other = align_and_extract_native(self, other) From 0af11dbb90a90c5f6dfb2700ec5b82be1ecf9ba2 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 23:02:52 +0100 Subject: [PATCH 015/101] fix: `polars` backcompat https://github.com/pola-rs/polars/pull/19093 --- narwhals/_polars/series.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index ccb847a446..9d6fc56dfd 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -83,7 +83,6 @@ "drop_nulls", "fill_null", "filter", - "first", "gather_every", "head", "is_between", @@ -612,6 +611,13 @@ def hist( # noqa: C901, PLR0912 def to_polars(self) -> pl.Series: return self.native + def first(self) -> Any: + if self._backend_version >= (1, 1): + return self.native.first() + elif len(self): + return self.native.item(0) + return None + @property def dt(self) -> PolarsSeriesDateTimeNamespace: return PolarsSeriesDateTimeNamespace(self) @@ -663,7 +669,6 @@ def struct(self) -> PolarsSeriesStructNamespace: drop_nulls: Method[Self] fill_null: Method[Self] filter: Method[Self] - first: Method[Any] gather_every: Method[Self] head: Method[Self] is_between: Method[Self] From afe20f055c0539a83e8588a82cafd58f049db21d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 23:03:43 +0100 Subject: [PATCH 016/101] docs: Add `Series.first` --- docs/api-reference/series.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index 95a6539af6..c79b548dc8 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -28,6 +28,7 @@ - ewm_mean - fill_null - filter + - first - gather_every - head - hist From 6c0bd6facc4cc1d3520a0cf92d271ceae0cfb5fd Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 23:06:25 +0100 Subject: [PATCH 017/101] lol version typo https://github.com/narwhals-dev/narwhals/actions/runs/14949511597/job/41997113260?pr=2528 --- narwhals/_polars/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 9d6fc56dfd..5ff41bfb81 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -612,7 +612,7 @@ def to_polars(self) -> pl.Series: return self.native def first(self) -> Any: - if self._backend_version >= (1, 1): + if self._backend_version >= (1, 10): return self.native.first() elif len(self): return self.native.item(0) From e0fdf78866b147a2eb8c45318bed37d52173b5c4 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 10 May 2025 23:13:30 +0100 Subject: [PATCH 018/101] cov https://github.com/narwhals-dev/narwhals/actions/runs/14949533546/job/41997163953?pr=2528 --- narwhals/_polars/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 5ff41bfb81..e24f07a867 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -614,9 +614,9 @@ def to_polars(self) -> pl.Series: def first(self) -> Any: if self._backend_version >= (1, 10): return self.native.first() - elif len(self): + elif len(self): # pragma: no cover return self.native.item(0) - return None + return None # pragma: no cover @property def dt(self) -> PolarsSeriesDateTimeNamespace: From aa7c5104aedf2d8ea41f1e65414cebca0b3e2300 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 11 May 2025 12:11:14 +0100 Subject: [PATCH 019/101] chore: Add `nw.Expr.first` The description is exactly https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.first.html --- docs/api-reference/expr.md | 1 + narwhals/expr.py | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md index e0f7b65788..bc0690ed62 100644 --- a/docs/api-reference/expr.md +++ b/docs/api-reference/expr.md @@ -23,6 +23,7 @@ - ewm_mean - fill_null - filter + - first - gather_every - head - clip diff --git a/narwhals/expr.py b/narwhals/expr.py index a1abfba7f7..aacbc2c2ee 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1965,6 +1965,16 @@ def clip( ), ) + def first(self) -> Self: + """Get the first value. + + Returns: + A new expression. + """ + return self._with_orderable_aggregation( + lambda plx: self._to_compliant_expr(plx).first() + ) + def mode(self) -> Self: r"""Compute the most occurring value(s). From bd4ab89c55f749d214ac5bd82d365d9c686483b8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 11 May 2025 13:26:12 +0100 Subject: [PATCH 020/101] feat: Maybe `SparkLike` requires `order_by`? https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083291874 --- narwhals/_spark_like/expr.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 8fe3192c29..3c6715591d 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -561,10 +561,12 @@ def _clip_both( ) def first(self) -> Self: - def fn(_input: Column) -> Column: - return self._F.first(_input, ignorenulls=False) + def fn(inputs: WindowInputs) -> Column: + return self._F.first(inputs.expr, ignorenulls=False).over( + self.partition_by(inputs).orderBy(*self._sort(inputs)) + ) - return self._with_callable(fn) + return self._with_window_function(fn) def is_finite(self) -> Self: def _is_finite(_input: Column) -> Column: From 9f7f5a9098f83d67b5097dd8e898c219f93709bf Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 11 May 2025 14:30:12 +0100 Subject: [PATCH 021/101] test: Try out eager backends Still need to add `dask`, `duckdb` equivalent of (https://github.com/narwhals-dev/narwhals/pull/2528/commits/bd4ab89c55f749d214ac5bd82d365d9c686483b8) --- tests/expr_and_series/first_test.py | 43 +++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py index 0c6a75d6d8..ccfa12ae97 100644 --- a/tests/expr_and_series/first_test.py +++ b/tests/expr_and_series/first_test.py @@ -1,6 +1,8 @@ from __future__ import annotations from typing import TYPE_CHECKING +from typing import Mapping +from typing import Sequence import pytest @@ -8,14 +10,20 @@ from tests.utils import assert_equal_data if TYPE_CHECKING: + from narwhals.typing import PythonLiteral from tests.utils import ConstructorEager -data = {"a": [8, 2, 1], "b": [58, 5, 6], "c": [2.5, 1.0, 3.0]} +data = { + "a": [8, 2, 1, None], + "b": [58, 5, 6, 12], + "c": [2.5, 1.0, 3.0, 0.9], + "d": [2, 1, 4, 3], +} @pytest.mark.parametrize(("col", "expected"), [("a", 8), ("b", 58), ("c", 2.5)]) def test_first_series( - constructor_eager: ConstructorEager, col: str, expected: float + constructor_eager: ConstructorEager, col: str, expected: PythonLiteral ) -> None: series = nw.from_native(constructor_eager(data), eager_only=True)[col] result = series.first() @@ -27,3 +35,34 @@ def test_first_series_empty(constructor_eager: ConstructorEager) -> None: series = series.filter(series > 50) result = series.first() assert result is None + + +@pytest.mark.parametrize(("col", "expected"), [("a", 8), ("b", 58), ("c", 2.5)]) +def test_first_expr_eager( + constructor_eager: ConstructorEager, col: str, expected: PythonLiteral +) -> None: + df = nw.from_native(constructor_eager(data)) + expr = nw.col(col).first() + result = df.select(expr) + assert_equal_data(result, {col: [expected]}) + + +@pytest.mark.parametrize( + "expected", + [{"a": [8], "c": [2.5]}, {"d": [2], "b": [58]}, {"c": [2.5], "a": [8], "d": [2]}], +) +def test_first_expr_eager_expand( + constructor_eager: ConstructorEager, expected: Mapping[str, Sequence[PythonLiteral]] +) -> None: + df = nw.from_native(constructor_eager(data)) + expr = nw.col(expected).first() + result = df.select(expr) + assert_equal_data(result, expected) + + +def test_first_expr_eager_expand_sort(constructor_eager: ConstructorEager) -> None: + df = nw.from_native(constructor_eager(data)) + expr = nw.col("d", "a", "b", "c").first() + result = df.sort("d").select(expr) + expected = {"d": [1], "a": [2], "b": [5], "c": [1.0]} + assert_equal_data(result, expected) From 7146f60208afe764fed841c671d559fd69f45681 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 11 May 2025 19:01:58 +0100 Subject: [PATCH 022/101] =?UTF-8?q?test:=20Add=20mostly=20broken=20lazy=20?= =?UTF-8?q?tests=20=F0=9F=98=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083584575 --- tests/expr_and_series/first_test.py | 40 ++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py index ccfa12ae97..8ed420771b 100644 --- a/tests/expr_and_series/first_test.py +++ b/tests/expr_and_series/first_test.py @@ -11,13 +11,15 @@ if TYPE_CHECKING: from narwhals.typing import PythonLiteral + from tests.utils import Constructor from tests.utils import ConstructorEager -data = { +data: dict[str, list[PythonLiteral]] = { "a": [8, 2, 1, None], "b": [58, 5, 6, 12], "c": [2.5, 1.0, 3.0, 0.9], "d": [2, 1, 4, 3], + "idx": [0, 1, 2, 3], } @@ -47,6 +49,42 @@ def test_first_expr_eager( assert_equal_data(result, {col: [expected]}) +@pytest.mark.skip( + reason="https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083557149" +) +@pytest.mark.parametrize(("col", "expected"), [("a", 8), ("b", 58), ("c", 2.5)]) +def test_first_expr_lazy_select( + constructor: Constructor, col: str, expected: PythonLiteral +) -> None: + frame = nw.from_native(constructor(data)) + expr = nw.col(col).first().over(order_by="idx") + result = frame.select(expr) + assert_equal_data(result, {col: [expected]}) + + +@pytest.mark.parametrize(("col", "expected"), [("a", 8), ("b", 58), ("c", 2.5)]) +def test_first_expr_lazy_with_columns( + constructor: Constructor, + col: str, + expected: PythonLiteral, + request: pytest.FixtureRequest, +) -> None: + if any(x in str(constructor) for x in ("pyarrow_table", "pandas")): + request.applymarker( + pytest.mark.xfail( + reason="Some kind of index error, see https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083582828" + ) + ) + if any(x in str(constructor) for x in ("dask", "duckdb")): + request.applymarker(pytest.mark.xfail(reason="Need to add the over support")) + + frame = nw.from_native(constructor(data)) + expr = nw.col(col).first().over(order_by="idx").alias("result") + result = frame.with_columns(expr).select("result") + expected_broadcast = len(data[col]) * [expected] + assert_equal_data(result, {"result": expected_broadcast}) + + @pytest.mark.parametrize( "expected", [{"a": [8], "c": [2.5]}, {"d": [2], "b": [58]}, {"c": [2.5], "a": [8], "d": [2]}], From 8c24e6e29a7a705899335a784f248a9f4c285b3b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 11 May 2025 20:10:33 +0100 Subject: [PATCH 023/101] feat: `duckdb` support? https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083291874 --- narwhals/_duckdb/expr.py | 10 +++++++--- tests/expr_and_series/first_test.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index 75e34bc13a..dd832dc2a3 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -408,11 +408,15 @@ def _clip_both( _clip_both, lower_bound=lower_bound, upper_bound=upper_bound ) + @requires.backend_version((1, 3)) def first(self) -> Self: - def fn(_input: duckdb.Expression) -> duckdb.Expression: - return FunctionExpression("first", _input) + def fn(window_inputs: WindowInputs) -> duckdb.Expression: + order_by = generate_order_by_sql(*window_inputs.order_by, ascending=True) + partition_by = generate_partition_by_sql(*window_inputs.partition_by) + window = f"({partition_by} {order_by})" + return SQLExpression(f"first({window_inputs.expr}) over {window}") - return self._with_callable(fn) + return self._with_window_function(fn) def sum(self) -> Self: return self._with_callable(lambda _input: FunctionExpression("sum", _input)) diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py index 8ed420771b..29f1e3adf3 100644 --- a/tests/expr_and_series/first_test.py +++ b/tests/expr_and_series/first_test.py @@ -75,7 +75,7 @@ def test_first_expr_lazy_with_columns( reason="Some kind of index error, see https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083582828" ) ) - if any(x in str(constructor) for x in ("dask", "duckdb")): + if any(x in str(constructor) for x in ("dask",)): request.applymarker(pytest.mark.xfail(reason="Need to add the over support")) frame = nw.from_native(constructor(data)) From 54a4cb47c685ad65897a736cc9623f6677fa209c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 11 May 2025 20:32:19 +0100 Subject: [PATCH 024/101] test: Update xfails --- tests/expr_and_series/first_test.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py index 29f1e3adf3..b065076c32 100644 --- a/tests/expr_and_series/first_test.py +++ b/tests/expr_and_series/first_test.py @@ -7,6 +7,8 @@ import pytest import narwhals as nw +from tests.utils import DUCKDB_VERSION +from tests.utils import POLARS_VERSION from tests.utils import assert_equal_data if TYPE_CHECKING: @@ -69,7 +71,7 @@ def test_first_expr_lazy_with_columns( expected: PythonLiteral, request: pytest.FixtureRequest, ) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "pandas")): + if any(x in str(constructor) for x in ("pyarrow_table", "pandas", "modin")): request.applymarker( pytest.mark.xfail( reason="Some kind of index error, see https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083582828" @@ -78,6 +80,21 @@ def test_first_expr_lazy_with_columns( if any(x in str(constructor) for x in ("dask",)): request.applymarker(pytest.mark.xfail(reason="Need to add the over support")) + request.applymarker( + pytest.mark.xfail( + ("duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3)), + reason="Needs `SQLExpression`", + raises=NotImplementedError, + ) + ) + request.applymarker( + pytest.mark.xfail( + ("polars" in str(constructor) and POLARS_VERSION < (1, 10)), + reason="Needs `order_by`", + raises=NotImplementedError, + ) + ) + frame = nw.from_native(constructor(data)) expr = nw.col(col).first().over(order_by="idx").alias("result") result = frame.with_columns(expr).select("result") From 63e045924e97ea30c070858e0b39c3e353359237 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 11 May 2025 20:39:55 +0100 Subject: [PATCH 025/101] fix: Use `head(1)` in `DaskExpr` https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083596257 --- narwhals/_dask/expr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 63648f5b58..8705701ef0 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -665,7 +665,7 @@ def is_finite(self) -> Self: def first(self) -> Self: def fn(_input: dx.Series) -> dx.Series: - return _input[0].to_series() + return _input.head(1) return self._with_callable(fn, "first") From 9493aad754543154bcfdb0c21d24b530146f3de7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 11 May 2025 20:40:45 +0100 Subject: [PATCH 026/101] ignore cov https://github.com/narwhals-dev/narwhals/actions/runs/14959039229/job/42018537389?pr=2528 --- tests/expr_and_series/first_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py index b065076c32..3ac2ff8f15 100644 --- a/tests/expr_and_series/first_test.py +++ b/tests/expr_and_series/first_test.py @@ -57,7 +57,7 @@ def test_first_expr_eager( @pytest.mark.parametrize(("col", "expected"), [("a", 8), ("b", 58), ("c", 2.5)]) def test_first_expr_lazy_select( constructor: Constructor, col: str, expected: PythonLiteral -) -> None: +) -> None: # pragma: no cover frame = nw.from_native(constructor(data)) expr = nw.col(col).first().over(order_by="idx") result = frame.select(expr) From 88535a4ef2a57d7a64bbe3b934f7bca5003a5df7 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 11 May 2025 20:48:56 +0100 Subject: [PATCH 027/101] Apply suggestion Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> --- tests/expr_and_series/first_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py index 3ac2ff8f15..3d2bcf7a8c 100644 --- a/tests/expr_and_series/first_test.py +++ b/tests/expr_and_series/first_test.py @@ -71,7 +71,7 @@ def test_first_expr_lazy_with_columns( expected: PythonLiteral, request: pytest.FixtureRequest, ) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "pandas", "modin")): + if any(x in str(constructor) for x in ("pyarrow_table", "pandas", "modin", "cudf")): request.applymarker( pytest.mark.xfail( reason="Some kind of index error, see https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083582828" From 77ae9c0300efa126a39bf5812832397e32fc4394 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 11 May 2025 20:54:08 +0100 Subject: [PATCH 028/101] test: Remove dask `xfail` It still fails, but I'd rather it show in CI --- tests/expr_and_series/first_test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py index 3d2bcf7a8c..a808de3a4e 100644 --- a/tests/expr_and_series/first_test.py +++ b/tests/expr_and_series/first_test.py @@ -77,8 +77,6 @@ def test_first_expr_lazy_with_columns( reason="Some kind of index error, see https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083582828" ) ) - if any(x in str(constructor) for x in ("dask",)): - request.applymarker(pytest.mark.xfail(reason="Need to add the over support")) request.applymarker( pytest.mark.xfail( From c1a61734c79a212b19dbbf50bd3ba61c82793ab4 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 11 May 2025 21:58:52 +0100 Subject: [PATCH 029/101] revert: Remove `dask` implementation Tried but not worth the stress - other support is more important --- narwhals/_dask/expr.py | 7 +------ tests/expr_and_series/first_test.py | 7 +++++++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 8705701ef0..d1ce0b1e70 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -663,12 +663,6 @@ def is_finite(self) -> Self: return self._with_callable(da.isfinite, "is_finite") - def first(self) -> Self: - def fn(_input: dx.Series) -> dx.Series: - return _input.head(1) - - return self._with_callable(fn, "first") - @property def str(self) -> DaskExprStringNamespace: return DaskExprStringNamespace(self) @@ -680,3 +674,4 @@ def dt(self) -> DaskExprDateTimeNamespace: list = not_implemented() # pyright: ignore[reportAssignmentType] struct = not_implemented() # pyright: ignore[reportAssignmentType] rank = not_implemented() # pyright: ignore[reportAssignmentType] + first = not_implemented() diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py index a808de3a4e..899e9ccbf9 100644 --- a/tests/expr_and_series/first_test.py +++ b/tests/expr_and_series/first_test.py @@ -92,6 +92,13 @@ def test_first_expr_lazy_with_columns( raises=NotImplementedError, ) ) + request.applymarker( + pytest.mark.xfail( + ("dask" in str(constructor)), + reason="Abandoned https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083585895", + raises=NotImplementedError, + ) + ) frame = nw.from_native(constructor(data)) expr = nw.col(col).first().over(order_by="idx").alias("result") From 3c4ff9b0af3200b7a31aa4d22ea81e4c57a32d80 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 11 May 2025 22:04:55 +0100 Subject: [PATCH 030/101] refactor(typing): Use `PythonLiteral` for `Series` return Resolves (https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083298573) --- narwhals/_arrow/series.py | 2 +- narwhals/_compliant/series.py | 3 ++- narwhals/_pandas_like/series.py | 3 ++- narwhals/_polars/series.py | 5 +++-- narwhals/series.py | 3 ++- 5 files changed, 10 insertions(+), 6 deletions(-) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 4f4ba029f8..07c847fc55 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -315,7 +315,7 @@ def filter(self, predicate: ArrowSeries | list[bool | None]) -> Self: other_native = predicate return self._with_native(self.native.filter(other_native)) - def first(self, *, _return_py_scalar: bool = True) -> Any: + def first(self, *, _return_py_scalar: bool = True) -> PythonLiteral: result = self.native[0] if len(self.native) else None return maybe_extract_py_scalar(result, _return_py_scalar) diff --git a/narwhals/_compliant/series.py b/narwhals/_compliant/series.py index b51abedd13..620ababece 100644 --- a/narwhals/_compliant/series.py +++ b/narwhals/_compliant/series.py @@ -49,6 +49,7 @@ from narwhals.typing import MultiIndexSelector from narwhals.typing import NonNestedLiteral from narwhals.typing import NumericLiteral + from narwhals.typing import PythonLiteral from narwhals.typing import RankMethod from narwhals.typing import RollingInterpolationMethod from narwhals.typing import SizedMultiIndexSelector @@ -177,7 +178,7 @@ def fill_null( limit: int | None, ) -> Self: ... def filter(self, predicate: Any) -> Self: ... - def first(self) -> Any: ... + def first(self) -> PythonLiteral: ... def gather_every(self, n: int, offset: int) -> Self: ... @unstable def hist( diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 82f397d0cf..b23b3cd10c 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -50,6 +50,7 @@ from narwhals.typing import Into1DArray from narwhals.typing import NonNestedLiteral from narwhals.typing import NumericLiteral + from narwhals.typing import PythonLiteral from narwhals.typing import RankMethod from narwhals.typing import RollingInterpolationMethod from narwhals.typing import SizedMultiIndexSelector @@ -381,7 +382,7 @@ def filter(self, predicate: Any) -> PandasLikeSeries: other_native = predicate return self._with_native(self.native.loc[other_native]).alias(self.name) - def first(self) -> Any: + def first(self) -> PythonLiteral: return self.native.iloc[0] if len(self.native) else None def __eq__(self, other: object) -> PandasLikeSeries: # type: ignore[override] diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index e24f07a867..e60908b1cc 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -38,6 +38,7 @@ from narwhals.series import Series from narwhals.typing import Into1DArray from narwhals.typing import MultiIndexSelector + from narwhals.typing import PythonLiteral from narwhals.typing import _1DArray from narwhals.utils import Version from narwhals.utils import _FullContext @@ -611,11 +612,11 @@ def hist( # noqa: C901, PLR0912 def to_polars(self) -> pl.Series: return self.native - def first(self) -> Any: + def first(self) -> PythonLiteral: if self._backend_version >= (1, 10): return self.native.first() elif len(self): # pragma: no cover - return self.native.item(0) + return self.native.item(0) # type: ignore[no-any-return] return None # pragma: no cover @property diff --git a/narwhals/series.py b/narwhals/series.py index 36cd42d4a6..f2548af00d 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -44,6 +44,7 @@ from narwhals.typing import ClosedInterval from narwhals.typing import FillNullStrategy from narwhals.typing import NumericLiteral + from narwhals.typing import PythonLiteral from narwhals.typing import RankMethod from narwhals.typing import RollingInterpolationMethod from narwhals.typing import TemporalLiteral @@ -801,7 +802,7 @@ def clip( ) ) - def first(self) -> Any: + def first(self) -> PythonLiteral: """Get the first element of the Series. Returns: From cd002f38d0266928df608cb3ec4c858f0c7e43a2 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 12 May 2025 12:21:24 +0100 Subject: [PATCH 031/101] test: Add `test_group_by_agg_first` - Expecting **at least** `pyarrow` to be possible - Assuming `pandas` might be, but haven't explored yet --- tests/group_by_test.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/group_by_test.py b/tests/group_by_test.py index d64b3e386d..8f616b53de 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -4,6 +4,7 @@ from contextlib import nullcontext from typing import Any from typing import Mapping +from typing import Sequence import pandas as pd import pyarrow as pa @@ -591,3 +592,34 @@ def test_renaming_edge_case(constructor: Constructor) -> None: result = nw.from_native(constructor(data)).group_by(nw.col("a")).agg(nw.all().min()) expected = {"a": [0], "_a_tmp": [1], "b": [4]} assert_equal_data(result, expected) + + +@pytest.mark.parametrize( + ("keys", "aggs", "expected", "pre_sort"), + [ + (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 2, 4, 6]}, None), + (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 3, 5, 6]}, {"descending": True}), + ], +) +def test_group_by_agg_first( + constructor_eager: ConstructorEager, + request: pytest.FixtureRequest, + keys: Sequence[str], + aggs: Sequence[str], + expected: Mapping[str, Any], + pre_sort: Mapping[str, Any] | None, +) -> None: + if any( + x in str(constructor_eager) for x in ("pyarrow_table", "pandas", "modin", "cudf") + ): + request.applymarker( + pytest.mark.xfail( + reason="Not implemented yet.\n `pyarrow` should be possible." + ) + ) + data = {"a": [1, 2, 2, 3, 3, 4], "b": [1, 2, 3, 4, 5, 6]} + df = nw.from_native(constructor_eager(data)) + if pre_sort: + df = df.sort(aggs, **pre_sort) + result = df.group_by(keys).agg(nw.col(aggs).first()).sort(keys) + assert_equal_data(result, expected) From 1458530ce5d0a96176289347a3b6eb0fd49917c5 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 12 May 2025 13:11:23 +0100 Subject: [PATCH 032/101] feat(DRAFT): Start trying `pyarrow` `agg(first())` - When `first` appears, we'll need to backtrack in `agg` to recreate the `TableGroupBy` w/ `use_threads = False` --- narwhals/_arrow/group_by.py | 5 ++++- narwhals/_compliant/group_by.py | 12 +++++++++++- tests/group_by_test.py | 12 ++++++++---- 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index 571d5c18d8..8f6fc0c6da 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -39,6 +39,7 @@ class ArrowGroupBy(EagerGroupBy["ArrowDataFrame", "ArrowExpr", "Aggregation"]): "len": "count", "n_unique": "count_distinct", "count": "count", + "first": "first", } _REMAP_UNIQUE: ClassVar[Mapping[UniqueKeepStrategy, Aggregation]] = { "any": "min", @@ -60,7 +61,7 @@ def __init__( self._grouped = pa.TableGroupBy(self.compliant.native, self._keys) self._drop_null_keys = drop_null_keys - def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame: + def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame: # noqa: C901 self._ensure_all_simple(exprs) aggs: list[tuple[str, Aggregation, AggregateOptions | None]] = [] expected_pyarrow_column_names: list[str] = self._keys.copy() @@ -90,6 +91,8 @@ def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame: option = pc.CountOptions(mode="all") elif function_name == "count": option = pc.CountOptions(mode="only_valid") + elif function_name == "first": + option = pc.ScalarAggregateOptions(skip_nulls=False) else: option = None diff --git a/narwhals/_compliant/group_by.py b/narwhals/_compliant/group_by.py index b96211a7ab..c482d9c0fb 100644 --- a/narwhals/_compliant/group_by.py +++ b/narwhals/_compliant/group_by.py @@ -56,7 +56,17 @@ "NativeAggregationT_co", bound="str | Callable[..., Any]", covariant=True ) NarwhalsAggregation: TypeAlias = Literal[ - "sum", "mean", "median", "max", "min", "std", "var", "len", "n_unique", "count" + "sum", + "mean", + "median", + "max", + "min", + "std", + "var", + "len", + "n_unique", + "count", + "first", ] diff --git a/tests/group_by_test.py b/tests/group_by_test.py index 8f616b53de..6f3c42714f 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -609,12 +609,16 @@ def test_group_by_agg_first( expected: Mapping[str, Any], pre_sort: Mapping[str, Any] | None, ) -> None: - if any( - x in str(constructor_eager) for x in ("pyarrow_table", "pandas", "modin", "cudf") - ): + if any(x in str(constructor_eager) for x in ("pandas", "modin", "cudf")): + request.applymarker(pytest.mark.xfail(reason="Not implemented yet.")) + if "pyarrow_table" in str(constructor_eager): + from pyarrow import ArrowNotImplementedError + request.applymarker( pytest.mark.xfail( - reason="Not implemented yet.\n `pyarrow` should be possible." + reason="Need to disable threading if this appears\n" + "Using ordered aggregator in multiple threaded execution is not supported", + raises=ArrowNotImplementedError, ) ) data = {"a": [1, 2, 2, 3, 3, 4], "b": [1, 2, 3, 4, 5, 6]} From 962ebcd962e19c3270ef0ed9c98f4935b205b02f Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 12 May 2025 13:28:54 +0100 Subject: [PATCH 033/101] fix: Maybe `pyarrow` support? https://github.com/narwhals-dev/narwhals/pull/2528/commits/1458530ce5d0a96176289347a3b6eb0fd49917c5 --- narwhals/_arrow/group_by.py | 11 ++++++++++- tests/group_by_test.py | 10 ---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index 8f6fc0c6da..4c9ee1600f 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -67,6 +67,7 @@ def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame: # noqa: C901 expected_pyarrow_column_names: list[str] = self._keys.copy() new_column_names: list[str] = self._keys.copy() exclude = (*self._keys, *self._output_key_names) + grouped = self._grouped for expr in exprs: output_names, aliases = evaluate_output_names_and_aliases( @@ -93,6 +94,14 @@ def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame: # noqa: C901 option = pc.CountOptions(mode="only_valid") elif function_name == "first": option = pc.ScalarAggregateOptions(skip_nulls=False) + # NOTE: `pyarrow` defaults to multithreading, but is not compatible with ordered aggs + # If we see any that are ordered, the entire aggregation must disable threading + # `ArrowNotImplementedError: Using ordered aggregator in multiple threaded execution is not supported` + # Need to avoid overwriting `self._grouped`, since we don't want to slow down unrelated `.agg(...)` calls + if grouped._use_threads: + grouped = pa.TableGroupBy( + self.compliant.native, grouped.keys, use_threads=False + ) else: option = None @@ -105,7 +114,7 @@ def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame: # noqa: C901 [(output_name, function_name, option) for output_name in output_names] ) - result_simple = self._grouped.aggregate(aggs) + result_simple = grouped.aggregate(aggs) # Rename columns, being very careful expected_old_names_indices: dict[str, list[int]] = collections.defaultdict(list) diff --git a/tests/group_by_test.py b/tests/group_by_test.py index 6f3c42714f..a0d192aa9e 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -611,16 +611,6 @@ def test_group_by_agg_first( ) -> None: if any(x in str(constructor_eager) for x in ("pandas", "modin", "cudf")): request.applymarker(pytest.mark.xfail(reason="Not implemented yet.")) - if "pyarrow_table" in str(constructor_eager): - from pyarrow import ArrowNotImplementedError - - request.applymarker( - pytest.mark.xfail( - reason="Need to disable threading if this appears\n" - "Using ordered aggregator in multiple threaded execution is not supported", - raises=ArrowNotImplementedError, - ) - ) data = {"a": [1, 2, 2, 3, 3, 4], "b": [1, 2, 3, 4, 5, 6]} df = nw.from_native(constructor_eager(data)) if pre_sort: From 5d310bc90dd487d7e79746a92dd09e3c5f87a918 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 12 May 2025 14:14:39 +0100 Subject: [PATCH 034/101] refactor: Add `ArrowGroupBy._configure_agg` `agg` was already too complex, but we need to start handling even more now for https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2084603274 --- narwhals/_arrow/group_by.py | 52 +++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index 4c9ee1600f..03dadc4911 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -61,7 +61,35 @@ def __init__( self._grouped = pa.TableGroupBy(self.compliant.native, self._keys) self._drop_null_keys = drop_null_keys - def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame: # noqa: C901 + def _configure_agg( + self, grouped: pa.TableGroupBy, expr: ArrowExpr, / + ) -> tuple[pa.TableGroupBy, Aggregation, AggregateOptions | None]: + option: AggregateOptions | None + function_name = self._leaf_name(expr) + if function_name in {"std", "var"}: + option = pc.VarianceOptions(ddof=expr._call_kwargs["ddof"]) + elif function_name in {"len", "n_unique"}: + option = pc.CountOptions(mode="all") + elif function_name == "count": + option = pc.CountOptions(mode="only_valid") + elif function_name == "first": + # TODO @dangotbanned: split this into an ordered agg method + # - will need the same for `last` + # - and need to handle compat https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2084603274 + option = pc.ScalarAggregateOptions(skip_nulls=False) + # NOTE: `pyarrow` defaults to multi-threading, but is not compatible with ordered aggs + # If we see any that are ordered, the entire aggregation must disable threading + # `ArrowNotImplementedError: Using ordered aggregator in multiple threaded execution is not supported` + # Need to avoid overwriting `self._grouped`, since we don't want to slow down unrelated `.agg(...)` calls + if grouped._use_threads: + grouped = pa.TableGroupBy( + self.compliant.native, grouped.keys, use_threads=False + ) + else: + option = None + return grouped, self._remap_expr_name(function_name), option + + def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame: self._ensure_all_simple(exprs) aggs: list[tuple[str, Aggregation, AggregateOptions | None]] = [] expected_pyarrow_column_names: list[str] = self._keys.copy() @@ -85,27 +113,7 @@ def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame: # noqa: C901 aggs.append((self._keys[0], "count", pc.CountOptions(mode="all"))) continue - function_name = self._leaf_name(expr) - if function_name in {"std", "var"}: - option: Any = pc.VarianceOptions(ddof=expr._call_kwargs["ddof"]) - elif function_name in {"len", "n_unique"}: - option = pc.CountOptions(mode="all") - elif function_name == "count": - option = pc.CountOptions(mode="only_valid") - elif function_name == "first": - option = pc.ScalarAggregateOptions(skip_nulls=False) - # NOTE: `pyarrow` defaults to multithreading, but is not compatible with ordered aggs - # If we see any that are ordered, the entire aggregation must disable threading - # `ArrowNotImplementedError: Using ordered aggregator in multiple threaded execution is not supported` - # Need to avoid overwriting `self._grouped`, since we don't want to slow down unrelated `.agg(...)` calls - if grouped._use_threads: - grouped = pa.TableGroupBy( - self.compliant.native, grouped.keys, use_threads=False - ) - else: - option = None - - function_name = self._remap_expr_name(function_name) + grouped, function_name, option = self._configure_agg(grouped, expr) new_column_names.extend(aliases) expected_pyarrow_column_names.extend( [f"{output_name}_{function_name}" for output_name in output_names] From a417341262b5dd977f3e2438ccebd8ab76ca79fd Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 12 May 2025 15:38:50 +0100 Subject: [PATCH 035/101] fix: Add `pyarrow` compat for `first` Resolves (https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2084575498) --- narwhals/_arrow/group_by.py | 47 +++++++++++++++++++++++-------------- tests/group_by_test.py | 8 +++++++ 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index 03dadc4911..e7c6d354d4 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -46,6 +46,12 @@ class ArrowGroupBy(EagerGroupBy["ArrowDataFrame", "ArrowExpr", "Aggregation"]): "first": "min", "last": "max", } + _OPTION_COUNT_ALL: ClassVar[frozenset[NarwhalsAggregation]] = frozenset( + ("len", "n_unique") + ) + _OPTION_COUNT_VALID: ClassVar[frozenset[NarwhalsAggregation]] = frozenset(("count",)) + _OPTION_ORDERED: ClassVar[frozenset[NarwhalsAggregation]] = frozenset(("first",)) + _OPTION_VARIANCE: ClassVar[frozenset[NarwhalsAggregation]] = frozenset(("std", "var")) def __init__( self, @@ -64,30 +70,37 @@ def __init__( def _configure_agg( self, grouped: pa.TableGroupBy, expr: ArrowExpr, / ) -> tuple[pa.TableGroupBy, Aggregation, AggregateOptions | None]: - option: AggregateOptions | None + option: AggregateOptions | None = None function_name = self._leaf_name(expr) - if function_name in {"std", "var"}: + if function_name in self._OPTION_VARIANCE: option = pc.VarianceOptions(ddof=expr._call_kwargs["ddof"]) - elif function_name in {"len", "n_unique"}: + elif function_name in self._OPTION_COUNT_ALL: option = pc.CountOptions(mode="all") - elif function_name == "count": + elif function_name in self._OPTION_COUNT_VALID: option = pc.CountOptions(mode="only_valid") - elif function_name == "first": - # TODO @dangotbanned: split this into an ordered agg method - # - will need the same for `last` - # - and need to handle compat https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2084603274 - option = pc.ScalarAggregateOptions(skip_nulls=False) + elif function_name in self._OPTION_ORDERED: + grouped, option = self._ordered_agg(grouped) + return grouped, self._remap_expr_name(function_name), option + + def _ordered_agg( + self, grouped: pa.TableGroupBy, / + ) -> tuple[pa.TableGroupBy, AggregateOptions]: + # NOTE: Will need the same for `last` + backend_version = self.compliant._backend_version + if backend_version >= (14, 0) and grouped._use_threads: # NOTE: `pyarrow` defaults to multi-threading, but is not compatible with ordered aggs # If we see any that are ordered, the entire aggregation must disable threading # `ArrowNotImplementedError: Using ordered aggregator in multiple threaded execution is not supported` - # Need to avoid overwriting `self._grouped`, since we don't want to slow down unrelated `.agg(...)` calls - if grouped._use_threads: - grouped = pa.TableGroupBy( - self.compliant.native, grouped.keys, use_threads=False - ) - else: - option = None - return grouped, self._remap_expr_name(function_name), option + # Also need to avoid overwriting `self._grouped`, since we don't want to slow down unrelated `.agg(...)` calls + grouped = pa.TableGroupBy( + self.compliant.native, grouped.keys, use_threads=False + ) + elif backend_version < (14, 0) and backend_version >= (13, 0): # pragma: no cover + # TODO @dangotbanned: Write up an message to suggest upgrading to `>=14.0.0` + msg = "https://github.com/apache/arrow/issues/36709" + raise NotImplementedError(msg) + # NOTE: Prior to `13.0.0`, threading was disabled + return grouped, pc.ScalarAggregateOptions(skip_nulls=False) def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame: self._ensure_all_simple(exprs) diff --git a/tests/group_by_test.py b/tests/group_by_test.py index a0d192aa9e..9a8f6d3991 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -611,6 +611,14 @@ def test_group_by_agg_first( ) -> None: if any(x in str(constructor_eager) for x in ("pandas", "modin", "cudf")): request.applymarker(pytest.mark.xfail(reason="Not implemented yet.")) + request.applymarker( + pytest.mark.xfail( + "pyarrow_table" in str(constructor_eager) + and (PYARROW_VERSION < (14, 0) and PYARROW_VERSION >= (13, 0)), + reason="https://github.com/apache/arrow/issues/36709", + raises=NotImplementedError, + ) + ) data = {"a": [1, 2, 2, 3, 3, 4], "b": [1, 2, 3, 4, 5, 6]} df = nw.from_native(constructor_eager(data)) if pre_sort: From 354da1a909b995c0019db718b12dcb10d5d9ffe3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 12 May 2025 15:50:48 +0100 Subject: [PATCH 036/101] fix: Don't support below `14` ever > pyarrow.lib.ArrowKeyError: No function registered with name: hash_first https://github.com/narwhals-dev/narwhals/actions/runs/14975157220/job/42065491924?pr=2528 --- narwhals/_arrow/group_by.py | 3 +++ tests/group_by_test.py | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index e7c6d354d4..78d70e3339 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -99,6 +99,9 @@ def _ordered_agg( # TODO @dangotbanned: Write up an message to suggest upgrading to `>=14.0.0` msg = "https://github.com/apache/arrow/issues/36709" raise NotImplementedError(msg) + else: # pragma: no cover + msg = "`hash_first` wasn't available until `13` https://arrow.apache.org/docs/12.0/python/compute.html" + raise NotImplementedError(msg) # NOTE: Prior to `13.0.0`, threading was disabled return grouped, pc.ScalarAggregateOptions(skip_nulls=False) diff --git a/tests/group_by_test.py b/tests/group_by_test.py index 9a8f6d3991..5759bf8fee 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -613,8 +613,7 @@ def test_group_by_agg_first( request.applymarker(pytest.mark.xfail(reason="Not implemented yet.")) request.applymarker( pytest.mark.xfail( - "pyarrow_table" in str(constructor_eager) - and (PYARROW_VERSION < (14, 0) and PYARROW_VERSION >= (13, 0)), + "pyarrow_table" in str(constructor_eager) and (PYARROW_VERSION < (14, 0)), reason="https://github.com/apache/arrow/issues/36709", raises=NotImplementedError, ) From 0cea41b361234952e971682cedec7517a2359b75 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 12 May 2025 16:40:40 +0100 Subject: [PATCH 037/101] test: Add some `None` cases - `polars` always includes nulls - `pyarrow` skips by default - `pandas` says it skips `NaN` by default - maybe also includes `None`? --- tests/group_by_test.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/group_by_test.py b/tests/group_by_test.py index 5759bf8fee..ab694d2375 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -599,6 +599,13 @@ def test_renaming_edge_case(constructor: Constructor) -> None: [ (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 2, 4, 6]}, None), (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 3, 5, 6]}, {"descending": True}), + (["a"], ["c"], {"a": [1, 2, 3, 4], "c": [None, "A", None, "B"]}, None), + ( + ["a"], + ["c"], + {"a": [1, 2, 3, 4], "c": [None, "A", "B", "B"]}, + {"nulls_last": True}, + ), ], ) def test_group_by_agg_first( @@ -618,7 +625,11 @@ def test_group_by_agg_first( raises=NotImplementedError, ) ) - data = {"a": [1, 2, 2, 3, 3, 4], "b": [1, 2, 3, 4, 5, 6]} + data = { + "a": [1, 2, 2, 3, 3, 4], + "b": [1, 2, 3, 4, 5, 6], + "c": [None, "A", "A", None, "B", "B"], + } df = nw.from_native(constructor_eager(data)) if pre_sort: df = df.sort(aggs, **pre_sort) From 5229096661f0bd61bc61286c3639e47e6ac845c7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 12 May 2025 17:27:05 +0100 Subject: [PATCH 038/101] feat(DRAFT): Partial support for `pandas` - Will start a thread to see what the best option is - https://github.com/pandas-dev/pandas/issues/57019 --- narwhals/_pandas_like/group_by.py | 1 + tests/group_by_test.py | 36 +++++++++++++++++++++++++------ tests/utils.py | 5 +++++ 3 files changed, 35 insertions(+), 7 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 14ab7f9c97..4149d8c13e 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -33,6 +33,7 @@ class PandasLikeGroupBy(EagerGroupBy["PandasLikeDataFrame", "PandasLikeExpr", st "len": "size", "n_unique": "nunique", "count": "count", + "first": "first", } def __init__( diff --git a/tests/group_by_test.py b/tests/group_by_test.py index ab694d2375..000f8600e9 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -20,6 +20,7 @@ from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data +from tests.utils import is_pandas_like data: Mapping[str, Any] = {"a": [1, 1, 3], "b": [4, 4, 6], "c": [7.0, 8.0, 9.0]} @@ -594,30 +595,51 @@ def test_renaming_edge_case(constructor: Constructor) -> None: assert_equal_data(result, expected) +XFAIL_PANDAS_SKIPNA = pytest.mark.xfail( + reason="Requires `skipna=False`, which was introduced in `2.2.1`.\n" + "https://github.com/pandas-dev/pandas/issues/57019\n" +) + + @pytest.mark.parametrize( - ("keys", "aggs", "expected", "pre_sort"), + ("keys", "aggs", "expected", "pre_sort", "marks"), [ - (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 2, 4, 6]}, None), - (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 3, 5, 6]}, {"descending": True}), - (["a"], ["c"], {"a": [1, 2, 3, 4], "c": [None, "A", None, "B"]}, None), + (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 2, 4, 6]}, None, None), + ( + ["a"], + ["b"], + {"a": [1, 2, 3, 4], "b": [1, 3, 5, 6]}, + {"descending": True}, + None, + ), + ( + ["a"], + ["c"], + {"a": [1, 2, 3, 4], "c": [None, "A", None, "B"]}, + None, + [XFAIL_PANDAS_SKIPNA], + ), ( ["a"], ["c"], {"a": [1, 2, 3, 4], "c": [None, "A", "B", "B"]}, {"nulls_last": True}, + None, ), ], ) def test_group_by_agg_first( constructor_eager: ConstructorEager, - request: pytest.FixtureRequest, keys: Sequence[str], aggs: Sequence[str], expected: Mapping[str, Any], pre_sort: Mapping[str, Any] | None, + marks: Sequence[pytest.MarkDecorator] | None, + request: pytest.FixtureRequest, ) -> None: - if any(x in str(constructor_eager) for x in ("pandas", "modin", "cudf")): - request.applymarker(pytest.mark.xfail(reason="Not implemented yet.")) + if is_pandas_like(constructor_eager) and marks is not None: + for mark in marks: + request.applymarker(mark) request.applymarker( pytest.mark.xfail( "pyarrow_table" in str(constructor_eager) and (PYARROW_VERSION < (14, 0)), diff --git a/tests/utils.py b/tests/utils.py index 4469dcf0fe..219ea6f93f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -150,3 +150,8 @@ def windows_has_tzdata() -> bool: # pragma: no cover def is_pyarrow_windows_no_tzdata(constructor: Constructor, /) -> bool: """Skip test on Windows when the tz database is not configured.""" return "pyarrow" in str(constructor) and is_windows() and not windows_has_tzdata() + + +def is_pandas_like(constructor: Constructor) -> bool: + """Return `True` when `constructor` produces a native pandas-like object.""" + return any(x in str(constructor) for x in ("pandas", "modin", "cudf")) From 8d3aaec632d1a95973a573e89f1312a0adc3cdb3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 12 May 2025 18:26:36 +0100 Subject: [PATCH 039/101] docs: Tidy error and comments Resolves (https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2084885160) --- narwhals/_arrow/group_by.py | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index 78d70e3339..89429d87cd 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -16,6 +16,7 @@ from narwhals._compliant import EagerGroupBy from narwhals._expression_parsing import evaluate_output_names_and_aliases from narwhals.utils import generate_temporary_column_name +from narwhals.utils import requires if TYPE_CHECKING: from narwhals._arrow.dataframe import ArrowDataFrame @@ -79,30 +80,34 @@ def _configure_agg( elif function_name in self._OPTION_COUNT_VALID: option = pc.CountOptions(mode="only_valid") elif function_name in self._OPTION_ORDERED: - grouped, option = self._ordered_agg(grouped) + grouped, option = self._ordered_agg(grouped, function_name) return grouped, self._remap_expr_name(function_name), option def _ordered_agg( - self, grouped: pa.TableGroupBy, / + self, grouped: pa.TableGroupBy, name: NarwhalsAggregation, / ) -> tuple[pa.TableGroupBy, AggregateOptions]: - # NOTE: Will need the same for `last` + """The default behavior of `pyarrow` raises when `first` or `last` are used. + + You'd see an error like: + + ArrowNotImplementedError: Using ordered aggregator in multiple threaded execution is not supported + + We need to **disable** multi-threading to use them, but the ability to do so + wasn't possible before `14.0.0` ([pyarrow-36709]) + + [pyarrow-36709]: https://github.com/apache/arrow/issues/36709 + """ backend_version = self.compliant._backend_version if backend_version >= (14, 0) and grouped._use_threads: - # NOTE: `pyarrow` defaults to multi-threading, but is not compatible with ordered aggs - # If we see any that are ordered, the entire aggregation must disable threading - # `ArrowNotImplementedError: Using ordered aggregator in multiple threaded execution is not supported` - # Also need to avoid overwriting `self._grouped`, since we don't want to slow down unrelated `.agg(...)` calls - grouped = pa.TableGroupBy( - self.compliant.native, grouped.keys, use_threads=False + native = self.compliant.native + grouped = pa.TableGroupBy(native, grouped.keys, use_threads=False) + elif backend_version < (14, 0): # pragma: no cover + msg = ( + f"Using `{name}()` in a `group_by().agg(...)` context is only available in 'pyarrow>=14.0.0', " + f"found version {requires._unparse_version(backend_version)!r}.\n\n" + f"See https://github.com/apache/arrow/issues/36709" ) - elif backend_version < (14, 0) and backend_version >= (13, 0): # pragma: no cover - # TODO @dangotbanned: Write up an message to suggest upgrading to `>=14.0.0` - msg = "https://github.com/apache/arrow/issues/36709" - raise NotImplementedError(msg) - else: # pragma: no cover - msg = "`hash_first` wasn't available until `13` https://arrow.apache.org/docs/12.0/python/compute.html" raise NotImplementedError(msg) - # NOTE: Prior to `13.0.0`, threading was disabled return grouped, pc.ScalarAggregateOptions(skip_nulls=False) def agg(self, *exprs: ArrowExpr) -> ArrowDataFrame: From ad8e3f78fceb2a745f180e7e1b01dc85788553e3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 13 May 2025 16:05:28 +0100 Subject: [PATCH 040/101] test: xfail `ibis` --- tests/expr_and_series/first_test.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py index 899e9ccbf9..d23c335751 100644 --- a/tests/expr_and_series/first_test.py +++ b/tests/expr_and_series/first_test.py @@ -99,6 +99,13 @@ def test_first_expr_lazy_with_columns( raises=NotImplementedError, ) ) + request.applymarker( + pytest.mark.xfail( + ("ibis" in str(constructor)), + reason="Need to implement, following https://github.com/narwhals-dev/narwhals/pull/2000", + raises=NotImplementedError, + ) + ) frame = nw.from_native(constructor(data)) expr = nw.col(col).first().over(order_by="idx").alias("result") From 628f71e8f146fff8a74b4a716e6f7e187eb226c2 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 13 May 2025 16:32:18 +0100 Subject: [PATCH 041/101] feat: Add `IbisExpr.first` --- narwhals/_ibis/expr.py | 14 +++++++++++++- tests/expr_and_series/first_test.py | 7 ------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/narwhals/_ibis/expr.py b/narwhals/_ibis/expr.py index b776448a29..26e0ab398f 100644 --- a/narwhals/_ibis/expr.py +++ b/narwhals/_ibis/expr.py @@ -374,6 +374,19 @@ def _clip(_input: ir.NumericValue, lower: Any, upper: Any) -> ir.NumericValue: def sum(self) -> Self: return self._with_callable(lambda _input: _input.sum()) + def first(self) -> Self: + def fn(window_inputs: WindowInputs) -> ir.Value: + inputs = window_inputs + order_by = [ibis.asc(by, nulls_first=True) for by in window_inputs.order_by] + expr = cast("ir.Column", window_inputs.expr) + if partition_by := inputs.partition_by: # pragma: no cover + window = ibis.window(group_by=list(partition_by), order_by=order_by) + return expr.first(include_null=True).over(window) + else: + return expr.first(order_by=order_by, include_null=True) + + return self._with_window_function(fn) + def n_unique(self) -> Self: return self._with_callable( lambda _input: _input.nunique() + _input.isnull().any().cast("int8") @@ -669,4 +682,3 @@ def struct(self) -> IbisExprStructNamespace: # NOTE: https://github.com/ibis-project/ibis/issues/11176 skew = not_implemented() unique = not_implemented() - first = not_implemented() diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py index d23c335751..899e9ccbf9 100644 --- a/tests/expr_and_series/first_test.py +++ b/tests/expr_and_series/first_test.py @@ -99,13 +99,6 @@ def test_first_expr_lazy_with_columns( raises=NotImplementedError, ) ) - request.applymarker( - pytest.mark.xfail( - ("ibis" in str(constructor)), - reason="Need to implement, following https://github.com/narwhals-dev/narwhals/pull/2000", - raises=NotImplementedError, - ) - ) frame = nw.from_native(constructor(data)) expr = nw.col(col).first().over(order_by="idx").alias("result") From deacc7166400da6a738534bf0f39bc559761ad3d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 13 May 2025 16:39:34 +0100 Subject: [PATCH 042/101] test: Don't xfail for `pandas<1.0.0` https://github.com/narwhals-dev/narwhals/actions/runs/15000709204/job/42146640207?pr=2528 --- tests/group_by_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/group_by_test.py b/tests/group_by_test.py index 7bf9953ed6..dc3039e56c 100644 --- a/tests/group_by_test.py +++ b/tests/group_by_test.py @@ -600,8 +600,9 @@ def test_renaming_edge_case(constructor: Constructor) -> None: XFAIL_PANDAS_SKIPNA = pytest.mark.xfail( + PANDAS_VERSION >= (1,), reason="Requires `skipna=False`, which was introduced in `2.2.1`.\n" - "https://github.com/pandas-dev/pandas/issues/57019\n" + "https://github.com/pandas-dev/pandas/issues/57019\n", ) From 652615f134682d8c35725fd84386174b1c8811f6 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 13 Jun 2025 15:59:00 +0100 Subject: [PATCH 043/101] fix: Use reverted `partition_by`, `_sort` --- narwhals/_spark_like/expr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 155d10a2bc..db622c4764 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -539,7 +539,9 @@ def _clip_both(expr: Column, lower_bound: Column, upper_bound: Column) -> Column def first(self) -> Self: def fn(inputs: SparkWindowInputs) -> Column: return self._F.first(inputs.expr, ignorenulls=False).over( - self.partition_by(inputs).orderBy(*self._sort(inputs)) + self.partition_by(*inputs.partition_by).orderBy( + *self._sort(*inputs.order_by) + ) ) return self._with_window_function(fn) From ecaca9a8c6053e06e4c1e7d2112a373878aae73b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 15 Jun 2025 11:26:04 +0100 Subject: [PATCH 044/101] fix: Update `DuckDBExpr.first` 1/3 for https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2145293106 --- narwhals/_duckdb/expr.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index f143dab160..66595bedd1 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -495,11 +495,12 @@ def _clip_both( @requires.backend_version((1, 3)) def first(self) -> Self: - def fn(inputs: DuckDBWindowInputs) -> Expression: + def fn(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> Sequence[Expression]: order_by = generate_order_by_sql(*inputs.order_by, ascending=True) partition_by = generate_partition_by_sql(*inputs.partition_by) window = f"({partition_by} {order_by})" - return SQLExpression(f"first({inputs.expr}) over {window}") + sql = f"first({{expr}}) over {window}" + return [SQLExpression(sql.format(expr=expr)) for expr in self(df)] return self._with_window_function(fn) From ea30f26044a610a5f7911e82d4a17cb54ae6eb12 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 15 Jun 2025 11:26:44 +0100 Subject: [PATCH 045/101] fix: Update `IbisExpr.first` 2/3 for https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2145293106 --- narwhals/_ibis/expr.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/narwhals/_ibis/expr.py b/narwhals/_ibis/expr.py index 7484263456..d31a620b48 100644 --- a/narwhals/_ibis/expr.py +++ b/narwhals/_ibis/expr.py @@ -25,8 +25,10 @@ from narwhals._utils import Implementation, not_implemented if TYPE_CHECKING: + from typing import Union + import ibis.expr.types as ir - from typing_extensions import Self + from typing_extensions import Self, TypeAlias from narwhals._compliant.typing import ( AliasNames, @@ -44,6 +46,10 @@ IbisWindowFunction = WindowFunction[IbisLazyFrame, ir.Value] IbisWindowInputs = WindowInputs[ir.Value] + # NOTE: Needed to avoid `ibis` metaclass shenanigans breaking `__or__` + # > Expected class but received "UnionType` Pylance(reportGeneralTypeIssues) + LegacyWindow: TypeAlias = Union[ir.bl.LegacyWindowBuilder, None] + class IbisExpr(LazyExpr["IbisLazyFrame", "ir.Column"]): _implementation = Implementation.IBIS @@ -374,15 +380,22 @@ def sum(self) -> Self: return self._with_callable(lambda expr: expr.sum().fill_null(lit(0))) def first(self) -> Self: - def fn(inputs: IbisWindowInputs) -> ir.Value: - order_by = [ibis.asc(by, nulls_first=True) for by in inputs.order_by] - expr = cast("ir.Column", inputs.expr) - if partition_by := inputs.partition_by: # pragma: no cover - window = ibis.window(group_by=list(partition_by), order_by=order_by) - return expr.first(include_null=True).over(window) + def window( + expr: ir.Value, order_by: Sequence[ir.Column], legacy: LegacyWindow + ) -> ir.Value: + expr = cast("ir.Column", expr) + if legacy: # pragma: no cover + return expr.first(include_null=True).over(legacy) else: return expr.first(order_by=order_by, include_null=True) + def fn(df: IbisLazyFrame, inputs: IbisWindowInputs) -> Sequence[ir.Value]: + legacy: LegacyWindow = None + order_by = list(self._sort(*inputs.order_by)) + if group_by := inputs.partition_by: # pragma: no cover + legacy = ibis.window(group_by=list(group_by), order_by=order_by) + return [window(expr, order_by, legacy) for expr in self(df)] + return self._with_window_function(fn) def n_unique(self) -> Self: From 12987eeed6a55afe6623cb3c4fa651cfdcae0712 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 15 Jun 2025 12:08:33 +0100 Subject: [PATCH 046/101] fix: Update `SparkLikeExpr.first` --- narwhals/_spark_like/expr.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 8d8174da4e..b47303e27a 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -610,12 +610,12 @@ def _clip_both(expr: Column, lower_bound: Column, upper_bound: Column) -> Column ) def first(self) -> Self: - def fn(inputs: SparkWindowInputs) -> Column: - return self._F.first(inputs.expr, ignorenulls=False).over( - self.partition_by(*inputs.partition_by).orderBy( - *self._sort(*inputs.order_by) - ) + def fn(df: SparkLikeLazyFrame, inputs: SparkWindowInputs) -> Sequence[Column]: + first = self._F.first + window = self.partition_by(*inputs.partition_by).orderBy( + *self._sort(*inputs.order_by) ) + return [first(expr, ignorenulls=False).over(window) for expr in self(df)] return self._with_window_function(fn) From 5446095ef1d98cd97d4cd748888a59d52c083645 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 15 Jun 2025 12:19:57 +0100 Subject: [PATCH 047/101] test: Update `pandas` xfail The version that worked isn't supported by `narwhals` anymore https://github.com/narwhals-dev/narwhals/actions/runs/15662564393/job/44122396713?pr=2528 --- tests/frame/group_by_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/frame/group_by_test.py b/tests/frame/group_by_test.py index a1acc0e803..5c880cc434 100644 --- a/tests/frame/group_by_test.py +++ b/tests/frame/group_by_test.py @@ -538,10 +538,10 @@ def test_renaming_edge_case(constructor: Constructor) -> None: assert_equal_data(result, expected) +# NOTE: Resolving this in the current `PandasLikeGroupBy` might be too complex XFAIL_PANDAS_SKIPNA = pytest.mark.xfail( - PANDAS_VERSION >= (1,), reason="Requires `skipna=False`, which was introduced in `2.2.1`.\n" - "https://github.com/pandas-dev/pandas/issues/57019\n", + "https://github.com/pandas-dev/pandas/issues/57019\n" ) From f62c0855b15aa366e7b374e052298f687068cb75 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 20 Jun 2025 10:38:11 +0100 Subject: [PATCH 048/101] test: Don't xfail for pandas `1.1.3<=...<1.1.5` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One day I'll work out the right condition 😅 https://github.com/narwhals-dev/narwhals/actions/runs/15775635670/job/44469350175 --- tests/frame/group_by_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/frame/group_by_test.py b/tests/frame/group_by_test.py index 5c880cc434..b32753fcb9 100644 --- a/tests/frame/group_by_test.py +++ b/tests/frame/group_by_test.py @@ -540,8 +540,9 @@ def test_renaming_edge_case(constructor: Constructor) -> None: # NOTE: Resolving this in the current `PandasLikeGroupBy` might be too complex XFAIL_PANDAS_SKIPNA = pytest.mark.xfail( + PANDAS_VERSION >= (1, 1, 5), reason="Requires `skipna=False`, which was introduced in `2.2.1`.\n" - "https://github.com/pandas-dev/pandas/issues/57019\n" + "https://github.com/pandas-dev/pandas/issues/57019\n", ) From e72b11548594390a082a2d533d21cbbc275e0a8a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 29 Jun 2025 14:15:08 +0100 Subject: [PATCH 049/101] fix: Upgrade `DuckDBExpr.first` again --- narwhals/_duckdb/expr.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index 2e868c82a1..8b442e77be 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -452,11 +452,10 @@ def _clip_both( @requires.backend_version((1, 3)) def first(self) -> Self: def fn(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> Sequence[Expression]: - order_by = generate_order_by_sql(*inputs.order_by, ascending=True) - partition_by = generate_partition_by_sql(*inputs.partition_by) - window = f"({partition_by} {order_by})" - sql = f"first({{expr}}) over {window}" - return [SQLExpression(sql.format(expr=expr)) for expr in self(df)] + return [ + window_expression(F("first", expr), inputs.partition_by, inputs.order_by) + for expr in self(df) + ] return self._with_window_function(fn) From cb363be37595dd3e73bbb8c21b7bf1b48a8a5479 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 19 Jul 2025 21:27:27 +0000 Subject: [PATCH 050/101] test(DRAFT): Let's start trying to fix pandas Need CI to weed out all the failure cases --- tests/frame/group_by_test.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/frame/group_by_test.py b/tests/frame/group_by_test.py index 83376a2764..7d87f7cd59 100644 --- a/tests/frame/group_by_test.py +++ b/tests/frame/group_by_test.py @@ -641,13 +641,7 @@ def test_group_by_no_preserve_dtype( {"descending": True}, None, ), - ( - ["a"], - ["c"], - {"a": [1, 2, 3, 4], "c": [None, "A", None, "B"]}, - None, - [XFAIL_PANDAS_SKIPNA], - ), + (["a"], ["c"], {"a": [1, 2, 3, 4], "c": [None, "A", None, "B"]}, None, None), ( ["a"], ["c"], From bc80a5f12e26874ce07079b48eb7b14ebdc617a7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 19 Jul 2025 21:40:42 +0000 Subject: [PATCH 051/101] try `pandas>=2.2.1` path https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2085080744 --- narwhals/_pandas_like/group_by.py | 9 ++++++++- tests/frame/group_by_test.py | 20 ++++---------------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 6d5c66747f..e6d7532d3c 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -8,7 +8,7 @@ from narwhals._compliant import EagerGroupBy from narwhals._expression_parsing import evaluate_output_names_and_aliases -from narwhals._utils import find_stacklevel +from narwhals._utils import Implementation, find_stacklevel from narwhals.dependencies import is_pandas_like_dataframe if TYPE_CHECKING: @@ -62,6 +62,13 @@ def _native_agg(name: NativeAggregation, /, **kwds: Unpack[ScalarKwargs]) -> _NativeAgg: if name == "nunique": return methodcaller(name, dropna=False) + if name == "first": + pd_version = Implementation.PANDAS._backend_version() + if pd_version >= (2, 2, 1): + return methodcaller(name, skipna=False) + else: + msg = f"Unsupported pandas version {pd_version!r}" + raise NotImplementedError(msg) if not kwds or kwds.get("ddof") == 1: return methodcaller(name) return methodcaller(name, **kwds) diff --git a/tests/frame/group_by_test.py b/tests/frame/group_by_test.py index 7d87f7cd59..95bd7f5cbb 100644 --- a/tests/frame/group_by_test.py +++ b/tests/frame/group_by_test.py @@ -19,7 +19,6 @@ Constructor, ConstructorEager, assert_equal_data, - is_pandas_like, ) if TYPE_CHECKING: @@ -631,23 +630,16 @@ def test_group_by_no_preserve_dtype( @pytest.mark.parametrize( - ("keys", "aggs", "expected", "pre_sort", "marks"), + ("keys", "aggs", "expected", "pre_sort"), [ - (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 2, 4, 6]}, None, None), - ( - ["a"], - ["b"], - {"a": [1, 2, 3, 4], "b": [1, 3, 5, 6]}, - {"descending": True}, - None, - ), - (["a"], ["c"], {"a": [1, 2, 3, 4], "c": [None, "A", None, "B"]}, None, None), + (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 2, 4, 6]}, None), + (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 3, 5, 6]}, {"descending": True}), + (["a"], ["c"], {"a": [1, 2, 3, 4], "c": [None, "A", None, "B"]}, None), ( ["a"], ["c"], {"a": [1, 2, 3, 4], "c": [None, "A", "B", "B"]}, {"nulls_last": True}, - None, ), ], ) @@ -657,12 +649,8 @@ def test_group_by_agg_first( aggs: Sequence[str], expected: Mapping[str, Any], pre_sort: Mapping[str, Any] | None, - marks: Sequence[pytest.MarkDecorator] | None, request: pytest.FixtureRequest, ) -> None: - if is_pandas_like(constructor_eager) and marks is not None: - for mark in marks: - request.applymarker(mark) request.applymarker( pytest.mark.xfail( "pyarrow_table" in str(constructor_eager) and (PYARROW_VERSION < (14, 0)), From 14051faa83d53cfa5e9321f363234b68110cc6f3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 19 Jul 2025 21:43:06 +0000 Subject: [PATCH 052/101] allow very old pandas that worked? --- narwhals/_pandas_like/group_by.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index e6d7532d3c..2ce3d2b5e5 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -66,6 +66,8 @@ def _native_agg(name: NativeAggregation, /, **kwds: Unpack[ScalarKwargs]) -> _Na pd_version = Implementation.PANDAS._backend_version() if pd_version >= (2, 2, 1): return methodcaller(name, skipna=False) + elif pd_version < (1, 1, 5): + return methodcaller(name) else: msg = f"Unsupported pandas version {pd_version!r}" raise NotImplementedError(msg) From 3d42dcf7e0051aae363c6dff57bbfe845a4ca686 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 19 Jul 2025 22:21:53 +0000 Subject: [PATCH 053/101] test: xfail `pandas[pyarrow]`, `modin[pyarrow]` --- tests/frame/group_by_test.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/frame/group_by_test.py b/tests/frame/group_by_test.py index 95bd7f5cbb..919a2c2f00 100644 --- a/tests/frame/group_by_test.py +++ b/tests/frame/group_by_test.py @@ -634,7 +634,13 @@ def test_group_by_no_preserve_dtype( [ (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 2, 4, 6]}, None), (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 3, 5, 6]}, {"descending": True}), - (["a"], ["c"], {"a": [1, 2, 3, 4], "c": [None, "A", None, "B"]}, None), + pytest.param( + ["a"], + ["c"], + {"a": [1, 2, 3, 4], "c": [None, "A", None, "B"]}, + None, + id="pandas-pyarrow-na-order", + ), ( ["a"], ["c"], @@ -658,6 +664,13 @@ def test_group_by_agg_first( raises=NotImplementedError, ) ) + request.applymarker( + pytest.mark.xfail( + "[pyarrow]" in request.node.name + and "pandas-pyarrow-na-order" in request.node.name, + reason=" marker seems to not respect ordering", + ) + ) data = { "a": [1, 2, 2, 3, 3, 4], "b": [1, 2, 3, 4, 5, 6], From 934d09ee7d28c2c4d9a45620479134b3a46a7813 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 20 Jul 2025 12:55:52 +0000 Subject: [PATCH 054/101] Apply suggestion narwhals/_polars/series.py Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> --- narwhals/_polars/series.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 031b9076b6..07ea1e862c 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -562,11 +562,9 @@ def to_polars(self) -> pl.Series: return self.native def first(self) -> PythonLiteral: - if self._backend_version >= (1, 10): - return self.native.first() - elif len(self): # pragma: no cover - return self.native.item(0) # type: ignore[no-any-return] - return None # pragma: no cover + if self._backend_version < (1, 10): # pragma: no cover + return self.native.item(0) if len(self) else None + return self.native.first() @property def dt(self) -> PolarsSeriesDateTimeNamespace: From 801a7a807000b31145dba9c8f606a72c9a3627bc Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 20 Jul 2025 17:29:12 +0000 Subject: [PATCH 055/101] docs: Be more explicit on WIP `pandas` (https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217722493), (https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217888841) --- narwhals/_pandas_like/group_by.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index f699a63d30..7bc0a1b2f1 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -63,13 +63,18 @@ def _native_agg(name: NativeAggregation, /, **kwds: Unpack[ScalarKwargs]) -> _Na if name == "nunique": return methodcaller(name, dropna=False) if name == "first": + # TODO @dangotbanned: `modin` support + # TODO @dangotbanned: `cuDF` support + # https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217722493 pd_version = Implementation.PANDAS._backend_version() if pd_version >= (2, 2, 1): return methodcaller(name, skipna=False) elif pd_version < (1, 1, 5): return methodcaller(name) else: - msg = f"Unsupported pandas version {pd_version!r}" + # TODO @dangotbanned: Figure out if there is anything we can do for `pandas>=1.1.5;<2.2.1` + # https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2085080744 + msg = f"TODO: Unhandled pandas version {pd_version!r}" raise NotImplementedError(msg) if not kwds or kwds.get("ddof") == 1: return methodcaller(name) From 47bfaba9201f8ea53e92bbb9c459192d7200b74c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 20 Jul 2025 17:32:32 +0000 Subject: [PATCH 056/101] docs: Link to long explanation See https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217791263 --- narwhals/_ibis/expr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/narwhals/_ibis/expr.py b/narwhals/_ibis/expr.py index 977fd35bd9..e0625e8cc2 100644 --- a/narwhals/_ibis/expr.py +++ b/narwhals/_ibis/expr.py @@ -41,8 +41,9 @@ IbisWindowFunction = WindowFunction[IbisLazyFrame, ir.Value] IbisWindowInputs = WindowInputs[ir.Value] - # NOTE: Needed to avoid `ibis` metaclass shenanigans breaking `__or__` + # NOTE: Needed to avoid `ibis` metaclass shenanigans breaking `ir.bl.LegacyWindowBuilder.__or__` # > Expected class but received "UnionType` Pylance(reportGeneralTypeIssues) + # See https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217791263 LegacyWindow: TypeAlias = Union[ir.bl.LegacyWindowBuilder, None] From 4618d016c4003265820423235d14710d193fd7f5 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 20 Jul 2025 18:33:45 +0000 Subject: [PATCH 057/101] revert: remove lazy support See https://github.com/narwhals-dev/narwhals/issues/2526#issuecomment-3019303816 --- narwhals/_compliant/expr.py | 1 + narwhals/_dask/expr.py | 1 - narwhals/_duckdb/expr.py | 10 ------ narwhals/_ibis/expr.py | 27 +--------------- narwhals/_spark_like/expr.py | 10 ------ tests/expr_and_series/first_test.py | 49 +++++++---------------------- 6 files changed, 14 insertions(+), 84 deletions(-) diff --git a/narwhals/_compliant/expr.py b/narwhals/_compliant/expr.py index bb9dc1fe4b..2b48b6f33e 100644 --- a/narwhals/_compliant/expr.py +++ b/narwhals/_compliant/expr.py @@ -923,6 +923,7 @@ class LazyExpr( gather_every: not_implemented = not_implemented() replace_strict: not_implemented = not_implemented() cat: not_implemented = not_implemented() # type: ignore[assignment] + first: not_implemented = not_implemented() @property def _backend_version(self) -> tuple[int, ...]: diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index d0120004cc..ffeac3f404 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -671,4 +671,3 @@ def dt(self) -> DaskExprDateTimeNamespace: window_function = not_implemented() # pyright: ignore[reportAssignmentType] _from_elementwise_horizontal_op = not_implemented() _with_binary = not_implemented() - first = not_implemented() diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index a924d0884b..a9209a5b6c 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -436,16 +436,6 @@ def _clip_both( _clip_both, lower_bound=lower_bound, upper_bound=upper_bound ) - @requires.backend_version((1, 3)) - def first(self) -> Self: - def fn(df: DuckDBLazyFrame, inputs: DuckDBWindowInputs) -> Sequence[Expression]: - return [ - window_expression(F("first", expr), inputs.partition_by, inputs.order_by) - for expr in self(df) - ] - - return self._with_window_function(fn) - def sum(self) -> Self: def f(expr: Expression) -> Expression: return CoalesceOperator(F("sum", expr), lit(0)) diff --git a/narwhals/_ibis/expr.py b/narwhals/_ibis/expr.py index e0625e8cc2..936dc7e6b1 100644 --- a/narwhals/_ibis/expr.py +++ b/narwhals/_ibis/expr.py @@ -20,10 +20,9 @@ if TYPE_CHECKING: from collections.abc import Iterable, Iterator, Sequence - from typing import Union import ibis.expr.types as ir - from typing_extensions import Self, TypeAlias + from typing_extensions import Self from narwhals._compliant.typing import ( AliasNames, @@ -41,11 +40,6 @@ IbisWindowFunction = WindowFunction[IbisLazyFrame, ir.Value] IbisWindowInputs = WindowInputs[ir.Value] - # NOTE: Needed to avoid `ibis` metaclass shenanigans breaking `ir.bl.LegacyWindowBuilder.__or__` - # > Expected class but received "UnionType` Pylance(reportGeneralTypeIssues) - # See https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217791263 - LegacyWindow: TypeAlias = Union[ir.bl.LegacyWindowBuilder, None] - class IbisExpr(LazyExpr["IbisLazyFrame", "ir.Column"]): _implementation = Implementation.IBIS @@ -324,25 +318,6 @@ def _clip( def sum(self) -> Self: return self._with_callable(lambda expr: expr.sum().fill_null(lit(0))) - def first(self) -> Self: - def window( - expr: ir.Value, order_by: Sequence[ir.Column], legacy: LegacyWindow - ) -> ir.Value: - expr = cast("ir.Column", expr) - if legacy: # pragma: no cover - return expr.first(include_null=True).over(legacy) - else: - return expr.first(order_by=order_by, include_null=True) - - def fn(df: IbisLazyFrame, inputs: IbisWindowInputs) -> Sequence[ir.Value]: - legacy: LegacyWindow = None - order_by = list(self._sort(*inputs.order_by)) - if group_by := inputs.partition_by: # pragma: no cover - legacy = ibis.window(group_by=list(group_by), order_by=order_by) - return [window(expr, order_by, legacy) for expr in self(df)] - - return self._with_window_function(fn) - def n_unique(self) -> Self: return self._with_callable( lambda expr: expr.nunique() + expr.isnull().any().cast("int8") diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 4078f30ed4..a358a7d6d5 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -590,16 +590,6 @@ def _clip_both(expr: Column, lower_bound: Column, upper_bound: Column) -> Column _clip_both, lower_bound=lower_bound, upper_bound=upper_bound ) - def first(self) -> Self: - def fn(df: SparkLikeLazyFrame, inputs: SparkWindowInputs) -> Sequence[Column]: - first = self._F.first - window = self.partition_by(*inputs.partition_by).orderBy( - *self._sort(*inputs.order_by) - ) - return [first(expr, ignorenulls=False).over(window) for expr in self(df)] - - return self._with_window_function(fn) - def is_finite(self) -> Self: def _is_finite(expr: Column) -> Column: # A value is finite if it's not NaN, and not infinite, while NULLs should be diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py index 87ac5d692e..58d427388a 100644 --- a/tests/expr_and_series/first_test.py +++ b/tests/expr_and_series/first_test.py @@ -5,13 +5,13 @@ import pytest import narwhals as nw -from tests.utils import DUCKDB_VERSION, POLARS_VERSION, assert_equal_data +from tests.utils import POLARS_VERSION, assert_equal_data if TYPE_CHECKING: from collections.abc import Mapping, Sequence from narwhals.typing import PythonLiteral - from tests.utils import Constructor, ConstructorEager + from tests.utils import ConstructorEager data: dict[str, list[PythonLiteral]] = { "a": [8, 2, 1, None], @@ -39,7 +39,7 @@ def test_first_series_empty(constructor_eager: ConstructorEager) -> None: @pytest.mark.parametrize(("col", "expected"), [("a", 8), ("b", 58), ("c", 2.5)]) -def test_first_expr_eager( +def test_first_expr_select( constructor_eager: ConstructorEager, col: str, expected: PythonLiteral ) -> None: df = nw.from_native(constructor_eager(data)) @@ -48,27 +48,16 @@ def test_first_expr_eager( assert_equal_data(result, {col: [expected]}) -@pytest.mark.skip( - reason="https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083557149" -) -@pytest.mark.parametrize(("col", "expected"), [("a", 8), ("b", 58), ("c", 2.5)]) -def test_first_expr_lazy_select( - constructor: Constructor, col: str, expected: PythonLiteral -) -> None: # pragma: no cover - frame = nw.from_native(constructor(data)) - expr = nw.col(col).first().over(order_by="idx") - result = frame.select(expr) - assert_equal_data(result, {col: [expected]}) - - @pytest.mark.parametrize(("col", "expected"), [("a", 8), ("b", 58), ("c", 2.5)]) -def test_first_expr_lazy_with_columns( - constructor: Constructor, +def test_first_expr_with_columns( + constructor_eager: ConstructorEager, col: str, expected: PythonLiteral, request: pytest.FixtureRequest, ) -> None: - if any(x in str(constructor) for x in ("pyarrow_table", "pandas", "modin", "cudf")): + if any( + x in str(constructor_eager) for x in ("pyarrow_table", "pandas", "modin", "cudf") + ): request.applymarker( pytest.mark.xfail( reason="Some kind of index error, see https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083582828" @@ -77,27 +66,13 @@ def test_first_expr_lazy_with_columns( request.applymarker( pytest.mark.xfail( - ("duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3)), - reason="Needs `SQLExpression`", - raises=NotImplementedError, - ) - ) - request.applymarker( - pytest.mark.xfail( - ("polars" in str(constructor) and POLARS_VERSION < (1, 10)), + ("polars" in str(constructor_eager) and POLARS_VERSION < (1, 10)), reason="Needs `order_by`", raises=NotImplementedError, ) ) - request.applymarker( - pytest.mark.xfail( - ("dask" in str(constructor)), - reason="Abandoned https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083585895", - raises=NotImplementedError, - ) - ) - frame = nw.from_native(constructor(data)) + frame = nw.from_native(constructor_eager(data)) expr = nw.col(col).first().over(order_by="idx").alias("result") result = frame.with_columns(expr).select("result") expected_broadcast = len(data[col]) * [expected] @@ -108,7 +83,7 @@ def test_first_expr_lazy_with_columns( "expected", [{"a": [8], "c": [2.5]}, {"d": [2], "b": [58]}, {"c": [2.5], "a": [8], "d": [2]}], ) -def test_first_expr_eager_expand( +def test_first_expr_expand( constructor_eager: ConstructorEager, expected: Mapping[str, Sequence[PythonLiteral]] ) -> None: df = nw.from_native(constructor_eager(data)) @@ -117,7 +92,7 @@ def test_first_expr_eager_expand( assert_equal_data(result, expected) -def test_first_expr_eager_expand_sort(constructor_eager: ConstructorEager) -> None: +def test_first_expr_expand_sort(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data)) expr = nw.col("d", "a", "b", "c").first() result = df.sort("d").select(expr) From b77d2b3687c44ba83dfc7e36a4e1b2c9f78e1508 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 21 Jul 2025 16:54:59 +0000 Subject: [PATCH 058/101] try `nth` for `>=1.1.5; <2.0.0` Would cover most of the versions, appears to have been this regressing that lead to `first` getting `skipna` https://pandas.pydata.org/pandas-docs/stable/whatsnew/v2.0.0.html#dataframegroupby-nth-and-seriesgroupby-nth-now-behave-as-filtrations --- narwhals/_pandas_like/group_by.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 7bc0a1b2f1..8cff28c46f 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -71,6 +71,10 @@ def _native_agg(name: NativeAggregation, /, **kwds: Unpack[ScalarKwargs]) -> _Na return methodcaller(name, skipna=False) elif pd_version < (1, 1, 5): return methodcaller(name) + elif pd_version < (2, 0, 0): + # NOTE: https://pandas.pydata.org/pandas-docs/stable/whatsnew/v2.0.0.html#dataframegroupby-nth-and-seriesgroupby-nth-now-behave-as-filtrations + # https://github.com/pandas-dev/pandas/issues/57019#issuecomment-1905038446 + return methodcaller("nth", n=0) else: # TODO @dangotbanned: Figure out if there is anything we can do for `pandas>=1.1.5;<2.2.1` # https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2085080744 From 2b0bc16361129c9e0ecc10863402b7246440ef04 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 21 Jul 2025 18:15:16 +0000 Subject: [PATCH 059/101] Is this fixed? --- narwhals/_pandas_like/group_by.py | 55 +++++++++++++++++++++++-------- tests/frame/group_by_test.py | 15 --------- 2 files changed, 41 insertions(+), 29 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 8cff28c46f..ff539d7ac4 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -15,6 +15,7 @@ from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence import pandas as pd + from pandas.api.extensions import ExtensionDtype from pandas.api.typing import DataFrameGroupBy as _NativeGroupBy from typing_extensions import TypeAlias, Unpack @@ -59,7 +60,13 @@ @lru_cache(maxsize=32) -def _native_agg(name: NativeAggregation, /, **kwds: Unpack[ScalarKwargs]) -> _NativeAgg: +def _native_agg( + name: NativeAggregation, + /, + *, + has_pyarrow_string: bool | None = None, + **kwds: Unpack[ScalarKwargs], +) -> _NativeAgg: if name == "nunique": return methodcaller(name, dropna=False) if name == "first": @@ -67,24 +74,34 @@ def _native_agg(name: NativeAggregation, /, **kwds: Unpack[ScalarKwargs]) -> _Na # TODO @dangotbanned: `cuDF` support # https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217722493 pd_version = Implementation.PANDAS._backend_version() + if has_pyarrow_string or (pd_version >= (2, 0, 0) and pd_version < (2, 2, 1)): + return first_compat if pd_version >= (2, 2, 1): return methodcaller(name, skipna=False) - elif pd_version < (1, 1, 5): - return methodcaller(name) - elif pd_version < (2, 0, 0): + # NOTE: Before 1.1.5, `first` worked without arguments + if pd_version >= (1, 1, 5): # NOTE: https://pandas.pydata.org/pandas-docs/stable/whatsnew/v2.0.0.html#dataframegroupby-nth-and-seriesgroupby-nth-now-behave-as-filtrations # https://github.com/pandas-dev/pandas/issues/57019#issuecomment-1905038446 return methodcaller("nth", n=0) - else: - # TODO @dangotbanned: Figure out if there is anything we can do for `pandas>=1.1.5;<2.2.1` - # https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2085080744 - msg = f"TODO: Unhandled pandas version {pd_version!r}" - raise NotImplementedError(msg) if not kwds or kwds.get("ddof") == 1: return methodcaller(name) return methodcaller(name, **kwds) +def first_compat(dgb: NativeGroupBy, /) -> pd.DataFrame: + """Taken from https://github.com/pandas-dev/pandas/issues/57019#issue-2094896421. + + - Theoretically could be slow + - but works in all cases for `string[pyarrow]` + - and `>=2.0.0; <2.2.1` + """ + return dgb.apply(lambda group: group.iloc[0]) + + +def _is_pyarrow_string(dtype: ExtensionDtype) -> bool: + return dtype.name == "string[pyarrow]" + + class AggExpr: """Wrapper storing the intermediate state per-`PandasLikeExpr`. @@ -128,7 +145,7 @@ def _getitem_aggs( result = group_by._grouped.size() else: select = names[0] if len(names) == 1 else list(names) - result = self.native_agg()(group_by._grouped[select]) + result = self.native_agg(group_by)(group_by._grouped[select]) if is_pandas_like_dataframe(result): result.columns = list(self.aliases) else: @@ -141,6 +158,10 @@ def is_len(self) -> bool: def is_anonymous(self) -> bool: return self.expr._depth == 0 + def is_ordered(self) -> bool: + """`string[pyarrow]` needs special treatment with nulls.""" + return self.leaf_name in {"first", "last"} + @property def kwargs(self) -> ScalarKwargs: return self.expr._scalar_kwargs @@ -152,11 +173,17 @@ def leaf_name(self) -> NarwhalsAggregation | Any: self._leaf_name = PandasLikeGroupBy._leaf_name(self.expr) return self._leaf_name - def native_agg(self) -> _NativeAgg: + def native_agg(self, group_by: PandasLikeGroupBy) -> _NativeAgg: """Return a partial `DataFrameGroupBy` method, missing only `self`.""" - return _native_agg( - PandasLikeGroupBy._remap_expr_name(self.leaf_name), **self.kwargs - ) + native_name = PandasLikeGroupBy._remap_expr_name(self.leaf_name) + if self.is_ordered(): + dtypes: pd.Series = group_by.compliant.native.dtypes + targets = dtypes[list(self.output_names)] + has_pyarrow_string = targets.transform(_is_pyarrow_string).any().item() + return _native_agg( + native_name, has_pyarrow_string=has_pyarrow_string, **self.kwargs + ) + return _native_agg(native_name, **self.kwargs) class PandasLikeGroupBy( diff --git a/tests/frame/group_by_test.py b/tests/frame/group_by_test.py index 898f697d4f..4d6bae78fa 100644 --- a/tests/frame/group_by_test.py +++ b/tests/frame/group_by_test.py @@ -660,14 +660,6 @@ def test_group_by_no_preserve_dtype( assert_equal_data(result, expected) -# NOTE: Resolving this in the current `PandasLikeGroupBy` might be too complex -XFAIL_PANDAS_SKIPNA = pytest.mark.xfail( - PANDAS_VERSION >= (1, 1, 5), - reason="Requires `skipna=False`, which was introduced in `2.2.1`.\n" - "https://github.com/pandas-dev/pandas/issues/57019\n", -) - - @pytest.mark.parametrize( ("keys", "aggs", "expected", "pre_sort"), [ @@ -703,13 +695,6 @@ def test_group_by_agg_first( raises=NotImplementedError, ) ) - request.applymarker( - pytest.mark.xfail( - "[pyarrow]" in request.node.name - and "pandas-pyarrow-na-order" in request.node.name, - reason=" marker seems to not respect ordering", - ) - ) data = { "a": [1, 2, 2, 3, 3, 4], "b": [1, 2, 3, 4, 5, 6], From abbb4b71f0231e8b1bfff445634a0592e8dfe85a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 21 Jul 2025 18:21:32 +0000 Subject: [PATCH 060/101] cov https://github.com/narwhals-dev/narwhals/actions/runs/16424779850/job/46412226822?pr=2528 --- narwhals/_pandas_like/group_by.py | 2 +- tests/utils.py | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index ff539d7ac4..63f8d200d4 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -79,7 +79,7 @@ def _native_agg( if pd_version >= (2, 2, 1): return methodcaller(name, skipna=False) # NOTE: Before 1.1.5, `first` worked without arguments - if pd_version >= (1, 1, 5): + if pd_version >= (1, 1, 5): # pragma: no cover # NOTE: https://pandas.pydata.org/pandas-docs/stable/whatsnew/v2.0.0.html#dataframegroupby-nth-and-seriesgroupby-nth-now-behave-as-filtrations # https://github.com/pandas-dev/pandas/issues/57019#issuecomment-1905038446 return methodcaller("nth", n=0) diff --git a/tests/utils.py b/tests/utils.py index 7199155aa3..482e4362e1 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -190,8 +190,3 @@ def maybe_collect(df: Frame) -> Frame: if isinstance(df, nw.LazyFrame): return df.collect() return df # pragma: no cover - - -def is_pandas_like(constructor: Constructor) -> bool: - """Return `True` when `constructor` produces a native pandas-like object.""" - return any(x in str(constructor) for x in ("pandas", "modin", "cudf")) From ccfe5324a631bd4e85bf6988cf1368a3fab3c2ad Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 21 Jul 2025 21:59:14 +0000 Subject: [PATCH 061/101] feat: Add `(Expr|Series).last` Still need new tests and `Expr` docs for both --- docs/api-reference/expr.md | 1 + docs/api-reference/series.md | 1 + narwhals/_arrow/group_by.py | 5 ++++- narwhals/_arrow/series.py | 5 +++++ narwhals/_compliant/expr.py | 6 ++++++ narwhals/_compliant/series.py | 1 + narwhals/_pandas_like/group_by.py | 33 +++++++++++++++++++++++-------- narwhals/_pandas_like/series.py | 3 +++ narwhals/_polars/expr.py | 1 + narwhals/_polars/series.py | 5 +++++ narwhals/expr.py | 10 ++++++++++ narwhals/series.py | 19 ++++++++++++++++++ 12 files changed, 81 insertions(+), 9 deletions(-) diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md index 9b3a1808ea..3ddfbad887 100644 --- a/docs/api-reference/expr.md +++ b/docs/api-reference/expr.md @@ -25,6 +25,7 @@ - fill_null - filter - first + - last - gather_every - head - clip diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index 38193d5d00..8eb9cab3f9 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -30,6 +30,7 @@ - fill_null - filter - first + - last - gather_every - head - hist diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index 4e28830f77..47bd15a37d 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -40,6 +40,7 @@ class ArrowGroupBy(EagerGroupBy["ArrowDataFrame", "ArrowExpr", "Aggregation"]): "all": "all", "any": "any", "first": "first", + "last": "last", } _REMAP_UNIQUE: ClassVar[Mapping[UniqueKeepStrategy, Aggregation]] = { "any": "min", @@ -50,7 +51,9 @@ class ArrowGroupBy(EagerGroupBy["ArrowDataFrame", "ArrowExpr", "Aggregation"]): ("len", "n_unique") ) _OPTION_COUNT_VALID: ClassVar[frozenset[NarwhalsAggregation]] = frozenset(("count",)) - _OPTION_ORDERED: ClassVar[frozenset[NarwhalsAggregation]] = frozenset(("first",)) + _OPTION_ORDERED: ClassVar[frozenset[NarwhalsAggregation]] = frozenset( + ("first", "last") + ) _OPTION_VARIANCE: ClassVar[frozenset[NarwhalsAggregation]] = frozenset(("std", "var")) _OPTION_SCALAR: ClassVar[frozenset[NarwhalsAggregation]] = frozenset(("any", "all")) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 95b4c4ffed..b3a7ae35ed 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -320,6 +320,11 @@ def first(self, *, _return_py_scalar: bool = True) -> PythonLiteral: result = self.native[0] if len(self.native) else None return maybe_extract_py_scalar(result, _return_py_scalar) + def last(self, *, _return_py_scalar: bool = True) -> PythonLiteral: + ca = self.native + result = ca[height - 1] if (height := len(ca)) else None + return maybe_extract_py_scalar(result, _return_py_scalar) + def mean(self, *, _return_py_scalar: bool = True) -> float: return maybe_extract_py_scalar(pc.mean(self.native), _return_py_scalar) diff --git a/narwhals/_compliant/expr.py b/narwhals/_compliant/expr.py index 722598cd2a..16f6d0c29c 100644 --- a/narwhals/_compliant/expr.py +++ b/narwhals/_compliant/expr.py @@ -150,6 +150,7 @@ def cum_prod(self, *, reverse: bool) -> Self: ... def is_in(self, other: Any) -> Self: ... def sort(self, *, descending: bool, nulls_last: bool) -> Self: ... def first(self) -> Self: ... + def last(self) -> Self: ... def rank(self, method: RankMethod, *, descending: bool) -> Self: ... def replace_strict( self, @@ -875,6 +876,9 @@ def sqrt(self) -> Self: def first(self) -> Self: return self._reuse_series("first", returns_scalar=True) + def last(self) -> Self: + return self._reuse_series("last", returns_scalar=True) + @property def cat(self) -> EagerExprCatNamespace[Self]: return EagerExprCatNamespace(self) @@ -905,7 +909,9 @@ class LazyExpr( # type: ignore[misc] CompliantExpr[CompliantLazyFrameT, NativeExprT], Protocol38[CompliantLazyFrameT, NativeExprT], ): + # NOTE: See https://github.com/narwhals-dev/narwhals/issues/2526#issuecomment-3019303816 first: not_implemented = not_implemented() + last: not_implemented = not_implemented() def _with_alias_output_names(self, func: AliasNames | None, /) -> Self: ... def alias(self, name: str) -> Self: diff --git a/narwhals/_compliant/series.py b/narwhals/_compliant/series.py index bb5bdeac5e..46e3339a0c 100644 --- a/narwhals/_compliant/series.py +++ b/narwhals/_compliant/series.py @@ -186,6 +186,7 @@ def fill_null( ) -> Self: ... def filter(self, predicate: Any) -> Self: ... def first(self) -> PythonLiteral: ... + def last(self) -> PythonLiteral: ... def gather_every(self, n: int, offset: int) -> Self: ... @unstable def hist( diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 63f8d200d4..f591601084 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -17,7 +17,7 @@ import pandas as pd from pandas.api.extensions import ExtensionDtype from pandas.api.typing import DataFrameGroupBy as _NativeGroupBy - from typing_extensions import TypeAlias, Unpack + from typing_extensions import TypeAlias, TypeIs, Unpack from narwhals._compliant.typing import NarwhalsAggregation, ScalarKwargs from narwhals._pandas_like.dataframe import PandasLikeDataFrame @@ -27,14 +27,13 @@ NativeApply: TypeAlias = "Callable[[pd.DataFrame], pd.Series[Any]]" InefficientNativeAggregation: TypeAlias = Literal["cov", "skew"] +OrderedAggregation: TypeAlias = Literal["first", "last"] NativeAggregation: TypeAlias = Literal[ "any", "all", "count", - "first", "idxmax", "idxmin", - "last", "max", "mean", "median", @@ -47,6 +46,7 @@ "std", "sum", "var", + OrderedAggregation, InefficientNativeAggregation, ] """https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#built-in-aggregation-methods""" @@ -58,6 +58,16 @@ NonStrHashable: TypeAlias = Any """Because `pandas` allows *"names"* like that 😭""" +_REMAP_ORDERED_INDEX: Mapping[OrderedAggregation, Literal[0, -1]] = { + "first": 0, + "last": -1, +} +_ORDERED_AGG = frozenset[OrderedAggregation](("first", "last")) + + +def _is_ordered_agg(obj: Any) -> TypeIs[OrderedAggregation]: + return obj in _ORDERED_AGG + @lru_cache(maxsize=32) def _native_agg( @@ -69,33 +79,40 @@ def _native_agg( ) -> _NativeAgg: if name == "nunique": return methodcaller(name, dropna=False) - if name == "first": + if _is_ordered_agg(name): # TODO @dangotbanned: `modin` support # TODO @dangotbanned: `cuDF` support # https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217722493 pd_version = Implementation.PANDAS._backend_version() if has_pyarrow_string or (pd_version >= (2, 0, 0) and pd_version < (2, 2, 1)): - return first_compat + return _ordered_compat(name) if pd_version >= (2, 2, 1): return methodcaller(name, skipna=False) # NOTE: Before 1.1.5, `first` worked without arguments if pd_version >= (1, 1, 5): # pragma: no cover # NOTE: https://pandas.pydata.org/pandas-docs/stable/whatsnew/v2.0.0.html#dataframegroupby-nth-and-seriesgroupby-nth-now-behave-as-filtrations # https://github.com/pandas-dev/pandas/issues/57019#issuecomment-1905038446 - return methodcaller("nth", n=0) + return methodcaller("nth", n=_REMAP_ORDERED_INDEX[name]) if not kwds or kwds.get("ddof") == 1: return methodcaller(name) return methodcaller(name, **kwds) -def first_compat(dgb: NativeGroupBy, /) -> pd.DataFrame: +def _ordered_compat( + name: OrderedAggregation, / +) -> Callable[[NativeGroupBy], pd.DataFrame]: """Taken from https://github.com/pandas-dev/pandas/issues/57019#issue-2094896421. - Theoretically could be slow - but works in all cases for `string[pyarrow]` - and `>=2.0.0; <2.2.1` """ - return dgb.apply(lambda group: group.iloc[0]) + index = _REMAP_ORDERED_INDEX[name] + + def fn(dgb: NativeGroupBy, /) -> pd.DataFrame: + return dgb.apply(lambda group: group.iloc[index]) + + return fn def _is_pyarrow_string(dtype: ExtensionDtype) -> bool: diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 379a8b070f..dbef8af9d2 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -381,6 +381,9 @@ def filter(self, predicate: Any) -> Self: def first(self) -> PythonLiteral: return self.native.iloc[0] if len(self.native) else None + def last(self) -> PythonLiteral: + return self.native.iloc[-1] if len(self.native) else None + def __eq__(self, other: object) -> Self: # type: ignore[override] ser, other = align_and_extract_native(self, other) return self._with_native(ser == other).alias(self.name) diff --git a/narwhals/_polars/expr.py b/narwhals/_polars/expr.py index 7f137a2d4c..7e72d1f0e3 100644 --- a/narwhals/_polars/expr.py +++ b/narwhals/_polars/expr.py @@ -279,6 +279,7 @@ def struct(self) -> PolarsExprStructNamespace: exp: Method[Self] fill_null: Method[Self] first: Method[Self] + last: Method[Self] gather_every: Method[Self] head: Method[Self] is_finite: Method[Self] diff --git a/narwhals/_polars/series.py b/narwhals/_polars/series.py index 07ea1e862c..0cd055dde1 100644 --- a/narwhals/_polars/series.py +++ b/narwhals/_polars/series.py @@ -566,6 +566,11 @@ def first(self) -> PythonLiteral: return self.native.item(0) if len(self) else None return self.native.first() + def last(self) -> PythonLiteral: + if self._backend_version < (1, 10): # pragma: no cover + return self.native.item(-1) if len(self) else None + return self.native.last() + @property def dt(self) -> PolarsSeriesDateTimeNamespace: return PolarsSeriesDateTimeNamespace(self) diff --git a/narwhals/expr.py b/narwhals/expr.py index 3294b09806..4e947787cd 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1984,6 +1984,16 @@ def first(self) -> Self: lambda plx: self._to_compliant_expr(plx).first() ) + def last(self) -> Self: + """Get the last value. + + Returns: + A new expression. + """ + return self._with_orderable_aggregation( + lambda plx: self._to_compliant_expr(plx).last() + ) + def mode(self) -> Self: r"""Compute the most occurring value(s). diff --git a/narwhals/series.py b/narwhals/series.py index dbeba00a8b..5d0e8a36b3 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -834,6 +834,25 @@ def first(self) -> PythonLiteral: """ return self._compliant_series.first() + def last(self) -> PythonLiteral: + """Get the last element of the Series. + + Returns: + A scalar value or `None` if the Series is empty. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> + >>> s_native = pa.chunked_array([[1, 2, 3]]) + >>> s_nw = nw.from_native(s_native, series_only=True) + >>> s_nw.last() + 3 + >>> s_nw.filter(s_nw > 5).last() is None + True + """ + return self._compliant_series.last() + def is_in(self, other: Any) -> Self: """Check if the elements of this Series are in the other sequence. From dd1f89e6dbd639adfa463feb2be5b8e7bddb2b4d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 22 Jul 2025 12:52:03 +0000 Subject: [PATCH 062/101] test: Add `last_test.py` --- tests/expr_and_series/last_test.py | 108 +++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 tests/expr_and_series/last_test.py diff --git a/tests/expr_and_series/last_test.py b/tests/expr_and_series/last_test.py new file mode 100644 index 0000000000..7df67ee171 --- /dev/null +++ b/tests/expr_and_series/last_test.py @@ -0,0 +1,108 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +import narwhals as nw +from tests.utils import POLARS_VERSION, assert_equal_data + +if TYPE_CHECKING: + from collections.abc import Mapping, Sequence + + from narwhals.typing import PythonLiteral + from tests.utils import ConstructorEager + +data: dict[str, list[PythonLiteral]] = { + "a": [8, 2, 1, None], + "b": [58, 5, 6, 12], + "c": [2.5, 1.0, 3.0, 0.9], + "d": [2, 1, 4, 3], + "idx": [0, 1, 2, 3], +} + +single_cases = pytest.mark.parametrize( + ("col", "expected"), [("a", None), ("b", 12), ("c", 0.9)] +) + + +@single_cases +def test_last_series( + constructor_eager: ConstructorEager, col: str, expected: PythonLiteral +) -> None: + series = nw.from_native(constructor_eager(data), eager_only=True)[col] + result = series.last() + assert_equal_data({col: [result]}, {col: [expected]}) + + +def test_last_series_empty(constructor_eager: ConstructorEager) -> None: + series = nw.from_native(constructor_eager(data), eager_only=True)["b"] + series = series.filter(series > 60) + result = series.last() + assert result is None + + +@single_cases +def test_last_expr_select( + constructor_eager: ConstructorEager, col: str, expected: PythonLiteral +) -> None: + df = nw.from_native(constructor_eager(data)) + expr = nw.col(col).last() + result = df.select(expr) + assert_equal_data(result, {col: [expected]}) + + +@single_cases +def test_last_expr_with_columns( + constructor_eager: ConstructorEager, + col: str, + expected: PythonLiteral, + request: pytest.FixtureRequest, +) -> None: + if any( + x in str(constructor_eager) for x in ("pyarrow_table", "pandas", "modin", "cudf") + ): + request.applymarker( + pytest.mark.xfail( + reason="Some kind of index error, see https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083582828" + ) + ) + + request.applymarker( + pytest.mark.xfail( + ("polars" in str(constructor_eager) and POLARS_VERSION < (1, 10)), + reason="Needs `order_by`", + raises=NotImplementedError, + ) + ) + + frame = nw.from_native(constructor_eager(data)) + expr = nw.col(col).last().over(order_by="idx").alias("result") + result = frame.with_columns(expr).select("result") + expected_broadcast = len(data[col]) * [expected] + assert_equal_data(result, {"result": expected_broadcast}) + + +@pytest.mark.parametrize( + "expected", + [ + {"a": [None], "c": [0.9]}, + {"d": [3], "b": [12]}, + {"c": [0.9], "a": [None], "d": [3]}, + ], +) +def test_last_expr_expand( + constructor_eager: ConstructorEager, expected: Mapping[str, Sequence[PythonLiteral]] +) -> None: + df = nw.from_native(constructor_eager(data)) + expr = nw.col(expected).last() + result = df.select(expr) + assert_equal_data(result, expected) + + +def test_last_expr_expand_sort(constructor_eager: ConstructorEager) -> None: + df = nw.from_native(constructor_eager(data)) + expr = nw.col("d", "a", "b", "c").last() + result = df.sort("d").select(expr) + expected = {"d": [4], "a": [1], "b": [6], "c": [3.0]} + assert_equal_data(result, expected) From 54b3188b5b891ef899a5c319ae62177a0ae1d2d1 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 22 Jul 2025 13:03:21 +0000 Subject: [PATCH 063/101] test: Add `test_group_by_agg_last` --- tests/frame/group_by_test.py | 47 ++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/frame/group_by_test.py b/tests/frame/group_by_test.py index 4d6bae78fa..aa067320a3 100644 --- a/tests/frame/group_by_test.py +++ b/tests/frame/group_by_test.py @@ -705,3 +705,50 @@ def test_group_by_agg_first( df = df.sort(aggs, **pre_sort) result = df.group_by(keys).agg(nw.col(aggs).first()).sort(keys) assert_equal_data(result, expected) + + +@pytest.mark.parametrize( + ("keys", "aggs", "expected", "pre_sort"), + [ + (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 3, 5, 6]}, None), + (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 2, 4, 6]}, {"descending": True}), + pytest.param( + ["a"], + ["c"], + {"a": [1, 2, 3, 4], "c": [None, "A", "B", "B"]}, + None, + id="pandas-pyarrow-na-order", + ), + ( + ["a"], + ["c"], + {"a": [1, 2, 3, 4], "c": [None, "A", None, "B"]}, + {"nulls_last": True}, + ), + ], +) +def test_group_by_agg_last( + constructor_eager: ConstructorEager, + keys: Sequence[str], + aggs: Sequence[str], + expected: Mapping[str, Any], + pre_sort: Mapping[str, Any] | None, + request: pytest.FixtureRequest, +) -> None: + request.applymarker( + pytest.mark.xfail( + "pyarrow_table" in str(constructor_eager) and (PYARROW_VERSION < (14, 0)), + reason="https://github.com/apache/arrow/issues/36709", + raises=NotImplementedError, + ) + ) + data = { + "a": [1, 2, 2, 3, 3, 4], + "b": [1, 2, 3, 4, 5, 6], + "c": [None, "A", "A", None, "B", "B"], + } + df = nw.from_native(constructor_eager(data)) + if pre_sort: + df = df.sort(aggs, **pre_sort) + result = df.group_by(keys).agg(nw.col(aggs).last()).sort(keys) + assert_equal_data(result, expected) From 5f9ff6fd625d51da673aba91cd3becd8b6640d6f Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 22 Jul 2025 13:03:59 +0000 Subject: [PATCH 064/101] fix: Add missing `PandasLikeGroupBy._REMAP_AGGS` entry --- narwhals/_pandas_like/group_by.py | 1 + 1 file changed, 1 insertion(+) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index f591601084..e056ed4c9c 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -221,6 +221,7 @@ class PandasLikeGroupBy( "all": "all", "any": "any", "first": "first", + "last": "last", } _original_columns: tuple[str, ...] """Column names *prior* to any aliasing in `ParseKeysGroupBy`.""" From 4000b25c3f38989e881e394c6d3fe51b66052f5e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 22 Jul 2025 13:06:05 +0000 Subject: [PATCH 065/101] test: Repeat `@single_cases` pattern for `first` --- tests/expr_and_series/first_test.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py index 58d427388a..8f4d24f3c5 100644 --- a/tests/expr_and_series/first_test.py +++ b/tests/expr_and_series/first_test.py @@ -21,8 +21,12 @@ "idx": [0, 1, 2, 3], } +single_cases = pytest.mark.parametrize( + ("col", "expected"), [("a", 8), ("b", 58), ("c", 2.5)] +) + -@pytest.mark.parametrize(("col", "expected"), [("a", 8), ("b", 58), ("c", 2.5)]) +@single_cases def test_first_series( constructor_eager: ConstructorEager, col: str, expected: PythonLiteral ) -> None: @@ -38,7 +42,7 @@ def test_first_series_empty(constructor_eager: ConstructorEager) -> None: assert result is None -@pytest.mark.parametrize(("col", "expected"), [("a", 8), ("b", 58), ("c", 2.5)]) +@single_cases def test_first_expr_select( constructor_eager: ConstructorEager, col: str, expected: PythonLiteral ) -> None: @@ -48,7 +52,7 @@ def test_first_expr_select( assert_equal_data(result, {col: [expected]}) -@pytest.mark.parametrize(("col", "expected"), [("a", 8), ("b", 58), ("c", 2.5)]) +@single_cases def test_first_expr_with_columns( constructor_eager: ConstructorEager, col: str, From 1c62ce20dc8c26138ac732d9b97d7c365182f227 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 22 Jul 2025 13:33:58 +0000 Subject: [PATCH 066/101] docs: Examples for `Expr.(first|last)` Based on https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217706019 --- narwhals/expr.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/narwhals/expr.py b/narwhals/expr.py index 4e947787cd..9246ceaf93 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1979,6 +1979,29 @@ def first(self) -> Self: Returns: A new expression. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> data = {"a": [1, 1, 2, 2], "b": ["foo", None, None, "baz"]} + >>> df_native = pd.DataFrame(data) + >>> df = nw.from_native(df_native) + >>> df.select(nw.all().first()) + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | a b | + | 0 1 foo | + └──────────────────┘ + + >>> df.group_by("a").agg(nw.col("b").first()) + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | a b | + | 0 1 foo | + | 1 2 None | + └──────────────────┘ """ return self._with_orderable_aggregation( lambda plx: self._to_compliant_expr(plx).first() @@ -1989,6 +2012,36 @@ def last(self) -> Self: Returns: A new expression. + + Examples: + >>> import pyarrow as pa + >>> import narwhals as nw + >>> data = {"a": [1, 1, 2, 2], "b": ["foo", None, None, "baz"]} + >>> df_native = pa.table(data) + >>> df = nw.from_native(df_native) + >>> df.select(nw.all().last()) + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | pyarrow.Table | + | a: int64 | + | b: string | + | ---- | + | a: [[2]] | + | b: [["baz"]] | + └──────────────────┘ + + >>> df.group_by("a").agg(nw.col("b").last()) + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + |pyarrow.Table | + |a: int64 | + |b: string | + |---- | + |a: [[1,2]] | + |b: [[null,"baz"]] | + └──────────────────┘ """ return self._with_orderable_aggregation( lambda plx: self._to_compliant_expr(plx).last() From 063e5d036d7f0e5922e8a9250c3bd4e024f3d842 Mon Sep 17 00:00:00 2001 From: Dan Redding <125183946+dangotbanned@users.noreply.github.com> Date: Tue, 22 Jul 2025 13:49:26 +0000 Subject: [PATCH 067/101] Remove `modin` todo --- narwhals/_pandas_like/group_by.py | 1 - 1 file changed, 1 deletion(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index e056ed4c9c..6a2b98816e 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -80,7 +80,6 @@ def _native_agg( if name == "nunique": return methodcaller(name, dropna=False) if _is_ordered_agg(name): - # TODO @dangotbanned: `modin` support # TODO @dangotbanned: `cuDF` support # https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217722493 pd_version = Implementation.PANDAS._backend_version() From 65e6804f2307193961889c277c7177180e2d5d65 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 23 Jul 2025 10:56:00 +0000 Subject: [PATCH 068/101] clean up and doc `pandas` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit truly wild how many issues I've needed to reference 😩 --- narwhals/_pandas_like/group_by.py | 75 +++++++++++++++++-------------- 1 file changed, 41 insertions(+), 34 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 6a2b98816e..b60b8b8272 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -28,7 +28,7 @@ NativeApply: TypeAlias = "Callable[[pd.DataFrame], pd.Series[Any]]" InefficientNativeAggregation: TypeAlias = Literal["cov", "skew"] OrderedAggregation: TypeAlias = Literal["first", "last"] -NativeAggregation: TypeAlias = Literal[ +UnorderedAggregation: TypeAlias = Literal[ "any", "all", "count", @@ -46,9 +46,9 @@ "std", "sum", "var", - OrderedAggregation, InefficientNativeAggregation, ] +NativeAggregation: TypeAlias = Literal[UnorderedAggregation, OrderedAggregation] """https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#built-in-aggregation-methods""" _NativeAgg: TypeAlias = "Callable[[Any], pd.DataFrame | pd.Series[Any]]" @@ -62,42 +62,50 @@ "first": 0, "last": -1, } -_ORDERED_AGG = frozenset[OrderedAggregation](("first", "last")) - - -def _is_ordered_agg(obj: Any) -> TypeIs[OrderedAggregation]: - return obj in _ORDERED_AGG @lru_cache(maxsize=32) def _native_agg( - name: NativeAggregation, - /, - *, - has_pyarrow_string: bool | None = None, - **kwds: Unpack[ScalarKwargs], + name: UnorderedAggregation, /, **kwds: Unpack[ScalarKwargs] ) -> _NativeAgg: if name == "nunique": return methodcaller(name, dropna=False) - if _is_ordered_agg(name): - # TODO @dangotbanned: `cuDF` support - # https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217722493 - pd_version = Implementation.PANDAS._backend_version() - if has_pyarrow_string or (pd_version >= (2, 0, 0) and pd_version < (2, 2, 1)): - return _ordered_compat(name) - if pd_version >= (2, 2, 1): - return methodcaller(name, skipna=False) - # NOTE: Before 1.1.5, `first` worked without arguments - if pd_version >= (1, 1, 5): # pragma: no cover - # NOTE: https://pandas.pydata.org/pandas-docs/stable/whatsnew/v2.0.0.html#dataframegroupby-nth-and-seriesgroupby-nth-now-behave-as-filtrations - # https://github.com/pandas-dev/pandas/issues/57019#issuecomment-1905038446 - return methodcaller("nth", n=_REMAP_ORDERED_INDEX[name]) if not kwds or kwds.get("ddof") == 1: return methodcaller(name) return methodcaller(name, **kwds) -def _ordered_compat( +@lru_cache(maxsize=8) +def _native_ordered_agg( + name: OrderedAggregation, *, has_pyarrow_string: bool +) -> _NativeAgg: + """Best effort alignment of `first`, `last` across versions. + + The way `polars` works **by default**, is a constantly moving target in `pandas` 😫 + """ + # TODO @dangotbanned: `cuDF` support + # https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217722493 + pd_version = Implementation.PANDAS._backend_version() + if has_pyarrow_string or (pd_version >= (2, 0, 0) and pd_version < (2, 2, 1)): + # NOTE: `>=2.0.0; <2.2.1` breaks the `nth` workaround + # ATOW (`2.3.1`), `string[pyarrow]` has always required `apply` + # https://github.com/pandas-dev/pandas/issues/13666 + # https://pandas.pydata.org/pandas-docs/stable/whatsnew/v2.0.0.html#dataframegroupby-nth-and-seriesgroupby-nth-now-behave-as-filtrations + return _apply_ordered_agg(name) + if pd_version >= (2, 2, 1): + # NOTE: Introduces option to disable default null skipping + # https://github.com/pandas-dev/pandas/pull/57102 + return methodcaller(name, skipna=False) + if pd_version >= (1, 1, 5): # pragma: no cover + # NOTE: `>=1.1.5; <2.0.0`, `nth` could be used as a non-skipping aggregation + # https://github.com/pandas-dev/pandas/issues/57019#issuecomment-1905038446 + return methodcaller("nth", n=_REMAP_ORDERED_INDEX[name]) + # NOTE: `<1.1.5`, nulls weren't skipped by default + # https://github.com/pandas-dev/pandas/issues/38286 + return methodcaller(name) # pragma: no cover + + +def _apply_ordered_agg( name: OrderedAggregation, / ) -> Callable[[NativeGroupBy], pd.DataFrame]: """Taken from https://github.com/pandas-dev/pandas/issues/57019#issue-2094896421. @@ -114,6 +122,11 @@ def fn(dgb: NativeGroupBy, /) -> pd.DataFrame: return fn +def _is_ordered_agg(obj: Any) -> TypeIs[OrderedAggregation]: + """`string[pyarrow]` needs special treatment with nulls in `first`, `last`.""" + return obj in _REMAP_ORDERED_INDEX + + def _is_pyarrow_string(dtype: ExtensionDtype) -> bool: return dtype.name == "string[pyarrow]" @@ -174,10 +187,6 @@ def is_len(self) -> bool: def is_anonymous(self) -> bool: return self.expr._depth == 0 - def is_ordered(self) -> bool: - """`string[pyarrow]` needs special treatment with nulls.""" - return self.leaf_name in {"first", "last"} - @property def kwargs(self) -> ScalarKwargs: return self.expr._scalar_kwargs @@ -192,13 +201,11 @@ def leaf_name(self) -> NarwhalsAggregation | Any: def native_agg(self, group_by: PandasLikeGroupBy) -> _NativeAgg: """Return a partial `DataFrameGroupBy` method, missing only `self`.""" native_name = PandasLikeGroupBy._remap_expr_name(self.leaf_name) - if self.is_ordered(): + if _is_ordered_agg(native_name): dtypes: pd.Series = group_by.compliant.native.dtypes targets = dtypes[list(self.output_names)] has_pyarrow_string = targets.transform(_is_pyarrow_string).any().item() - return _native_agg( - native_name, has_pyarrow_string=has_pyarrow_string, **self.kwargs - ) + return _native_ordered_agg(native_name, has_pyarrow_string=has_pyarrow_string) return _native_agg(native_name, **self.kwargs) From 22fae20c62245a3000f91fd882eda27e1714de0d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 23 Jul 2025 13:20:52 +0000 Subject: [PATCH 069/101] feat: Warn on new pandas apply path Resolves https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2220010556 --- narwhals/_pandas_like/group_by.py | 43 ++++++++++++++++++++++++++----- tests/frame/group_by_test.py | 38 +++++++++++++++------------ 2 files changed, 58 insertions(+), 23 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index b60b8b8272..6e6068573b 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -8,7 +8,7 @@ from narwhals._compliant import EagerGroupBy from narwhals._expression_parsing import evaluate_output_names_and_aliases -from narwhals._utils import Implementation, find_stacklevel +from narwhals._utils import Implementation, find_stacklevel, requires from narwhals.dependencies import is_pandas_like_dataframe if TYPE_CHECKING: @@ -62,6 +62,8 @@ "first": 0, "last": -1, } +_PYARROW_STRING_NAME = "string[pyarrow]" +_MINIMUM_SKIPNA = (2, 2, 1) @lru_cache(maxsize=32) @@ -75,7 +77,10 @@ def _native_agg( return methodcaller(name, **kwds) -@lru_cache(maxsize=8) +# NOTE: Caching disabled to avoid using an `apply` warning either: +# - Once per unique args +# - Once per column in a multi-output expression +# *Least bad* option is to warn once per expression def _native_ordered_agg( name: OrderedAggregation, *, has_pyarrow_string: bool ) -> _NativeAgg: @@ -86,13 +91,13 @@ def _native_ordered_agg( # TODO @dangotbanned: `cuDF` support # https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217722493 pd_version = Implementation.PANDAS._backend_version() - if has_pyarrow_string or (pd_version >= (2, 0, 0) and pd_version < (2, 2, 1)): + if has_pyarrow_string or (pd_version >= (2, 0, 0) and pd_version < _MINIMUM_SKIPNA): # NOTE: `>=2.0.0; <2.2.1` breaks the `nth` workaround # ATOW (`2.3.1`), `string[pyarrow]` has always required `apply` # https://github.com/pandas-dev/pandas/issues/13666 # https://pandas.pydata.org/pandas-docs/stable/whatsnew/v2.0.0.html#dataframegroupby-nth-and-seriesgroupby-nth-now-behave-as-filtrations - return _apply_ordered_agg(name) - if pd_version >= (2, 2, 1): + return _apply_ordered_agg(name, has_pyarrow_string=has_pyarrow_string) + if pd_version >= _MINIMUM_SKIPNA: # NOTE: Introduces option to disable default null skipping # https://github.com/pandas-dev/pandas/pull/57102 return methodcaller(name, skipna=False) @@ -106,7 +111,7 @@ def _native_ordered_agg( def _apply_ordered_agg( - name: OrderedAggregation, / + name: OrderedAggregation, /, *, has_pyarrow_string: bool ) -> Callable[[NativeGroupBy], pd.DataFrame]: """Taken from https://github.com/pandas-dev/pandas/issues/57019#issue-2094896421. @@ -114,6 +119,7 @@ def _apply_ordered_agg( - but works in all cases for `string[pyarrow]` - and `>=2.0.0; <2.2.1` """ + warn_ordered_apply(name, has_pyarrow_string=has_pyarrow_string) index = _REMAP_ORDERED_INDEX[name] def fn(dgb: NativeGroupBy, /) -> pd.DataFrame: @@ -128,7 +134,7 @@ def _is_ordered_agg(obj: Any) -> TypeIs[OrderedAggregation]: def _is_pyarrow_string(dtype: ExtensionDtype) -> bool: - return dtype.name == "string[pyarrow]" + return dtype.name == _PYARROW_STRING_NAME class AggExpr: @@ -389,3 +395,26 @@ def warn_complex_group_by() -> None: UserWarning, stacklevel=find_stacklevel(), ) + + +def warn_ordered_apply(name: OrderedAggregation, /, *, has_pyarrow_string: bool) -> None: + if has_pyarrow_string: + msg = ( + f"{_PYARROW_STRING_NAME!r} has different ordering semantics than other pandas dtypes.\n\n" + "Please see: " + "https://pandas.pydata.org/pdeps/0014-string-dtype.html" + ) + else: + found = requires._unparse_version(Implementation.PANDAS._backend_version()) + minimum = requires._unparse_version(_MINIMUM_SKIPNA) + msg = ( + f"If you can, please upgrade to 'pandas>={minimum}', found version {found!r}.\n\n" + "Please see: " + "https://github.com/pandas-dev/pandas/issues/57019" + ) + warnings.warn( + f"Found ordered group-by aggregation `{name}()`, which can't be expressed both efficiently and " + f"safely with the pandas API.\n{msg}", + UserWarning, + stacklevel=find_stacklevel(), + ) diff --git a/tests/frame/group_by_test.py b/tests/frame/group_by_test.py index aa067320a3..ffc879c431 100644 --- a/tests/frame/group_by_test.py +++ b/tests/frame/group_by_test.py @@ -3,6 +3,7 @@ import datetime as dt import os import re +from contextlib import nullcontext from decimal import Decimal from typing import TYPE_CHECKING, Any @@ -23,6 +24,7 @@ if TYPE_CHECKING: from collections.abc import Mapping, Sequence + from contextlib import AbstractContextManager from narwhals.typing import NonNestedLiteral @@ -660,18 +662,24 @@ def test_group_by_no_preserve_dtype( assert_equal_data(result, expected) +def _warns_context(request: pytest.FixtureRequest) -> AbstractContextManager[Any]: + if "[pyarrow]" in request.node.name and "NA-order" in request.node.name: + pattern = r"ordered.+safely.+string\[pyarrow\]" + elif any(x in str(request.node.name) for x in ("pandas", "modin", "cudf")) and ( + PANDAS_VERSION >= (2, 0, 0) and PANDAS_VERSION < (2, 2, 1) + ): # pragma: no cover + pattern = r"ordered.+safely.+please upgrade.+2.2.1" + else: + return nullcontext() + return pytest.warns(UserWarning, match=re.compile(pattern, re.DOTALL | re.IGNORECASE)) + + @pytest.mark.parametrize( ("keys", "aggs", "expected", "pre_sort"), [ (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 2, 4, 6]}, None), (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 3, 5, 6]}, {"descending": True}), - pytest.param( - ["a"], - ["c"], - {"a": [1, 2, 3, 4], "c": [None, "A", None, "B"]}, - None, - id="pandas-pyarrow-na-order", - ), + (["a"], ["c"], {"a": [1, 2, 3, 4], "c": [None, "A", None, "B"]}, None), ( ["a"], ["c"], @@ -679,6 +687,7 @@ def test_group_by_no_preserve_dtype( {"nulls_last": True}, ), ], + ids=["no-sort", "sort-descending", "NA-order-nulls-first", "NA-order-nulls-last"], ) def test_group_by_agg_first( constructor_eager: ConstructorEager, @@ -703,7 +712,8 @@ def test_group_by_agg_first( df = nw.from_native(constructor_eager(data)) if pre_sort: df = df.sort(aggs, **pre_sort) - result = df.group_by(keys).agg(nw.col(aggs).first()).sort(keys) + with _warns_context(request): + result = df.group_by(keys).agg(nw.col(aggs).first()).sort(keys) assert_equal_data(result, expected) @@ -712,13 +722,7 @@ def test_group_by_agg_first( [ (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 3, 5, 6]}, None), (["a"], ["b"], {"a": [1, 2, 3, 4], "b": [1, 2, 4, 6]}, {"descending": True}), - pytest.param( - ["a"], - ["c"], - {"a": [1, 2, 3, 4], "c": [None, "A", "B", "B"]}, - None, - id="pandas-pyarrow-na-order", - ), + (["a"], ["c"], {"a": [1, 2, 3, 4], "c": [None, "A", "B", "B"]}, None), ( ["a"], ["c"], @@ -726,6 +730,7 @@ def test_group_by_agg_first( {"nulls_last": True}, ), ], + ids=["no-sort", "sort-descending", "NA-order-nulls-first", "NA-order-nulls-last"], ) def test_group_by_agg_last( constructor_eager: ConstructorEager, @@ -750,5 +755,6 @@ def test_group_by_agg_last( df = nw.from_native(constructor_eager(data)) if pre_sort: df = df.sort(aggs, **pre_sort) - result = df.group_by(keys).agg(nw.col(aggs).last()).sort(keys) + with _warns_context(request): + result = df.group_by(keys).agg(nw.col(aggs).last()).sort(keys) assert_equal_data(result, expected) From d66fddce40ad7b9ed9379941016a3f9dfc12a308 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Wed, 23 Jul 2025 13:27:58 +0000 Subject: [PATCH 070/101] cov https://github.com/narwhals-dev/narwhals/actions/runs/16471856154/job/46562856822?pr=2528 --- narwhals/_pandas_like/group_by.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 6e6068573b..bcbfa6e899 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -404,7 +404,7 @@ def warn_ordered_apply(name: OrderedAggregation, /, *, has_pyarrow_string: bool) "Please see: " "https://pandas.pydata.org/pdeps/0014-string-dtype.html" ) - else: + else: # pragma: no cover found = requires._unparse_version(Implementation.PANDAS._backend_version()) minimum = requires._unparse_version(_MINIMUM_SKIPNA) msg = ( From 5e444a5f553fa9316b55d3cfaea0dd806ff6a8df Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 24 Jul 2025 17:49:50 +0000 Subject: [PATCH 071/101] =?UTF-8?q?always=20use=20`apply`=20for=20`cudf`?= =?UTF-8?q?=20=F0=9F=98=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2226673738 - https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217722493 --- narwhals/_pandas_like/group_by.py | 40 +++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index bcbfa6e899..9364c39429 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -82,21 +82,25 @@ def _native_agg( # - Once per column in a multi-output expression # *Least bad* option is to warn once per expression def _native_ordered_agg( - name: OrderedAggregation, *, has_pyarrow_string: bool + name: OrderedAggregation, *, has_pyarrow_string: bool, is_cudf: bool ) -> _NativeAgg: """Best effort alignment of `first`, `last` across versions. The way `polars` works **by default**, is a constantly moving target in `pandas` 😫 """ - # TODO @dangotbanned: `cuDF` support - # https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2217722493 pd_version = Implementation.PANDAS._backend_version() - if has_pyarrow_string or (pd_version >= (2, 0, 0) and pd_version < _MINIMUM_SKIPNA): + if ( + is_cudf + or has_pyarrow_string + or (pd_version >= (2, 0, 0) and pd_version < _MINIMUM_SKIPNA) + ): # NOTE: `>=2.0.0; <2.2.1` breaks the `nth` workaround # ATOW (`2.3.1`), `string[pyarrow]` has always required `apply` # https://github.com/pandas-dev/pandas/issues/13666 # https://pandas.pydata.org/pandas-docs/stable/whatsnew/v2.0.0.html#dataframegroupby-nth-and-seriesgroupby-nth-now-behave-as-filtrations - return _apply_ordered_agg(name, has_pyarrow_string=has_pyarrow_string) + return _apply_ordered_agg( + name, has_pyarrow_string=has_pyarrow_string, is_cudf=is_cudf + ) if pd_version >= _MINIMUM_SKIPNA: # NOTE: Introduces option to disable default null skipping # https://github.com/pandas-dev/pandas/pull/57102 @@ -111,7 +115,7 @@ def _native_ordered_agg( def _apply_ordered_agg( - name: OrderedAggregation, /, *, has_pyarrow_string: bool + name: OrderedAggregation, /, *, has_pyarrow_string: bool, is_cudf: bool ) -> Callable[[NativeGroupBy], pd.DataFrame]: """Taken from https://github.com/pandas-dev/pandas/issues/57019#issue-2094896421. @@ -119,7 +123,7 @@ def _apply_ordered_agg( - but works in all cases for `string[pyarrow]` - and `>=2.0.0; <2.2.1` """ - warn_ordered_apply(name, has_pyarrow_string=has_pyarrow_string) + warn_ordered_apply(name, has_pyarrow_string=has_pyarrow_string, is_cudf=is_cudf) index = _REMAP_ORDERED_INDEX[name] def fn(dgb: NativeGroupBy, /) -> pd.DataFrame: @@ -204,6 +208,10 @@ def leaf_name(self) -> NarwhalsAggregation | Any: self._leaf_name = PandasLikeGroupBy._leaf_name(self.expr) return self._leaf_name + @property + def implementation(self) -> Implementation: + return self.expr._implementation + def native_agg(self, group_by: PandasLikeGroupBy) -> _NativeAgg: """Return a partial `DataFrameGroupBy` method, missing only `self`.""" native_name = PandasLikeGroupBy._remap_expr_name(self.leaf_name) @@ -211,7 +219,11 @@ def native_agg(self, group_by: PandasLikeGroupBy) -> _NativeAgg: dtypes: pd.Series = group_by.compliant.native.dtypes targets = dtypes[list(self.output_names)] has_pyarrow_string = targets.transform(_is_pyarrow_string).any().item() - return _native_ordered_agg(native_name, has_pyarrow_string=has_pyarrow_string) + return _native_ordered_agg( + native_name, + has_pyarrow_string=has_pyarrow_string, + is_cudf=self.implementation.is_cudf(), + ) return _native_agg(native_name, **self.kwargs) @@ -397,8 +409,16 @@ def warn_complex_group_by() -> None: ) -def warn_ordered_apply(name: OrderedAggregation, /, *, has_pyarrow_string: bool) -> None: - if has_pyarrow_string: +def warn_ordered_apply( + name: OrderedAggregation, /, *, has_pyarrow_string: bool, is_cudf: bool +) -> None: + if is_cudf: # pragma: no cover + msg = ( + f"cuDF does not support selecting the {name} value without skipping NA.\n\n" + "Please see: " + "https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/groupby/" + ) + elif has_pyarrow_string: msg = ( f"{_PYARROW_STRING_NAME!r} has different ordering semantics than other pandas dtypes.\n\n" "Please see: " From 2ae42458cae91f4473e01270919815fcd7cb9667 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 29 Jul 2025 14:20:41 +0200 Subject: [PATCH 072/101] special path for orderable_aggregation in over --- narwhals/_arrow/expr.py | 11 +++++++- narwhals/_pandas_like/expr.py | 14 ++++++++++- narwhals/_pandas_like/group_by.py | 39 +++-------------------------- narwhals/_pandas_like/series.py | 2 +- tests/expr_and_series/first_test.py | 9 ------- tests/expr_and_series/last_test.py | 9 ------- 6 files changed, 27 insertions(+), 57 deletions(-) diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 324a160824..ff41c745b1 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -2,11 +2,12 @@ from typing import TYPE_CHECKING, Any +import pyarrow as pa import pyarrow.compute as pc from narwhals._arrow.series import ArrowSeries from narwhals._compliant import EagerExpr -from narwhals._expression_parsing import evaluate_output_names_and_aliases +from narwhals._expression_parsing import ExprKind, evaluate_output_names_and_aliases from narwhals._utils import ( Implementation, generate_temporary_column_name, @@ -132,6 +133,14 @@ def func(df: ArrowDataFrame) -> Sequence[ArrowSeries]: *order_by, descending=False, nulls_last=False ) result = self(df.drop([token], strict=True)) + if ( + meta := self._metadata + ) is not None and meta.last_node is ExprKind.ORDERABLE_AGGREGATION: + # Orderable aggregation result in a scalar, yet require order_by. + # Therefore we need to broadcast the result to the original size + size = len(df) + return [s._with_native(pa.repeat(s.item(), size)) for s in result] + # TODO(marco): is there a way to do this efficiently without # doing 2 sorts? Here we're sorting the dataframe and then # again calling `sort_indices`. `ArrowSeries.scatter` would also sort. diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 5705eedc8c..1c611b3f29 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING from narwhals._compliant import EagerExpr -from narwhals._expression_parsing import evaluate_output_names_and_aliases +from narwhals._expression_parsing import ExprKind, evaluate_output_names_and_aliases from narwhals._pandas_like.group_by import PandasLikeGroupBy from narwhals._pandas_like.series import PandasLikeSeries from narwhals._utils import generate_temporary_column_name @@ -227,6 +227,18 @@ def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: *order_by, descending=False, nulls_last=False ) results = self(df.drop([token], strict=True)) + if ( + meta := self._metadata + ) is not None and meta.last_node is ExprKind.ORDERABLE_AGGREGATION: + # Orderable aggregation result in a scalar, yet require order_by. + # Therefore we need to broadcast the result to the original size + index = df.native.index + ns = self._implementation.to_native_namespace() + return [ + s._with_native(ns.Series(s.item(), index=index, name=s.name)) + for s in results + ] + sorting_indices = df.get_column(token) for s in results: s._scatter_in_place(sorting_indices, s) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 7dc4e58ba3..6ad26a982f 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -409,7 +409,6 @@ def warn_complex_group_by() -> None: ) - def warn_ordered_apply( name: OrderedAggregation, /, *, has_pyarrow_string: bool, is_cudf: bool ) -> None: @@ -433,40 +432,8 @@ def warn_ordered_apply( "Please see: " "https://github.com/pandas-dev/pandas/issues/57019" ) - warnings.warn( - f"Found ordered group-by aggregation `{name}()`, which can't be expressed both efficiently and " - f"safely with the pandas API.\n{msg}", - UserWarning, - stacklevel=find_stacklevel(), - ) - - -def warn_ordered_apply( - name: OrderedAggregation, /, *, has_pyarrow_string: bool, is_cudf: bool -) -> None: - if is_cudf: # pragma: no cover - msg = ( - f"cuDF does not support selecting the {name} value without skipping NA.\n\n" - "Please see: " - "https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/groupby/" - ) - elif has_pyarrow_string: - msg = ( - f"{_PYARROW_STRING_NAME!r} has different ordering semantics than other pandas dtypes.\n\n" - "Please see: " - "https://pandas.pydata.org/pdeps/0014-string-dtype.html" - ) - else: # pragma: no cover - found = requires._unparse_version(Implementation.PANDAS._backend_version()) - minimum = requires._unparse_version(_MINIMUM_SKIPNA) - msg = ( - f"If you can, please upgrade to 'pandas>={minimum}', found version {found!r}.\n\n" - "Please see: " - "https://github.com/pandas-dev/pandas/issues/57019" - ) - warnings.warn( + msg = ( f"Found ordered group-by aggregation `{name}()`, which can't be expressed both efficiently and " - f"safely with the pandas API.\n{msg}", - UserWarning, - stacklevel=find_stacklevel(), + f"safely with the pandas API.\n{msg}" ) + issue_warning(msg, UserWarning) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index fa81771621..a6845bd569 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -312,7 +312,7 @@ def cast(self, dtype: IntoDType) -> Self: ) return self._with_native(self.native.astype(pd_dtype), preserve_broadcast=True) - def item(self, index: int | None) -> Any: + def item(self, index: int | None = None) -> Any: # cuDF doesn't have Series.item(). if index is None: if len(self) != 1: diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py index 8f4d24f3c5..7551398a3a 100644 --- a/tests/expr_and_series/first_test.py +++ b/tests/expr_and_series/first_test.py @@ -59,15 +59,6 @@ def test_first_expr_with_columns( expected: PythonLiteral, request: pytest.FixtureRequest, ) -> None: - if any( - x in str(constructor_eager) for x in ("pyarrow_table", "pandas", "modin", "cudf") - ): - request.applymarker( - pytest.mark.xfail( - reason="Some kind of index error, see https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083582828" - ) - ) - request.applymarker( pytest.mark.xfail( ("polars" in str(constructor_eager) and POLARS_VERSION < (1, 10)), diff --git a/tests/expr_and_series/last_test.py b/tests/expr_and_series/last_test.py index 7df67ee171..1e640642a5 100644 --- a/tests/expr_and_series/last_test.py +++ b/tests/expr_and_series/last_test.py @@ -59,15 +59,6 @@ def test_last_expr_with_columns( expected: PythonLiteral, request: pytest.FixtureRequest, ) -> None: - if any( - x in str(constructor_eager) for x in ("pyarrow_table", "pandas", "modin", "cudf") - ): - request.applymarker( - pytest.mark.xfail( - reason="Some kind of index error, see https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083582828" - ) - ) - request.applymarker( pytest.mark.xfail( ("polars" in str(constructor_eager) and POLARS_VERSION < (1, 10)), From b8066c4c57d4b0b6c38d58a0f5de05eefc2cae70 Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 29 Jul 2025 14:52:32 +0200 Subject: [PATCH 073/101] expand on comments --- narwhals/_arrow/expr.py | 6 ++++-- narwhals/_pandas_like/expr.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index ff41c745b1..699f81252d 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -136,8 +136,10 @@ def func(df: ArrowDataFrame) -> Sequence[ArrowSeries]: if ( meta := self._metadata ) is not None and meta.last_node is ExprKind.ORDERABLE_AGGREGATION: - # Orderable aggregation result in a scalar, yet require order_by. - # Therefore we need to broadcast the result to the original size + # Orderable aggregations require `order_by` columns and result in a + # scalar output (well actually in a length 1 series). + # Therefore we need to broadcast the result to the original size, since + # `over` is not a length changing operation. size = len(df) return [s._with_native(pa.repeat(s.item(), size)) for s in result] diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 1c611b3f29..3462838f4c 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -230,8 +230,10 @@ def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: if ( meta := self._metadata ) is not None and meta.last_node is ExprKind.ORDERABLE_AGGREGATION: - # Orderable aggregation result in a scalar, yet require order_by. - # Therefore we need to broadcast the result to the original size + # Orderable aggregations require `order_by` columns and result in a + # scalar output (well actually in a length 1 series). + # Therefore we need to broadcast the result to the original size, since + # `over` is not a length changing operation. index = df.native.index ns = self._implementation.to_native_namespace() return [ From 2dae6ef4afa187f5f6ef8df7ba3304c0e4f4707b Mon Sep 17 00:00:00 2001 From: FBruzzesi Date: Tue, 29 Jul 2025 16:36:42 +0200 Subject: [PATCH 074/101] assign metadata in arrow --- narwhals/_arrow/expr.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 699f81252d..d8f4cefa64 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -114,11 +114,8 @@ def _reuse_series_extra_kwargs( return {"_return_py_scalar": False} if returns_scalar else {} def over(self, partition_by: Sequence[str], order_by: Sequence[str]) -> Self: - if ( - partition_by - and self._metadata is not None - and not self._metadata.is_scalar_like - ): + meta = self._metadata + if partition_by and meta is not None and not meta.is_scalar_like: msg = "Only aggregation or literal operations are supported in grouped `over` context for PyArrow." raise NotImplementedError(msg) @@ -132,22 +129,20 @@ def func(df: ArrowDataFrame) -> Sequence[ArrowSeries]: df = df.with_row_index(token, order_by=None).sort( *order_by, descending=False, nulls_last=False ) - result = self(df.drop([token], strict=True)) - if ( - meta := self._metadata - ) is not None and meta.last_node is ExprKind.ORDERABLE_AGGREGATION: - # Orderable aggregations require `order_by` columns and result in a + results = self(df.drop([token], strict=True)) + if meta is not None and meta.last_node is ExprKind.ORDERABLE_AGGREGATION: + # Orderable aggregations require `order_by` columns and results in a # scalar output (well actually in a length 1 series). - # Therefore we need to broadcast the result to the original size, since + # Therefore we need to broadcast the results to the original size, since # `over` is not a length changing operation. size = len(df) - return [s._with_native(pa.repeat(s.item(), size)) for s in result] + return [s._with_native(pa.repeat(s.item(), size)) for s in results] # TODO(marco): is there a way to do this efficiently without # doing 2 sorts? Here we're sorting the dataframe and then # again calling `sort_indices`. `ArrowSeries.scatter` would also sort. sorting_indices = pc.sort_indices(df.get_column(token).native) - return [s._with_native(s.native.take(sorting_indices)) for s in result] + return [s._with_native(s.native.take(sorting_indices)) for s in results] else: def func(df: ArrowDataFrame) -> Sequence[ArrowSeries]: From f47ef1417418c0b5e6fdcb89f7e91fab2fd8c738 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 23 Aug 2025 12:46:10 +0000 Subject: [PATCH 075/101] docs: Remove *Returns* from `Expr` version Still makes sense in `Series` https://github.com/narwhals-dev/narwhals/pull/2528#discussion_r2083299578 --- narwhals/expr.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index 247fd919b0..bef2e73c8a 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1563,9 +1563,6 @@ def clip( def first(self) -> Self: """Get the first value. - Returns: - A new expression. - Examples: >>> import pandas as pd >>> import narwhals as nw @@ -1596,9 +1593,6 @@ def first(self) -> Self: def last(self) -> Self: """Get the last value. - Returns: - A new expression. - Examples: >>> import pyarrow as pa >>> import narwhals as nw From 0fb045536f5b56b978f354f8178b292301e9598c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 13 Sep 2025 10:24:02 +0000 Subject: [PATCH 076/101] chore(typing): fix incompatible override - https://github.com/narwhals-dev/narwhals/pull/3096#discussion_r2328717976 - https://github.com/narwhals-dev/narwhals/pull/2895/commits/0de173e20102203ffcc5b9abb29e1102b3ed2233 --- narwhals/_compliant/expr.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/narwhals/_compliant/expr.py b/narwhals/_compliant/expr.py index 8d18642480..562f79d021 100644 --- a/narwhals/_compliant/expr.py +++ b/narwhals/_compliant/expr.py @@ -904,10 +904,6 @@ def struct(self) -> EagerExprStructNamespace[Self]: class LazyExpr( # type: ignore[misc] ImplExpr[CompliantLazyFrameT, NativeExprT], Protocol[CompliantLazyFrameT, NativeExprT] ): - # NOTE: See https://github.com/narwhals-dev/narwhals/issues/2526#issuecomment-3019303816 - first: not_implemented = not_implemented() - last: not_implemented = not_implemented() - def _with_alias_output_names(self, func: AliasNames | None, /) -> Self: ... def alias(self, name: str) -> Self: def fn(names: Sequence[str]) -> Sequence[str]: @@ -922,6 +918,9 @@ def fn(names: Sequence[str]) -> Sequence[str]: def name(self) -> LazyExprNameNamespace[Self]: return LazyExprNameNamespace(self) + # NOTE: See https://github.com/narwhals-dev/narwhals/issues/2526#issuecomment-3019303816 + first = not_implemented() # type: ignore[misc] + last = not_implemented() # type: ignore[misc] ewm_mean = not_implemented() # type: ignore[misc] map_batches = not_implemented() # type: ignore[misc] replace_strict = not_implemented() # type: ignore[misc] From 6d63ea6b3c3e058c8caed9adb288cbfcbc52da1e Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 2 Oct 2025 16:25:51 +0100 Subject: [PATCH 077/101] simplify grouped first/last# --- narwhals/_pandas_like/group_by.py | 76 ++++++------------------------- tests/frame/group_by_test.py | 20 +------- 2 files changed, 17 insertions(+), 79 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index a2855cc390..d3af953af2 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -79,59 +79,8 @@ def _native_agg( return methodcaller(name, **kwds) -# NOTE: Caching disabled to avoid using an `apply` warning either: -# - Once per unique args -# - Once per column in a multi-output expression -# *Least bad* option is to warn once per expression -def _native_ordered_agg( - name: OrderedAggregation, *, has_pyarrow_string: bool, is_cudf: bool -) -> _NativeAgg: - """Best effort alignment of `first`, `last` across versions. - - The way `polars` works **by default**, is a constantly moving target in `pandas` 😫 - """ - pd_version = Implementation.PANDAS._backend_version() - if ( - is_cudf - or has_pyarrow_string - or (pd_version >= (2, 0, 0) and pd_version < _MINIMUM_SKIPNA) - ): - # NOTE: `>=2.0.0; <2.2.1` breaks the `nth` workaround - # ATOW (`2.3.1`), `string[pyarrow]` has always required `apply` - # https://github.com/pandas-dev/pandas/issues/13666 - # https://pandas.pydata.org/pandas-docs/stable/whatsnew/v2.0.0.html#dataframegroupby-nth-and-seriesgroupby-nth-now-behave-as-filtrations - return _apply_ordered_agg( - name, has_pyarrow_string=has_pyarrow_string, is_cudf=is_cudf - ) - if pd_version >= _MINIMUM_SKIPNA: - # NOTE: Introduces option to disable default null skipping - # https://github.com/pandas-dev/pandas/pull/57102 - return methodcaller(name, skipna=False) - if pd_version >= (1, 1, 5): # pragma: no cover - # NOTE: `>=1.1.5; <2.0.0`, `nth` could be used as a non-skipping aggregation - # https://github.com/pandas-dev/pandas/issues/57019#issuecomment-1905038446 - return methodcaller("nth", n=_REMAP_ORDERED_INDEX[name]) - # NOTE: `<1.1.5`, nulls weren't skipped by default - # https://github.com/pandas-dev/pandas/issues/38286 - return methodcaller(name) # pragma: no cover - - -def _apply_ordered_agg( - name: OrderedAggregation, /, *, has_pyarrow_string: bool, is_cudf: bool -) -> Callable[[NativeGroupBy], pd.DataFrame]: - """Taken from https://github.com/pandas-dev/pandas/issues/57019#issue-2094896421. - - - Theoretically could be slow - - but works in all cases for `string[pyarrow]` - - and `>=2.0.0; <2.2.1` - """ - warn_ordered_apply(name, has_pyarrow_string=has_pyarrow_string, is_cudf=is_cudf) - index = _REMAP_ORDERED_INDEX[name] - - def fn(dgb: NativeGroupBy, /) -> pd.DataFrame: - return dgb.apply(lambda group: group.iloc[index]) - - return fn +def _native_ordered_agg(name: OrderedAggregation) -> _NativeAgg: + return methodcaller("nth", n=_REMAP_ORDERED_INDEX[name]) def _is_ordered_agg(obj: Any) -> TypeIs[OrderedAggregation]: @@ -219,6 +168,12 @@ def _getitem_aggs( for col in cols ] ) + elif self.is_last() or self.is_first(): + select = names[0] if len(names) == 1 else list(names) + result = self.native_agg(group_by)( + group_by._grouped[[*group_by._keys, *select]] + ) + result.set_index(group_by._keys, inplace=True) # noqa: PD002 else: select = names[0] if len(names) == 1 else list(names) result = self.native_agg(group_by)(group_by._grouped[select]) @@ -231,6 +186,12 @@ def _getitem_aggs( def is_len(self) -> bool: return self.leaf_name == "len" + def is_last(self) -> bool: + return self.leaf_name == "last" + + def is_first(self) -> bool: + return self.leaf_name == "first" + def is_mode(self) -> bool: return self.leaf_name == "mode" @@ -257,14 +218,7 @@ def native_agg(self, group_by: PandasLikeGroupBy) -> _NativeAgg: """Return a partial `DataFrameGroupBy` method, missing only `self`.""" native_name = PandasLikeGroupBy._remap_expr_name(self.leaf_name) if _is_ordered_agg(native_name): - dtypes: pd.Series = group_by.compliant.native.dtypes - targets = dtypes[list(self.output_names)] - has_pyarrow_string = targets.transform(_is_pyarrow_string).any().item() - return _native_ordered_agg( - native_name, - has_pyarrow_string=has_pyarrow_string, - is_cudf=self.implementation.is_cudf(), - ) + return _native_ordered_agg(native_name) return _native_agg(native_name, **self.kwargs) diff --git a/tests/frame/group_by_test.py b/tests/frame/group_by_test.py index 544a4089fc..2388636aac 100644 --- a/tests/frame/group_by_test.py +++ b/tests/frame/group_by_test.py @@ -3,7 +3,6 @@ import datetime as dt import os import re -from contextlib import nullcontext from decimal import Decimal from typing import TYPE_CHECKING, Any @@ -24,7 +23,6 @@ if TYPE_CHECKING: from collections.abc import Mapping, Sequence - from contextlib import AbstractContextManager from narwhals.typing import NonNestedLiteral @@ -683,18 +681,6 @@ def test_top_level_len(constructor: Constructor) -> None: assert_equal_data(result, expected) -def _warns_context(request: pytest.FixtureRequest) -> AbstractContextManager[Any]: - if "[pyarrow]" in request.node.name and "NA-order" in request.node.name: - pattern = r"ordered.+safely.+string\[pyarrow\]" - elif any(x in str(request.node.name) for x in ("pandas", "modin", "cudf")) and ( - PANDAS_VERSION >= (2, 0, 0) and PANDAS_VERSION < (2, 2, 1) - ): # pragma: no cover - pattern = r"ordered.+safely.+please upgrade.+2.2.1" - else: - return nullcontext() - return pytest.warns(UserWarning, match=re.compile(pattern, re.DOTALL | re.IGNORECASE)) - - @pytest.mark.parametrize( ("keys", "aggs", "expected", "pre_sort"), [ @@ -733,8 +719,7 @@ def test_group_by_agg_first( df = nw.from_native(constructor_eager(data)) if pre_sort: df = df.sort(aggs, **pre_sort) - with _warns_context(request): - result = df.group_by(keys).agg(nw.col(aggs).first()).sort(keys) + result = df.group_by(keys).agg(nw.col(aggs).first()).sort(keys) assert_equal_data(result, expected) @@ -776,6 +761,5 @@ def test_group_by_agg_last( df = nw.from_native(constructor_eager(data)) if pre_sort: df = df.sort(aggs, **pre_sort) - with _warns_context(request): - result = df.group_by(keys).agg(nw.col(aggs).last()).sort(keys) + result = df.group_by(keys).agg(nw.col(aggs).last()).sort(keys) assert_equal_data(result, expected) From 7b003101fc2ad019dc79f457d1b3e83a4f983d7f Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 2 Oct 2025 18:27:16 +0100 Subject: [PATCH 078/101] simplify test, remove unnecessary over(order_by) --- tests/expr_and_series/first_test.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py index 7551398a3a..6f017e32cd 100644 --- a/tests/expr_and_series/first_test.py +++ b/tests/expr_and_series/first_test.py @@ -54,21 +54,12 @@ def test_first_expr_select( @single_cases def test_first_expr_with_columns( - constructor_eager: ConstructorEager, - col: str, - expected: PythonLiteral, - request: pytest.FixtureRequest, + constructor_eager: ConstructorEager, col: str, expected: PythonLiteral ) -> None: - request.applymarker( - pytest.mark.xfail( - ("polars" in str(constructor_eager) and POLARS_VERSION < (1, 10)), - reason="Needs `order_by`", - raises=NotImplementedError, - ) - ) - + if "polars" in str(constructor_eager) and POLARS_VERSION < (1, 10): + pytest.skip() frame = nw.from_native(constructor_eager(data)) - expr = nw.col(col).first().over(order_by="idx").alias("result") + expr = nw.col(col).first().alias("result") result = frame.with_columns(expr).select("result") expected_broadcast = len(data[col]) * [expected] assert_equal_data(result, {"result": expected_broadcast}) From 016abc9463dc1cd5cc33acd07a381b9bc450bfff Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 2 Oct 2025 19:13:00 +0100 Subject: [PATCH 079/101] combine tests --- narwhals/_pandas_like/expr.py | 12 ++- narwhals/_pandas_like/series.py | 2 +- narwhals/_sql/expr.py | 26 ++++++ tests/expr_and_series/first_last_test.py | 110 +++++++++++++++++++++++ tests/expr_and_series/first_test.py | 86 ------------------ 5 files changed, 146 insertions(+), 90 deletions(-) create mode 100644 tests/expr_and_series/first_last_test.py delete mode 100644 tests/expr_and_series/first_test.py diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index e6f2e4981d..4ee6f64ba8 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -1,5 +1,6 @@ from __future__ import annotations +import warnings from typing import TYPE_CHECKING from narwhals._compliant import EagerExpr @@ -101,6 +102,8 @@ def window_kwargs_to_pandas_equivalent( "min_periods": kwargs["min_samples"], "ignore_na": kwargs["ignore_nulls"], } + elif function_name == "first": + pandas_kwargs = {"skipna": False} else: # sum, len, ... pandas_kwargs = {} return pandas_kwargs @@ -341,9 +344,12 @@ def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: # noqa: C901, ) results = [result_frame.get_column(name) for name in aliases] if order_by: - for s in results: - s._scatter_in_place(sorting_indices, s) - return results + with warnings.catch_warnings(): + # Ignore settingwithcopy warnings/errors, they're false-positives here. + warnings.filterwarnings("ignore", message="\n.*copy of a slice") + for s in results: + s._scatter_in_place(sorting_indices, s) + return results if reverse: return [s._gather_slice(slice(None, None, -1)) for s in results] return results diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index f432814f9d..9e6916537c 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -287,9 +287,9 @@ def scatter(self, indices: int | Sequence[int], values: Any) -> Self: return self._with_native(s) def _scatter_in_place(self, indices: Self, values: Self) -> None: + # Scatter, modifying original Series. Use with care! implementation = self._implementation backend_version = self._backend_version - # Scatter, modifying original Series. Use with care! values_native = set_index( values.native, self.native.index[indices.native], diff --git a/narwhals/_sql/expr.py b/narwhals/_sql/expr.py index f427b73f40..adad762775 100644 --- a/narwhals/_sql/expr.py +++ b/narwhals/_sql/expr.py @@ -661,6 +661,32 @@ def func( return self._with_window_function(func) + def first(self) -> Self: + def func( + df: SQLLazyFrameT, inputs: WindowInputs[NativeExprT] + ) -> Sequence[NativeExprT]: + return [ + self._window_expression( + self._function("first", expr), inputs.partition_by, inputs.order_by + ) + for expr in self(df) + ] + + return self._with_window_function(func) + + def last(self) -> Self: + def func( + df: SQLLazyFrameT, inputs: WindowInputs[NativeExprT] + ) -> Sequence[NativeExprT]: + return [ + self._window_expression( + self._function("last", expr), inputs.partition_by, inputs.order_by + ) + for expr in self(df) + ] + + return self._with_window_function(func) + def rank(self, method: RankMethod, *, descending: bool) -> Self: if method in {"min", "max", "average"}: func = self._function("rank") diff --git a/tests/expr_and_series/first_last_test.py b/tests/expr_and_series/first_last_test.py new file mode 100644 index 0000000000..df3b6fc680 --- /dev/null +++ b/tests/expr_and_series/first_last_test.py @@ -0,0 +1,110 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +import narwhals as nw +from tests.utils import PANDAS_VERSION, POLARS_VERSION, Constructor, assert_equal_data + +if TYPE_CHECKING: + from narwhals.typing import PythonLiteral + from tests.utils import ConstructorEager + +data: dict[str, list[PythonLiteral]] = { + "a": [8, 2, 1, None], + "b": [58, 5, 6, 12], + "c": [2.5, 1.0, 3.0, 0.9], + "d": [2, 1, 4, 3], + "idx": [0, 1, 2, 3], +} + +single_cases = pytest.mark.parametrize( + ("col", "expected_first", "expected_last"), + [("a", 8, None), ("b", 58, 12), ("c", 2.5, 0.9)], +) + + +@single_cases +def test_first_last_series( + constructor_eager: ConstructorEager, + col: str, + expected_first: PythonLiteral, + expected_last: PythonLiteral, +) -> None: + series = nw.from_native(constructor_eager(data), eager_only=True)[col] + result = series.first() + assert_equal_data({col: [result]}, {col: [expected_first]}) + result = series.last() + assert_equal_data({col: [result]}, {col: [expected_last]}) + + +def test_first_last_series_empty(constructor_eager: ConstructorEager) -> None: + series = nw.from_native(constructor_eager(data), eager_only=True)["a"] + series = series.filter(series > 50) + result = series.first() + assert result is None + result = series.last() + assert result is None + + +@single_cases +def test_first_last_expr_select( + constructor_eager: ConstructorEager, + col: str, + expected_first: PythonLiteral, + expected_last: PythonLiteral, +) -> None: + df = nw.from_native(constructor_eager(data)) + result = df.select( + nw.col(col).first().name.suffix("_first"), nw.col(col).last().name.suffix("_last") + ) + assert_equal_data( + result, {f"{col}_first": [expected_first], f"{col}_last": [expected_last]} + ) + + +@single_cases +def test_first_expr_with_columns( + constructor_eager: ConstructorEager, + col: str, + expected_first: PythonLiteral, + expected_last: PythonLiteral, +) -> None: + df = nw.from_native(constructor_eager(data)) + result = df.with_columns( + nw.col(col).first().name.suffix("_first"), nw.col(col).last().name.suffix("_last") + ).select(nw.selectors.matches("first|last")) + assert_equal_data( + result, + { + f"{col}_first": [expected_first] * len(data["a"]), + f"{col}_last": [expected_last] * len(data["a"]), + }, + ) + + +def test_first_expr_over_order_by(constructor: Constructor) -> None: + if "polars" in str(constructor) and POLARS_VERSION < (1, 10): + pytest.skip() + frame = nw.from_native( + constructor({"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [0, 2, 1]}) + ) + result = frame.with_columns(nw.col("b", "c").first().over(order_by="i")).sort("i") + expected = {"a": [1, 2, 1], "b": [4, 4, 4], "c": [None, None, None], "i": [0, 1, 2]} + assert_equal_data(result, expected) + + +def test_first_expr_over_order_by_partition_by(constructor: Constructor) -> None: + if "polars" in str(constructor) and POLARS_VERSION < (1, 10): + pytest.skip() + if "pandas" in str(constructor) and PANDAS_VERSION < (2, 2, 1): + pytest.skip() + frame = nw.from_native( + constructor({"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [0, 1, 2]}) + ) + result = frame.with_columns(nw.col("b", "c").first().over("a", order_by="i")).sort( + "i" + ) + expected = {"a": [1, 1, 2], "b": [4, 4, 6], "c": [None, None, 8], "i": [0, 1, 2]} + assert_equal_data(result, expected) diff --git a/tests/expr_and_series/first_test.py b/tests/expr_and_series/first_test.py deleted file mode 100644 index 6f017e32cd..0000000000 --- a/tests/expr_and_series/first_test.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pytest - -import narwhals as nw -from tests.utils import POLARS_VERSION, assert_equal_data - -if TYPE_CHECKING: - from collections.abc import Mapping, Sequence - - from narwhals.typing import PythonLiteral - from tests.utils import ConstructorEager - -data: dict[str, list[PythonLiteral]] = { - "a": [8, 2, 1, None], - "b": [58, 5, 6, 12], - "c": [2.5, 1.0, 3.0, 0.9], - "d": [2, 1, 4, 3], - "idx": [0, 1, 2, 3], -} - -single_cases = pytest.mark.parametrize( - ("col", "expected"), [("a", 8), ("b", 58), ("c", 2.5)] -) - - -@single_cases -def test_first_series( - constructor_eager: ConstructorEager, col: str, expected: PythonLiteral -) -> None: - series = nw.from_native(constructor_eager(data), eager_only=True)[col] - result = series.first() - assert_equal_data({col: [result]}, {col: [expected]}) - - -def test_first_series_empty(constructor_eager: ConstructorEager) -> None: - series = nw.from_native(constructor_eager(data), eager_only=True)["a"] - series = series.filter(series > 50) - result = series.first() - assert result is None - - -@single_cases -def test_first_expr_select( - constructor_eager: ConstructorEager, col: str, expected: PythonLiteral -) -> None: - df = nw.from_native(constructor_eager(data)) - expr = nw.col(col).first() - result = df.select(expr) - assert_equal_data(result, {col: [expected]}) - - -@single_cases -def test_first_expr_with_columns( - constructor_eager: ConstructorEager, col: str, expected: PythonLiteral -) -> None: - if "polars" in str(constructor_eager) and POLARS_VERSION < (1, 10): - pytest.skip() - frame = nw.from_native(constructor_eager(data)) - expr = nw.col(col).first().alias("result") - result = frame.with_columns(expr).select("result") - expected_broadcast = len(data[col]) * [expected] - assert_equal_data(result, {"result": expected_broadcast}) - - -@pytest.mark.parametrize( - "expected", - [{"a": [8], "c": [2.5]}, {"d": [2], "b": [58]}, {"c": [2.5], "a": [8], "d": [2]}], -) -def test_first_expr_expand( - constructor_eager: ConstructorEager, expected: Mapping[str, Sequence[PythonLiteral]] -) -> None: - df = nw.from_native(constructor_eager(data)) - expr = nw.col(expected).first() - result = df.select(expr) - assert_equal_data(result, expected) - - -def test_first_expr_expand_sort(constructor_eager: ConstructorEager) -> None: - df = nw.from_native(constructor_eager(data)) - expr = nw.col("d", "a", "b", "c").first() - result = df.sort("d").select(expr) - expected = {"d": [1], "a": [2], "b": [5], "c": [1.0]} - assert_equal_data(result, expected) From 29d6cb72e6e597dc8f64ea295e09d0e6d208e839 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 2 Oct 2025 19:16:57 +0100 Subject: [PATCH 080/101] combine tests --- tests/expr_and_series/first_last_test.py | 34 +++++++++++++++++++----- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/tests/expr_and_series/first_last_test.py b/tests/expr_and_series/first_last_test.py index df3b6fc680..88bd2fc732 100644 --- a/tests/expr_and_series/first_last_test.py +++ b/tests/expr_and_series/first_last_test.py @@ -90,8 +90,20 @@ def test_first_expr_over_order_by(constructor: Constructor) -> None: frame = nw.from_native( constructor({"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [0, 2, 1]}) ) - result = frame.with_columns(nw.col("b", "c").first().over(order_by="i")).sort("i") - expected = {"a": [1, 2, 1], "b": [4, 4, 4], "c": [None, None, None], "i": [0, 1, 2]} + result = frame.with_columns( + nw.col("b", "c").first().over(order_by="i").name.suffix("_first"), + nw.col("b", "c").last().over(order_by="i").name.suffix("_last"), + ).sort("i") + expected = { + "a": [1, 2, 1], + "b": [4, 6, 5], + "c": [None, 8.0, 7.0], + "i": [0, 1, 2], + "b_first": [4, 4, 4], + "c_first": [None, None, None], + "b_last": [5, 5, 5], + "c_last": [7.0, 7.0, 7.0], + } assert_equal_data(result, expected) @@ -103,8 +115,18 @@ def test_first_expr_over_order_by_partition_by(constructor: Constructor) -> None frame = nw.from_native( constructor({"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [0, 1, 2]}) ) - result = frame.with_columns(nw.col("b", "c").first().over("a", order_by="i")).sort( - "i" - ) - expected = {"a": [1, 1, 2], "b": [4, 4, 6], "c": [None, None, 8], "i": [0, 1, 2]} + result = frame.with_columns( + nw.col("b", "c").first().over("a", order_by="i").name.suffix("_first"), + nw.col("b", "c").last().over("a", order_by="i").name.suffix("_last"), + ).sort("i") + expected = { + "a": [1, 2, 1], + "b": [4, 6, 5], + "c": [None, 8, 7], + "i": [0, 1, 2], + "b_first": [4, 4, 4], + "c_first": [None, None, None], + "b_last": [4, 6, 5], + "c_last": [None, 8, 7], + } assert_equal_data(result, expected) From 3b91e23cf503722ca3aa098480db18c1969a0aed Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 2 Oct 2025 19:38:42 +0100 Subject: [PATCH 081/101] duckdb fix# --- narwhals/_duckdb/expr.py | 21 +++++++++++++++++++++ narwhals/_sql/expr.py | 6 ++++-- tests/expr_and_series/first_last_test.py | 18 +++++++++--------- tests/utils.py | 2 +- 4 files changed, 35 insertions(+), 12 deletions(-) diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index cab9d51f22..eb43acceb8 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -13,6 +13,7 @@ DeferredTimeZone, F, col, + generate_order_by_sql, lit, narwhals_to_native_dtype, when, @@ -93,6 +94,26 @@ def _window_expression( nulls_last=nulls_last, ) + def _first(self, expr: Expression, *order_by: str) -> Expression: + from duckdb import SQLExpression + + order_by_sql = generate_order_by_sql( + *order_by, + descending=[False] * len(order_by), + nulls_last=[False] * len(order_by), + ) + return SQLExpression(f"first({expr} {order_by_sql})") + + def _last(self, expr: Expression, *order_by: str) -> Expression: + from duckdb import SQLExpression + + order_by_sql = generate_order_by_sql( + *order_by, + descending=[False] * len(order_by), + nulls_last=[False] * len(order_by), + ) + return SQLExpression(f"last({expr} {order_by_sql})") + def __narwhals_namespace__(self) -> DuckDBNamespace: # pragma: no cover from narwhals._duckdb.namespace import DuckDBNamespace diff --git a/narwhals/_sql/expr.py b/narwhals/_sql/expr.py index adad762775..44b4da4291 100644 --- a/narwhals/_sql/expr.py +++ b/narwhals/_sql/expr.py @@ -194,6 +194,8 @@ def _coalesce(self, *expr: NativeExprT) -> NativeExprT: return self.__narwhals_namespace__()._coalesce(*expr) def _count_star(self) -> NativeExprT: ... + def _first(self, expr: NativeExprT, *order_by: str) -> NativeExprT: ... + def _last(self, expr: NativeExprT, *order_by: str) -> NativeExprT: ... def _when( self, @@ -667,7 +669,7 @@ def func( ) -> Sequence[NativeExprT]: return [ self._window_expression( - self._function("first", expr), inputs.partition_by, inputs.order_by + self._first(expr, *inputs.order_by), inputs.partition_by ) for expr in self(df) ] @@ -680,7 +682,7 @@ def func( ) -> Sequence[NativeExprT]: return [ self._window_expression( - self._function("last", expr), inputs.partition_by, inputs.order_by + self._last(expr, *inputs.order_by), inputs.partition_by ) for expr in self(df) ] diff --git a/tests/expr_and_series/first_last_test.py b/tests/expr_and_series/first_last_test.py index 88bd2fc732..9faeff97cf 100644 --- a/tests/expr_and_series/first_last_test.py +++ b/tests/expr_and_series/first_last_test.py @@ -97,12 +97,12 @@ def test_first_expr_over_order_by(constructor: Constructor) -> None: expected = { "a": [1, 2, 1], "b": [4, 6, 5], - "c": [None, 8.0, 7.0], + "c": [None, 8, 7], "i": [0, 1, 2], "b_first": [4, 4, 4], "c_first": [None, None, None], "b_last": [5, 5, 5], - "c_last": [7.0, 7.0, 7.0], + "c_last": [7, 7, 7], } assert_equal_data(result, expected) @@ -120,13 +120,13 @@ def test_first_expr_over_order_by_partition_by(constructor: Constructor) -> None nw.col("b", "c").last().over("a", order_by="i").name.suffix("_last"), ).sort("i") expected = { - "a": [1, 2, 1], - "b": [4, 6, 5], - "c": [None, 8, 7], + "a": [1, 1, 2], + "b": [4, 5, 6], + "c": [None, 7, 8], "i": [0, 1, 2], - "b_first": [4, 4, 4], - "c_first": [None, None, None], - "b_last": [4, 6, 5], - "c_last": [None, 8, 7], + "b_first": [4, 4, 6], + "c_first": [None, None, 8], + "b_last": [5, 5, 6], + "c_last": [7, 7, 8], } assert_equal_data(result, expected) diff --git a/tests/utils.py b/tests/utils.py index 95c29537d4..e32ad0bbbd 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -147,7 +147,7 @@ def assert_equal_data(result: Any, expected: Mapping[str, Any]) -> None: are_equivalent_values = lhs == rhs assert are_equivalent_values, ( - f"Mismatch at index {i}: {lhs} != {rhs}\nExpected: {expected}\nGot: {result}" + f"Mismatch at index {i}, key {key}: {lhs} != {rhs}\nExpected: {expected}\nGot: {result}" ) From c87935dc2beede30795f8b18bfc13c82b697ece7 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 2 Oct 2025 19:58:18 +0100 Subject: [PATCH 082/101] sort out ibis --- narwhals/_ibis/expr.py | 10 ++++++++++ narwhals/_spark_like/expr.py | 10 ++++++++++ tests/expr_and_series/first_last_test.py | 17 +++++++++++++++-- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/narwhals/_ibis/expr.py b/narwhals/_ibis/expr.py index 3471d3c831..b9560aec06 100644 --- a/narwhals/_ibis/expr.py +++ b/narwhals/_ibis/expr.py @@ -112,6 +112,16 @@ def _window_expression( ) return expr.over(window) + def _first(self, expr: ir.Value, *order_by: str) -> ir.Value: + return cast("ir.Column", expr).first( + order_by=self._sort(*order_by), include_null=True + ) + + def _last(self, expr: ir.Value, *order_by: str) -> ir.Value: + return cast("ir.Column", expr).last( + order_by=self._sort(*order_by), include_null=True + ) + def __narwhals_namespace__(self) -> IbisNamespace: # pragma: no cover from narwhals._ibis.namespace import IbisNamespace diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 13dd167fe5..7f8fc95d62 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -96,6 +96,16 @@ def _window_expression( window = window.rowsBetween(rows_start, self._Window.unboundedFollowing) return expr.over(window) + def _first(self, expr: Column, *order_by: str) -> Column: + # Docs say it's non-deterministic, with no way to specify order. + msg = "`first` is not supported for PySpark." + raise NotImplementedError(msg) + + def _last(self, expr: Column, *order_by: str) -> Column: + # Docs say it's non-deterministic, with no way to specify order. + msg = "`last` is not supported for PySpark." + raise NotImplementedError(msg) + def broadcast(self, kind: Literal[ExprKind.AGGREGATION, ExprKind.LITERAL]) -> Self: if kind is ExprKind.LITERAL: return self diff --git a/tests/expr_and_series/first_last_test.py b/tests/expr_and_series/first_last_test.py index 9faeff97cf..8c8d63752d 100644 --- a/tests/expr_and_series/first_last_test.py +++ b/tests/expr_and_series/first_last_test.py @@ -84,9 +84,17 @@ def test_first_expr_with_columns( ) -def test_first_expr_over_order_by(constructor: Constructor) -> None: +def test_first_expr_over_order_by( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: if "polars" in str(constructor) and POLARS_VERSION < (1, 10): pytest.skip() + if "pyspark" in str(constructor): + # Currently unsupported. + request.applymarker(pytest.mark.xfail) + if "ibis" in str(constructor): + # https://github.com/ibis-project/ibis/issues/11656 + request.applymarker(pytest.mark.xfail) frame = nw.from_native( constructor({"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [0, 2, 1]}) ) @@ -107,11 +115,16 @@ def test_first_expr_over_order_by(constructor: Constructor) -> None: assert_equal_data(result, expected) -def test_first_expr_over_order_by_partition_by(constructor: Constructor) -> None: +def test_first_expr_over_order_by_partition_by( + constructor: Constructor, request: pytest.FixtureRequest +) -> None: if "polars" in str(constructor) and POLARS_VERSION < (1, 10): pytest.skip() if "pandas" in str(constructor) and PANDAS_VERSION < (2, 2, 1): pytest.skip() + if "pyspark" in str(constructor): + # Currently unsupported. + request.applymarker(pytest.mark.xfail) frame = nw.from_native( constructor({"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [0, 1, 2]}) ) From 0393dfece1b5d1323e1959833dd3eca6bfcf3dc1 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 2 Oct 2025 19:59:16 +0100 Subject: [PATCH 083/101] dask --- tests/expr_and_series/first_last_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/expr_and_series/first_last_test.py b/tests/expr_and_series/first_last_test.py index 8c8d63752d..e78bf9b26b 100644 --- a/tests/expr_and_series/first_last_test.py +++ b/tests/expr_and_series/first_last_test.py @@ -89,7 +89,7 @@ def test_first_expr_over_order_by( ) -> None: if "polars" in str(constructor) and POLARS_VERSION < (1, 10): pytest.skip() - if "pyspark" in str(constructor): + if any(x in str(constructor) for x in ("pyspark", "dask")): # Currently unsupported. request.applymarker(pytest.mark.xfail) if "ibis" in str(constructor): @@ -122,7 +122,7 @@ def test_first_expr_over_order_by_partition_by( pytest.skip() if "pandas" in str(constructor) and PANDAS_VERSION < (2, 2, 1): pytest.skip() - if "pyspark" in str(constructor): + if any(x in str(constructor) for x in ("pyspark", "dask")): # Currently unsupported. request.applymarker(pytest.mark.xfail) frame = nw.from_native( From 466c9221b6e4aba13c4b27e2bb6f67852037cec8 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 2 Oct 2025 20:02:54 +0100 Subject: [PATCH 084/101] add note to docs --- narwhals/expr.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/narwhals/expr.py b/narwhals/expr.py index 1d7cf567a9..ed6e6f97ec 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -1597,6 +1597,10 @@ def clip( def first(self) -> Self: """Get the first value. + Notes: + For lazy backends, this can only be used with `over`. We may introduce + `min_by` in the future so it can be used as an aggregation. + Examples: >>> import pandas as pd >>> import narwhals as nw @@ -1627,6 +1631,10 @@ def first(self) -> Self: def last(self) -> Self: """Get the last value. + Notes: + For lazy backends, this can only be used with `over`. We may introduce + `max_by` in the future so it can be used as an aggregation. + Examples: >>> import pyarrow as pa >>> import narwhals as nw From 4266e4babbb572e30b0f4fac15459b3d38f20c85 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 2 Oct 2025 20:53:56 +0100 Subject: [PATCH 085/101] remove unnecessary code --- narwhals/_pandas_like/group_by.py | 41 +------------------------------ 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index d3af953af2..4b68cccb81 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -9,14 +9,13 @@ from narwhals._compliant import EagerGroupBy from narwhals._exceptions import issue_warning from narwhals._expression_parsing import evaluate_output_names_and_aliases -from narwhals._utils import Implementation, requires, zip_strict +from narwhals._utils import zip_strict from narwhals.dependencies import is_pandas_like_dataframe if TYPE_CHECKING: from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence import pandas as pd - from pandas.api.extensions import ExtensionDtype from pandas.api.typing import DataFrameGroupBy as _NativeGroupBy from typing_extensions import TypeAlias, TypeIs, Unpack @@ -88,10 +87,6 @@ def _is_ordered_agg(obj: Any) -> TypeIs[OrderedAggregation]: return obj in _REMAP_ORDERED_INDEX -def _is_pyarrow_string(dtype: ExtensionDtype) -> bool: - return dtype.name == _PYARROW_STRING_NAME - - class AggExpr: """Wrapper storing the intermediate state per-`PandasLikeExpr`. @@ -210,10 +205,6 @@ def leaf_name(self) -> NarwhalsAggregation | Any: self._leaf_name = PandasLikeGroupBy._leaf_name(self.expr) return self._leaf_name - @property - def implementation(self) -> Implementation: - return self.expr._implementation - def native_agg(self, group_by: PandasLikeGroupBy) -> _NativeAgg: """Return a partial `DataFrameGroupBy` method, missing only `self`.""" native_name = PandasLikeGroupBy._remap_expr_name(self.leaf_name) @@ -405,33 +396,3 @@ def warn_complex_group_by() -> None: "https://narwhals-dev.github.io/narwhals/concepts/improve_group_by_operation/", UserWarning, ) - - -def warn_ordered_apply( - name: OrderedAggregation, /, *, has_pyarrow_string: bool, is_cudf: bool -) -> None: - if is_cudf: # pragma: no cover - msg = ( - f"cuDF does not support selecting the {name} value without skipping NA.\n\n" - "Please see: " - "https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/groupby/" - ) - elif has_pyarrow_string: - msg = ( - f"{_PYARROW_STRING_NAME!r} has different ordering semantics than other pandas dtypes.\n\n" - "Please see: " - "https://pandas.pydata.org/pdeps/0014-string-dtype.html" - ) - else: # pragma: no cover - found = requires._unparse_version(Implementation.PANDAS._backend_version()) - minimum = requires._unparse_version(_MINIMUM_SKIPNA) - msg = ( - f"If you can, please upgrade to 'pandas>={minimum}', found version {found!r}.\n\n" - "Please see: " - "https://github.com/pandas-dev/pandas/issues/57019" - ) - msg = ( - f"Found ordered group-by aggregation `{name}()`, which can't be expressed both efficiently and " - f"safely with the pandas API.\n{msg}" - ) - issue_warning(msg, UserWarning) From 555098b24fe138c30429166d5f41f4098a137aa9 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 2 Oct 2025 20:55:02 +0100 Subject: [PATCH 086/101] pyarrow --- tests/expr_and_series/first_last_test.py | 10 ++- tests/expr_and_series/last_test.py | 99 ------------------------ 2 files changed, 9 insertions(+), 100 deletions(-) delete mode 100644 tests/expr_and_series/last_test.py diff --git a/tests/expr_and_series/first_last_test.py b/tests/expr_and_series/first_last_test.py index e78bf9b26b..bd19a5bef9 100644 --- a/tests/expr_and_series/first_last_test.py +++ b/tests/expr_and_series/first_last_test.py @@ -5,7 +5,13 @@ import pytest import narwhals as nw -from tests.utils import PANDAS_VERSION, POLARS_VERSION, Constructor, assert_equal_data +from tests.utils import ( + PANDAS_VERSION, + POLARS_VERSION, + PYARROW_VERSION, + Constructor, + assert_equal_data, +) if TYPE_CHECKING: from narwhals.typing import PythonLiteral @@ -125,6 +131,8 @@ def test_first_expr_over_order_by_partition_by( if any(x in str(constructor) for x in ("pyspark", "dask")): # Currently unsupported. request.applymarker(pytest.mark.xfail) + if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (14,): + pytest.skip() frame = nw.from_native( constructor({"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [0, 1, 2]}) ) diff --git a/tests/expr_and_series/last_test.py b/tests/expr_and_series/last_test.py deleted file mode 100644 index 1e640642a5..0000000000 --- a/tests/expr_and_series/last_test.py +++ /dev/null @@ -1,99 +0,0 @@ -from __future__ import annotations - -from typing import TYPE_CHECKING - -import pytest - -import narwhals as nw -from tests.utils import POLARS_VERSION, assert_equal_data - -if TYPE_CHECKING: - from collections.abc import Mapping, Sequence - - from narwhals.typing import PythonLiteral - from tests.utils import ConstructorEager - -data: dict[str, list[PythonLiteral]] = { - "a": [8, 2, 1, None], - "b": [58, 5, 6, 12], - "c": [2.5, 1.0, 3.0, 0.9], - "d": [2, 1, 4, 3], - "idx": [0, 1, 2, 3], -} - -single_cases = pytest.mark.parametrize( - ("col", "expected"), [("a", None), ("b", 12), ("c", 0.9)] -) - - -@single_cases -def test_last_series( - constructor_eager: ConstructorEager, col: str, expected: PythonLiteral -) -> None: - series = nw.from_native(constructor_eager(data), eager_only=True)[col] - result = series.last() - assert_equal_data({col: [result]}, {col: [expected]}) - - -def test_last_series_empty(constructor_eager: ConstructorEager) -> None: - series = nw.from_native(constructor_eager(data), eager_only=True)["b"] - series = series.filter(series > 60) - result = series.last() - assert result is None - - -@single_cases -def test_last_expr_select( - constructor_eager: ConstructorEager, col: str, expected: PythonLiteral -) -> None: - df = nw.from_native(constructor_eager(data)) - expr = nw.col(col).last() - result = df.select(expr) - assert_equal_data(result, {col: [expected]}) - - -@single_cases -def test_last_expr_with_columns( - constructor_eager: ConstructorEager, - col: str, - expected: PythonLiteral, - request: pytest.FixtureRequest, -) -> None: - request.applymarker( - pytest.mark.xfail( - ("polars" in str(constructor_eager) and POLARS_VERSION < (1, 10)), - reason="Needs `order_by`", - raises=NotImplementedError, - ) - ) - - frame = nw.from_native(constructor_eager(data)) - expr = nw.col(col).last().over(order_by="idx").alias("result") - result = frame.with_columns(expr).select("result") - expected_broadcast = len(data[col]) * [expected] - assert_equal_data(result, {"result": expected_broadcast}) - - -@pytest.mark.parametrize( - "expected", - [ - {"a": [None], "c": [0.9]}, - {"d": [3], "b": [12]}, - {"c": [0.9], "a": [None], "d": [3]}, - ], -) -def test_last_expr_expand( - constructor_eager: ConstructorEager, expected: Mapping[str, Sequence[PythonLiteral]] -) -> None: - df = nw.from_native(constructor_eager(data)) - expr = nw.col(expected).last() - result = df.select(expr) - assert_equal_data(result, expected) - - -def test_last_expr_expand_sort(constructor_eager: ConstructorEager) -> None: - df = nw.from_native(constructor_eager(data)) - expr = nw.col("d", "a", "b", "c").last() - result = df.sort("d").select(expr) - expected = {"d": [4], "a": [1], "b": [6], "c": [3.0]} - assert_equal_data(result, expected) From 36e38e0f263171fda68b54cb55f52f2df745b530 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 2 Oct 2025 22:08:41 +0100 Subject: [PATCH 087/101] fixup --- narwhals/_pandas_like/expr.py | 2 +- narwhals/_spark_like/expr.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 4ee6f64ba8..e88b020dfe 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -102,7 +102,7 @@ def window_kwargs_to_pandas_equivalent( "min_periods": kwargs["min_samples"], "ignore_na": kwargs["ignore_nulls"], } - elif function_name == "first": + elif function_name in {"first", "last"}: pandas_kwargs = {"skipna": False} else: # sum, len, ... pandas_kwargs = {} diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 7f8fc95d62..f1db158b52 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -101,7 +101,7 @@ def _first(self, expr: Column, *order_by: str) -> Column: msg = "`first` is not supported for PySpark." raise NotImplementedError(msg) - def _last(self, expr: Column, *order_by: str) -> Column: + def _last(self, expr: Column, *order_by: str) -> Column: # pragma: no cover # Docs say it's non-deterministic, with no way to specify order. msg = "`last` is not supported for PySpark." raise NotImplementedError(msg) From 42d2cd6705165bc0f0127f6ce81957af80deb76d Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 2 Oct 2025 22:13:01 +0100 Subject: [PATCH 088/101] typing --- narwhals/_compliant/expr.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/narwhals/_compliant/expr.py b/narwhals/_compliant/expr.py index 562f79d021..52a5b4f6c5 100644 --- a/narwhals/_compliant/expr.py +++ b/narwhals/_compliant/expr.py @@ -918,9 +918,6 @@ def fn(names: Sequence[str]) -> Sequence[str]: def name(self) -> LazyExprNameNamespace[Self]: return LazyExprNameNamespace(self) - # NOTE: See https://github.com/narwhals-dev/narwhals/issues/2526#issuecomment-3019303816 - first = not_implemented() # type: ignore[misc] - last = not_implemented() # type: ignore[misc] ewm_mean = not_implemented() # type: ignore[misc] map_batches = not_implemented() # type: ignore[misc] replace_strict = not_implemented() # type: ignore[misc] From 63f012aaa2265d914812151416bb47a79c8d4ebf Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 2 Oct 2025 23:20:52 +0100 Subject: [PATCH 089/101] dask --- narwhals/_dask/expr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index db636bb504..5dc6bee4cb 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -683,6 +683,8 @@ def dt(self) -> DaskExprDateTimeNamespace: return DaskExprDateTimeNamespace(self) rank = not_implemented() + first = not_implemented() + last = not_implemented() # namespaces list: not_implemented = not_implemented() # type: ignore[assignment] From c4ac0430558f626925bca26be812815ad8244718 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 3 Oct 2025 15:26:30 +0100 Subject: [PATCH 090/101] test and support `diff().sum().over(order_by=...)` --- narwhals/_arrow/expr.py | 10 ++++------ narwhals/_pandas_like/expr.py | 13 +++++-------- tests/expr_and_series/over_test.py | 16 ++++++++++++++++ 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index d41db7565d..22393261fc 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -7,7 +7,7 @@ from narwhals._arrow.series import ArrowSeries from narwhals._compliant import EagerExpr -from narwhals._expression_parsing import ExprKind, evaluate_output_names_and_aliases +from narwhals._expression_parsing import evaluate_output_names_and_aliases from narwhals._utils import ( Implementation, generate_temporary_column_name, @@ -128,11 +128,9 @@ def func(df: ArrowDataFrame) -> Sequence[ArrowSeries]: *order_by, descending=False, nulls_last=False ) results = self(df.drop([token], strict=True)) - if meta is not None and meta.last_node is ExprKind.ORDERABLE_AGGREGATION: - # Orderable aggregations require `order_by` columns and results in a - # scalar output (well actually in a length 1 series). - # Therefore we need to broadcast the results to the original size, since - # `over` is not a length changing operation. + if meta is not None and meta.is_scalar_like: + # We need to broadcast the results to the original size, since + # `over` is a length-preserving operation. size = len(df) return [s._with_native(pa.repeat(s.item(), size)) for s in results] diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index e88b020dfe..bd0feb57e7 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -4,7 +4,7 @@ from typing import TYPE_CHECKING from narwhals._compliant import EagerExpr -from narwhals._expression_parsing import ExprKind, evaluate_output_names_and_aliases +from narwhals._expression_parsing import evaluate_output_names_and_aliases from narwhals._pandas_like.group_by import PandasLikeGroupBy from narwhals._pandas_like.series import PandasLikeSeries from narwhals._utils import generate_temporary_column_name @@ -228,13 +228,10 @@ def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: *order_by, descending=False, nulls_last=False ) results = self(df.drop([token], strict=True)) - if ( - meta := self._metadata - ) is not None and meta.last_node is ExprKind.ORDERABLE_AGGREGATION: - # Orderable aggregations require `order_by` columns and result in a - # scalar output (well actually in a length 1 series). - # Therefore we need to broadcast the result to the original size, since - # `over` is not a length changing operation. + meta = self._metadata + if meta is not None and meta.is_scalar_like: + # We need to broadcast the result to the original size, since + # `over` is a length-preserving operation. index = df.native.index ns = self._implementation.to_native_namespace() return [ diff --git a/tests/expr_and_series/over_test.py b/tests/expr_and_series/over_test.py index 1fd71d8347..b32368f6e4 100644 --- a/tests/expr_and_series/over_test.py +++ b/tests/expr_and_series/over_test.py @@ -419,6 +419,22 @@ def test_over_without_partition_by( assert_equal_data(result, expected) +def test_aggregation_over_without_partition_by( + constructor_eager: ConstructorEager, +) -> None: + if "polars" in str(constructor_eager) and POLARS_VERSION < (1, 10): + pytest.skip() + + df = nw.from_native(constructor_eager({"a": [1, -1, 2], "i": [0, 2, 1]})) + result = ( + df.with_columns(b=nw.col("a").diff().sum().over(order_by="i")) + .sort("i") + .select("a", "b", "i") + ) + expected = {"a": [1, 2, -1], "b": [-2, -2, -2], "i": [0, 1, 2]} + assert_equal_data(result, expected) + + def test_len_over_2369(constructor: Constructor, request: pytest.FixtureRequest) -> None: if "duckdb" in str(constructor) and DUCKDB_VERSION < (1, 3): pytest.skip() From 8739b6ac2d8ddeb3f4b0c9c6a39ce4ff1a59b4f2 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 3 Oct 2025 15:39:03 +0100 Subject: [PATCH 091/101] cross-pandas version compat --- narwhals/_pandas_like/expr.py | 14 ++++++++++++-- tests/expr_and_series/first_last_test.py | 10 +--------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index bd0feb57e7..5be7bad920 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -102,8 +102,6 @@ def window_kwargs_to_pandas_equivalent( "min_periods": kwargs["min_samples"], "ignore_na": kwargs["ignore_nulls"], } - elif function_name in {"first", "last"}: - pandas_kwargs = {"skipna": False} else: # sum, len, ... pandas_kwargs = {} return pandas_kwargs @@ -332,6 +330,18 @@ def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: # noqa: C901, msg = "Safety check failed, please report a bug." raise AssertionError(msg) res_native = grouped.transform("size").to_frame(aliases[0]) + elif function_name == "first": + _first = grouped[[*partition_by, *output_names]].nth(0) + _first.reset_index(drop=True, inplace=True) + res_native = df.native[list(partition_by)].merge( + _first, on=list(partition_by) + )[list(output_names)] + elif function_name == "last": + _last = grouped[[*partition_by, *output_names]].nth(-1) + _last.reset_index(drop=True, inplace=True) + res_native = df.native[list(partition_by)].merge( + _last, on=list(partition_by) + )[list(output_names)] else: res_native = grouped[list(output_names)].transform( pandas_function_name, **pandas_kwargs diff --git a/tests/expr_and_series/first_last_test.py b/tests/expr_and_series/first_last_test.py index bd19a5bef9..590d1fa9d2 100644 --- a/tests/expr_and_series/first_last_test.py +++ b/tests/expr_and_series/first_last_test.py @@ -5,13 +5,7 @@ import pytest import narwhals as nw -from tests.utils import ( - PANDAS_VERSION, - POLARS_VERSION, - PYARROW_VERSION, - Constructor, - assert_equal_data, -) +from tests.utils import POLARS_VERSION, PYARROW_VERSION, Constructor, assert_equal_data if TYPE_CHECKING: from narwhals.typing import PythonLiteral @@ -126,8 +120,6 @@ def test_first_expr_over_order_by_partition_by( ) -> None: if "polars" in str(constructor) and POLARS_VERSION < (1, 10): pytest.skip() - if "pandas" in str(constructor) and PANDAS_VERSION < (2, 2, 1): - pytest.skip() if any(x in str(constructor) for x in ("pyspark", "dask")): # Currently unsupported. request.applymarker(pytest.mark.xfail) From ff2260429b7375e6ed277f8732f57ca953c65dde Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 3 Oct 2025 15:40:13 +0100 Subject: [PATCH 092/101] make test more unusual --- tests/expr_and_series/first_last_test.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/expr_and_series/first_last_test.py b/tests/expr_and_series/first_last_test.py index 590d1fa9d2..48702dcf21 100644 --- a/tests/expr_and_series/first_last_test.py +++ b/tests/expr_and_series/first_last_test.py @@ -126,7 +126,7 @@ def test_first_expr_over_order_by_partition_by( if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (14,): pytest.skip() frame = nw.from_native( - constructor({"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [0, 1, 2]}) + constructor({"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [1, 0, 2]}) ) result = frame.with_columns( nw.col("b", "c").first().over("a", order_by="i").name.suffix("_first"), @@ -134,12 +134,12 @@ def test_first_expr_over_order_by_partition_by( ).sort("i") expected = { "a": [1, 1, 2], - "b": [4, 5, 6], - "c": [None, 7, 8], + "b": [5, 4, 6], + "c": [7.0, None, 8.0], "i": [0, 1, 2], - "b_first": [4, 4, 6], - "c_first": [None, None, 8], - "b_last": [5, 5, 6], - "c_last": [7, 7, 8], + "b_first": [5, 5, 6], + "c_first": [7.0, 7.0, 8.0], + "b_last": [4, 4, 6], + "c_last": [None, None, 8.0], } assert_equal_data(result, expected) From d9c4a1bcc7fb09a5bebacd96a1520a0ed95de561 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 3 Oct 2025 15:44:48 +0100 Subject: [PATCH 093/101] fix another pyarrow issue --- narwhals/_arrow/expr.py | 3 +++ tests/expr_and_series/first_last_test.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 22393261fc..63c048837f 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -142,6 +142,9 @@ def func(df: ArrowDataFrame) -> Sequence[ArrowSeries]: else: def func(df: ArrowDataFrame) -> Sequence[ArrowSeries]: + if order_by: + df = df.sort(*order_by, descending=False, nulls_last=False) + output_names, aliases = evaluate_output_names_and_aliases(self, df, []) if overlap := set(output_names).intersection(partition_by): # E.g. `df.select(nw.all().sum().over('a'))`. This is well-defined, diff --git a/tests/expr_and_series/first_last_test.py b/tests/expr_and_series/first_last_test.py index 48702dcf21..d5540c52f5 100644 --- a/tests/expr_and_series/first_last_test.py +++ b/tests/expr_and_series/first_last_test.py @@ -125,6 +125,9 @@ def test_first_expr_over_order_by_partition_by( request.applymarker(pytest.mark.xfail) if "pyarrow_table" in str(constructor) and PYARROW_VERSION < (14,): pytest.skip() + if "ibis" in str(constructor): + # https://github.com/ibis-project/ibis/issues/11656 + request.applymarker(pytest.mark.xfail) frame = nw.from_native( constructor({"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [1, 0, 2]}) ) From 03b79698c30276231abdcc051a113eb84fc89ee6 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 3 Oct 2025 16:10:59 +0100 Subject: [PATCH 094/101] catch more warnings for modin --- narwhals/_pandas_like/expr.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 5be7bad920..b314c29fa5 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -331,13 +331,19 @@ def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: # noqa: C901, raise AssertionError(msg) res_native = grouped.transform("size").to_frame(aliases[0]) elif function_name == "first": - _first = grouped[[*partition_by, *output_names]].nth(0) + with warnings.catch_warnings(): + # Ignore settingwithcopy warnings/errors, they're false-positives here. + warnings.filterwarnings("ignore", message="\n.*copy of a slice") + _first = grouped[[*partition_by, *output_names]].nth(0) _first.reset_index(drop=True, inplace=True) res_native = df.native[list(partition_by)].merge( _first, on=list(partition_by) )[list(output_names)] elif function_name == "last": - _last = grouped[[*partition_by, *output_names]].nth(-1) + with warnings.catch_warnings(): + # Ignore settingwithcopy warnings/errors, they're false-positives here. + warnings.filterwarnings("ignore", message="\n.*copy of a slice") + _last = grouped[[*partition_by, *output_names]].nth(-1) _last.reset_index(drop=True, inplace=True) res_native = df.native[list(partition_by)].merge( _last, on=list(partition_by) From d01a3989d0a2da3bfb3563fee83f29cf212af2bf Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 3 Oct 2025 16:24:38 +0100 Subject: [PATCH 095/101] factor out sql_expression, link to feature request --- narwhals/_duckdb/expr.py | 11 +++++------ narwhals/_duckdb/utils.py | 16 ++++++++++------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index eb43acceb8..34a872fec2 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -16,6 +16,7 @@ generate_order_by_sql, lit, narwhals_to_native_dtype, + sql_expression, when, window_expression, ) @@ -95,24 +96,22 @@ def _window_expression( ) def _first(self, expr: Expression, *order_by: str) -> Expression: - from duckdb import SQLExpression - + # https://github.com/duckdb/duckdb/discussions/19252 order_by_sql = generate_order_by_sql( *order_by, descending=[False] * len(order_by), nulls_last=[False] * len(order_by), ) - return SQLExpression(f"first({expr} {order_by_sql})") + return sql_expression(f"first({expr} {order_by_sql})") def _last(self, expr: Expression, *order_by: str) -> Expression: - from duckdb import SQLExpression - + # https://github.com/duckdb/duckdb/discussions/19252 order_by_sql = generate_order_by_sql( *order_by, descending=[False] * len(order_by), nulls_last=[False] * len(order_by), ) - return SQLExpression(f"last({expr} {order_by_sql})") + return sql_expression(f"last({expr} {order_by_sql})") def __narwhals_namespace__(self) -> DuckDBNamespace: # pragma: no cover from narwhals._duckdb.namespace import DuckDBNamespace diff --git a/narwhals/_duckdb/utils.py b/narwhals/_duckdb/utils.py index 413b0cf7fe..83f950b192 100644 --- a/narwhals/_duckdb/utils.py +++ b/narwhals/_duckdb/utils.py @@ -324,11 +324,6 @@ def window_expression( ) -> Expression: # TODO(unassigned): Replace with `duckdb.WindowExpression` when they release it. # https://github.com/duckdb/duckdb/discussions/14725#discussioncomment-11200348 - try: - from duckdb import SQLExpression - except ModuleNotFoundError as exc: # pragma: no cover - msg = f"DuckDB>=1.3.0 is required for this operation. Found: DuckDB {duckdb.__version__}" - raise NotImplementedError(msg) from exc pb = generate_partition_by_sql(*partition_by) descending = descending or [False] * len(order_by) nulls_last = nulls_last or [False] * len(order_by) @@ -344,7 +339,7 @@ def window_expression( rows = "" func = f"{str(expr).removesuffix(')')} ignore nulls)" if ignore_nulls else str(expr) - return SQLExpression(f"{func} over ({pb} {ob} {rows})") + return sql_expression(f"{func} over ({pb} {ob} {rows})") def catch_duckdb_exception( @@ -368,3 +363,12 @@ def function(name: str, *args: Expression) -> Expression: if name == "isnull": return args[0].isnull() return F(name, *args) + + +def sql_expression(expr: str) -> Expression: + try: + from duckdb import SQLExpression + except ModuleNotFoundError as exc: # pragma: no cover + msg = f"DuckDB>=1.3.0 is required for this operation. Found: DuckDB {duckdb.__version__}" + raise NotImplementedError(msg) from exc + return SQLExpression(expr) From 18c08611517f0cb760c160a5e54202f2bd454201 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 3 Oct 2025 16:46:20 +0100 Subject: [PATCH 096/101] combine first and last blocks --- narwhals/_pandas_like/expr.py | 33 ++++++++++++------------ narwhals/_pandas_like/group_by.py | 30 +++++++-------------- tests/expr_and_series/first_last_test.py | 12 ++++++--- 3 files changed, 34 insertions(+), 41 deletions(-) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index b314c29fa5..18e2df5ce1 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -5,7 +5,7 @@ from narwhals._compliant import EagerExpr from narwhals._expression_parsing import evaluate_output_names_and_aliases -from narwhals._pandas_like.group_by import PandasLikeGroupBy +from narwhals._pandas_like.group_by import _REMAP_ORDERED_INDEX, PandasLikeGroupBy from narwhals._pandas_like.series import PandasLikeSeries from narwhals._utils import generate_temporary_column_name @@ -14,7 +14,13 @@ from typing_extensions import Self - from narwhals._compliant.typing import AliasNames, EvalNames, EvalSeries, ScalarKwargs + from narwhals._compliant.typing import ( + AliasNames, + EvalNames, + EvalSeries, + NarwhalsAggregation, + ScalarKwargs, + ) from narwhals._expression_parsing import ExprMetadata from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.namespace import PandasLikeNamespace @@ -44,7 +50,7 @@ def window_kwargs_to_pandas_equivalent( - function_name: str, kwargs: ScalarKwargs + function_name: NarwhalsAggregation, kwargs: ScalarKwargs ) -> dict[str, PythonLiteral]: if function_name == "shift": assert "n" in kwargs # noqa: S101 @@ -102,6 +108,8 @@ def window_kwargs_to_pandas_equivalent( "min_periods": kwargs["min_samples"], "ignore_na": kwargs["ignore_nulls"], } + elif function_name in {"first", "last"}: + pandas_kwargs = {"n": _REMAP_ORDERED_INDEX[function_name]} else: # sum, len, ... pandas_kwargs = {} return pandas_kwargs @@ -330,23 +338,16 @@ def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: # noqa: C901, msg = "Safety check failed, please report a bug." raise AssertionError(msg) res_native = grouped.transform("size").to_frame(aliases[0]) - elif function_name == "first": + elif function_name in {"first", "last"}: with warnings.catch_warnings(): # Ignore settingwithcopy warnings/errors, they're false-positives here. warnings.filterwarnings("ignore", message="\n.*copy of a slice") - _first = grouped[[*partition_by, *output_names]].nth(0) - _first.reset_index(drop=True, inplace=True) + _nth = getattr( + grouped[[*partition_by, *output_names]], pandas_function_name + )(**pandas_kwargs) + _nth.reset_index(drop=True, inplace=True) res_native = df.native[list(partition_by)].merge( - _first, on=list(partition_by) - )[list(output_names)] - elif function_name == "last": - with warnings.catch_warnings(): - # Ignore settingwithcopy warnings/errors, they're false-positives here. - warnings.filterwarnings("ignore", message="\n.*copy of a slice") - _last = grouped[[*partition_by, *output_names]].nth(-1) - _last.reset_index(drop=True, inplace=True) - res_native = df.native[list(partition_by)].merge( - _last, on=list(partition_by) + _nth, on=list(partition_by) )[list(output_names)] else: res_native = grouped[list(output_names)].transform( diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 4b68cccb81..93d0ed3f60 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -17,7 +17,7 @@ import pandas as pd from pandas.api.typing import DataFrameGroupBy as _NativeGroupBy - from typing_extensions import TypeAlias, TypeIs, Unpack + from typing_extensions import TypeAlias, Unpack from narwhals._compliant.typing import NarwhalsAggregation, ScalarKwargs from narwhals._pandas_like.dataframe import PandasLikeDataFrame @@ -27,8 +27,7 @@ NativeApply: TypeAlias = "Callable[[pd.DataFrame], pd.Series[Any]]" InefficientNativeAggregation: TypeAlias = Literal["cov", "skew"] -OrderedAggregation: TypeAlias = Literal["first", "last"] -UnorderedAggregation: TypeAlias = Literal[ +NativeAggregation: TypeAlias = Literal[ "any", "all", "count", @@ -39,6 +38,7 @@ "median", "min", "mode", + "nth", "nunique", "prod", "quantile", @@ -49,7 +49,6 @@ "var", InefficientNativeAggregation, ] -NativeAggregation: TypeAlias = Literal[UnorderedAggregation, OrderedAggregation] """https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#built-in-aggregation-methods""" _NativeAgg: TypeAlias = "Callable[[Any], pd.DataFrame | pd.Series[Any]]" @@ -59,7 +58,7 @@ NonStrHashable: TypeAlias = Any """Because `pandas` allows *"names"* like that 😭""" -_REMAP_ORDERED_INDEX: Mapping[OrderedAggregation, Literal[0, -1]] = { +_REMAP_ORDERED_INDEX: Mapping[NarwhalsAggregation, Literal[0, -1]] = { "first": 0, "last": -1, } @@ -68,9 +67,7 @@ @lru_cache(maxsize=32) -def _native_agg( - name: UnorderedAggregation, /, **kwds: Unpack[ScalarKwargs] -) -> _NativeAgg: +def _native_agg(name: NativeAggregation, /, **kwds: Unpack[ScalarKwargs]) -> _NativeAgg: if name == "nunique": return methodcaller(name, dropna=False) if not kwds or kwds.get("ddof") == 1: @@ -78,15 +75,6 @@ def _native_agg( return methodcaller(name, **kwds) -def _native_ordered_agg(name: OrderedAggregation) -> _NativeAgg: - return methodcaller("nth", n=_REMAP_ORDERED_INDEX[name]) - - -def _is_ordered_agg(obj: Any) -> TypeIs[OrderedAggregation]: - """`string[pyarrow]` needs special treatment with nulls in `first`, `last`.""" - return obj in _REMAP_ORDERED_INDEX - - class AggExpr: """Wrapper storing the intermediate state per-`PandasLikeExpr`. @@ -208,8 +196,8 @@ def leaf_name(self) -> NarwhalsAggregation | Any: def native_agg(self, group_by: PandasLikeGroupBy) -> _NativeAgg: """Return a partial `DataFrameGroupBy` method, missing only `self`.""" native_name = PandasLikeGroupBy._remap_expr_name(self.leaf_name) - if _is_ordered_agg(native_name): - return _native_ordered_agg(native_name) + if self.leaf_name in _REMAP_ORDERED_INDEX: + return methodcaller("nth", n=_REMAP_ORDERED_INDEX[self.leaf_name]) return _native_agg(native_name, **self.kwargs) @@ -231,8 +219,8 @@ class PandasLikeGroupBy( "quantile": "quantile", "all": "all", "any": "any", - "first": "first", - "last": "last", + "first": "nth", + "last": "nth", } _original_columns: tuple[str, ...] """Column names *prior* to any aliasing in `ParseKeysGroupBy`.""" diff --git a/tests/expr_and_series/first_last_test.py b/tests/expr_and_series/first_last_test.py index d5540c52f5..42afc9ac4f 100644 --- a/tests/expr_and_series/first_last_test.py +++ b/tests/expr_and_series/first_last_test.py @@ -96,7 +96,9 @@ def test_first_expr_over_order_by( # https://github.com/ibis-project/ibis/issues/11656 request.applymarker(pytest.mark.xfail) frame = nw.from_native( - constructor({"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [0, 2, 1]}) + constructor( + {"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [None, 2, 1]} + ) ) result = frame.with_columns( nw.col("b", "c").first().over(order_by="i").name.suffix("_first"), @@ -106,7 +108,7 @@ def test_first_expr_over_order_by( "a": [1, 2, 1], "b": [4, 6, 5], "c": [None, 8, 7], - "i": [0, 1, 2], + "i": [None, 1, 2], "b_first": [4, 4, 4], "c_first": [None, None, None], "b_last": [5, 5, 5], @@ -129,7 +131,9 @@ def test_first_expr_over_order_by_partition_by( # https://github.com/ibis-project/ibis/issues/11656 request.applymarker(pytest.mark.xfail) frame = nw.from_native( - constructor({"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [1, 0, 2]}) + constructor( + {"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [1, None, 2]} + ) ) result = frame.with_columns( nw.col("b", "c").first().over("a", order_by="i").name.suffix("_first"), @@ -139,7 +143,7 @@ def test_first_expr_over_order_by_partition_by( "a": [1, 1, 2], "b": [5, 4, 6], "c": [7.0, None, 8.0], - "i": [0, 1, 2], + "i": [None, 1, 2], "b_first": [5, 5, 6], "c_first": [7.0, 7.0, 8.0], "b_last": [4, 4, 6], From 948d96d60e28894467a8d3db3c5d1c493ccc6641 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 3 Oct 2025 16:49:20 +0100 Subject: [PATCH 097/101] remove more unneeded --- narwhals/_pandas_like/group_by.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 93d0ed3f60..741d08dcb1 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -62,8 +62,6 @@ "first": 0, "last": -1, } -_PYARROW_STRING_NAME = "string[pyarrow]" -_MINIMUM_SKIPNA = (2, 2, 1) @lru_cache(maxsize=32) From 8810d039d778eb791cb6166d41fdf3847a30ad65 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 3 Oct 2025 16:51:09 +0100 Subject: [PATCH 098/101] less special-casing --- narwhals/_pandas_like/group_by.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 741d08dcb1..5efae553e9 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -150,9 +150,8 @@ def _getitem_aggs( ] ) elif self.is_last() or self.is_first(): - select = names[0] if len(names) == 1 else list(names) result = self.native_agg(group_by)( - group_by._grouped[[*group_by._keys, *select]] + group_by._grouped[[*group_by._keys, *names]] ) result.set_index(group_by._keys, inplace=True) # noqa: PD002 else: From 843549ff361e5fdc5a2203faffc8f7c74492e498 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 3 Oct 2025 16:54:49 +0100 Subject: [PATCH 099/101] simplify further --- narwhals/_pandas_like/group_by.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/narwhals/_pandas_like/group_by.py b/narwhals/_pandas_like/group_by.py index 5efae553e9..d3a84ae72f 100644 --- a/narwhals/_pandas_like/group_by.py +++ b/narwhals/_pandas_like/group_by.py @@ -150,13 +150,11 @@ def _getitem_aggs( ] ) elif self.is_last() or self.is_first(): - result = self.native_agg(group_by)( - group_by._grouped[[*group_by._keys, *names]] - ) + result = self.native_agg()(group_by._grouped[[*group_by._keys, *names]]) result.set_index(group_by._keys, inplace=True) # noqa: PD002 else: select = names[0] if len(names) == 1 else list(names) - result = self.native_agg(group_by)(group_by._grouped[select]) + result = self.native_agg()(group_by._grouped[select]) if is_pandas_like_dataframe(result): result.columns = list(self.aliases) else: @@ -190,7 +188,7 @@ def leaf_name(self) -> NarwhalsAggregation | Any: self._leaf_name = PandasLikeGroupBy._leaf_name(self.expr) return self._leaf_name - def native_agg(self, group_by: PandasLikeGroupBy) -> _NativeAgg: + def native_agg(self) -> _NativeAgg: """Return a partial `DataFrameGroupBy` method, missing only `self`.""" native_name = PandasLikeGroupBy._remap_expr_name(self.leaf_name) if self.leaf_name in _REMAP_ORDERED_INDEX: From 363490dc0dcd1d616e931cb075aba3ee695883e4 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 3 Oct 2025 17:13:54 +0100 Subject: [PATCH 100/101] typing --- narwhals/_pandas_like/expr.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 1547f37bb8..d5332397c8 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -1,7 +1,7 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast from narwhals._compliant import EagerExpr from narwhals._expression_parsing import evaluate_output_names_and_aliases @@ -49,8 +49,8 @@ } -def window_kwargs_to_pandas_equivalent( - function_name: NarwhalsAggregation, kwargs: ScalarKwargs +def window_kwargs_to_pandas_equivalent( # noqa: C901 + function_name: str, kwargs: ScalarKwargs ) -> dict[str, PythonLiteral]: if function_name == "shift": assert "n" in kwargs # noqa: S101 @@ -111,7 +111,9 @@ def window_kwargs_to_pandas_equivalent( "ignore_na": kwargs["ignore_nulls"], } elif function_name in {"first", "last"}: - pandas_kwargs = {"n": _REMAP_ORDERED_INDEX[function_name]} + pandas_kwargs = { + "n": _REMAP_ORDERED_INDEX[cast("NarwhalsAggregation", function_name)] + } else: # sum, len, ... pandas_kwargs = {} return pandas_kwargs @@ -275,6 +277,7 @@ def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: ) def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: # noqa: C901, PLR0912, PLR0914, PLR0915 + assert pandas_function_name is not None # help mypy # noqa: S101 output_names, aliases = evaluate_output_names_and_aliases(self, df, []) if function_name == "cum_count": plx = self.__narwhals_namespace__() @@ -302,7 +305,6 @@ def func(df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: # noqa: C901, grouped = df._native_frame.groupby(partition_by) if function_name.startswith("rolling"): rolling = grouped[list(output_names)].rolling(**pandas_kwargs) - assert pandas_function_name is not None # help mypy # noqa: S101 if pandas_function_name in {"std", "var"}: assert "ddof" in self._scalar_kwargs # noqa: S101 res_native = getattr(rolling, pandas_function_name)( From c25d64962c1499ae69441629072a56e8d2ac32b7 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 3 Oct 2025 22:42:40 +0100 Subject: [PATCH 101/101] use repeat_by instead of lit for polars --- narwhals/_polars/expr.py | 9 +++++---- tests/expr_and_series/first_last_test.py | 15 ++++++++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/narwhals/_polars/expr.py b/narwhals/_polars/expr.py index 3368d3288e..b7cb739ffe 100644 --- a/narwhals/_polars/expr.py +++ b/narwhals/_polars/expr.py @@ -127,15 +127,16 @@ def is_nan(self) -> Self: return self._with_native(native) def over(self, partition_by: Sequence[str], order_by: Sequence[str]) -> Self: + # Use `pl.repeat(1, pl.len())` instead of `pl.lit(1)` to avoid issues for + # non-numeric types: https://github.com/pola-rs/polars/issues/24756. + pl_partition_by = partition_by or pl.repeat(1, pl.len()) if self._backend_version < (1, 9): if order_by: msg = "`order_by` in Polars requires version 1.10 or greater" raise NotImplementedError(msg) - native = self.native.over(partition_by or pl.lit(1)) + native = self.native.over(pl_partition_by) else: - native = self.native.over( - partition_by or pl.lit(1), order_by=order_by or None - ) + native = self.native.over(pl_partition_by, order_by=order_by or None) return self._with_native(native) @requires.backend_version((1,)) diff --git a/tests/expr_and_series/first_last_test.py b/tests/expr_and_series/first_last_test.py index 42afc9ac4f..67a4ff843a 100644 --- a/tests/expr_and_series/first_last_test.py +++ b/tests/expr_and_series/first_last_test.py @@ -97,22 +97,31 @@ def test_first_expr_over_order_by( request.applymarker(pytest.mark.xfail) frame = nw.from_native( constructor( - {"a": [1, 1, 2], "b": [4, 5, 6], "c": [None, 7, 8], "i": [None, 2, 1]} + { + "a": [1, 1, 2], + "b": [4, 5, 6], + "c": [None, 7, 8], + "d": ["x", "y", "z"], + "i": [None, 2, 1], + } ) ) result = frame.with_columns( - nw.col("b", "c").first().over(order_by="i").name.suffix("_first"), - nw.col("b", "c").last().over(order_by="i").name.suffix("_last"), + nw.col("b", "c", "d").first().over(order_by="i").name.suffix("_first"), + nw.col("b", "c", "d").last().over(order_by="i").name.suffix("_last"), ).sort("i") expected = { "a": [1, 2, 1], "b": [4, 6, 5], "c": [None, 8, 7], + "d": ["x", "z", "y"], "i": [None, 1, 2], "b_first": [4, 4, 4], "c_first": [None, None, None], + "d_first": ["x", "x", "x"], "b_last": [5, 5, 5], "c_last": [7, 7, 7], + "d_last": ["y", "y", "y"], } assert_equal_data(result, expected)