From 028098e4b5091f73fca2f812830d54f109875777 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 20 Feb 2025 18:39:59 +0000 Subject: [PATCH 01/55] refactor: replace `lambda df: df.columns` https://github.com/narwhals-dev/narwhals/pull/2058#discussion_r1964137667 --- narwhals/_arrow/namespace.py | 3 ++- narwhals/_arrow/selectors.py | 3 ++- narwhals/_dask/namespace.py | 3 ++- narwhals/_dask/selectors.py | 3 ++- narwhals/_duckdb/namespace.py | 3 ++- narwhals/_duckdb/selectors.py | 3 ++- narwhals/_pandas_like/namespace.py | 3 ++- narwhals/_pandas_like/selectors.py | 3 ++- narwhals/_spark_like/namespace.py | 3 ++- narwhals/_spark_like/selectors.py | 3 ++- narwhals/utils.py | 10 ++++++++++ 11 files changed, 30 insertions(+), 10 deletions(-) diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index f11e69af8e..1efbb69751 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -26,6 +26,7 @@ from narwhals._expression_parsing import combine_evaluate_output_names from narwhals.typing import CompliantNamespace from narwhals.utils import Implementation +from narwhals.utils import get_columns from narwhals.utils import import_dtypes_module from narwhals.utils import is_compliant_expr @@ -159,7 +160,7 @@ def all(self: Self) -> ArrowExpr: ], depth=0, function_name="all", - evaluate_output_names=lambda df: df.columns, + evaluate_output_names=get_columns, alias_output_names=None, backend_version=self._backend_version, version=self._version, diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index ec045c9e15..adf2374568 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -10,6 +10,7 @@ from narwhals._arrow.expr import ArrowExpr from narwhals.utils import _parse_time_unit_and_time_zone from narwhals.utils import dtype_matches_time_unit_and_time_zone +from narwhals.utils import get_columns from narwhals.utils import import_dtypes_module if TYPE_CHECKING: @@ -82,7 +83,7 @@ def all(self: Self) -> ArrowSelector: def func(df: ArrowDataFrame) -> list[ArrowSeries]: return [df[col] for col in df.columns] - return selector(self, func, lambda df: df.columns) + return selector(self, func, get_columns) def datetime( self: Self, diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index a8c3e41798..28fcd8c07f 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -24,6 +24,7 @@ from narwhals._expression_parsing import combine_alias_output_names from narwhals._expression_parsing import combine_evaluate_output_names from narwhals.typing import CompliantNamespace +from narwhals.utils import get_columns from narwhals.utils import is_compliant_expr if TYPE_CHECKING: @@ -57,7 +58,7 @@ def func(df: DaskLazyFrame) -> list[dx.Series]: func, depth=0, function_name="all", - evaluate_output_names=lambda df: df.columns, + evaluate_output_names=get_columns, alias_output_names=None, backend_version=self._backend_version, version=self._version, diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 123da1212d..d1ec3d4a69 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -10,6 +10,7 @@ from narwhals._dask.expr import DaskExpr from narwhals.utils import _parse_time_unit_and_time_zone from narwhals.utils import dtype_matches_time_unit_and_time_zone +from narwhals.utils import get_columns from narwhals.utils import import_dtypes_module if TYPE_CHECKING: @@ -95,7 +96,7 @@ def all(self: Self) -> DaskSelector: def func(df: DaskLazyFrame) -> list[dx.Series]: return [df._native_frame[col] for col in df.columns] - return selector(self, func, lambda df: df.columns) + return selector(self, func, get_columns) def datetime( self: Self, diff --git a/narwhals/_duckdb/namespace.py b/narwhals/_duckdb/namespace.py index 2f50a70724..3acf9a2c85 100644 --- a/narwhals/_duckdb/namespace.py +++ b/narwhals/_duckdb/namespace.py @@ -24,6 +24,7 @@ from narwhals._expression_parsing import combine_alias_output_names from narwhals._expression_parsing import combine_evaluate_output_names from narwhals.typing import CompliantNamespace +from narwhals.utils import get_columns if TYPE_CHECKING: import duckdb @@ -52,7 +53,7 @@ def _all(df: DuckDBLazyFrame) -> list[duckdb.Expression]: return DuckDBExpr( call=_all, function_name="all", - evaluate_output_names=lambda df: df.columns, + evaluate_output_names=get_columns, alias_output_names=None, backend_version=self._backend_version, version=self._version, diff --git a/narwhals/_duckdb/selectors.py b/narwhals/_duckdb/selectors.py index a30cec06cb..1617d60c56 100644 --- a/narwhals/_duckdb/selectors.py +++ b/narwhals/_duckdb/selectors.py @@ -12,6 +12,7 @@ from narwhals._duckdb.expr import DuckDBExpr from narwhals.utils import _parse_time_unit_and_time_zone from narwhals.utils import dtype_matches_time_unit_and_time_zone +from narwhals.utils import get_columns from narwhals.utils import import_dtypes_module if TYPE_CHECKING: @@ -88,7 +89,7 @@ def all(self: Self) -> DuckDBSelector: def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: return [ColumnExpression(col) for col in df.columns] - return selector(self, func, lambda df: df.columns) + return selector(self, func, get_columns) def datetime( self: Self, diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index a0c8a3ac3c..3d7b2514c2 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -22,6 +22,7 @@ from narwhals._pandas_like.utils import horizontal_concat from narwhals._pandas_like.utils import vertical_concat from narwhals.typing import CompliantNamespace +from narwhals.utils import get_columns from narwhals.utils import import_dtypes_module from narwhals.utils import is_compliant_expr @@ -134,7 +135,7 @@ def all(self: Self) -> PandasLikeExpr: ], depth=0, function_name="all", - evaluate_output_names=lambda df: df.columns, + evaluate_output_names=get_columns, alias_output_names=None, implementation=self._implementation, backend_version=self._backend_version, diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index 4b7a2ef2e0..c31e351916 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -10,6 +10,7 @@ from narwhals._pandas_like.expr import PandasLikeExpr from narwhals.utils import _parse_time_unit_and_time_zone from narwhals.utils import dtype_matches_time_unit_and_time_zone +from narwhals.utils import get_columns from narwhals.utils import import_dtypes_module if TYPE_CHECKING: @@ -83,7 +84,7 @@ def all(self: Self) -> PandasSelector: def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: return [df[col] for col in df.columns] - return selector(self, func, lambda df: df.columns) + return selector(self, func, get_columns) def datetime( self: Self, diff --git a/narwhals/_spark_like/namespace.py b/narwhals/_spark_like/namespace.py index b6f51a60f8..ac779730e8 100644 --- a/narwhals/_spark_like/namespace.py +++ b/narwhals/_spark_like/namespace.py @@ -17,6 +17,7 @@ from narwhals._spark_like.utils import maybe_evaluate_expr from narwhals._spark_like.utils import narwhals_to_native_dtype from narwhals.typing import CompliantNamespace +from narwhals.utils import get_columns if TYPE_CHECKING: from pyspark.sql import Column @@ -51,7 +52,7 @@ def _all(df: SparkLikeLazyFrame) -> list[Column]: return SparkLikeExpr( call=_all, function_name="all", - evaluate_output_names=lambda df: df.columns, + evaluate_output_names=get_columns, alias_output_names=None, backend_version=self._backend_version, version=self._version, diff --git a/narwhals/_spark_like/selectors.py b/narwhals/_spark_like/selectors.py index e037e1f8a3..9aedceab3a 100644 --- a/narwhals/_spark_like/selectors.py +++ b/narwhals/_spark_like/selectors.py @@ -10,6 +10,7 @@ from narwhals._spark_like.expr import SparkLikeExpr from narwhals.utils import _parse_time_unit_and_time_zone from narwhals.utils import dtype_matches_time_unit_and_time_zone +from narwhals.utils import get_columns from narwhals.utils import import_dtypes_module if TYPE_CHECKING: @@ -83,7 +84,7 @@ def all(self: Self) -> SparkLikeSelector: def func(df: SparkLikeLazyFrame) -> list[Column]: return [df._F.col(col) for col in df.columns] - return selector(self, func, lambda df: df.columns) + return selector(self, func, get_columns) def datetime( self: Self, diff --git a/narwhals/utils.py b/narwhals/utils.py index cb33a603d6..6167e73207 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -10,6 +10,7 @@ from typing import TYPE_CHECKING from typing import Any from typing import Iterable +from typing import Iterator from typing import Sequence from typing import TypeVar from typing import Union @@ -59,6 +60,7 @@ from narwhals.typing import DataFrameLike from narwhals.typing import DTypes from narwhals.typing import IntoSeriesT + from narwhals.typing import NativeFrame from narwhals.typing import SizeUnit from narwhals.typing import SupportsNativeNamespace from narwhals.typing import TimeUnit @@ -1303,6 +1305,14 @@ def dtype_matches_time_unit_and_time_zone( ) +def get_columns(df: NativeFrame) -> Sequence[str]: + return df.columns + + +def iter_columns(df: NativeFrame) -> Iterator[str]: # pragma: no cover + yield from df.columns + + def _hasattr_static(obj: Any, attr: str) -> bool: sentinel = object() return getattr_static(obj, attr, sentinel) is not sentinel From c597ba7c2b06b1d1db29e9b7f254aa3be47b5d86 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Thu, 20 Feb 2025 21:30:05 +0000 Subject: [PATCH 02/55] refactor: rename `get_columns` -> `get_column_names` Less ambiguous, thinking `iter_columns` will be a better name to reserve for https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.iter_columns.html --- narwhals/_arrow/namespace.py | 4 ++-- narwhals/_arrow/selectors.py | 4 ++-- narwhals/_dask/namespace.py | 4 ++-- narwhals/_dask/selectors.py | 4 ++-- narwhals/_duckdb/namespace.py | 4 ++-- narwhals/_duckdb/selectors.py | 4 ++-- narwhals/_pandas_like/namespace.py | 4 ++-- narwhals/_pandas_like/selectors.py | 4 ++-- narwhals/_spark_like/namespace.py | 4 ++-- narwhals/_spark_like/selectors.py | 4 ++-- narwhals/utils.py | 2 +- 11 files changed, 21 insertions(+), 21 deletions(-) diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index 1efbb69751..28c22ab62f 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -26,7 +26,7 @@ from narwhals._expression_parsing import combine_evaluate_output_names from narwhals.typing import CompliantNamespace from narwhals.utils import Implementation -from narwhals.utils import get_columns +from narwhals.utils import get_column_names from narwhals.utils import import_dtypes_module from narwhals.utils import is_compliant_expr @@ -160,7 +160,7 @@ def all(self: Self) -> ArrowExpr: ], depth=0, function_name="all", - evaluate_output_names=get_columns, + evaluate_output_names=get_column_names, alias_output_names=None, backend_version=self._backend_version, version=self._version, diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index adf2374568..ef2a760364 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -10,7 +10,7 @@ from narwhals._arrow.expr import ArrowExpr from narwhals.utils import _parse_time_unit_and_time_zone from narwhals.utils import dtype_matches_time_unit_and_time_zone -from narwhals.utils import get_columns +from narwhals.utils import get_column_names from narwhals.utils import import_dtypes_module if TYPE_CHECKING: @@ -83,7 +83,7 @@ def all(self: Self) -> ArrowSelector: def func(df: ArrowDataFrame) -> list[ArrowSeries]: return [df[col] for col in df.columns] - return selector(self, func, get_columns) + return selector(self, func, get_column_names) def datetime( self: Self, diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 28fcd8c07f..5777fd6d5d 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -24,7 +24,7 @@ from narwhals._expression_parsing import combine_alias_output_names from narwhals._expression_parsing import combine_evaluate_output_names from narwhals.typing import CompliantNamespace -from narwhals.utils import get_columns +from narwhals.utils import get_column_names from narwhals.utils import is_compliant_expr if TYPE_CHECKING: @@ -58,7 +58,7 @@ def func(df: DaskLazyFrame) -> list[dx.Series]: func, depth=0, function_name="all", - evaluate_output_names=get_columns, + evaluate_output_names=get_column_names, alias_output_names=None, backend_version=self._backend_version, version=self._version, diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index d1ec3d4a69..599c8835cb 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -10,7 +10,7 @@ from narwhals._dask.expr import DaskExpr from narwhals.utils import _parse_time_unit_and_time_zone from narwhals.utils import dtype_matches_time_unit_and_time_zone -from narwhals.utils import get_columns +from narwhals.utils import get_column_names from narwhals.utils import import_dtypes_module if TYPE_CHECKING: @@ -96,7 +96,7 @@ def all(self: Self) -> DaskSelector: def func(df: DaskLazyFrame) -> list[dx.Series]: return [df._native_frame[col] for col in df.columns] - return selector(self, func, get_columns) + return selector(self, func, get_column_names) def datetime( self: Self, diff --git a/narwhals/_duckdb/namespace.py b/narwhals/_duckdb/namespace.py index 3acf9a2c85..56b3bf4a4f 100644 --- a/narwhals/_duckdb/namespace.py +++ b/narwhals/_duckdb/namespace.py @@ -24,7 +24,7 @@ from narwhals._expression_parsing import combine_alias_output_names from narwhals._expression_parsing import combine_evaluate_output_names from narwhals.typing import CompliantNamespace -from narwhals.utils import get_columns +from narwhals.utils import get_column_names if TYPE_CHECKING: import duckdb @@ -53,7 +53,7 @@ def _all(df: DuckDBLazyFrame) -> list[duckdb.Expression]: return DuckDBExpr( call=_all, function_name="all", - evaluate_output_names=get_columns, + evaluate_output_names=get_column_names, alias_output_names=None, backend_version=self._backend_version, version=self._version, diff --git a/narwhals/_duckdb/selectors.py b/narwhals/_duckdb/selectors.py index 1617d60c56..254f41152a 100644 --- a/narwhals/_duckdb/selectors.py +++ b/narwhals/_duckdb/selectors.py @@ -12,7 +12,7 @@ from narwhals._duckdb.expr import DuckDBExpr from narwhals.utils import _parse_time_unit_and_time_zone from narwhals.utils import dtype_matches_time_unit_and_time_zone -from narwhals.utils import get_columns +from narwhals.utils import get_column_names from narwhals.utils import import_dtypes_module if TYPE_CHECKING: @@ -89,7 +89,7 @@ def all(self: Self) -> DuckDBSelector: def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: return [ColumnExpression(col) for col in df.columns] - return selector(self, func, get_columns) + return selector(self, func, get_column_names) def datetime( self: Self, diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 3d7b2514c2..b936be3467 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -22,7 +22,7 @@ from narwhals._pandas_like.utils import horizontal_concat from narwhals._pandas_like.utils import vertical_concat from narwhals.typing import CompliantNamespace -from narwhals.utils import get_columns +from narwhals.utils import get_column_names from narwhals.utils import import_dtypes_module from narwhals.utils import is_compliant_expr @@ -135,7 +135,7 @@ def all(self: Self) -> PandasLikeExpr: ], depth=0, function_name="all", - evaluate_output_names=get_columns, + evaluate_output_names=get_column_names, alias_output_names=None, implementation=self._implementation, backend_version=self._backend_version, diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index c31e351916..0653ed2e40 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -10,7 +10,7 @@ from narwhals._pandas_like.expr import PandasLikeExpr from narwhals.utils import _parse_time_unit_and_time_zone from narwhals.utils import dtype_matches_time_unit_and_time_zone -from narwhals.utils import get_columns +from narwhals.utils import get_column_names from narwhals.utils import import_dtypes_module if TYPE_CHECKING: @@ -84,7 +84,7 @@ def all(self: Self) -> PandasSelector: def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: return [df[col] for col in df.columns] - return selector(self, func, get_columns) + return selector(self, func, get_column_names) def datetime( self: Self, diff --git a/narwhals/_spark_like/namespace.py b/narwhals/_spark_like/namespace.py index ac779730e8..30e7fe3955 100644 --- a/narwhals/_spark_like/namespace.py +++ b/narwhals/_spark_like/namespace.py @@ -17,7 +17,7 @@ from narwhals._spark_like.utils import maybe_evaluate_expr from narwhals._spark_like.utils import narwhals_to_native_dtype from narwhals.typing import CompliantNamespace -from narwhals.utils import get_columns +from narwhals.utils import get_column_names if TYPE_CHECKING: from pyspark.sql import Column @@ -52,7 +52,7 @@ def _all(df: SparkLikeLazyFrame) -> list[Column]: return SparkLikeExpr( call=_all, function_name="all", - evaluate_output_names=get_columns, + evaluate_output_names=get_column_names, alias_output_names=None, backend_version=self._backend_version, version=self._version, diff --git a/narwhals/_spark_like/selectors.py b/narwhals/_spark_like/selectors.py index 9aedceab3a..95ad0407d1 100644 --- a/narwhals/_spark_like/selectors.py +++ b/narwhals/_spark_like/selectors.py @@ -10,7 +10,7 @@ from narwhals._spark_like.expr import SparkLikeExpr from narwhals.utils import _parse_time_unit_and_time_zone from narwhals.utils import dtype_matches_time_unit_and_time_zone -from narwhals.utils import get_columns +from narwhals.utils import get_column_names from narwhals.utils import import_dtypes_module if TYPE_CHECKING: @@ -84,7 +84,7 @@ def all(self: Self) -> SparkLikeSelector: def func(df: SparkLikeLazyFrame) -> list[Column]: return [df._F.col(col) for col in df.columns] - return selector(self, func, get_columns) + return selector(self, func, get_column_names) def datetime( self: Self, diff --git a/narwhals/utils.py b/narwhals/utils.py index 6167e73207..b5fbb28100 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -1305,7 +1305,7 @@ def dtype_matches_time_unit_and_time_zone( ) -def get_columns(df: NativeFrame) -> Sequence[str]: +def get_column_names(df: NativeFrame) -> Sequence[str]: return df.columns From d26fd813238e37305860ce9cd6be45ce99f0bb9d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 15:51:40 +0000 Subject: [PATCH 03/55] chore(typing): add missing context for `CompliantDataFrame` Column names and selecting series is the core part of `selectors` --- narwhals/typing.py | 16 ++++++++++------ narwhals/utils.py | 2 +- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/narwhals/typing.py b/narwhals/typing.py index ebd89e7732..92ce71b683 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -49,7 +49,12 @@ def __narwhals_series__(self) -> CompliantSeries: ... def alias(self, name: str) -> Self: ... -class CompliantDataFrame(Protocol): +CompliantSeriesT_co = TypeVar( + "CompliantSeriesT_co", bound=CompliantSeries, covariant=True +) + + +class CompliantDataFrame(Generic[CompliantSeriesT_co], Protocol): def __narwhals_dataframe__(self) -> Self: ... def __narwhals_namespace__(self) -> Any: ... def simple_select( @@ -59,6 +64,10 @@ def aggregate(self, *exprs: Any) -> Self: ... # `select` where all args are aggregations or literals # (so, no broadcasting is necessary). + @property + def columns(self) -> Sequence[str]: ... + def get_column(self, name: str) -> CompliantSeriesT_co: ... + class CompliantLazyFrame(Protocol): def __narwhals_lazyframe__(self) -> Self: ... @@ -71,11 +80,6 @@ def aggregate(self, *exprs: Any) -> Self: # (so, no broadcasting is necessary). -CompliantSeriesT_co = TypeVar( - "CompliantSeriesT_co", bound=CompliantSeries, covariant=True -) - - class CompliantExpr(Protocol, Generic[CompliantSeriesT_co]): _implementation: Implementation _backend_version: tuple[int, ...] diff --git a/narwhals/utils.py b/narwhals/utils.py index b5fbb28100..141423d200 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -1305,7 +1305,7 @@ def dtype_matches_time_unit_and_time_zone( ) -def get_column_names(df: NativeFrame) -> Sequence[str]: +def get_column_names(df: NativeFrame | CompliantDataFrame) -> Sequence[str]: return df.columns From 230daf9e55f22b09316f71a2c4e6120909a18f1d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 15:53:12 +0000 Subject: [PATCH 04/55] fix(typing): temp widen `.collect` from `CompliantDataFrame` Will investigate later --- narwhals/_duckdb/dataframe.py | 3 ++- narwhals/_pandas_like/dataframe.py | 3 ++- narwhals/_polars/dataframe.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index b4ee8c2825..1e0972e5e8 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -36,6 +36,7 @@ from narwhals._duckdb.group_by import DuckDBGroupBy from narwhals._duckdb.namespace import DuckDBNamespace from narwhals._duckdb.series import DuckDBInterchangeSeries + from narwhals._polars.dataframe import PolarsDataFrame from narwhals.dtypes import DType from narwhals.typing import CompliantLazyFrame @@ -90,7 +91,7 @@ def collect( self: Self, backend: ModuleType | Implementation | str | None, **kwargs: Any, - ) -> CompliantDataFrame: + ) -> CompliantDataFrame[Any] | PolarsDataFrame: if backend is None or backend is Implementation.PYARROW: import pyarrow as pa # ignore-banned-import diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 306a37fd8f..d333ee1459 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -48,6 +48,7 @@ from narwhals._pandas_like.expr import PandasLikeExpr from narwhals._pandas_like.group_by import PandasLikeGroupBy from narwhals._pandas_like.namespace import PandasLikeNamespace + from narwhals._polars.dataframe import PolarsDataFrame from narwhals.dtypes import DType from narwhals.typing import SizeUnit from narwhals.typing import _1DArray @@ -519,7 +520,7 @@ def collect( self: Self, backend: Implementation | None, **kwargs: Any, - ) -> CompliantDataFrame: + ) -> CompliantDataFrame[Any] | PolarsDataFrame: if backend is None: return PandasLikeDataFrame( self._native_frame, diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index 070d658343..7ec7423106 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -465,7 +465,7 @@ def collect( self: Self, backend: Implementation | None, **kwargs: Any, - ) -> CompliantDataFrame: + ) -> PolarsDataFrame | CompliantDataFrame[Any]: try: result = self._native_frame.collect(**kwargs) except Exception as e: # noqa: BLE001 From 2ab3305064bb093b5d98422c88b9a583c86f11a7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 16:23:35 +0000 Subject: [PATCH 05/55] feat(typing): provide `PandasLikeSeries` to `PandasLikeDataFrame` Experimenting with `pandas` only to start --- narwhals/_pandas_like/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index d333ee1459..e235fe27ee 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -84,7 +84,7 @@ ) -class PandasLikeDataFrame(CompliantDataFrame, CompliantLazyFrame): +class PandasLikeDataFrame(CompliantDataFrame["PandasLikeSeries"], CompliantLazyFrame): # --- not in the spec --- def __init__( self: Self, From 47816b988bf623b65471593e7fdd8295d83a7477 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 16:36:04 +0000 Subject: [PATCH 06/55] feat: Adds generic `CompliantSelector` & `Namespace` --- narwhals/_selectors.py | 257 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 narwhals/_selectors.py diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py new file mode 100644 index 0000000000..89a77f9148 --- /dev/null +++ b/narwhals/_selectors.py @@ -0,0 +1,257 @@ +"""Almost entirely complete, generic `selectors` implementation. + +- Focusing on eager-only for now +""" + +from __future__ import annotations + +import re +from functools import partial +from typing import TYPE_CHECKING +from typing import Callable +from typing import Collection +from typing import Generic +from typing import Iterable +from typing import Iterator +from typing import Protocol +from typing import Sequence +from typing import TypeVar +from typing import overload + +from narwhals.typing import CompliantExpr +from narwhals.utils import _parse_time_unit_and_time_zone +from narwhals.utils import dtype_matches_time_unit_and_time_zone +from narwhals.utils import get_column_names +from narwhals.utils import import_dtypes_module + +if TYPE_CHECKING: + from datetime import timezone + + from typing_extensions import Self + from typing_extensions import TypeAlias + from typing_extensions import TypeIs + + from narwhals.dtypes import DType + from narwhals.typing import CompliantDataFrame + from narwhals.typing import CompliantSeries + from narwhals.typing import TimeUnit + from narwhals.utils import Implementation + from narwhals.utils import Version + from narwhals.utils import _FullContext + + # NOTE: Plugging the gap of this not being defined in `CompliantSeries` + class CompliantSeriesWithDType(CompliantSeries, Protocol): + @property + def dtype(self) -> DType: ... + + +SeriesT = TypeVar("SeriesT", bound="CompliantSeriesWithDType") +DataFrameT = TypeVar("DataFrameT", bound="CompliantDataFrame") +SelectorOrExpr: TypeAlias = ( + "CompliantSelector[DataFrameT, SeriesT] | CompliantExpr[SeriesT]" +) +EvalSeries: TypeAlias = Callable[[DataFrameT], Sequence[SeriesT]] +EvalNames: TypeAlias = Callable[[DataFrameT], Sequence[str]] + + +# NOTE: Pretty much finished generic for eager backends +class CompliantSelectorNamespace(Generic[DataFrameT, SeriesT], Protocol): + _implementation: Implementation + _backend_version: tuple[int, ...] + _version: Version + + # TODO @dangotbanned: push for adding to public API for `DataFrame` + # Only need internally, but it plugs so many holes that it must be useful beyond that + # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.iter_columns.html + def _iter_columns(self, df: DataFrameT, /) -> Iterator[SeriesT]: ... + + def _selector( + self, + context: _FullContext, + call: EvalSeries[DataFrameT, SeriesT], + evaluate_output_names: EvalNames[DataFrameT], + /, + ) -> CompliantSelector[DataFrameT, SeriesT]: ... + + def _is_dtype( + self: CompliantSelectorNamespace[DataFrameT, SeriesT], dtype: type[DType], / + ) -> CompliantSelector[DataFrameT, SeriesT]: + def series(df: DataFrameT) -> Sequence[SeriesT]: + return [ser for ser in self._iter_columns(df) if isinstance(ser.dtype, dtype)] + + def names(df: DataFrameT) -> Sequence[str]: + return [ + ser.name for ser in self._iter_columns(df) if isinstance(ser.dtype, dtype) + ] + + return self._selector(self, series, names) + + def by_dtype( + self: Self, dtypes: Collection[DType | type[DType]] + ) -> CompliantSelector[DataFrameT, SeriesT]: + def series(df: DataFrameT) -> Sequence[SeriesT]: + return [ser for ser in self._iter_columns(df) if ser.dtype in dtypes] + + def names(df: DataFrameT) -> Sequence[str]: + return [ser.name for ser in self._iter_columns(df) if ser.dtype in dtypes] + + return self._selector(self, series, names) + + def matches(self: Self, pattern: str) -> CompliantSelector[DataFrameT, SeriesT]: + p = re.compile(pattern) + + def series(df: DataFrameT) -> Sequence[SeriesT]: + return [df.get_column(col) for col in df.columns if p.search(col)] + + def names(df: DataFrameT) -> Sequence[str]: + return [col for col in df.columns if p.search(col)] + + return self._selector(self, series, names) + + def numeric(self: Self) -> CompliantSelector[DataFrameT, SeriesT]: + def series(df: DataFrameT) -> Sequence[SeriesT]: + return [ser for ser in self._iter_columns(df) if ser.dtype.is_numeric()] + + def names(df: DataFrameT) -> Sequence[str]: + return [ser.name for ser in self._iter_columns(df) if ser.dtype.is_numeric()] + + return self._selector(self, series, names) + + def categorical(self: Self) -> CompliantSelector[DataFrameT, SeriesT]: + return self._is_dtype(import_dtypes_module(self._version).Categorical) + + def string(self: Self) -> CompliantSelector[DataFrameT, SeriesT]: + return self._is_dtype(import_dtypes_module(self._version).String) + + def boolean(self: Self) -> CompliantSelector[DataFrameT, SeriesT]: + return self._is_dtype(import_dtypes_module(self._version).Boolean) + + def all(self: Self) -> CompliantSelector[DataFrameT, SeriesT]: + def series(df: DataFrameT) -> Sequence[SeriesT]: + return list(self._iter_columns(df)) + + return self._selector(self, series, get_column_names) + + def datetime( + self: Self, + time_unit: TimeUnit | Iterable[TimeUnit] | None, + time_zone: str | timezone | Iterable[str | timezone | None] | None, + ) -> CompliantSelector[DataFrameT, SeriesT]: + time_units, time_zones = _parse_time_unit_and_time_zone(time_unit, time_zone) + matches = partial( + dtype_matches_time_unit_and_time_zone, + dtypes=import_dtypes_module(version=self._version), + time_units=time_units, + time_zones=time_zones, + ) + + def series(df: DataFrameT) -> Sequence[SeriesT]: + return [ser for ser in self._iter_columns(df) if matches(ser.dtype)] + + def names(df: DataFrameT) -> Sequence[str]: + return [ser.name for ser in self._iter_columns(df) if matches(ser.dtype)] + + return self._selector(self, series, names) + + def __init__(self: Self, context: _FullContext, /) -> None: + self._implementation = context._implementation + self._backend_version = context._backend_version + self._version = context._version + + +# NOTE: CompliantExpr already provides `_implementation`, `_backend_version` +# https://github.com/narwhals-dev/narwhals/pull/2060 +class CompliantSelector(CompliantExpr[SeriesT], Generic[DataFrameT, SeriesT], Protocol): + _version: Version + + @property + def selectors(self) -> CompliantSelectorNamespace[DataFrameT, SeriesT]: ... + def __repr__(self: Self) -> str: ... + def _to_expr(self: Self) -> CompliantExpr[SeriesT]: ... + + def _is_selector( + self: Self, + other: Self | CompliantExpr[SeriesT], + ) -> TypeIs[CompliantSelector[DataFrameT, SeriesT]]: + return isinstance(other, type(self)) + + @overload + def __sub__(self: Self, other: Self) -> Self: ... + @overload + def __sub__(self: Self, other: CompliantExpr[SeriesT]) -> CompliantExpr[SeriesT]: ... + def __sub__( + self: Self, other: SelectorOrExpr[DataFrameT, SeriesT] + ) -> SelectorOrExpr[DataFrameT, SeriesT]: + if self._is_selector(other): + + def series(df: DataFrameT) -> Sequence[SeriesT]: + lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) + return [ + x for x, name in zip(self(df), lhs_names) if name not in rhs_names + ] + + def names(df: DataFrameT) -> Sequence[str]: + lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) + return [x for x in lhs_names if x not in rhs_names] + + return self.selectors._selector(self, series, names) + else: + return self._to_expr() - other + + @overload + def __or__(self: Self, other: Self) -> Self: ... + @overload + def __or__(self: Self, other: CompliantExpr[SeriesT]) -> CompliantExpr[SeriesT]: ... + def __or__( + self: Self, other: SelectorOrExpr[DataFrameT, SeriesT] + ) -> SelectorOrExpr[DataFrameT, SeriesT]: + if self._is_selector(other): + + def names(df: DataFrameT) -> Sequence[SeriesT]: + lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) + return [ + *(x for x, name in zip(self(df), lhs_names) if name not in rhs_names), + *other(df), + ] + + def series(df: DataFrameT) -> Sequence[str]: + lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) + return [*(x for x in lhs_names if x not in rhs_names), *rhs_names] + + return self.selectors._selector(self, names, series) + else: + return self._to_expr() | other + + @overload + def __and__(self: Self, other: Self) -> Self: ... + @overload + def __and__(self: Self, other: CompliantExpr[SeriesT]) -> CompliantExpr[SeriesT]: ... + def __and__( + self: Self, other: SelectorOrExpr[DataFrameT, SeriesT] + ) -> SelectorOrExpr[DataFrameT, SeriesT]: + if self._is_selector(other): + + def series(df: DataFrameT) -> Sequence[SeriesT]: + lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) + return [x for x, name in zip(self(df), lhs_names) if name in rhs_names] + + def names(df: DataFrameT) -> Sequence[str]: + lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) + return [x for x in lhs_names if x in rhs_names] + + return self.selectors._selector(self, series, names) + else: + return self._to_expr() & other + + def __invert__( + self: Self, + ) -> CompliantSelector[DataFrameT, SeriesT]: + return self.selectors.all() - self + + +# NOTE: Should probably be a `DataFrame` method +# Using `Expr` because this doesn't require `Selector` attrs/methods +def _eval_lhs_rhs( + df: CompliantDataFrame, lhs: CompliantExpr, rhs: CompliantExpr +) -> tuple[Sequence[str], Sequence[str]]: + return lhs._evaluate_output_names(df), rhs._evaluate_output_names(df) From c2e56d1405db1ed9ae4e952431b73e16807fe86e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 16:44:58 +0000 Subject: [PATCH 07/55] feat: reimplement `_pandas_like.selectors` All tests are passing locally, hoping to do `_arrow` next and see if this holds up --- narwhals/_pandas_like/selectors.py | 236 +++++++---------------------- 1 file changed, 52 insertions(+), 184 deletions(-) diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index 0653ed2e40..6404a6f4b1 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -1,129 +1,77 @@ from __future__ import annotations -import re +from functools import partial from typing import TYPE_CHECKING from typing import Any -from typing import Callable -from typing import Iterable -from typing import Sequence +from typing import Iterator +from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.expr import PandasLikeExpr -from narwhals.utils import _parse_time_unit_and_time_zone -from narwhals.utils import dtype_matches_time_unit_and_time_zone -from narwhals.utils import get_column_names -from narwhals.utils import import_dtypes_module +from narwhals._pandas_like.series import PandasLikeSeries +from narwhals._selectors import CompliantSelector +from narwhals._selectors import CompliantSelectorNamespace if TYPE_CHECKING: - from datetime import timezone - from typing_extensions import Self from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.series import PandasLikeSeries - from narwhals.dtypes import DType - from narwhals.typing import TimeUnit + from narwhals._selectors import EvalNames + from narwhals._selectors import EvalSeries + from narwhals.utils import Version from narwhals.utils import _FullContext -class PandasSelectorNamespace: - def __init__(self: Self, context: _FullContext, /) -> None: - self._implementation = context._implementation - self._backend_version = context._backend_version - self._version = context._version - - def by_dtype(self: Self, dtypes: Iterable[DType | type[DType]]) -> PandasSelector: - def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: - return [df[col] for col in df.columns if df.schema[col] in dtypes] - - def evaluate_output_names(df: PandasLikeDataFrame) -> Sequence[str]: - return [col for col in df.columns if df.schema[col] in dtypes] - - return selector(self, func, evaluate_output_names) +class PandasSelectorNamespace( + CompliantSelectorNamespace["PandasLikeDataFrame", "PandasLikeSeries"] +): + def _iter_columns(self, df: PandasLikeDataFrame) -> Iterator[PandasLikeSeries]: + from narwhals._pandas_like.series import PandasLikeSeries - def matches(self: Self, pattern: str) -> PandasSelector: - def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: - return [df[col] for col in df.columns if re.search(pattern, col)] - - def evaluate_output_names(df: PandasLikeDataFrame) -> Sequence[str]: - return [col for col in df.columns if re.search(pattern, col)] - - return selector(self, func, evaluate_output_names) - - def numeric(self: Self) -> PandasSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype( - { - dtypes.Int128, - dtypes.Int64, - dtypes.Int32, - dtypes.Int16, - dtypes.Int8, - dtypes.UInt128, - dtypes.UInt64, - dtypes.UInt32, - dtypes.UInt16, - dtypes.UInt8, - dtypes.Float64, - dtypes.Float32, - } + series = partial( + PandasLikeSeries, + implementation=df._implementation, + backend_version=df._backend_version, + version=df._version, ) - - def categorical(self: Self) -> PandasSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype({dtypes.Categorical}) - - def string(self: Self) -> PandasSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype({dtypes.String}) - - def boolean(self: Self) -> PandasSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype({dtypes.Boolean}) - - def all(self: Self) -> PandasSelector: - def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: - return [df[col] for col in df.columns] - - return selector(self, func, get_column_names) - - def datetime( - self: Self, - time_unit: TimeUnit | Iterable[TimeUnit] | None, - time_zone: str | timezone | Iterable[str | timezone | None] | None, - ) -> PandasSelector: - dtypes = import_dtypes_module(version=self._version) - time_units, time_zones = _parse_time_unit_and_time_zone( - time_unit=time_unit, time_zone=time_zone + # NOTE: (PERF102) is a false-positive + # .items() -> (str, pd.Series) + # .values() -> np.ndarray + for _col, ser in df._native_frame.items(): # noqa: PERF102 + yield series(ser) + + def _selector( + self, + context: _FullContext, + call: EvalSeries[PandasLikeDataFrame, PandasLikeSeries], + evaluate_output_names: EvalNames[PandasLikeDataFrame], + /, + ) -> CompliantSelector[PandasLikeDataFrame, PandasLikeSeries]: + return PandasSelector( + call, + depth=0, + function_name="selector", + evaluate_output_names=evaluate_output_names, + alias_output_names=None, + implementation=context._implementation, + backend_version=context._backend_version, + version=context._version, + kwargs={}, ) - def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: - return [ - df[col] - for col in df.columns - if dtype_matches_time_unit_and_time_zone( - dtype=df.schema[col], - dtypes=dtypes, - time_units=time_units, - time_zones=time_zones, - ) - ] - - def evaluate_output_names(df: PandasLikeDataFrame) -> Sequence[str]: - return [ - col - for col in df.columns - if dtype_matches_time_unit_and_time_zone( - dtype=df.schema[col], - dtypes=dtypes, - time_units=time_units, - time_zones=time_zones, - ) - ] - return selector(self, func, evaluate_output_names) +class PandasSelector( # type: ignore[misc] + CompliantSelector["PandasLikeDataFrame", "PandasLikeSeries"], PandasLikeExpr +): + # TODO @dangotbanned: Remove after merging (https://github.com/narwhals-dev/narwhals/pull/2060) + def __init__(self: Self, *args: Any, version: Version, **kwds: Any) -> None: + super().__init__(*args, version=version, **kwds) + self._version = version + @property + def selectors(self) -> PandasSelectorNamespace: + return PandasSelectorNamespace(self) -class PandasSelector(PandasLikeExpr): def __repr__(self) -> str: # pragma: no cover return ( f"PandasSelector(depth={self._depth}, function_name={self._function_name}, " @@ -141,83 +89,3 @@ def _to_expr(self: Self) -> PandasLikeExpr: version=self._version, kwargs=self._kwargs, ) - - def __sub__(self: Self, other: PandasSelector | Any) -> PandasSelector | Any: - if isinstance(other, PandasSelector): - - def call(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - return [x for x, name in zip(lhs, lhs_names) if name not in rhs_names] - - def evaluate_output_names(df: PandasLikeDataFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [x for x in lhs_names if x not in rhs_names] - - return selector(self, call, evaluate_output_names) - else: - return self._to_expr() - other - - def __or__(self: Self, other: PandasSelector | Any) -> PandasSelector | Any: - if isinstance(other, PandasSelector): - - def call(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - rhs = other._call(df) - return [ - *(x for x, name in zip(lhs, lhs_names) if name not in rhs_names), - *rhs, - ] - - def evaluate_output_names(df: PandasLikeDataFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [*(x for x in lhs_names if x not in rhs_names), *rhs_names] - - return selector(self, call, evaluate_output_names) - else: - return self._to_expr() | other - - def __and__(self: Self, other: PandasSelector | Any) -> PandasSelector | Any: - if isinstance(other, PandasSelector): - - def call(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - return [x for x, name in zip(lhs, lhs_names) if name in rhs_names] - - def evaluate_output_names(df: PandasLikeDataFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [x for x in lhs_names if x in rhs_names] - - return selector(self, call, evaluate_output_names) - else: - return self._to_expr() & other - - def __invert__(self: Self) -> PandasSelector: - return PandasSelectorNamespace(self).all() - self - - -def selector( - context: _FullContext, - call: Callable[[PandasLikeDataFrame], Sequence[PandasLikeSeries]], - evaluate_output_names: Callable[[PandasLikeDataFrame], Sequence[str]], - /, -) -> PandasSelector: - return PandasSelector( - call, - depth=0, - function_name="selector", - evaluate_output_names=evaluate_output_names, - alias_output_names=None, - implementation=context._implementation, - backend_version=context._backend_version, - version=context._version, - kwargs={}, - ) From 706732a5e129ec9b9884c5795bc31b51f7475079 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 17:04:18 +0000 Subject: [PATCH 08/55] feat: reimplement `_arrow.selectors` Well that went smoothly --- narwhals/_arrow/dataframe.py | 2 +- narwhals/_arrow/selectors.py | 228 +++++++---------------------------- 2 files changed, 42 insertions(+), 188 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 30c3f511ad..1d7e3d1e53 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -71,7 +71,7 @@ from narwhals.typing import CompliantLazyFrame -class ArrowDataFrame(CompliantDataFrame, CompliantLazyFrame): +class ArrowDataFrame(CompliantDataFrame["ArrowSeries"], CompliantLazyFrame): # --- not in the spec --- def __init__( self: Self, diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index ef2a760364..663945a0f3 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -1,128 +1,61 @@ from __future__ import annotations -import re from typing import TYPE_CHECKING from typing import Any -from typing import Callable -from typing import Iterable -from typing import Sequence +from typing import Iterator from narwhals._arrow.expr import ArrowExpr -from narwhals.utils import _parse_time_unit_and_time_zone -from narwhals.utils import dtype_matches_time_unit_and_time_zone -from narwhals.utils import get_column_names -from narwhals.utils import import_dtypes_module +from narwhals._selectors import CompliantSelector +from narwhals._selectors import CompliantSelectorNamespace if TYPE_CHECKING: - from datetime import timezone - from typing_extensions import Self from narwhals._arrow.dataframe import ArrowDataFrame from narwhals._arrow.series import ArrowSeries - from narwhals.dtypes import DType - from narwhals.typing import TimeUnit - from narwhals.utils import _LimitedContext - - -class ArrowSelectorNamespace: - def __init__(self: Self, context: _LimitedContext, /) -> None: - self._backend_version = context._backend_version - self._version = context._version - - def by_dtype(self: Self, dtypes: Iterable[DType | type[DType]]) -> ArrowSelector: - def func(df: ArrowDataFrame) -> list[ArrowSeries]: - return [df[col] for col in df.columns if df.schema[col] in dtypes] - - def evaluate_output_names(df: ArrowDataFrame) -> Sequence[str]: - return [col for col in df.columns if df.schema[col] in dtypes] - - return selector(self, func, evaluate_output_names) - - def matches(self: Self, pattern: str) -> ArrowSelector: - def func(df: ArrowDataFrame) -> list[ArrowSeries]: - return [df[col] for col in df.columns if re.search(pattern, col)] - - def evaluate_output_names(df: ArrowDataFrame) -> Sequence[str]: - return [col for col in df.columns if re.search(pattern, col)] - - return selector(self, func, evaluate_output_names) - - def numeric(self: Self) -> ArrowSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype( - [ - dtypes.Int128, - dtypes.Int64, - dtypes.Int32, - dtypes.Int16, - dtypes.Int8, - dtypes.UInt128, - dtypes.UInt64, - dtypes.UInt32, - dtypes.UInt16, - dtypes.UInt8, - dtypes.Float64, - dtypes.Float32, - ], - ) - - def categorical(self: Self) -> ArrowSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype([dtypes.Categorical]) - - def string(self: Self) -> ArrowSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype([dtypes.String]) - - def boolean(self: Self) -> ArrowSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype([dtypes.Boolean]) - - def all(self: Self) -> ArrowSelector: - def func(df: ArrowDataFrame) -> list[ArrowSeries]: - return [df[col] for col in df.columns] - - return selector(self, func, get_column_names) - - def datetime( - self: Self, - time_unit: TimeUnit | Iterable[TimeUnit] | None, - time_zone: str | timezone | Iterable[str | timezone | None] | None, - ) -> ArrowSelector: - dtypes = import_dtypes_module(version=self._version) - time_units, time_zones = _parse_time_unit_and_time_zone( - time_unit=time_unit, time_zone=time_zone + from narwhals._selectors import EvalNames + from narwhals._selectors import EvalSeries + from narwhals.utils import Version + from narwhals.utils import _FullContext + + +class ArrowSelectorNamespace(CompliantSelectorNamespace["ArrowDataFrame", "ArrowSeries"]): + def _iter_columns(self, df: ArrowDataFrame) -> Iterator[ArrowSeries]: + from narwhals._arrow.series import ArrowSeries + + for col, ser in zip(df.columns, df._native_frame.itercolumns()): + yield ArrowSeries( + ser, name=col, backend_version=df._backend_version, version=df._version + ) + + def _selector( + self, + context: _FullContext, + call: EvalSeries[ArrowDataFrame, ArrowSeries], + evaluate_output_names: EvalNames[ArrowDataFrame], + /, + ) -> CompliantSelector[ArrowDataFrame, ArrowSeries]: + return ArrowSelector( + call, + depth=0, + function_name="selector", + evaluate_output_names=evaluate_output_names, + alias_output_names=None, + backend_version=context._backend_version, + version=context._version, ) - def func(df: ArrowDataFrame) -> list[ArrowSeries]: - return [ - df[col] - for col in df.columns - if dtype_matches_time_unit_and_time_zone( - dtype=df.schema[col], - dtypes=dtypes, - time_units=time_units, - time_zones=time_zones, - ) - ] - def evaluate_output_names(df: ArrowDataFrame) -> Sequence[str]: - return [ - col - for col in df.columns - if dtype_matches_time_unit_and_time_zone( - dtype=df.schema[col], - dtypes=dtypes, - time_units=time_units, - time_zones=time_zones, - ) - ] +class ArrowSelector(CompliantSelector["ArrowDataFrame", "ArrowSeries"], ArrowExpr): # type: ignore[misc] + # TODO @dangotbanned: Remove after merging (https://github.com/narwhals-dev/narwhals/pull/2060) + def __init__(self: Self, *args: Any, version: Version, **kwds: Any) -> None: + super().__init__(*args, version=version, **kwds) + self._version = version - return selector(self, func, evaluate_output_names) + @property + def selectors(self) -> ArrowSelectorNamespace: + return ArrowSelectorNamespace(self) - -class ArrowSelector(ArrowExpr): def __repr__(self: Self) -> str: # pragma: no cover return f"ArrowSelector(depth={self._depth}, function_name={self._function_name})" @@ -136,82 +69,3 @@ def _to_expr(self: Self) -> ArrowExpr: backend_version=self._backend_version, version=self._version, ) - - def __sub__(self: Self, other: Self | Any) -> ArrowSelector | Any: - if isinstance(other, ArrowSelector): - - def call(df: ArrowDataFrame) -> list[ArrowSeries]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - return [x for x, name in zip(lhs, lhs_names) if name not in rhs_names] - - def evaluate_output_names(df: ArrowDataFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [x for x in lhs_names if x not in rhs_names] - - return selector(self, call, evaluate_output_names) - else: - return self._to_expr() - other - - def __or__(self: Self, other: Self | Any) -> ArrowSelector | Any: - if isinstance(other, ArrowSelector): - - def call(df: ArrowDataFrame) -> list[ArrowSeries]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - rhs = other._call(df) - return [ - *(x for x, name in zip(lhs, lhs_names) if name not in rhs_names), - *rhs, - ] - - def evaluate_output_names(df: ArrowDataFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [*(x for x in lhs_names if x not in rhs_names), *rhs_names] - - return selector(self, call, evaluate_output_names) - else: - return self._to_expr() | other - - def __and__(self: Self, other: Self | Any) -> ArrowSelector | Any: - if isinstance(other, ArrowSelector): - - def call(df: ArrowDataFrame) -> list[ArrowSeries]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - return [x for x, name in zip(lhs, lhs_names) if name in rhs_names] - - def evaluate_output_names(df: ArrowDataFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [x for x in lhs_names if x in rhs_names] - - return selector(self, call, evaluate_output_names) - - else: - return self._to_expr() & other - - def __invert__(self: Self) -> ArrowSelector: - return ArrowSelectorNamespace(self).all() - self - - -def selector( - context: _LimitedContext, - call: Callable[[ArrowDataFrame], Sequence[ArrowSeries]], - evaluate_output_names: Callable[[ArrowDataFrame], Sequence[str]], - /, -) -> ArrowSelector: - return ArrowSelector( - call, - depth=0, - function_name="selector", - evaluate_output_names=evaluate_output_names, - alias_output_names=None, - backend_version=context._backend_version, - version=context._version, - ) From 8bf4a491eb2ea8355ae6a0f8ade6067412392ef6 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 17:07:51 +0000 Subject: [PATCH 09/55] refactor: utilize updated `CompliantExpr` Possible via #2060 --- narwhals/_arrow/selectors.py | 7 ------- narwhals/_pandas_like/selectors.py | 7 ------- narwhals/_selectors.py | 4 ---- 3 files changed, 18 deletions(-) diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index 663945a0f3..896db5bf21 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Any from typing import Iterator from narwhals._arrow.expr import ArrowExpr @@ -15,7 +14,6 @@ from narwhals._arrow.series import ArrowSeries from narwhals._selectors import EvalNames from narwhals._selectors import EvalSeries - from narwhals.utils import Version from narwhals.utils import _FullContext @@ -47,11 +45,6 @@ def _selector( class ArrowSelector(CompliantSelector["ArrowDataFrame", "ArrowSeries"], ArrowExpr): # type: ignore[misc] - # TODO @dangotbanned: Remove after merging (https://github.com/narwhals-dev/narwhals/pull/2060) - def __init__(self: Self, *args: Any, version: Version, **kwds: Any) -> None: - super().__init__(*args, version=version, **kwds) - self._version = version - @property def selectors(self) -> ArrowSelectorNamespace: return ArrowSelectorNamespace(self) diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index 6404a6f4b1..2a71f9a1f3 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -2,7 +2,6 @@ from functools import partial from typing import TYPE_CHECKING -from typing import Any from typing import Iterator from narwhals._pandas_like.dataframe import PandasLikeDataFrame @@ -18,7 +17,6 @@ from narwhals._pandas_like.series import PandasLikeSeries from narwhals._selectors import EvalNames from narwhals._selectors import EvalSeries - from narwhals.utils import Version from narwhals.utils import _FullContext @@ -63,11 +61,6 @@ def _selector( class PandasSelector( # type: ignore[misc] CompliantSelector["PandasLikeDataFrame", "PandasLikeSeries"], PandasLikeExpr ): - # TODO @dangotbanned: Remove after merging (https://github.com/narwhals-dev/narwhals/pull/2060) - def __init__(self: Self, *args: Any, version: Version, **kwds: Any) -> None: - super().__init__(*args, version=version, **kwds) - self._version = version - @property def selectors(self) -> PandasSelectorNamespace: return PandasSelectorNamespace(self) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 89a77f9148..707715fac2 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -159,11 +159,7 @@ def __init__(self: Self, context: _FullContext, /) -> None: self._version = context._version -# NOTE: CompliantExpr already provides `_implementation`, `_backend_version` -# https://github.com/narwhals-dev/narwhals/pull/2060 class CompliantSelector(CompliantExpr[SeriesT], Generic[DataFrameT, SeriesT], Protocol): - _version: Version - @property def selectors(self) -> CompliantSelectorNamespace[DataFrameT, SeriesT]: ... def __repr__(self: Self) -> str: ... From 1e2e151e2a2a43ba0c24847108fe4213fdeb3943 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 17:19:42 +0000 Subject: [PATCH 10/55] fix(typing): update `_dask` collect as well https://github.com/narwhals-dev/narwhals/actions/runs/13461786363/job/37618653099?pr=2064 --- narwhals/_dask/dataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 935624c4fa..910c7781e5 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -30,6 +30,7 @@ from narwhals._dask.expr import DaskExpr from narwhals._dask.group_by import DaskLazyGroupBy from narwhals._dask.namespace import DaskNamespace + from narwhals._polars.dataframe import PolarsDataFrame from narwhals.dtypes import DType from narwhals.utils import Version @@ -94,7 +95,7 @@ def collect( self: Self, backend: Implementation | None, **kwargs: Any, - ) -> CompliantDataFrame: + ) -> CompliantDataFrame[Any] | PolarsDataFrame: import pandas as pd result = self._native_frame.compute(**kwargs) From 5a36c81b4e336c8b1ce115effbad88803e782bfa Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 18:16:26 +0000 Subject: [PATCH 11/55] fix: maybe resolve `<3.11` protocol bug https://github.com/narwhals-dev/narwhals/pull/2064#discussion_r1965921386 --- narwhals/_arrow/selectors.py | 5 +++++ narwhals/_pandas_like/selectors.py | 5 +++++ narwhals/_selectors.py | 3 +++ 3 files changed, 13 insertions(+) diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index 896db5bf21..408e753d55 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -43,6 +43,11 @@ def _selector( version=context._version, ) + def __init__(self: Self, context: _FullContext, /) -> None: + self._implementation = context._implementation + self._backend_version = context._backend_version + self._version = context._version + class ArrowSelector(CompliantSelector["ArrowDataFrame", "ArrowSeries"], ArrowExpr): # type: ignore[misc] @property diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index 2a71f9a1f3..4dcace8386 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -57,6 +57,11 @@ def _selector( kwargs={}, ) + def __init__(self: Self, context: _FullContext, /) -> None: + self._implementation = context._implementation + self._backend_version = context._backend_version + self._version = context._version + class PandasSelector( # type: ignore[misc] CompliantSelector["PandasLikeDataFrame", "PandasLikeSeries"], PandasLikeExpr diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 707715fac2..5d3d92b21c 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -153,6 +153,9 @@ def names(df: DataFrameT) -> Sequence[str]: return self._selector(self, series, names) + # NOTE: Can't reuse for `<3.11` + # - https://github.com/python/cpython/issues/88970 + # - https://github.com/python/cpython/pull/31628 def __init__(self: Self, context: _FullContext, /) -> None: self._implementation = context._implementation self._backend_version = context._backend_version From 5e453500ea05eaefb316fd0904b01c856e4e15ef Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 18:25:37 +0000 Subject: [PATCH 12/55] ignore coverage on init https://github.com/narwhals-dev/narwhals/actions/runs/13462760425/job/37621753052?pr=2064 --- narwhals/_selectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 5d3d92b21c..6e4ed6d67d 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -156,7 +156,7 @@ def names(df: DataFrameT) -> Sequence[str]: # NOTE: Can't reuse for `<3.11` # - https://github.com/python/cpython/issues/88970 # - https://github.com/python/cpython/pull/31628 - def __init__(self: Self, context: _FullContext, /) -> None: + def __init__(self: Self, context: _FullContext, /) -> None: # pragma: no cover self._implementation = context._implementation self._backend_version = context._backend_version self._version = context._version From 42c2f4171c71db4781910945492a79d939066912 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 19:06:59 +0000 Subject: [PATCH 13/55] refactor: Also drop `kwargs` in `_to_expr` Now is always empty #2059 --- narwhals/_pandas_like/selectors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index 316365d01f..d764b80751 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -84,5 +84,4 @@ def _to_expr(self: Self) -> PandasLikeExpr: implementation=self._implementation, backend_version=self._backend_version, version=self._version, - kwargs=self._kwargs, ) From 4e89a2e8f03e3ccc72e45ea8d9c207b86744f312 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 22:20:40 +0000 Subject: [PATCH 14/55] add `CompliantLazyFrame.columns` --- narwhals/typing.py | 2 ++ narwhals/utils.py | 9 +++------ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/narwhals/typing.py b/narwhals/typing.py index 22095eb0a7..38b66db09a 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -80,6 +80,8 @@ def aggregate(self, *exprs: Any) -> Self: ... # `select` where all args are aggregations or literals # (so, no broadcasting is necessary). + @property + def columns(self) -> Sequence[str]: ... class CompliantExpr(Protocol, Generic[CompliantSeriesT_co]): _implementation: Implementation diff --git a/narwhals/utils.py b/narwhals/utils.py index 141423d200..bddc9ad1cd 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -10,7 +10,6 @@ from typing import TYPE_CHECKING from typing import Any from typing import Iterable -from typing import Iterator from typing import Sequence from typing import TypeVar from typing import Union @@ -1305,14 +1304,12 @@ def dtype_matches_time_unit_and_time_zone( ) -def get_column_names(df: NativeFrame | CompliantDataFrame) -> Sequence[str]: +def get_column_names( + df: NativeFrame | CompliantDataFrame | CompliantLazyFrame, +) -> Sequence[str]: return df.columns -def iter_columns(df: NativeFrame) -> Iterator[str]: # pragma: no cover - yield from df.columns - - def _hasattr_static(obj: Any, attr: str) -> bool: sentinel = object() return getattr_static(obj, attr, sentinel) is not sentinel From 9210a17cfc213a6d60849baea6bda2377af6d770 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 22:21:53 +0000 Subject: [PATCH 15/55] chore(typing): add `Compliant(Lazy|Data)Frame.schema` --- narwhals/typing.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/narwhals/typing.py b/narwhals/typing.py index 38b66db09a..3b7075cc06 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -12,6 +12,7 @@ if TYPE_CHECKING: from types import ModuleType + from typing import Mapping import numpy as np from typing_extensions import Self @@ -67,6 +68,8 @@ def aggregate(self, *exprs: Any) -> Self: @property def columns(self) -> Sequence[str]: ... + @property + def schema(self) -> Mapping[str, DType]: ... def get_column(self, name: str) -> CompliantSeriesT_co: ... @@ -82,6 +85,9 @@ def aggregate(self, *exprs: Any) -> Self: @property def columns(self) -> Sequence[str]: ... + @property + def schema(self) -> Mapping[str, DType]: ... + class CompliantExpr(Protocol, Generic[CompliantSeriesT_co]): _implementation: Implementation From 5382778ac66ce6636e669e94a686a366dd977413 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 22:24:23 +0000 Subject: [PATCH 16/55] feat: add some default iteration methods Lazy-support needs to be able to override them --- narwhals/_selectors.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 6e4ed6d67d..7900dd8dc6 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -64,6 +64,16 @@ class CompliantSelectorNamespace(Generic[DataFrameT, SeriesT], Protocol): # Only need internally, but it plugs so many holes that it must be useful beyond that # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.iter_columns.html def _iter_columns(self, df: DataFrameT, /) -> Iterator[SeriesT]: ... + def _iter_schema(self, df: DataFrameT, /) -> Iterator[tuple[str, DType]]: + for ser in self._iter_columns(df): + yield ser.name, ser.dtype + + def _iter_columns_dtypes(self, df: DataFrameT, /) -> Iterator[tuple[SeriesT, DType]]: + for ser in self._iter_columns(df): + yield ser, ser.dtype + + def _iter_columns_names(self, df: DataFrameT, /) -> Iterator[tuple[SeriesT, str]]: + yield from zip(self._iter_columns(df), df.columns) def _selector( self, @@ -73,6 +83,9 @@ def _selector( /, ) -> CompliantSelector[DataFrameT, SeriesT]: ... + # NOTE: `.dtype` won't return a `nw.DType` (or maybe anything) for lazy backends + # - Their `SeriesT` is a native object + # - See (https://github.com/narwhals-dev/narwhals/issues/2044) def _is_dtype( self: CompliantSelectorNamespace[DataFrameT, SeriesT], dtype: type[DType], / ) -> CompliantSelector[DataFrameT, SeriesT]: From 91c86a08d11b8bd5a2e80aebdcc0ddb3cc44fb3d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 22:25:49 +0000 Subject: [PATCH 17/55] feat: build out lazy-support --- narwhals/_selectors.py | 44 +++++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 7900dd8dc6..a6532d1072 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -23,6 +23,7 @@ from narwhals.utils import dtype_matches_time_unit_and_time_zone from narwhals.utils import get_column_names from narwhals.utils import import_dtypes_module +from narwhals.utils import is_compliant_dataframe if TYPE_CHECKING: from datetime import timezone @@ -33,6 +34,7 @@ from narwhals.dtypes import DType from narwhals.typing import CompliantDataFrame + from narwhals.typing import CompliantLazyFrame from narwhals.typing import CompliantSeries from narwhals.typing import TimeUnit from narwhals.utils import Implementation @@ -46,7 +48,7 @@ def dtype(self) -> DType: ... SeriesT = TypeVar("SeriesT", bound="CompliantSeriesWithDType") -DataFrameT = TypeVar("DataFrameT", bound="CompliantDataFrame") +DataFrameT = TypeVar("DataFrameT", bound="CompliantDataFrame | CompliantLazyFrame") SelectorOrExpr: TypeAlias = ( "CompliantSelector[DataFrameT, SeriesT] | CompliantExpr[SeriesT]" ) @@ -90,23 +92,23 @@ def _is_dtype( self: CompliantSelectorNamespace[DataFrameT, SeriesT], dtype: type[DType], / ) -> CompliantSelector[DataFrameT, SeriesT]: def series(df: DataFrameT) -> Sequence[SeriesT]: - return [ser for ser in self._iter_columns(df) if isinstance(ser.dtype, dtype)] - - def names(df: DataFrameT) -> Sequence[str]: return [ - ser.name for ser in self._iter_columns(df) if isinstance(ser.dtype, dtype) + ser for ser, tp in self._iter_columns_dtypes(df) if isinstance(tp, dtype) ] + def names(df: DataFrameT) -> Sequence[str]: + return [name for name, tp in self._iter_schema(df) if isinstance(tp, dtype)] + return self._selector(self, series, names) def by_dtype( self: Self, dtypes: Collection[DType | type[DType]] ) -> CompliantSelector[DataFrameT, SeriesT]: def series(df: DataFrameT) -> Sequence[SeriesT]: - return [ser for ser in self._iter_columns(df) if ser.dtype in dtypes] + return [ser for ser, tp in self._iter_columns_dtypes(df) if tp in dtypes] def names(df: DataFrameT) -> Sequence[str]: - return [ser.name for ser in self._iter_columns(df) if ser.dtype in dtypes] + return [name for name, tp in self._iter_schema(df) if tp in dtypes] return self._selector(self, series, names) @@ -114,7 +116,11 @@ def matches(self: Self, pattern: str) -> CompliantSelector[DataFrameT, SeriesT]: p = re.compile(pattern) def series(df: DataFrameT) -> Sequence[SeriesT]: - return [df.get_column(col) for col in df.columns if p.search(col)] + # NOTE: Possibly cheaper than lazyframe? + if is_compliant_dataframe(df): + return [df.get_column(col) for col in df.columns if p.search(col)] + + return [ser for ser, name in self._iter_columns_names(df) if p.search(name)] def names(df: DataFrameT) -> Sequence[str]: return [col for col in df.columns if p.search(col)] @@ -123,10 +129,10 @@ def names(df: DataFrameT) -> Sequence[str]: def numeric(self: Self) -> CompliantSelector[DataFrameT, SeriesT]: def series(df: DataFrameT) -> Sequence[SeriesT]: - return [ser for ser in self._iter_columns(df) if ser.dtype.is_numeric()] + return [ser for ser, tp in self._iter_columns_dtypes(df) if tp.is_numeric()] def names(df: DataFrameT) -> Sequence[str]: - return [ser.name for ser in self._iter_columns(df) if ser.dtype.is_numeric()] + return [name for name, tp in self._iter_schema(df) if tp.is_numeric()] return self._selector(self, series, names) @@ -159,10 +165,10 @@ def datetime( ) def series(df: DataFrameT) -> Sequence[SeriesT]: - return [ser for ser in self._iter_columns(df) if matches(ser.dtype)] + return [ser for ser, tp in self._iter_columns_dtypes(df) if matches(tp)] def names(df: DataFrameT) -> Sequence[str]: - return [ser.name for ser in self._iter_columns(df) if matches(ser.dtype)] + return [name for name, tp in self._iter_schema(df) if matches(tp)] return self._selector(self, series, names) @@ -175,6 +181,18 @@ def __init__(self: Self, context: _FullContext, /) -> None: # pragma: no cover self._version = context._version +class LazySelectorNamespace( + CompliantSelectorNamespace[DataFrameT, SeriesT], + Generic[DataFrameT, SeriesT], + Protocol, +): + def _iter_schema(self, df: DataFrameT) -> Iterator[tuple[str, DType]]: + yield from df.schema.items() + + def _iter_columns_dtypes(self, df: DataFrameT, /) -> Iterator[tuple[SeriesT, DType]]: + yield from zip(self._iter_columns(df), df.schema.values()) + + class CompliantSelector(CompliantExpr[SeriesT], Generic[DataFrameT, SeriesT], Protocol): @property def selectors(self) -> CompliantSelectorNamespace[DataFrameT, SeriesT]: ... @@ -264,6 +282,6 @@ def __invert__( # NOTE: Should probably be a `DataFrame` method # Using `Expr` because this doesn't require `Selector` attrs/methods def _eval_lhs_rhs( - df: CompliantDataFrame, lhs: CompliantExpr, rhs: CompliantExpr + df: CompliantDataFrame | CompliantLazyFrame, lhs: CompliantExpr, rhs: CompliantExpr ) -> tuple[Sequence[str], Sequence[str]]: return lhs._evaluate_output_names(df), rhs._evaluate_output_names(df) From a8c9d1358f83eaa3b530cd44090d492fcac4aab8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 22:27:49 +0000 Subject: [PATCH 18/55] feat: reimplement `_dask.selectors` - Working locally - `pyright` is very unhappy with `dx.Series` being used --- narwhals/_dask/namespace.py | 3 + narwhals/_dask/selectors.py | 224 ++++++------------------------------ 2 files changed, 38 insertions(+), 189 deletions(-) diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 5946496957..dc35657b4a 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -24,6 +24,7 @@ from narwhals._expression_parsing import combine_alias_output_names from narwhals._expression_parsing import combine_evaluate_output_names from narwhals.typing import CompliantNamespace +from narwhals.utils import Implementation from narwhals.utils import get_column_names from narwhals.utils import is_compliant_expr @@ -40,6 +41,8 @@ class DaskNamespace(CompliantNamespace["dx.Series"]): + _implementation: Implementation = Implementation.DASK + @property def selectors(self: Self) -> DaskSelectorNamespace: return DaskSelectorNamespace(self) diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 1ea275aca8..ee85158758 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -1,17 +1,12 @@ from __future__ import annotations -import re from typing import TYPE_CHECKING -from typing import Any -from typing import Callable -from typing import Iterable -from typing import Sequence +from typing import Iterator +from narwhals._dask.dataframe import DaskLazyFrame from narwhals._dask.expr import DaskExpr -from narwhals.utils import _parse_time_unit_and_time_zone -from narwhals.utils import dtype_matches_time_unit_and_time_zone -from narwhals.utils import get_column_names -from narwhals.utils import import_dtypes_module +from narwhals._selectors import CompliantSelector +from narwhals._selectors import LazySelectorNamespace if TYPE_CHECKING: try: @@ -19,14 +14,12 @@ except ModuleNotFoundError: import dask_expr as dx - from datetime import timezone - from typing_extensions import Self from narwhals._dask.dataframe import DaskLazyFrame - from narwhals.dtypes import DType - from narwhals.typing import TimeUnit - from narwhals.utils import _LimitedContext + from narwhals._selectors import EvalNames + from narwhals._selectors import EvalSeries + from narwhals.utils import _FullContext try: import dask.dataframe.dask_expr as dx @@ -34,108 +27,39 @@ import dask_expr as dx -class DaskSelectorNamespace: - def __init__(self: Self, context: _LimitedContext, /) -> None: - self._backend_version = context._backend_version - self._version = context._version - - def by_dtype(self: Self, dtypes: Iterable[DType | type[DType]]) -> DaskSelector: - def func(df: DaskLazyFrame) -> list[dx.Series]: - return [ - df._native_frame[col] for col in df.columns if df.schema[col] in dtypes - ] - - def evaluate_output_names(df: DaskLazyFrame) -> Sequence[str]: - return [col for col in df.columns if df.schema[col] in dtypes] - - return selector(self, func, evaluate_output_names) - - def matches(self: Self, pattern: str) -> DaskSelector: - def func(df: DaskLazyFrame) -> list[dx.Series]: - return [ - df._native_frame[col] for col in df.columns if re.search(pattern, col) - ] - - def evaluate_output_names(df: DaskLazyFrame) -> Sequence[str]: - return [col for col in df.columns if re.search(pattern, col)] - - return selector(self, func, evaluate_output_names) - - def numeric(self: Self) -> DaskSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype( - { - dtypes.Int128, - dtypes.Int64, - dtypes.Int32, - dtypes.Int16, - dtypes.Int8, - dtypes.UInt128, - dtypes.UInt64, - dtypes.UInt32, - dtypes.UInt16, - dtypes.UInt8, - dtypes.Float64, - dtypes.Float32, - }, +class DaskSelectorNamespace(LazySelectorNamespace["DaskLazyFrame", "dx.Series"]): + def _iter_columns(self, df: DaskLazyFrame) -> Iterator[dx.Series]: + for _col, ser in df._native_frame.items(): # noqa: PERF102 + yield ser + + def _selector( + self, + context: _FullContext, + call: EvalSeries[DaskLazyFrame, dx.Series], + evaluate_output_names: EvalNames[DaskLazyFrame], + /, + ) -> CompliantSelector[DaskLazyFrame, dx.Series]: + return DaskSelector( + call, + depth=0, + function_name="selector", + evaluate_output_names=evaluate_output_names, + alias_output_names=None, + backend_version=context._backend_version, + version=context._version, ) - def categorical(self: Self) -> DaskSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype({dtypes.Categorical}) - - def string(self: Self) -> DaskSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype({dtypes.String}) - - def boolean(self: Self) -> DaskSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype({dtypes.Boolean}) - - def all(self: Self) -> DaskSelector: - def func(df: DaskLazyFrame) -> list[dx.Series]: - return [df._native_frame[col] for col in df.columns] - - return selector(self, func, get_column_names) - - def datetime( - self: Self, - time_unit: TimeUnit | Iterable[TimeUnit] | None, - time_zone: str | timezone | Iterable[str | timezone | None] | None, - ) -> DaskSelector: # pragma: no cover - dtypes = import_dtypes_module(version=self._version) - time_units, time_zones = _parse_time_unit_and_time_zone( - time_unit=time_unit, time_zone=time_zone - ) - - def func(df: DaskLazyFrame) -> list[dx.Series]: - return [ - df._native_frame[col] - for col in df.columns - if dtype_matches_time_unit_and_time_zone( - dtype=df.schema[col], - dtypes=dtypes, - time_units=time_units, - time_zones=time_zones, - ) - ] - - def evaluate_output_names(df: DaskLazyFrame) -> Sequence[str]: - return [ - col - for col in df.columns - if dtype_matches_time_unit_and_time_zone( - dtype=df.schema[col], - dtypes=dtypes, - time_units=time_units, - time_zones=time_zones, - ) - ] + def __init__(self: Self, context: _FullContext, /) -> None: + self._implementation = context._implementation + self._backend_version = context._backend_version + self._version = context._version - return selector(self, func, evaluate_output_names) +class DaskSelector(CompliantSelector["DaskLazyFrame", "dx.Series"], DaskExpr): # type: ignore[misc] + @property + def selectors(self) -> DaskSelectorNamespace: + return DaskSelectorNamespace(self) -class DaskSelector(DaskExpr): def __repr__(self: Self) -> str: # pragma: no cover return f"DaskSelector(depth={self._depth}, function_name={self._function_name})" @@ -149,81 +73,3 @@ def _to_expr(self: Self) -> DaskExpr: backend_version=self._backend_version, version=self._version, ) - - def __sub__(self: Self, other: DaskSelector | Any) -> DaskSelector | Any: - if isinstance(other, DaskSelector): - - def call(df: DaskLazyFrame) -> list[dx.Series]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - return [x for x, name in zip(lhs, lhs_names) if name not in rhs_names] - - def evaluate_output_names(df: DaskLazyFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [x for x in lhs_names if x not in rhs_names] - - return selector(self, call, evaluate_output_names) - else: - return self._to_expr() - other - - def __or__(self: Self, other: DaskSelector | Any) -> DaskSelector | Any: - if isinstance(other, DaskSelector): - - def call(df: DaskLazyFrame) -> list[dx.Series]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - rhs = other._call(df) - return [ - *(x for x, name in zip(lhs, lhs_names) if name not in rhs_names), - *rhs, - ] - - def evaluate_output_names(df: DaskLazyFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [*(x for x in lhs_names if x not in rhs_names), *rhs_names] - - return selector(self, call, evaluate_output_names) - else: - return self._to_expr() | other - - def __and__(self: Self, other: DaskSelector | Any) -> DaskSelector | Any: - if isinstance(other, DaskSelector): - - def call(df: DaskLazyFrame) -> list[dx.Series]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - return [x for x, name in zip(lhs, lhs_names) if name in rhs_names] - - def evaluate_output_names(df: DaskLazyFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [x for x in lhs_names if x in rhs_names] - - return selector(self, call, evaluate_output_names) - else: - return self._to_expr() & other - - def __invert__(self: Self) -> DaskSelector: - return DaskSelectorNamespace(self).all() - self - - -def selector( - context: _LimitedContext, - call: Callable[[DaskLazyFrame], Sequence[dx.Series]], - evaluate_output_names: Callable[[DaskLazyFrame], Sequence[str]], - /, -) -> DaskSelector: - return DaskSelector( - call, - depth=0, - function_name="selector", - evaluate_output_names=evaluate_output_names, - alias_output_names=None, - backend_version=context._backend_version, - version=context._version, - ) From d6a2e4138ff56c77b30f2f23c5562ca4568f6d2d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 22:32:10 +0000 Subject: [PATCH 19/55] refactor: rename `DataFrameT` -> `FrameT` --- narwhals/_selectors.py | 113 +++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 60 deletions(-) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index a6532d1072..37429bdc70 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -48,16 +48,14 @@ def dtype(self) -> DType: ... SeriesT = TypeVar("SeriesT", bound="CompliantSeriesWithDType") -DataFrameT = TypeVar("DataFrameT", bound="CompliantDataFrame | CompliantLazyFrame") -SelectorOrExpr: TypeAlias = ( - "CompliantSelector[DataFrameT, SeriesT] | CompliantExpr[SeriesT]" -) -EvalSeries: TypeAlias = Callable[[DataFrameT], Sequence[SeriesT]] -EvalNames: TypeAlias = Callable[[DataFrameT], Sequence[str]] +FrameT = TypeVar("FrameT", bound="CompliantDataFrame | CompliantLazyFrame") +SelectorOrExpr: TypeAlias = "CompliantSelector[FrameT, SeriesT] | CompliantExpr[SeriesT]" +EvalSeries: TypeAlias = Callable[[FrameT], Sequence[SeriesT]] +EvalNames: TypeAlias = Callable[[FrameT], Sequence[str]] # NOTE: Pretty much finished generic for eager backends -class CompliantSelectorNamespace(Generic[DataFrameT, SeriesT], Protocol): +class CompliantSelectorNamespace(Generic[FrameT, SeriesT], Protocol): _implementation: Implementation _backend_version: tuple[int, ...] _version: Version @@ -65,88 +63,88 @@ class CompliantSelectorNamespace(Generic[DataFrameT, SeriesT], Protocol): # TODO @dangotbanned: push for adding to public API for `DataFrame` # Only need internally, but it plugs so many holes that it must be useful beyond that # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.iter_columns.html - def _iter_columns(self, df: DataFrameT, /) -> Iterator[SeriesT]: ... - def _iter_schema(self, df: DataFrameT, /) -> Iterator[tuple[str, DType]]: + def _iter_columns(self, df: FrameT, /) -> Iterator[SeriesT]: ... + def _iter_schema(self, df: FrameT, /) -> Iterator[tuple[str, DType]]: for ser in self._iter_columns(df): yield ser.name, ser.dtype - def _iter_columns_dtypes(self, df: DataFrameT, /) -> Iterator[tuple[SeriesT, DType]]: + def _iter_columns_dtypes(self, df: FrameT, /) -> Iterator[tuple[SeriesT, DType]]: for ser in self._iter_columns(df): yield ser, ser.dtype - def _iter_columns_names(self, df: DataFrameT, /) -> Iterator[tuple[SeriesT, str]]: + def _iter_columns_names(self, df: FrameT, /) -> Iterator[tuple[SeriesT, str]]: yield from zip(self._iter_columns(df), df.columns) def _selector( self, context: _FullContext, - call: EvalSeries[DataFrameT, SeriesT], - evaluate_output_names: EvalNames[DataFrameT], + call: EvalSeries[FrameT, SeriesT], + evaluate_output_names: EvalNames[FrameT], /, - ) -> CompliantSelector[DataFrameT, SeriesT]: ... + ) -> CompliantSelector[FrameT, SeriesT]: ... # NOTE: `.dtype` won't return a `nw.DType` (or maybe anything) for lazy backends # - Their `SeriesT` is a native object # - See (https://github.com/narwhals-dev/narwhals/issues/2044) def _is_dtype( - self: CompliantSelectorNamespace[DataFrameT, SeriesT], dtype: type[DType], / - ) -> CompliantSelector[DataFrameT, SeriesT]: - def series(df: DataFrameT) -> Sequence[SeriesT]: + self: CompliantSelectorNamespace[FrameT, SeriesT], dtype: type[DType], / + ) -> CompliantSelector[FrameT, SeriesT]: + def series(df: FrameT) -> Sequence[SeriesT]: return [ ser for ser, tp in self._iter_columns_dtypes(df) if isinstance(tp, dtype) ] - def names(df: DataFrameT) -> Sequence[str]: + def names(df: FrameT) -> Sequence[str]: return [name for name, tp in self._iter_schema(df) if isinstance(tp, dtype)] return self._selector(self, series, names) def by_dtype( self: Self, dtypes: Collection[DType | type[DType]] - ) -> CompliantSelector[DataFrameT, SeriesT]: - def series(df: DataFrameT) -> Sequence[SeriesT]: + ) -> CompliantSelector[FrameT, SeriesT]: + def series(df: FrameT) -> Sequence[SeriesT]: return [ser for ser, tp in self._iter_columns_dtypes(df) if tp in dtypes] - def names(df: DataFrameT) -> Sequence[str]: + def names(df: FrameT) -> Sequence[str]: return [name for name, tp in self._iter_schema(df) if tp in dtypes] return self._selector(self, series, names) - def matches(self: Self, pattern: str) -> CompliantSelector[DataFrameT, SeriesT]: + def matches(self: Self, pattern: str) -> CompliantSelector[FrameT, SeriesT]: p = re.compile(pattern) - def series(df: DataFrameT) -> Sequence[SeriesT]: + def series(df: FrameT) -> Sequence[SeriesT]: # NOTE: Possibly cheaper than lazyframe? if is_compliant_dataframe(df): return [df.get_column(col) for col in df.columns if p.search(col)] return [ser for ser, name in self._iter_columns_names(df) if p.search(name)] - def names(df: DataFrameT) -> Sequence[str]: + def names(df: FrameT) -> Sequence[str]: return [col for col in df.columns if p.search(col)] return self._selector(self, series, names) - def numeric(self: Self) -> CompliantSelector[DataFrameT, SeriesT]: - def series(df: DataFrameT) -> Sequence[SeriesT]: + def numeric(self: Self) -> CompliantSelector[FrameT, SeriesT]: + def series(df: FrameT) -> Sequence[SeriesT]: return [ser for ser, tp in self._iter_columns_dtypes(df) if tp.is_numeric()] - def names(df: DataFrameT) -> Sequence[str]: + def names(df: FrameT) -> Sequence[str]: return [name for name, tp in self._iter_schema(df) if tp.is_numeric()] return self._selector(self, series, names) - def categorical(self: Self) -> CompliantSelector[DataFrameT, SeriesT]: + def categorical(self: Self) -> CompliantSelector[FrameT, SeriesT]: return self._is_dtype(import_dtypes_module(self._version).Categorical) - def string(self: Self) -> CompliantSelector[DataFrameT, SeriesT]: + def string(self: Self) -> CompliantSelector[FrameT, SeriesT]: return self._is_dtype(import_dtypes_module(self._version).String) - def boolean(self: Self) -> CompliantSelector[DataFrameT, SeriesT]: + def boolean(self: Self) -> CompliantSelector[FrameT, SeriesT]: return self._is_dtype(import_dtypes_module(self._version).Boolean) - def all(self: Self) -> CompliantSelector[DataFrameT, SeriesT]: - def series(df: DataFrameT) -> Sequence[SeriesT]: + def all(self: Self) -> CompliantSelector[FrameT, SeriesT]: + def series(df: FrameT) -> Sequence[SeriesT]: return list(self._iter_columns(df)) return self._selector(self, series, get_column_names) @@ -155,7 +153,7 @@ def datetime( self: Self, time_unit: TimeUnit | Iterable[TimeUnit] | None, time_zone: str | timezone | Iterable[str | timezone | None] | None, - ) -> CompliantSelector[DataFrameT, SeriesT]: + ) -> CompliantSelector[FrameT, SeriesT]: time_units, time_zones = _parse_time_unit_and_time_zone(time_unit, time_zone) matches = partial( dtype_matches_time_unit_and_time_zone, @@ -164,10 +162,10 @@ def datetime( time_zones=time_zones, ) - def series(df: DataFrameT) -> Sequence[SeriesT]: + def series(df: FrameT) -> Sequence[SeriesT]: return [ser for ser, tp in self._iter_columns_dtypes(df) if matches(tp)] - def names(df: DataFrameT) -> Sequence[str]: + def names(df: FrameT) -> Sequence[str]: return [name for name, tp in self._iter_schema(df) if matches(tp)] return self._selector(self, series, names) @@ -182,27 +180,24 @@ def __init__(self: Self, context: _FullContext, /) -> None: # pragma: no cover class LazySelectorNamespace( - CompliantSelectorNamespace[DataFrameT, SeriesT], - Generic[DataFrameT, SeriesT], - Protocol, + CompliantSelectorNamespace[FrameT, SeriesT], Generic[FrameT, SeriesT], Protocol ): - def _iter_schema(self, df: DataFrameT) -> Iterator[tuple[str, DType]]: + def _iter_schema(self, df: FrameT) -> Iterator[tuple[str, DType]]: yield from df.schema.items() - def _iter_columns_dtypes(self, df: DataFrameT, /) -> Iterator[tuple[SeriesT, DType]]: + def _iter_columns_dtypes(self, df: FrameT, /) -> Iterator[tuple[SeriesT, DType]]: yield from zip(self._iter_columns(df), df.schema.values()) -class CompliantSelector(CompliantExpr[SeriesT], Generic[DataFrameT, SeriesT], Protocol): +class CompliantSelector(CompliantExpr[SeriesT], Generic[FrameT, SeriesT], Protocol): @property - def selectors(self) -> CompliantSelectorNamespace[DataFrameT, SeriesT]: ... + def selectors(self) -> CompliantSelectorNamespace[FrameT, SeriesT]: ... def __repr__(self: Self) -> str: ... def _to_expr(self: Self) -> CompliantExpr[SeriesT]: ... def _is_selector( - self: Self, - other: Self | CompliantExpr[SeriesT], - ) -> TypeIs[CompliantSelector[DataFrameT, SeriesT]]: + self: Self, other: Self | CompliantExpr[SeriesT] + ) -> TypeIs[CompliantSelector[FrameT, SeriesT]]: return isinstance(other, type(self)) @overload @@ -210,17 +205,17 @@ def __sub__(self: Self, other: Self) -> Self: ... @overload def __sub__(self: Self, other: CompliantExpr[SeriesT]) -> CompliantExpr[SeriesT]: ... def __sub__( - self: Self, other: SelectorOrExpr[DataFrameT, SeriesT] - ) -> SelectorOrExpr[DataFrameT, SeriesT]: + self: Self, other: SelectorOrExpr[FrameT, SeriesT] + ) -> SelectorOrExpr[FrameT, SeriesT]: if self._is_selector(other): - def series(df: DataFrameT) -> Sequence[SeriesT]: + def series(df: FrameT) -> Sequence[SeriesT]: lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) return [ x for x, name in zip(self(df), lhs_names) if name not in rhs_names ] - def names(df: DataFrameT) -> Sequence[str]: + def names(df: FrameT) -> Sequence[str]: lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) return [x for x in lhs_names if x not in rhs_names] @@ -233,18 +228,18 @@ def __or__(self: Self, other: Self) -> Self: ... @overload def __or__(self: Self, other: CompliantExpr[SeriesT]) -> CompliantExpr[SeriesT]: ... def __or__( - self: Self, other: SelectorOrExpr[DataFrameT, SeriesT] - ) -> SelectorOrExpr[DataFrameT, SeriesT]: + self: Self, other: SelectorOrExpr[FrameT, SeriesT] + ) -> SelectorOrExpr[FrameT, SeriesT]: if self._is_selector(other): - def names(df: DataFrameT) -> Sequence[SeriesT]: + def names(df: FrameT) -> Sequence[SeriesT]: lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) return [ *(x for x, name in zip(self(df), lhs_names) if name not in rhs_names), *other(df), ] - def series(df: DataFrameT) -> Sequence[str]: + def series(df: FrameT) -> Sequence[str]: lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) return [*(x for x in lhs_names if x not in rhs_names), *rhs_names] @@ -257,15 +252,15 @@ def __and__(self: Self, other: Self) -> Self: ... @overload def __and__(self: Self, other: CompliantExpr[SeriesT]) -> CompliantExpr[SeriesT]: ... def __and__( - self: Self, other: SelectorOrExpr[DataFrameT, SeriesT] - ) -> SelectorOrExpr[DataFrameT, SeriesT]: + self: Self, other: SelectorOrExpr[FrameT, SeriesT] + ) -> SelectorOrExpr[FrameT, SeriesT]: if self._is_selector(other): - def series(df: DataFrameT) -> Sequence[SeriesT]: + def series(df: FrameT) -> Sequence[SeriesT]: lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) return [x for x, name in zip(self(df), lhs_names) if name in rhs_names] - def names(df: DataFrameT) -> Sequence[str]: + def names(df: FrameT) -> Sequence[str]: lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) return [x for x in lhs_names if x in rhs_names] @@ -273,9 +268,7 @@ def names(df: DataFrameT) -> Sequence[str]: else: return self._to_expr() & other - def __invert__( - self: Self, - ) -> CompliantSelector[DataFrameT, SeriesT]: + def __invert__(self: Self) -> CompliantSelector[FrameT, SeriesT]: return self.selectors.all() - self From f32c3487e8f5f78ae88efcf8204655e9140cf28e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 23:33:15 +0000 Subject: [PATCH 20/55] feat: reimplement `_duckdb.selectors` - Haven't got a local install - Expecting to work the same as `_dask` in CI --- narwhals/_duckdb/namespace.py | 3 + narwhals/_duckdb/selectors.py | 228 ++++++---------------------------- 2 files changed, 42 insertions(+), 189 deletions(-) diff --git a/narwhals/_duckdb/namespace.py b/narwhals/_duckdb/namespace.py index 56b3bf4a4f..fa5a9ee7fd 100644 --- a/narwhals/_duckdb/namespace.py +++ b/narwhals/_duckdb/namespace.py @@ -24,6 +24,7 @@ from narwhals._expression_parsing import combine_alias_output_names from narwhals._expression_parsing import combine_evaluate_output_names from narwhals.typing import CompliantNamespace +from narwhals.utils import Implementation from narwhals.utils import get_column_names if TYPE_CHECKING: @@ -36,6 +37,8 @@ class DuckDBNamespace(CompliantNamespace["duckdb.Expression"]): # type: ignore[type-var] + _implementation: Implementation = Implementation.DUCKDB + def __init__( self: Self, *, backend_version: tuple[int, ...], version: Version ) -> None: diff --git a/narwhals/_duckdb/selectors.py b/narwhals/_duckdb/selectors.py index 254f41152a..5b4627abac 100644 --- a/narwhals/_duckdb/selectors.py +++ b/narwhals/_duckdb/selectors.py @@ -1,134 +1,61 @@ from __future__ import annotations -import re from typing import TYPE_CHECKING -from typing import Any -from typing import Callable -from typing import Iterable -from typing import Sequence +from typing import Iterator from duckdb import ColumnExpression from narwhals._duckdb.expr import DuckDBExpr -from narwhals.utils import _parse_time_unit_and_time_zone -from narwhals.utils import dtype_matches_time_unit_and_time_zone -from narwhals.utils import get_column_names -from narwhals.utils import import_dtypes_module +from narwhals._selectors import CompliantSelector +from narwhals._selectors import LazySelectorNamespace if TYPE_CHECKING: - from datetime import timezone - import duckdb from typing_extensions import Self from narwhals._duckdb.dataframe import DuckDBLazyFrame - from narwhals.dtypes import DType - from narwhals.typing import TimeUnit - from narwhals.utils import _LimitedContext - + from narwhals._selectors import EvalNames + from narwhals._selectors import EvalSeries + from narwhals.utils import _FullContext + + +class DuckDBSelectorNamespace( + LazySelectorNamespace["DuckDBLazyFrame", "duckdb.Expression"] # type: ignore[type-var] +): + def _iter_columns(self, df: DuckDBLazyFrame) -> Iterator[duckdb.Expression]: + for col in df.columns: + yield ColumnExpression(col) + + def _selector( + self, + context: _FullContext, + call: EvalSeries[DuckDBLazyFrame, duckdb.Expression], # type: ignore[type-var] + evaluate_output_names: EvalNames[DuckDBLazyFrame], + /, + ) -> CompliantSelector[DuckDBLazyFrame, duckdb.Expression]: # type: ignore[type-var] + return DuckDBSelector( + call, + function_name="selector", + evaluate_output_names=evaluate_output_names, + alias_output_names=None, + backend_version=context._backend_version, + version=context._version, + ) -class DuckDBSelectorNamespace: - def __init__(self: Self, context: _LimitedContext, /) -> None: + def __init__(self: Self, context: _FullContext, /) -> None: + self._implementation = context._implementation self._backend_version = context._backend_version self._version = context._version - def by_dtype(self: Self, dtypes: Iterable[DType | type[DType]]) -> DuckDBSelector: - def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: - return [ - ColumnExpression(col) for col in df.columns if df.schema[col] in dtypes - ] - - def evaluate_output_names(df: DuckDBLazyFrame) -> Sequence[str]: - return [col for col in df.columns if df.schema[col] in dtypes] - - return selector(self, func, evaluate_output_names) - - def matches(self: Self, pattern: str) -> DuckDBSelector: - def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: - return [ - ColumnExpression(col) for col in df.columns if re.search(pattern, col) - ] - - def evaluate_output_names(df: DuckDBLazyFrame) -> Sequence[str]: - return [col for col in df.columns if re.search(pattern, col)] - - return selector(self, func, evaluate_output_names) - - def numeric(self: Self) -> DuckDBSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype( - { - dtypes.Int128, - dtypes.Int64, - dtypes.Int32, - dtypes.Int16, - dtypes.Int8, - dtypes.UInt128, - dtypes.UInt64, - dtypes.UInt32, - dtypes.UInt16, - dtypes.UInt8, - dtypes.Float64, - dtypes.Float32, - }, - ) - - def categorical(self: Self) -> DuckDBSelector: # pragma: no cover - dtypes = import_dtypes_module(self._version) - return self.by_dtype({dtypes.Categorical}) - - def string(self: Self) -> DuckDBSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype({dtypes.String}) - - def boolean(self: Self) -> DuckDBSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype({dtypes.Boolean}) - - def all(self: Self) -> DuckDBSelector: - def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: - return [ColumnExpression(col) for col in df.columns] - - return selector(self, func, get_column_names) - - def datetime( - self: Self, - time_unit: TimeUnit | Iterable[TimeUnit] | None, - time_zone: str | timezone | Iterable[str | timezone | None] | None, - ) -> DuckDBSelector: - dtypes = import_dtypes_module(version=self._version) - time_units, time_zones = _parse_time_unit_and_time_zone( - time_unit=time_unit, time_zone=time_zone - ) - - def func(df: DuckDBLazyFrame) -> list[duckdb.Expression]: - return [ - ColumnExpression(col) - for col in df.columns - if dtype_matches_time_unit_and_time_zone( - dtype=df.schema[col], - dtypes=dtypes, - time_units=time_units, - time_zones=time_zones, - ) - ] - def evaluate_output_names(df: DuckDBLazyFrame) -> Sequence[str]: - return [ - col - for col in df.columns - if dtype_matches_time_unit_and_time_zone( - dtype=df.schema[col], - dtypes=dtypes, - time_units=time_units, - time_zones=time_zones, - ) - ] +class DuckDBSelector( # type: ignore[misc] + CompliantSelector["DuckDBLazyFrame", "duckdb.Expression"], # type: ignore[type-var] + DuckDBExpr, +): + @property + def selectors(self) -> DuckDBSelectorNamespace: + return DuckDBSelectorNamespace(self) - return selector(self, func, evaluate_output_names) - - -class DuckDBSelector(DuckDBExpr): def __repr__(self: Self) -> str: # pragma: no cover return f"DuckDBSelector(function_name={self._function_name})" @@ -141,80 +68,3 @@ def _to_expr(self: Self) -> DuckDBExpr: backend_version=self._backend_version, version=self._version, ) - - def __sub__(self: Self, other: DuckDBSelector | Any) -> DuckDBSelector | Any: - if isinstance(other, DuckDBSelector): - - def call(df: DuckDBLazyFrame) -> list[duckdb.Expression]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - return [x for x, name in zip(lhs, lhs_names) if name not in rhs_names] - - def evaluate_output_names(df: DuckDBLazyFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [x for x in lhs_names if x not in rhs_names] - - return selector(self, call, evaluate_output_names) - else: - return self._to_expr() - other - - def __or__(self: Self, other: DuckDBSelector | Any) -> DuckDBSelector | Any: - if isinstance(other, DuckDBSelector): - - def call(df: DuckDBLazyFrame) -> list[duckdb.Expression]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - rhs = other._call(df) - return [ - *(x for x, name in zip(lhs, lhs_names) if name not in rhs_names), - *rhs, - ] - - def evaluate_output_names(df: DuckDBLazyFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [*(x for x in lhs_names if x not in rhs_names), *rhs_names] - - return selector(self, call, evaluate_output_names) - else: - return self._to_expr() | other - - def __and__(self: Self, other: DuckDBSelector | Any) -> DuckDBSelector | Any: - if isinstance(other, DuckDBSelector): - - def call(df: DuckDBLazyFrame) -> list[duckdb.Expression]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - return [x for x, name in zip(lhs, lhs_names) if name in rhs_names] - - def evaluate_output_names(df: DuckDBLazyFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [x for x in lhs_names if x in rhs_names] - - return selector(self, call, evaluate_output_names) - else: - return self._to_expr() & other - - def __invert__(self: Self) -> DuckDBSelector: - return DuckDBSelectorNamespace(self).all() - self - - -def selector( - context: _LimitedContext, - call: Callable[[DuckDBLazyFrame], Sequence[duckdb.Expression]], - evaluate_output_names: Callable[[DuckDBLazyFrame], Sequence[str]], - /, -) -> DuckDBSelector: - return DuckDBSelector( - call, - function_name="selector", - evaluate_output_names=evaluate_output_names, - alias_output_names=None, - backend_version=context._backend_version, - version=context._version, - ) From d785eb395abfd26e4360f575285d89bd9fc46371 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 23:44:45 +0000 Subject: [PATCH 21/55] fix: guard against back-compat `duckdb` https://github.com/narwhals-dev/narwhals/actions/runs/13467042968/job/37634856612?pr=2064 --- narwhals/_selectors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 37429bdc70..47b71cec3b 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -114,8 +114,8 @@ def matches(self: Self, pattern: str) -> CompliantSelector[FrameT, SeriesT]: p = re.compile(pattern) def series(df: FrameT) -> Sequence[SeriesT]: - # NOTE: Possibly cheaper than lazyframe? - if is_compliant_dataframe(df): + # NOTE: https://github.com/narwhals-dev/narwhals/actions/runs/13467042968/job/37634856612?pr=2064 + if is_compliant_dataframe(df) and not self._implementation.is_duckdb(): return [df.get_column(col) for col in df.columns if p.search(col)] return [ser for ser, name in self._iter_columns_names(df) if p.search(name)] From 90229c528350948f8d6d3d11d202b0803a23b1e1 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 23:46:35 +0000 Subject: [PATCH 22/55] feat: reimplement `_spark_like.selectors` --- narwhals/_spark_like/selectors.py | 214 +++++------------------------- 1 file changed, 31 insertions(+), 183 deletions(-) diff --git a/narwhals/_spark_like/selectors.py b/narwhals/_spark_like/selectors.py index 95ad0407d1..1115dee13c 100644 --- a/narwhals/_spark_like/selectors.py +++ b/narwhals/_spark_like/selectors.py @@ -1,129 +1,55 @@ from __future__ import annotations -import re from typing import TYPE_CHECKING -from typing import Any -from typing import Callable -from typing import Iterable -from typing import Sequence +from typing import Iterator +from narwhals._selectors import CompliantSelector +from narwhals._selectors import LazySelectorNamespace from narwhals._spark_like.expr import SparkLikeExpr -from narwhals.utils import _parse_time_unit_and_time_zone -from narwhals.utils import dtype_matches_time_unit_and_time_zone -from narwhals.utils import get_column_names -from narwhals.utils import import_dtypes_module if TYPE_CHECKING: - from datetime import timezone - from pyspark.sql import Column from typing_extensions import Self + from narwhals._selectors import EvalNames + from narwhals._selectors import EvalSeries from narwhals._spark_like.dataframe import SparkLikeLazyFrame - from narwhals.dtypes import DType - from narwhals.typing import TimeUnit from narwhals.utils import _FullContext -class SparkLikeSelectorNamespace: - def __init__(self: Self, context: _FullContext, /) -> None: - self._backend_version = context._backend_version - self._version = context._version - self._implementation = context._implementation - - def by_dtype(self: Self, dtypes: Iterable[DType | type[DType]]) -> SparkLikeSelector: - def func(df: SparkLikeLazyFrame) -> list[Column]: - return [df._F.col(col) for col in df.columns if df.schema[col] in dtypes] - - def evaluate_output_names(df: SparkLikeLazyFrame) -> Sequence[str]: - return [col for col in df.columns if df.schema[col] in dtypes] - - return selector(self, func, evaluate_output_names) - - def matches(self: Self, pattern: str) -> SparkLikeSelector: - def func(df: SparkLikeLazyFrame) -> list[Column]: - return [df._F.col(col) for col in df.columns if re.search(pattern, col)] - - def evaluate_output_names(df: SparkLikeLazyFrame) -> Sequence[str]: - return [col for col in df.columns if re.search(pattern, col)] +class SparkLikeSelectorNamespace(LazySelectorNamespace["SparkLikeLazyFrame", "Column"]): + def _iter_columns(self, df: SparkLikeLazyFrame) -> Iterator[Column]: + for col in df.columns: + yield df._F.col(col) - return selector(self, func, evaluate_output_names) - - def numeric(self: Self) -> SparkLikeSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype( - { - dtypes.Int128, - dtypes.Int64, - dtypes.Int32, - dtypes.Int16, - dtypes.Int8, - dtypes.UInt128, - dtypes.UInt64, - dtypes.UInt32, - dtypes.UInt16, - dtypes.UInt8, - dtypes.Float64, - dtypes.Float32, - }, - ) - - def categorical(self: Self) -> SparkLikeSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype({dtypes.Categorical}) - - def string(self: Self) -> SparkLikeSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype({dtypes.String}) - - def boolean(self: Self) -> SparkLikeSelector: - dtypes = import_dtypes_module(self._version) - return self.by_dtype({dtypes.Boolean}) - - def all(self: Self) -> SparkLikeSelector: - def func(df: SparkLikeLazyFrame) -> list[Column]: - return [df._F.col(col) for col in df.columns] - - return selector(self, func, get_column_names) - - def datetime( - self: Self, - time_unit: TimeUnit | Iterable[TimeUnit] | None, - time_zone: str | timezone | Iterable[str | timezone | None] | None, + def _selector( + self, + context: _FullContext, + call: EvalSeries[SparkLikeLazyFrame, Column], + evaluate_output_names: EvalNames[SparkLikeLazyFrame], + /, ) -> SparkLikeSelector: - dtypes = import_dtypes_module(version=self._version) - time_units, time_zones = _parse_time_unit_and_time_zone( - time_unit=time_unit, time_zone=time_zone + return SparkLikeSelector( + call, + function_name="selector", + evaluate_output_names=evaluate_output_names, + alias_output_names=None, + backend_version=context._backend_version, + version=context._version, + implementation=context._implementation, ) - def func(df: SparkLikeLazyFrame) -> list[Column]: - return [ - df._F.col(col) - for col in df.columns - if dtype_matches_time_unit_and_time_zone( - dtype=df.schema[col], - dtypes=dtypes, - time_units=time_units, - time_zones=time_zones, - ) - ] - - def evaluate_output_names(df: SparkLikeLazyFrame) -> Sequence[str]: - return [ - col - for col in df.columns - if dtype_matches_time_unit_and_time_zone( - dtype=df.schema[col], - dtypes=dtypes, - time_units=time_units, - time_zones=time_zones, - ) - ] + def __init__(self: Self, context: _FullContext, /) -> None: + self._backend_version = context._backend_version + self._version = context._version + self._implementation = context._implementation - return selector(self, func, evaluate_output_names) +class SparkLikeSelector(CompliantSelector["SparkLikeLazyFrame", "Column"], SparkLikeExpr): # type: ignore[misc] + @property + def selectors(self: Self) -> SparkLikeSelectorNamespace: + return SparkLikeSelectorNamespace(self) -class SparkLikeSelector(SparkLikeExpr): def __repr__(self: Self) -> str: # pragma: no cover return f"SparkLikeSelector(function_name={self._function_name})" @@ -137,81 +63,3 @@ def _to_expr(self: Self) -> SparkLikeExpr: version=self._version, implementation=self._implementation, ) - - def __sub__(self: Self, other: SparkLikeSelector | Any) -> SparkLikeSelector | Any: - if isinstance(other, SparkLikeSelector): - - def call(df: SparkLikeLazyFrame) -> list[Column]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - return [x for x, name in zip(lhs, lhs_names) if name not in rhs_names] - - def evaluate_output_names(df: SparkLikeLazyFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [x for x in lhs_names if x not in rhs_names] - - return selector(self, call, evaluate_output_names) - else: - return self._to_expr() - other - - def __or__(self: Self, other: SparkLikeSelector | Any) -> SparkLikeSelector | Any: - if isinstance(other, SparkLikeSelector): - - def call(df: SparkLikeLazyFrame) -> list[Column]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - rhs = other._call(df) - return [ - *(x for x, name in zip(lhs, lhs_names) if name not in rhs_names), - *rhs, - ] - - def evaluate_output_names(df: SparkLikeLazyFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [*(x for x in lhs_names if x not in rhs_names), *rhs_names] - - return selector(self, call, evaluate_output_names) - else: - return self._to_expr() | other - - def __and__(self: Self, other: SparkLikeSelector | Any) -> SparkLikeSelector | Any: - if isinstance(other, SparkLikeSelector): - - def call(df: SparkLikeLazyFrame) -> list[Column]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - lhs = self._call(df) - return [x for x, name in zip(lhs, lhs_names) if name in rhs_names] - - def evaluate_output_names(df: SparkLikeLazyFrame) -> list[str]: - lhs_names = self._evaluate_output_names(df) - rhs_names = other._evaluate_output_names(df) - return [x for x in lhs_names if x in rhs_names] - - return selector(self, call, evaluate_output_names) - else: - return self._to_expr() & other - - def __invert__(self: Self) -> SparkLikeSelector: - return SparkLikeSelectorNamespace(self).all() - self - - -def selector( - context: _FullContext, - call: Callable[[SparkLikeLazyFrame], Sequence[Column]], - evaluate_output_names: Callable[[SparkLikeLazyFrame], Sequence[str]], - /, -) -> SparkLikeSelector: - return SparkLikeSelector( - call, - function_name="selector", - evaluate_output_names=evaluate_output_names, - alias_output_names=None, - backend_version=context._backend_version, - version=context._version, - implementation=context._implementation, - ) From 180b9a341f06729707052c39217660aa55e3a40c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 21 Feb 2025 23:47:46 +0000 Subject: [PATCH 23/55] chore: remove duplicate import --- narwhals/_dask/selectors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index ee85158758..7a9ffaa594 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -3,7 +3,6 @@ from typing import TYPE_CHECKING from typing import Iterator -from narwhals._dask.dataframe import DaskLazyFrame from narwhals._dask.expr import DaskExpr from narwhals._selectors import CompliantSelector from narwhals._selectors import LazySelectorNamespace From bfed23f58f2343eeaadd7ec6036fe9178a31f257 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 22 Feb 2025 10:14:56 +0000 Subject: [PATCH 24/55] chore(typing): ignore valid `_dask.selectors` warnings - Only showing locally when `dask` is installed - Hoping it doesn't show in CI as unused - Can't fix until #2044 --- narwhals/_dask/selectors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 7a9ffaa594..4b797a6c87 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -26,7 +26,7 @@ import dask_expr as dx -class DaskSelectorNamespace(LazySelectorNamespace["DaskLazyFrame", "dx.Series"]): +class DaskSelectorNamespace(LazySelectorNamespace["DaskLazyFrame", "dx.Series"]): # pyright: ignore[reportInvalidTypeArguments] def _iter_columns(self, df: DaskLazyFrame) -> Iterator[dx.Series]: for _col, ser in df._native_frame.items(): # noqa: PERF102 yield ser @@ -34,10 +34,10 @@ def _iter_columns(self, df: DaskLazyFrame) -> Iterator[dx.Series]: def _selector( self, context: _FullContext, - call: EvalSeries[DaskLazyFrame, dx.Series], + call: EvalSeries[DaskLazyFrame, dx.Series], # pyright: ignore[reportInvalidTypeForm] evaluate_output_names: EvalNames[DaskLazyFrame], /, - ) -> CompliantSelector[DaskLazyFrame, dx.Series]: + ) -> CompliantSelector[DaskLazyFrame, dx.Series]: # pyright: ignore[reportInvalidTypeArguments] return DaskSelector( call, depth=0, From f060f220704d2e27ba8010360da92d0bea684588 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 22 Feb 2025 11:02:00 +0000 Subject: [PATCH 25/55] refactor: define `CompliantSelector.__repr__` Makes each backend simpler, and avoids typos like in `PandasSelector.__repr__` --- narwhals/_arrow/selectors.py | 3 --- narwhals/_dask/selectors.py | 3 --- narwhals/_duckdb/selectors.py | 3 --- narwhals/_pandas_like/selectors.py | 5 ----- narwhals/_selectors.py | 6 +++++- narwhals/_spark_like/selectors.py | 3 --- narwhals/utils.py | 9 +++++++++ 7 files changed, 14 insertions(+), 18 deletions(-) diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index 408e753d55..7c7f98b796 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -54,9 +54,6 @@ class ArrowSelector(CompliantSelector["ArrowDataFrame", "ArrowSeries"], ArrowExp def selectors(self) -> ArrowSelectorNamespace: return ArrowSelectorNamespace(self) - def __repr__(self: Self) -> str: # pragma: no cover - return f"ArrowSelector(depth={self._depth}, function_name={self._function_name})" - def _to_expr(self: Self) -> ArrowExpr: return ArrowExpr( self._call, diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 4b797a6c87..28c1a18e93 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -59,9 +59,6 @@ class DaskSelector(CompliantSelector["DaskLazyFrame", "dx.Series"], DaskExpr): def selectors(self) -> DaskSelectorNamespace: return DaskSelectorNamespace(self) - def __repr__(self: Self) -> str: # pragma: no cover - return f"DaskSelector(depth={self._depth}, function_name={self._function_name})" - def _to_expr(self: Self) -> DaskExpr: return DaskExpr( self._call, diff --git a/narwhals/_duckdb/selectors.py b/narwhals/_duckdb/selectors.py index 5b4627abac..8ff21829eb 100644 --- a/narwhals/_duckdb/selectors.py +++ b/narwhals/_duckdb/selectors.py @@ -56,9 +56,6 @@ class DuckDBSelector( # type: ignore[misc] def selectors(self) -> DuckDBSelectorNamespace: return DuckDBSelectorNamespace(self) - def __repr__(self: Self) -> str: # pragma: no cover - return f"DuckDBSelector(function_name={self._function_name})" - def _to_expr(self: Self) -> DuckDBExpr: return DuckDBExpr( self._call, diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index d764b80751..49095106ec 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -69,11 +69,6 @@ class PandasSelector( # type: ignore[misc] def selectors(self) -> PandasSelectorNamespace: return PandasSelectorNamespace(self) - def __repr__(self) -> str: # pragma: no cover - return ( - f"PandasSelector(depth={self._depth}, function_name={self._function_name}, " - ) - def _to_expr(self: Self) -> PandasLikeExpr: return PandasLikeExpr( self._call, diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 47b71cec3b..cfd71cdcaa 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -24,6 +24,7 @@ from narwhals.utils import get_column_names from narwhals.utils import import_dtypes_module from narwhals.utils import is_compliant_dataframe +from narwhals.utils import is_tracks_depth if TYPE_CHECKING: from datetime import timezone @@ -192,7 +193,6 @@ def _iter_columns_dtypes(self, df: FrameT, /) -> Iterator[tuple[SeriesT, DType]] class CompliantSelector(CompliantExpr[SeriesT], Generic[FrameT, SeriesT], Protocol): @property def selectors(self) -> CompliantSelectorNamespace[FrameT, SeriesT]: ... - def __repr__(self: Self) -> str: ... def _to_expr(self: Self) -> CompliantExpr[SeriesT]: ... def _is_selector( @@ -271,6 +271,10 @@ def names(df: FrameT) -> Sequence[str]: def __invert__(self: Self) -> CompliantSelector[FrameT, SeriesT]: return self.selectors.all() - self + def __repr__(self: Self) -> str: # pragma: no cover + s = f"depth={self._depth}, " if is_tracks_depth(self._implementation) else "" + return f"{type(self).__name__}({s}function_name={self._function_name})" + # NOTE: Should probably be a `DataFrame` method # Using `Expr` because this doesn't require `Selector` attrs/methods diff --git a/narwhals/_spark_like/selectors.py b/narwhals/_spark_like/selectors.py index 1115dee13c..631419b061 100644 --- a/narwhals/_spark_like/selectors.py +++ b/narwhals/_spark_like/selectors.py @@ -50,9 +50,6 @@ class SparkLikeSelector(CompliantSelector["SparkLikeLazyFrame", "Column"], Spark def selectors(self: Self) -> SparkLikeSelectorNamespace: return SparkLikeSelectorNamespace(self) - def __repr__(self: Self) -> str: # pragma: no cover - return f"SparkLikeSelector(function_name={self._function_name})" - def _to_expr(self: Self) -> SparkLikeExpr: return SparkLikeExpr( self._call, diff --git a/narwhals/utils.py b/narwhals/utils.py index bddc9ad1cd..6b526e0860 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -10,6 +10,7 @@ from typing import TYPE_CHECKING from typing import Any from typing import Iterable +from typing import Literal from typing import Sequence from typing import TypeVar from typing import Union @@ -45,6 +46,7 @@ import pandas as pd from typing_extensions import Self + from typing_extensions import TypeAlias from typing_extensions import TypeIs from narwhals.dataframe import DataFrame @@ -72,6 +74,8 @@ _T2 = TypeVar("_T2") _T3 = TypeVar("_T3") + _TracksDepth: TypeAlias = "Literal[Implementation.DASK,Implementation.CUDF,Implementation.MODIN,Implementation.PANDAS,Implementation.PYSPARK]" + class _SupportsVersion(Protocol): __version__: str @@ -1339,3 +1343,8 @@ def has_native_namespace(obj: Any) -> TypeIs[SupportsNativeNamespace]: def _supports_dataframe_interchange(obj: Any) -> TypeIs[DataFrameLike]: return hasattr(obj, "__dataframe__") + + +def is_tracks_depth(obj: Implementation, /) -> TypeIs[_TracksDepth]: # pragma: no cover + """Return `True` for implementations that utilize `CompliantExpr._depth`.""" + return obj.is_pandas_like() or obj in {Implementation.PYARROW, Implementation.DASK} From 3c5ea87d268dbb03633232e289e61c4f4c837543 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 22 Feb 2025 11:09:58 +0000 Subject: [PATCH 26/55] refactor(typing): always use concrete class in `_selector() -> ...` - `SparkLikeSelector` already has it - No need to keep these generic, after specializing --- narwhals/_arrow/selectors.py | 2 +- narwhals/_dask/selectors.py | 2 +- narwhals/_duckdb/selectors.py | 2 +- narwhals/_pandas_like/selectors.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index 7c7f98b796..ad5f74dfeb 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -32,7 +32,7 @@ def _selector( call: EvalSeries[ArrowDataFrame, ArrowSeries], evaluate_output_names: EvalNames[ArrowDataFrame], /, - ) -> CompliantSelector[ArrowDataFrame, ArrowSeries]: + ) -> ArrowSelector: return ArrowSelector( call, depth=0, diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 28c1a18e93..08fc62ea81 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -37,7 +37,7 @@ def _selector( call: EvalSeries[DaskLazyFrame, dx.Series], # pyright: ignore[reportInvalidTypeForm] evaluate_output_names: EvalNames[DaskLazyFrame], /, - ) -> CompliantSelector[DaskLazyFrame, dx.Series]: # pyright: ignore[reportInvalidTypeArguments] + ) -> DaskSelector: return DaskSelector( call, depth=0, diff --git a/narwhals/_duckdb/selectors.py b/narwhals/_duckdb/selectors.py index 8ff21829eb..1eef4a2a17 100644 --- a/narwhals/_duckdb/selectors.py +++ b/narwhals/_duckdb/selectors.py @@ -32,7 +32,7 @@ def _selector( call: EvalSeries[DuckDBLazyFrame, duckdb.Expression], # type: ignore[type-var] evaluate_output_names: EvalNames[DuckDBLazyFrame], /, - ) -> CompliantSelector[DuckDBLazyFrame, duckdb.Expression]: # type: ignore[type-var] + ) -> DuckDBSelector: return DuckDBSelector( call, function_name="selector", diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index 49095106ec..407b33a6ae 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -44,7 +44,7 @@ def _selector( call: EvalSeries[PandasLikeDataFrame, PandasLikeSeries], evaluate_output_names: EvalNames[PandasLikeDataFrame], /, - ) -> CompliantSelector[PandasLikeDataFrame, PandasLikeSeries]: + ) -> PandasSelector: return PandasSelector( call, depth=0, From b30f020545da872aa3b753e5314482dfc965a012 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 22 Feb 2025 11:28:27 +0000 Subject: [PATCH 27/55] chore: move comments to discussion threads --- narwhals/_pandas_like/selectors.py | 3 --- narwhals/_selectors.py | 16 +--------------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index 407b33a6ae..311c37bfca 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -32,9 +32,6 @@ def _iter_columns(self, df: PandasLikeDataFrame) -> Iterator[PandasLikeSeries]: backend_version=df._backend_version, version=df._version, ) - # NOTE: (PERF102) is a false-positive - # .items() -> (str, pd.Series) - # .values() -> np.ndarray for _col, ser in df._native_frame.items(): # noqa: PERF102 yield series(ser) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index cfd71cdcaa..f8643bced6 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -1,7 +1,4 @@ -"""Almost entirely complete, generic `selectors` implementation. - -- Focusing on eager-only for now -""" +"""Almost entirely complete, generic `selectors` implementation.""" from __future__ import annotations @@ -42,7 +39,6 @@ from narwhals.utils import Version from narwhals.utils import _FullContext - # NOTE: Plugging the gap of this not being defined in `CompliantSeries` class CompliantSeriesWithDType(CompliantSeries, Protocol): @property def dtype(self) -> DType: ... @@ -55,15 +51,11 @@ def dtype(self) -> DType: ... EvalNames: TypeAlias = Callable[[FrameT], Sequence[str]] -# NOTE: Pretty much finished generic for eager backends class CompliantSelectorNamespace(Generic[FrameT, SeriesT], Protocol): _implementation: Implementation _backend_version: tuple[int, ...] _version: Version - # TODO @dangotbanned: push for adding to public API for `DataFrame` - # Only need internally, but it plugs so many holes that it must be useful beyond that - # https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.iter_columns.html def _iter_columns(self, df: FrameT, /) -> Iterator[SeriesT]: ... def _iter_schema(self, df: FrameT, /) -> Iterator[tuple[str, DType]]: for ser in self._iter_columns(df): @@ -84,9 +76,6 @@ def _selector( /, ) -> CompliantSelector[FrameT, SeriesT]: ... - # NOTE: `.dtype` won't return a `nw.DType` (or maybe anything) for lazy backends - # - Their `SeriesT` is a native object - # - See (https://github.com/narwhals-dev/narwhals/issues/2044) def _is_dtype( self: CompliantSelectorNamespace[FrameT, SeriesT], dtype: type[DType], / ) -> CompliantSelector[FrameT, SeriesT]: @@ -115,7 +104,6 @@ def matches(self: Self, pattern: str) -> CompliantSelector[FrameT, SeriesT]: p = re.compile(pattern) def series(df: FrameT) -> Sequence[SeriesT]: - # NOTE: https://github.com/narwhals-dev/narwhals/actions/runs/13467042968/job/37634856612?pr=2064 if is_compliant_dataframe(df) and not self._implementation.is_duckdb(): return [df.get_column(col) for col in df.columns if p.search(col)] @@ -276,8 +264,6 @@ def __repr__(self: Self) -> str: # pragma: no cover return f"{type(self).__name__}({s}function_name={self._function_name})" -# NOTE: Should probably be a `DataFrame` method -# Using `Expr` because this doesn't require `Selector` attrs/methods def _eval_lhs_rhs( df: CompliantDataFrame | CompliantLazyFrame, lhs: CompliantExpr, rhs: CompliantExpr ) -> tuple[Sequence[str], Sequence[str]]: From eaa5b1c28ac8a93003825eed78698a22e7bcebab Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 22 Feb 2025 11:30:56 +0000 Subject: [PATCH 28/55] style: order w/ unimplemented methods first --- narwhals/_selectors.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index f8643bced6..e81dc44074 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -56,7 +56,16 @@ class CompliantSelectorNamespace(Generic[FrameT, SeriesT], Protocol): _backend_version: tuple[int, ...] _version: Version + def _selector( + self, + context: _FullContext, + call: EvalSeries[FrameT, SeriesT], + evaluate_output_names: EvalNames[FrameT], + /, + ) -> CompliantSelector[FrameT, SeriesT]: ... + def _iter_columns(self, df: FrameT, /) -> Iterator[SeriesT]: ... + def _iter_schema(self, df: FrameT, /) -> Iterator[tuple[str, DType]]: for ser in self._iter_columns(df): yield ser.name, ser.dtype @@ -68,14 +77,6 @@ def _iter_columns_dtypes(self, df: FrameT, /) -> Iterator[tuple[SeriesT, DType]] def _iter_columns_names(self, df: FrameT, /) -> Iterator[tuple[SeriesT, str]]: yield from zip(self._iter_columns(df), df.columns) - def _selector( - self, - context: _FullContext, - call: EvalSeries[FrameT, SeriesT], - evaluate_output_names: EvalNames[FrameT], - /, - ) -> CompliantSelector[FrameT, SeriesT]: ... - def _is_dtype( self: CompliantSelectorNamespace[FrameT, SeriesT], dtype: type[DType], / ) -> CompliantSelector[FrameT, SeriesT]: From 92a96cd416d98553051454b0f2957ee98c25d0f0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 22 Feb 2025 11:37:07 +0000 Subject: [PATCH 29/55] ignore `DAR201` narwhals/utils.py:1349:1: DAR201 Missing "Returns" in Docstring: - return https://results.pre-commit.ci/run/github/760058710/1740223863.-EOmgrTQQ7qnlq_vh8nmLw --- narwhals/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/utils.py b/narwhals/utils.py index 6b526e0860..18cd2ce3d3 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -1346,5 +1346,5 @@ def _supports_dataframe_interchange(obj: Any) -> TypeIs[DataFrameLike]: def is_tracks_depth(obj: Implementation, /) -> TypeIs[_TracksDepth]: # pragma: no cover - """Return `True` for implementations that utilize `CompliantExpr._depth`.""" + """Return `True` for implementations that utilize `CompliantExpr._depth`.""" # flake8: noqa return obj.is_pandas_like() or obj in {Implementation.PYARROW, Implementation.DASK} From 29eb2d02a27c9c61ba75fef8e689711aaf252b76 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 22 Feb 2025 11:39:25 +0000 Subject: [PATCH 30/55] degrade the DX to satisfy pre-commit :disappointed: --- narwhals/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/utils.py b/narwhals/utils.py index 18cd2ce3d3..1124004771 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -1346,5 +1346,5 @@ def _supports_dataframe_interchange(obj: Any) -> TypeIs[DataFrameLike]: def is_tracks_depth(obj: Implementation, /) -> TypeIs[_TracksDepth]: # pragma: no cover - """Return `True` for implementations that utilize `CompliantExpr._depth`.""" # flake8: noqa + # Return `True` for implementations that utilize `CompliantExpr._depth`. return obj.is_pandas_like() or obj in {Implementation.PYARROW, Implementation.DASK} From 547d62a5bf6049fc90a259a35f46afbf2505d9ad Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 22 Feb 2025 12:52:15 +0000 Subject: [PATCH 31/55] feat: reuse & define `CompliantNamespace.selectors` Typing correctly will require (#2053), so leaving it out for now --- narwhals/_arrow/selectors.py | 4 ---- narwhals/_dask/selectors.py | 4 ---- narwhals/_duckdb/selectors.py | 4 ---- narwhals/_pandas_like/selectors.py | 4 ---- narwhals/_selectors.py | 4 +++- narwhals/_spark_like/selectors.py | 4 ---- narwhals/typing.py | 3 +++ 7 files changed, 6 insertions(+), 21 deletions(-) diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index ad5f74dfeb..6772b7b365 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -50,10 +50,6 @@ def __init__(self: Self, context: _FullContext, /) -> None: class ArrowSelector(CompliantSelector["ArrowDataFrame", "ArrowSeries"], ArrowExpr): # type: ignore[misc] - @property - def selectors(self) -> ArrowSelectorNamespace: - return ArrowSelectorNamespace(self) - def _to_expr(self: Self) -> ArrowExpr: return ArrowExpr( self._call, diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 08fc62ea81..8c55b3e98d 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -55,10 +55,6 @@ def __init__(self: Self, context: _FullContext, /) -> None: class DaskSelector(CompliantSelector["DaskLazyFrame", "dx.Series"], DaskExpr): # type: ignore[misc] - @property - def selectors(self) -> DaskSelectorNamespace: - return DaskSelectorNamespace(self) - def _to_expr(self: Self) -> DaskExpr: return DaskExpr( self._call, diff --git a/narwhals/_duckdb/selectors.py b/narwhals/_duckdb/selectors.py index 1eef4a2a17..732cc3c6ec 100644 --- a/narwhals/_duckdb/selectors.py +++ b/narwhals/_duckdb/selectors.py @@ -52,10 +52,6 @@ class DuckDBSelector( # type: ignore[misc] CompliantSelector["DuckDBLazyFrame", "duckdb.Expression"], # type: ignore[type-var] DuckDBExpr, ): - @property - def selectors(self) -> DuckDBSelectorNamespace: - return DuckDBSelectorNamespace(self) - def _to_expr(self: Self) -> DuckDBExpr: return DuckDBExpr( self._call, diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index 311c37bfca..43e7f10258 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -62,10 +62,6 @@ def __init__(self: Self, context: _FullContext, /) -> None: class PandasSelector( # type: ignore[misc] CompliantSelector["PandasLikeDataFrame", "PandasLikeSeries"], PandasLikeExpr ): - @property - def selectors(self) -> PandasSelectorNamespace: - return PandasSelectorNamespace(self) - def _to_expr(self: Self) -> PandasLikeExpr: return PandasLikeExpr( self._call, diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index e81dc44074..53cc3b1689 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -181,7 +181,9 @@ def _iter_columns_dtypes(self, df: FrameT, /) -> Iterator[tuple[SeriesT, DType]] class CompliantSelector(CompliantExpr[SeriesT], Generic[FrameT, SeriesT], Protocol): @property - def selectors(self) -> CompliantSelectorNamespace[FrameT, SeriesT]: ... + def selectors(self) -> CompliantSelectorNamespace[FrameT, SeriesT]: + return self.__narwhals_namespace__().selectors + def _to_expr(self: Self) -> CompliantExpr[SeriesT]: ... def _is_selector( diff --git a/narwhals/_spark_like/selectors.py b/narwhals/_spark_like/selectors.py index 631419b061..928a803b39 100644 --- a/narwhals/_spark_like/selectors.py +++ b/narwhals/_spark_like/selectors.py @@ -46,10 +46,6 @@ def __init__(self: Self, context: _FullContext, /) -> None: class SparkLikeSelector(CompliantSelector["SparkLikeLazyFrame", "Column"], SparkLikeExpr): # type: ignore[misc] - @property - def selectors(self: Self) -> SparkLikeSelectorNamespace: - return SparkLikeSelectorNamespace(self) - def _to_expr(self: Self) -> SparkLikeExpr: return SparkLikeExpr( self._call, diff --git a/narwhals/typing.py b/narwhals/typing.py index 3b7075cc06..f6c478f0ea 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -20,6 +20,7 @@ from narwhals import dtypes from narwhals._expression_parsing import ExprKind + from narwhals._selectors import CompliantSelectorNamespace from narwhals.dataframe import DataFrame from narwhals.dataframe import LazyFrame from narwhals.dtypes import DType @@ -129,6 +130,8 @@ def col(self, *column_names: str) -> CompliantExpr[CompliantSeriesT_co]: ... def lit( self, value: Any, dtype: DType | None ) -> CompliantExpr[CompliantSeriesT_co]: ... + @property + def selectors(self) -> CompliantSelectorNamespace[Any, Any]: ... class SupportsNativeNamespace(Protocol): From e6f2d072ffd0de5e5d1d680fcc555cf7a21fdb19 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 22 Feb 2025 13:45:27 +0000 Subject: [PATCH 32/55] refactor: use `self` not `context` Only added in #2057, but not needed now this is a method --- narwhals/_arrow/selectors.py | 5 ++--- narwhals/_dask/selectors.py | 5 ++--- narwhals/_duckdb/selectors.py | 5 ++--- narwhals/_pandas_like/selectors.py | 7 +++---- narwhals/_selectors.py | 19 +++++++++---------- narwhals/_spark_like/selectors.py | 7 +++---- 6 files changed, 21 insertions(+), 27 deletions(-) diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index 6772b7b365..8cdc379b2d 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -28,7 +28,6 @@ def _iter_columns(self, df: ArrowDataFrame) -> Iterator[ArrowSeries]: def _selector( self, - context: _FullContext, call: EvalSeries[ArrowDataFrame, ArrowSeries], evaluate_output_names: EvalNames[ArrowDataFrame], /, @@ -39,8 +38,8 @@ def _selector( function_name="selector", evaluate_output_names=evaluate_output_names, alias_output_names=None, - backend_version=context._backend_version, - version=context._version, + backend_version=self._backend_version, + version=self._version, ) def __init__(self: Self, context: _FullContext, /) -> None: diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 8c55b3e98d..bd42fc76c7 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -33,7 +33,6 @@ def _iter_columns(self, df: DaskLazyFrame) -> Iterator[dx.Series]: def _selector( self, - context: _FullContext, call: EvalSeries[DaskLazyFrame, dx.Series], # pyright: ignore[reportInvalidTypeForm] evaluate_output_names: EvalNames[DaskLazyFrame], /, @@ -44,8 +43,8 @@ def _selector( function_name="selector", evaluate_output_names=evaluate_output_names, alias_output_names=None, - backend_version=context._backend_version, - version=context._version, + backend_version=self._backend_version, + version=self._version, ) def __init__(self: Self, context: _FullContext, /) -> None: diff --git a/narwhals/_duckdb/selectors.py b/narwhals/_duckdb/selectors.py index 732cc3c6ec..9e99f0e78f 100644 --- a/narwhals/_duckdb/selectors.py +++ b/narwhals/_duckdb/selectors.py @@ -28,7 +28,6 @@ def _iter_columns(self, df: DuckDBLazyFrame) -> Iterator[duckdb.Expression]: def _selector( self, - context: _FullContext, call: EvalSeries[DuckDBLazyFrame, duckdb.Expression], # type: ignore[type-var] evaluate_output_names: EvalNames[DuckDBLazyFrame], /, @@ -38,8 +37,8 @@ def _selector( function_name="selector", evaluate_output_names=evaluate_output_names, alias_output_names=None, - backend_version=context._backend_version, - version=context._version, + backend_version=self._backend_version, + version=self._version, ) def __init__(self: Self, context: _FullContext, /) -> None: diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index 43e7f10258..ed1947d530 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -37,7 +37,6 @@ def _iter_columns(self, df: PandasLikeDataFrame) -> Iterator[PandasLikeSeries]: def _selector( self, - context: _FullContext, call: EvalSeries[PandasLikeDataFrame, PandasLikeSeries], evaluate_output_names: EvalNames[PandasLikeDataFrame], /, @@ -48,9 +47,9 @@ def _selector( function_name="selector", evaluate_output_names=evaluate_output_names, alias_output_names=None, - implementation=context._implementation, - backend_version=context._backend_version, - version=context._version, + implementation=self._implementation, + backend_version=self._backend_version, + version=self._version, ) def __init__(self: Self, context: _FullContext, /) -> None: diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 53cc3b1689..bd6922559f 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -58,7 +58,6 @@ class CompliantSelectorNamespace(Generic[FrameT, SeriesT], Protocol): def _selector( self, - context: _FullContext, call: EvalSeries[FrameT, SeriesT], evaluate_output_names: EvalNames[FrameT], /, @@ -88,7 +87,7 @@ def series(df: FrameT) -> Sequence[SeriesT]: def names(df: FrameT) -> Sequence[str]: return [name for name, tp in self._iter_schema(df) if isinstance(tp, dtype)] - return self._selector(self, series, names) + return self._selector(series, names) def by_dtype( self: Self, dtypes: Collection[DType | type[DType]] @@ -99,7 +98,7 @@ def series(df: FrameT) -> Sequence[SeriesT]: def names(df: FrameT) -> Sequence[str]: return [name for name, tp in self._iter_schema(df) if tp in dtypes] - return self._selector(self, series, names) + return self._selector(series, names) def matches(self: Self, pattern: str) -> CompliantSelector[FrameT, SeriesT]: p = re.compile(pattern) @@ -113,7 +112,7 @@ def series(df: FrameT) -> Sequence[SeriesT]: def names(df: FrameT) -> Sequence[str]: return [col for col in df.columns if p.search(col)] - return self._selector(self, series, names) + return self._selector(series, names) def numeric(self: Self) -> CompliantSelector[FrameT, SeriesT]: def series(df: FrameT) -> Sequence[SeriesT]: @@ -122,7 +121,7 @@ def series(df: FrameT) -> Sequence[SeriesT]: def names(df: FrameT) -> Sequence[str]: return [name for name, tp in self._iter_schema(df) if tp.is_numeric()] - return self._selector(self, series, names) + return self._selector(series, names) def categorical(self: Self) -> CompliantSelector[FrameT, SeriesT]: return self._is_dtype(import_dtypes_module(self._version).Categorical) @@ -137,7 +136,7 @@ def all(self: Self) -> CompliantSelector[FrameT, SeriesT]: def series(df: FrameT) -> Sequence[SeriesT]: return list(self._iter_columns(df)) - return self._selector(self, series, get_column_names) + return self._selector(series, get_column_names) def datetime( self: Self, @@ -158,7 +157,7 @@ def series(df: FrameT) -> Sequence[SeriesT]: def names(df: FrameT) -> Sequence[str]: return [name for name, tp in self._iter_schema(df) if matches(tp)] - return self._selector(self, series, names) + return self._selector(series, names) # NOTE: Can't reuse for `<3.11` # - https://github.com/python/cpython/issues/88970 @@ -210,7 +209,7 @@ def names(df: FrameT) -> Sequence[str]: lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) return [x for x in lhs_names if x not in rhs_names] - return self.selectors._selector(self, series, names) + return self.selectors._selector(series, names) else: return self._to_expr() - other @@ -234,7 +233,7 @@ def series(df: FrameT) -> Sequence[str]: lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) return [*(x for x in lhs_names if x not in rhs_names), *rhs_names] - return self.selectors._selector(self, names, series) + return self.selectors._selector(names, series) else: return self._to_expr() | other @@ -255,7 +254,7 @@ def names(df: FrameT) -> Sequence[str]: lhs_names, rhs_names = _eval_lhs_rhs(df, self, other) return [x for x in lhs_names if x in rhs_names] - return self.selectors._selector(self, series, names) + return self.selectors._selector(series, names) else: return self._to_expr() & other diff --git a/narwhals/_spark_like/selectors.py b/narwhals/_spark_like/selectors.py index 928a803b39..5c9e77a3d0 100644 --- a/narwhals/_spark_like/selectors.py +++ b/narwhals/_spark_like/selectors.py @@ -24,7 +24,6 @@ def _iter_columns(self, df: SparkLikeLazyFrame) -> Iterator[Column]: def _selector( self, - context: _FullContext, call: EvalSeries[SparkLikeLazyFrame, Column], evaluate_output_names: EvalNames[SparkLikeLazyFrame], /, @@ -34,9 +33,9 @@ def _selector( function_name="selector", evaluate_output_names=evaluate_output_names, alias_output_names=None, - backend_version=context._backend_version, - version=context._version, - implementation=context._implementation, + backend_version=self._backend_version, + version=self._version, + implementation=self._implementation, ) def __init__(self: Self, context: _FullContext, /) -> None: From 13882a5fbf9676399ed531beb528a4f462c8aad9 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 23 Feb 2025 11:25:27 +0000 Subject: [PATCH 33/55] refactor: remove `CompliantSeriesWithDType` Resolves (https://github.com/narwhals-dev/narwhals/pull/2064#discussion_r1966578116) --- narwhals/_selectors.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index bd6922559f..c52f35efe5 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -39,12 +39,8 @@ from narwhals.utils import Version from narwhals.utils import _FullContext - class CompliantSeriesWithDType(CompliantSeries, Protocol): - @property - def dtype(self) -> DType: ... - -SeriesT = TypeVar("SeriesT", bound="CompliantSeriesWithDType") +SeriesT = TypeVar("SeriesT", bound="CompliantSeries") FrameT = TypeVar("FrameT", bound="CompliantDataFrame | CompliantLazyFrame") SelectorOrExpr: TypeAlias = "CompliantSelector[FrameT, SeriesT] | CompliantExpr[SeriesT]" EvalSeries: TypeAlias = Callable[[FrameT], Sequence[SeriesT]] From 7f65b4b00055a4d7ab07ed391e8a0c40457c65b0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 23 Feb 2025 12:04:40 +0000 Subject: [PATCH 34/55] fix(typing): resolve some variance issues `pyright` wanted *contra* because `CompliantExpr` was missing the annotation for `__call__` --- narwhals/_arrow/expr.py | 2 +- narwhals/_arrow/namespace.py | 2 +- narwhals/_dask/expr.py | 2 +- narwhals/_dask/namespace.py | 2 +- narwhals/_duckdb/expr.py | 2 +- narwhals/_duckdb/namespace.py | 2 +- narwhals/_expression_parsing.py | 22 ++++++++++------------ narwhals/_pandas_like/expr.py | 2 +- narwhals/_pandas_like/namespace.py | 2 +- narwhals/_selectors.py | 24 +++++++++++++++++------- narwhals/_spark_like/expr.py | 2 +- narwhals/_spark_like/namespace.py | 2 +- narwhals/typing.py | 25 ++++++++++--------------- narwhals/utils.py | 6 +++--- 14 files changed, 50 insertions(+), 47 deletions(-) diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 8ae5833d9b..1509722fef 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -49,7 +49,7 @@ def __init__( self._depth = depth self._function_name = function_name self._depth = depth - self._evaluate_output_names = evaluate_output_names # pyright: ignore[reportAttributeAccessIssue] + self._evaluate_output_names = evaluate_output_names self._alias_output_names = alias_output_names self._backend_version = backend_version self._version = version diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index ecbec58fa3..ed1b0c383a 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -473,7 +473,7 @@ def __init__( self._call = call self._depth = depth self._function_name = function_name - self._evaluate_output_names = evaluate_output_names # pyright: ignore[reportAttributeAccessIssue] + self._evaluate_output_names = evaluate_output_names self._alias_output_names = alias_output_names self._call_kwargs = call_kwargs or {} diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index 5d765a3a9d..f0c6ce5250 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -55,7 +55,7 @@ def __init__( self._call = call self._depth = depth self._function_name = function_name - self._evaluate_output_names = evaluate_output_names # pyright: ignore[reportAttributeAccessIssue] + self._evaluate_output_names = evaluate_output_names self._alias_output_names = alias_output_names self._backend_version = backend_version self._version = version diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 9453d15b9d..b6a179ea7c 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -412,7 +412,7 @@ def __init__( self._call = call self._depth = depth self._function_name = function_name - self._evaluate_output_names = evaluate_output_names # pyright: ignore[reportAttributeAccessIssue] + self._evaluate_output_names = evaluate_output_names self._alias_output_names = alias_output_names self._call_kwargs = call_kwargs or {} diff --git a/narwhals/_duckdb/expr.py b/narwhals/_duckdb/expr.py index 45f614489a..b098b7be65 100644 --- a/narwhals/_duckdb/expr.py +++ b/narwhals/_duckdb/expr.py @@ -49,7 +49,7 @@ def __init__( ) -> None: self._call = call self._function_name = function_name - self._evaluate_output_names = evaluate_output_names # pyright: ignore[reportAttributeAccessIssue] + self._evaluate_output_names = evaluate_output_names self._alias_output_names = alias_output_names self._backend_version = backend_version self._version = version diff --git a/narwhals/_duckdb/namespace.py b/narwhals/_duckdb/namespace.py index 39e39242ca..11fbd89384 100644 --- a/narwhals/_duckdb/namespace.py +++ b/narwhals/_duckdb/namespace.py @@ -336,7 +336,7 @@ def __init__( self._version = version self._call = call self._function_name = function_name - self._evaluate_output_names = evaluate_output_names # pyright: ignore[reportAttributeAccessIssue] + self._evaluate_output_names = evaluate_output_names self._alias_output_names = alias_output_names def otherwise(self: Self, value: DuckDBExpr | Any) -> DuckDBExpr: diff --git a/narwhals/_expression_parsing.py b/narwhals/_expression_parsing.py index c3862b3a0f..32ab404dfb 100644 --- a/narwhals/_expression_parsing.py +++ b/narwhals/_expression_parsing.py @@ -29,7 +29,7 @@ from narwhals.expr import Expr from narwhals.typing import CompliantDataFrame from narwhals.typing import CompliantExpr - from narwhals.typing import CompliantFrameT_contra + from narwhals.typing import CompliantFrameT from narwhals.typing import CompliantLazyFrame from narwhals.typing import CompliantNamespace from narwhals.typing import CompliantSeries @@ -51,8 +51,7 @@ def is_expr(obj: Any) -> TypeIs[Expr]: def evaluate_into_expr( - df: CompliantFrameT_contra, - expr: CompliantExpr[CompliantFrameT_contra, CompliantSeriesT_co], + df: CompliantFrameT, expr: CompliantExpr[CompliantFrameT, CompliantSeriesT_co] ) -> Sequence[CompliantSeriesT_co]: """Return list of raw columns. @@ -72,9 +71,9 @@ def evaluate_into_expr( def evaluate_into_exprs( - df: CompliantFrameT_contra, + df: CompliantFrameT, /, - *exprs: CompliantExpr[CompliantFrameT_contra, CompliantSeriesT_co], + *exprs: CompliantExpr[CompliantFrameT, CompliantSeriesT_co], ) -> list[CompliantSeriesT_co]: """Evaluate each expr into Series.""" return [ @@ -86,8 +85,7 @@ def evaluate_into_exprs( @overload def maybe_evaluate_expr( - df: CompliantFrameT_contra, - expr: CompliantExpr[CompliantFrameT_contra, CompliantSeriesT_co], + df: CompliantFrameT, expr: CompliantExpr[CompliantFrameT, CompliantSeriesT_co] ) -> CompliantSeriesT_co: ... @@ -257,15 +255,15 @@ def is_simple_aggregation(expr: CompliantExpr[Any, Any]) -> bool: def combine_evaluate_output_names( - *exprs: CompliantExpr[CompliantFrameT_contra, Any], -) -> Callable[[CompliantFrameT_contra], Sequence[str]]: + *exprs: CompliantExpr[CompliantFrameT, Any], +) -> Callable[[CompliantFrameT], Sequence[str]]: # Follow left-hand-rule for naming. E.g. `nw.sum_horizontal(expr1, expr2)` takes the # first name of `expr1`. if not is_compliant_expr(exprs[0]): # pragma: no cover msg = f"Safety assertion failed, expected expression, got: {type(exprs[0])}. Please report a bug." raise AssertionError(msg) - def evaluate_output_names(df: CompliantFrameT_contra) -> Sequence[str]: + def evaluate_output_names(df: CompliantFrameT) -> Sequence[str]: return exprs[0]._evaluate_output_names(df)[:1] return evaluate_output_names @@ -286,11 +284,11 @@ def alias_output_names(names: Sequence[str]) -> Sequence[str]: def extract_compliant( - plx: CompliantNamespace[CompliantFrameT_contra, CompliantSeriesT_co], + plx: CompliantNamespace[CompliantFrameT, CompliantSeriesT_co], other: Any, *, str_as_lit: bool, -) -> CompliantExpr[CompliantFrameT_contra, CompliantSeriesT_co] | object: +) -> CompliantExpr[CompliantFrameT, CompliantSeriesT_co] | object: if is_expr(other): return other._to_compliant_expr(plx) if isinstance(other, str) and not str_as_lit: diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 20077af8a2..3cd875e8d6 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -63,7 +63,7 @@ def __init__( self._call = call self._depth = depth self._function_name = function_name - self._evaluate_output_names = evaluate_output_names # pyright: ignore[reportAttributeAccessIssue] + self._evaluate_output_names = evaluate_output_names self._alias_output_names = alias_output_names self._implementation = implementation self._backend_version = backend_version diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 3aa74fdd8f..206d923969 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -496,7 +496,7 @@ def __init__( self._call = call self._depth = depth self._function_name = function_name - self._evaluate_output_names = evaluate_output_names # pyright: ignore[reportAttributeAccessIssue] + self._evaluate_output_names = evaluate_output_names self._alias_output_names = alias_output_names self._call_kwargs = call_kwargs or {} diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index c52f35efe5..68342bf95d 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -42,7 +42,9 @@ SeriesT = TypeVar("SeriesT", bound="CompliantSeries") FrameT = TypeVar("FrameT", bound="CompliantDataFrame | CompliantLazyFrame") -SelectorOrExpr: TypeAlias = "CompliantSelector[FrameT, SeriesT] | CompliantExpr[SeriesT]" +SelectorOrExpr: TypeAlias = ( + "CompliantSelector[FrameT, SeriesT] | CompliantExpr[FrameT, SeriesT]" +) EvalSeries: TypeAlias = Callable[[FrameT], Sequence[SeriesT]] EvalNames: TypeAlias = Callable[[FrameT], Sequence[str]] @@ -174,22 +176,26 @@ def _iter_columns_dtypes(self, df: FrameT, /) -> Iterator[tuple[SeriesT, DType]] yield from zip(self._iter_columns(df), df.schema.values()) -class CompliantSelector(CompliantExpr[SeriesT], Generic[FrameT, SeriesT], Protocol): +class CompliantSelector( + CompliantExpr[FrameT, SeriesT], Generic[FrameT, SeriesT], Protocol +): @property def selectors(self) -> CompliantSelectorNamespace[FrameT, SeriesT]: return self.__narwhals_namespace__().selectors - def _to_expr(self: Self) -> CompliantExpr[SeriesT]: ... + def _to_expr(self: Self) -> CompliantExpr[FrameT, SeriesT]: ... def _is_selector( - self: Self, other: Self | CompliantExpr[SeriesT] + self: Self, other: Self | CompliantExpr[FrameT, SeriesT] ) -> TypeIs[CompliantSelector[FrameT, SeriesT]]: return isinstance(other, type(self)) @overload def __sub__(self: Self, other: Self) -> Self: ... @overload - def __sub__(self: Self, other: CompliantExpr[SeriesT]) -> CompliantExpr[SeriesT]: ... + def __sub__( + self: Self, other: CompliantExpr[FrameT, SeriesT] + ) -> CompliantExpr[FrameT, SeriesT]: ... def __sub__( self: Self, other: SelectorOrExpr[FrameT, SeriesT] ) -> SelectorOrExpr[FrameT, SeriesT]: @@ -212,7 +218,9 @@ def names(df: FrameT) -> Sequence[str]: @overload def __or__(self: Self, other: Self) -> Self: ... @overload - def __or__(self: Self, other: CompliantExpr[SeriesT]) -> CompliantExpr[SeriesT]: ... + def __or__( + self: Self, other: CompliantExpr[FrameT, SeriesT] + ) -> CompliantExpr[FrameT, SeriesT]: ... def __or__( self: Self, other: SelectorOrExpr[FrameT, SeriesT] ) -> SelectorOrExpr[FrameT, SeriesT]: @@ -236,7 +244,9 @@ def series(df: FrameT) -> Sequence[str]: @overload def __and__(self: Self, other: Self) -> Self: ... @overload - def __and__(self: Self, other: CompliantExpr[SeriesT]) -> CompliantExpr[SeriesT]: ... + def __and__( + self: Self, other: CompliantExpr[FrameT, SeriesT] + ) -> CompliantExpr[FrameT, SeriesT]: ... def __and__( self: Self, other: SelectorOrExpr[FrameT, SeriesT] ) -> SelectorOrExpr[FrameT, SeriesT]: diff --git a/narwhals/_spark_like/expr.py b/narwhals/_spark_like/expr.py index 599b96059a..57daaf517d 100644 --- a/narwhals/_spark_like/expr.py +++ b/narwhals/_spark_like/expr.py @@ -43,7 +43,7 @@ def __init__( ) -> None: self._call = call self._function_name = function_name - self._evaluate_output_names = evaluate_output_names # pyright: ignore[reportAttributeAccessIssue] + self._evaluate_output_names = evaluate_output_names self._alias_output_names = alias_output_names self._backend_version = backend_version self._version = version diff --git a/narwhals/_spark_like/namespace.py b/narwhals/_spark_like/namespace.py index 7a21701599..74d0841431 100644 --- a/narwhals/_spark_like/namespace.py +++ b/narwhals/_spark_like/namespace.py @@ -380,7 +380,7 @@ def __init__( self._version = version self._call = call self._function_name = function_name - self._evaluate_output_names = evaluate_output_names # pyright: ignore[reportAttributeAccessIssue] + self._evaluate_output_names = evaluate_output_names self._alias_output_names = alias_output_names self._implementation = implementation diff --git a/narwhals/typing.py b/narwhals/typing.py index aa9243d479..9c4cb8bea4 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -92,30 +92,25 @@ def columns(self) -> Sequence[str]: ... def schema(self) -> Mapping[str, DType]: ... -CompliantFrameT_contra = TypeVar( - "CompliantFrameT_contra", - bound="CompliantDataFrame | CompliantLazyFrame", - contravariant=True, -) -CompliantSeriesT_co = TypeVar( - "CompliantSeriesT_co", bound=CompliantSeries, covariant=True +CompliantFrameT = TypeVar( + "CompliantFrameT", bound="CompliantDataFrame | CompliantLazyFrame" ) -class CompliantExpr(Protocol, Generic[CompliantFrameT_contra, CompliantSeriesT_co]): +class CompliantExpr(Protocol, Generic[CompliantFrameT, CompliantSeriesT_co]): _implementation: Implementation _backend_version: tuple[int, ...] _version: Version - _evaluate_output_names: Callable[[CompliantFrameT_contra], Sequence[str]] + _evaluate_output_names: Callable[[CompliantFrameT], Sequence[str]] _alias_output_names: Callable[[Sequence[str]], Sequence[str]] | None _depth: int _function_name: str - def __call__(self, df: Any) -> Sequence[CompliantSeriesT_co]: ... + def __call__(self, df: CompliantFrameT) -> Sequence[CompliantSeriesT_co]: ... def __narwhals_expr__(self) -> None: ... def __narwhals_namespace__( self, - ) -> CompliantNamespace[CompliantFrameT_contra, CompliantSeriesT_co]: ... + ) -> CompliantNamespace[CompliantFrameT, CompliantSeriesT_co]: ... def is_null(self) -> Self: ... def alias(self, name: str) -> Self: ... def cast(self, dtype: DType) -> Self: ... @@ -137,13 +132,13 @@ def broadcast( ) -> Self: ... -class CompliantNamespace(Protocol, Generic[CompliantFrameT_contra, CompliantSeriesT_co]): +class CompliantNamespace(Protocol, Generic[CompliantFrameT, CompliantSeriesT_co]): def col( self, *column_names: str - ) -> CompliantExpr[CompliantFrameT_contra, CompliantSeriesT_co]: ... + ) -> CompliantExpr[CompliantFrameT, CompliantSeriesT_co]: ... def lit( self, value: Any, dtype: DType | None - ) -> CompliantExpr[CompliantFrameT_contra, CompliantSeriesT_co]: ... + ) -> CompliantExpr[CompliantFrameT, CompliantSeriesT_co]: ... @property def selectors(self) -> CompliantSelectorNamespace[Any, Any]: ... @@ -345,7 +340,7 @@ class DTypes: # This one needs to be in TYPE_CHECKING to pass on 3.9, # and can only be defined after CompliantExpr has been defined IntoCompliantExpr: TypeAlias = ( - CompliantExpr[CompliantFrameT_contra, CompliantSeriesT_co] | CompliantSeriesT_co + CompliantExpr[CompliantFrameT, CompliantSeriesT_co] | CompliantSeriesT_co ) diff --git a/narwhals/utils.py b/narwhals/utils.py index 6338fa3ebc..e4d23f5b8a 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -55,7 +55,7 @@ from narwhals.series import Series from narwhals.typing import CompliantDataFrame from narwhals.typing import CompliantExpr - from narwhals.typing import CompliantFrameT_contra + from narwhals.typing import CompliantFrameT from narwhals.typing import CompliantLazyFrame from narwhals.typing import CompliantSeries from narwhals.typing import CompliantSeriesT_co @@ -1333,8 +1333,8 @@ def is_compliant_series(obj: Any) -> TypeIs[CompliantSeries]: def is_compliant_expr( - obj: CompliantExpr[CompliantFrameT_contra, CompliantSeriesT_co] | Any, -) -> TypeIs[CompliantExpr[CompliantFrameT_contra, CompliantSeriesT_co]]: + obj: CompliantExpr[CompliantFrameT, CompliantSeriesT_co] | Any, +) -> TypeIs[CompliantExpr[CompliantFrameT, CompliantSeriesT_co]]: return hasattr(obj, "__narwhals_expr__") From c11dc9566c5097a3be4675a36565128aef0986f3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 23 Feb 2025 13:17:20 +0000 Subject: [PATCH 35/55] fix(typing): resolve more `mypy` errors https://github.com/narwhals-dev/narwhals/actions/runs/13483136123/job/37670892891?pr=2064 --- narwhals/_arrow/dataframe.py | 22 ++++++++++++---------- narwhals/_pandas_like/dataframe.py | 18 +++++++++++------- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 1d7e3d1e53..98ba71810d 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -343,24 +343,24 @@ def simple_select(self, *column_names: str) -> Self: self._native_frame.select(list(column_names)), validate_column_names=False ) - def aggregate(self: Self, *exprs: ArrowExpr) -> Self: + def aggregate(self: ArrowDataFrame, *exprs: ArrowExpr) -> ArrowDataFrame: return self.select(*exprs) - def select(self: Self, *exprs: ArrowExpr) -> Self: - new_series: Sequence[ArrowSeries] = evaluate_into_exprs(self, *exprs) + def select(self: ArrowDataFrame, *exprs: ArrowExpr) -> ArrowDataFrame: + new_series = evaluate_into_exprs(self, *exprs) if not new_series: # return empty dataframe, like Polars does return self._from_native_frame( self._native_frame.__class__.from_arrays([]), validate_column_names=False ) names = [s.name for s in new_series] - new_series = align_series_full_broadcast(*new_series) - df = pa.Table.from_arrays([s._native_series for s in new_series], names=names) + reshaped = align_series_full_broadcast(*new_series) + df = pa.Table.from_arrays([s._native_series for s in reshaped], names=names) return self._from_native_frame(df, validate_column_names=False) - def with_columns(self: Self, *exprs: ArrowExpr) -> Self: + def with_columns(self: ArrowDataFrame, *exprs: ArrowExpr) -> ArrowDataFrame: native_frame = self._native_frame - new_columns: list[ArrowSeries] = evaluate_into_exprs(self, *exprs) + new_columns = evaluate_into_exprs(self, *exprs) length = len(self) columns = self.columns @@ -458,7 +458,7 @@ def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001 self._native_frame.drop(to_drop), validate_column_names=False ) - def drop_nulls(self: Self, subset: list[str] | None) -> Self: + def drop_nulls(self: Self, subset: list[str] | None) -> ArrowDataFrame: if subset is None: return self._from_native_frame( self._native_frame.drop_null(), validate_column_names=False @@ -540,7 +540,9 @@ def with_row_index(self: Self, name: str) -> Self: df.append_column(name, row_indices).select([name, *cols]) ) - def filter(self: Self, predicate: ArrowExpr | list[bool | None]) -> Self: + def filter( + self: ArrowDataFrame, predicate: ArrowExpr | list[bool | None] + ) -> ArrowDataFrame: if isinstance(predicate, list): mask_native: Mask | ArrowChunkedArray = predicate else: @@ -737,7 +739,7 @@ def unique( *, keep: Literal["any", "first", "last", "none"], maintain_order: bool | None = None, - ) -> Self: + ) -> ArrowDataFrame: # The param `maintain_order` is only here for compatibility with the Polars API # and has no effect on the output. import numpy as np # ignore-banned-import diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index e235fe27ee..f29a34db7b 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -388,11 +388,11 @@ def simple_select(self: Self, *column_names: str) -> Self: validate_column_names=False, ) - def aggregate(self: Self, *exprs: PandasLikeExpr) -> Self: + def aggregate(self: Self, *exprs: PandasLikeExpr) -> PandasLikeDataFrame: return self.select(*exprs) - def select(self: Self, *exprs: PandasLikeExpr) -> Self: - new_series: list[PandasLikeSeries] = evaluate_into_exprs(self, *exprs) + def select(self: PandasLikeDataFrame, *exprs: PandasLikeExpr) -> PandasLikeDataFrame: + new_series = evaluate_into_exprs(self, *exprs) if not new_series: # return empty dataframe, like Polars does return self._from_native_frame( @@ -406,7 +406,7 @@ def select(self: Self, *exprs: PandasLikeExpr) -> Self: ) return self._from_native_frame(df, validate_column_names=False) - def drop_nulls(self: Self, subset: list[str] | None) -> Self: + def drop_nulls(self: Self, subset: list[str] | None) -> PandasLikeDataFrame: if subset is None: return self._from_native_frame( self._native_frame.dropna(axis=0), validate_column_names=False @@ -437,7 +437,9 @@ def with_row_index(self: Self, name: str) -> Self: def row(self: Self, row: int) -> tuple[Any, ...]: return tuple(x for x in self._native_frame.iloc[row]) - def filter(self: Self, predicate: PandasLikeExpr | list[bool]) -> Self: + def filter( + self: PandasLikeDataFrame, predicate: PandasLikeExpr | list[bool] + ) -> PandasLikeDataFrame: if isinstance(predicate, list): mask_native: pd.Series[Any] | list[bool] = predicate else: @@ -449,9 +451,11 @@ def filter(self: Self, predicate: PandasLikeExpr | list[bool]) -> Self: self._native_frame.loc[mask_native], validate_column_names=False ) - def with_columns(self: Self, *exprs: PandasLikeExpr) -> Self: + def with_columns( + self: PandasLikeDataFrame, *exprs: PandasLikeExpr + ) -> PandasLikeDataFrame: index = self._native_frame.index - new_columns: list[PandasLikeSeries] = evaluate_into_exprs(self, *exprs) + new_columns = evaluate_into_exprs(self, *exprs) if not new_columns and len(self) == 0: return self From ef85eda6599d0a0fce261c35416983756fa456e5 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 23 Feb 2025 13:40:09 +0000 Subject: [PATCH 36/55] fix(typing): resolve `when()` related issues These were untyped before, but now `Expr.__call__` has the right typevar https://github.com/narwhals-dev/narwhals/actions/runs/13483136123/job/37670892891?pr=2064 --- narwhals/_arrow/namespace.py | 15 +++++++-------- narwhals/_dask/namespace.py | 19 ++++++++----------- narwhals/_pandas_like/namespace.py | 15 +++++++-------- 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/narwhals/_arrow/namespace.py b/narwhals/_arrow/namespace.py index ed1b0c383a..c15913f347 100644 --- a/narwhals/_arrow/namespace.py +++ b/narwhals/_arrow/namespace.py @@ -28,7 +28,6 @@ from narwhals.utils import Implementation from narwhals.utils import get_column_names from narwhals.utils import import_dtypes_module -from narwhals.utils import is_compliant_expr if TYPE_CHECKING: from typing import Callable @@ -392,9 +391,9 @@ def __init__( version: Version, ) -> None: self._backend_version = backend_version - self._condition = condition - self._then_value = then_value - self._otherwise_value = otherwise_value + self._condition: ArrowExpr = condition + self._then_value: ArrowExpr | Any = then_value + self._otherwise_value: ArrowExpr | Any = otherwise_value self._version = version def __call__(self: Self, df: ArrowDataFrame) -> Sequence[ArrowSeries]: @@ -402,8 +401,8 @@ def __call__(self: Self, df: ArrowDataFrame) -> Sequence[ArrowSeries]: condition = self._condition(df)[0] condition_native = condition._native_series - if is_compliant_expr(self._then_value): - value_series: ArrowSeries = self._then_value(df)[0] + if isinstance(self._then_value, ArrowExpr): + value_series = self._then_value(df)[0] else: # `self._then_value` is a scalar value_series = plx._create_series_from_scalar( @@ -421,8 +420,8 @@ def __call__(self: Self, df: ArrowDataFrame) -> Sequence[ArrowSeries]: pc.if_else(condition_native, value_series_native, otherwise_null) ) ] - if is_compliant_expr(self._otherwise_value): - otherwise_series: ArrowSeries = self._otherwise_value(df)[0] + if isinstance(self._otherwise_value, ArrowExpr): + otherwise_series = self._otherwise_value(df)[0] else: # `self._otherwise_value` is a scalar otherwise_series = plx._create_series_from_scalar( diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index b6a179ea7c..5cd828adfe 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -8,7 +8,6 @@ from typing import Iterable from typing import Literal from typing import Sequence -from typing import cast import dask.dataframe as dd import pandas as pd @@ -26,7 +25,6 @@ from narwhals.typing import CompliantNamespace from narwhals.utils import Implementation from narwhals.utils import get_column_names -from narwhals.utils import is_compliant_expr if TYPE_CHECKING: from typing_extensions import Self @@ -351,17 +349,16 @@ def __init__( version: Version, ) -> None: self._backend_version = backend_version - self._condition = condition - self._then_value = then_value - self._otherwise_value = otherwise_value + self._condition: DaskExpr = condition + self._then_value: DaskExpr | Any = then_value + self._otherwise_value: DaskExpr | Any = otherwise_value self._version = version def __call__(self: Self, df: DaskLazyFrame) -> Sequence[dx.Series]: condition = self._condition(df)[0] - condition = cast("dx.Series", condition) - if is_compliant_expr(self._then_value): - then_value: dx.Series | object = self._then_value(df)[0] + if isinstance(self._then_value, DaskExpr): + then_value = self._then_value(df)[0] else: then_value = self._then_value (then_series,) = align_series_full_broadcast(df, then_value) @@ -370,13 +367,13 @@ def __call__(self: Self, df: DaskLazyFrame) -> Sequence[dx.Series]: if self._otherwise_value is None: return [then_series.where(condition)] - if is_compliant_expr(self._otherwise_value): - otherwise_value: dx.Series | object = self._otherwise_value(df)[0] + if isinstance(self._otherwise_value, DaskExpr): + otherwise_value = self._otherwise_value(df)[0] else: otherwise_value = self._otherwise_value (otherwise_series,) = align_series_full_broadcast(df, otherwise_value) validate_comparand(condition, otherwise_series) - return [then_series.where(condition, otherwise_series)] + return [then_series.where(condition, otherwise_series)] # pyright: ignore[reportArgumentType] def then(self: Self, value: DaskExpr | Any) -> DaskThen: self._then_value = value diff --git a/narwhals/_pandas_like/namespace.py b/narwhals/_pandas_like/namespace.py index 206d923969..21ae784a66 100644 --- a/narwhals/_pandas_like/namespace.py +++ b/narwhals/_pandas_like/namespace.py @@ -24,7 +24,6 @@ from narwhals.typing import CompliantNamespace from narwhals.utils import get_column_names from narwhals.utils import import_dtypes_module -from narwhals.utils import is_compliant_expr if TYPE_CHECKING: from typing_extensions import Self @@ -413,9 +412,9 @@ def __init__( ) -> None: self._implementation = implementation self._backend_version = backend_version - self._condition = condition - self._then_value = then_value - self._otherwise_value = otherwise_value + self._condition: PandasLikeExpr = condition + self._then_value: PandasLikeExpr | Any = then_value + self._otherwise_value: PandasLikeExpr | Any = otherwise_value self._version = version def __call__(self: Self, df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: @@ -423,8 +422,8 @@ def __call__(self: Self, df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: condition = self._condition(df)[0] condition_native = condition._native_series - if is_compliant_expr(self._then_value): - value_series: PandasLikeSeries = self._then_value(df)[0] + if isinstance(self._then_value, PandasLikeExpr): + value_series = self._then_value(df)[0] else: # `self._then_value` is a scalar value_series = plx._create_series_from_scalar( @@ -442,8 +441,8 @@ def __call__(self: Self, df: PandasLikeDataFrame) -> Sequence[PandasLikeSeries]: ) ] - if is_compliant_expr(self._otherwise_value): - otherwise_series: PandasLikeSeries = self._otherwise_value(df)[0] + if isinstance(self._otherwise_value, PandasLikeExpr): + otherwise_series = self._otherwise_value(df)[0] else: # `self._then_value` is a scalar otherwise_series = plx._create_series_from_scalar( From 0301a4b9c9f902b558e9b6124c1647530a1174e5 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 23 Feb 2025 13:46:05 +0000 Subject: [PATCH 37/55] chore(typing): Ignore all `_dask` warnings **SPLIT INTO A PR** - Only doing this here to reduce the noise while I work out the rest --- narwhals/_dask/expr.py | 8 ++++---- narwhals/_dask/expr_dt.py | 6 +++--- narwhals/_dask/group_by.py | 2 +- narwhals/_dask/namespace.py | 2 +- narwhals/_dask/utils.py | 6 +++--- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index f0c6ce5250..b19272d56e 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -35,7 +35,7 @@ from narwhals.utils import Version -class DaskExpr(CompliantExpr["DaskLazyFrame", "dx.Series"]): +class DaskExpr(CompliantExpr["DaskLazyFrame", "dx.Series"]): # pyright: ignore[reportInvalidTypeArguments] (#2044) _implementation: Implementation = Implementation.DASK def __init__( @@ -454,7 +454,7 @@ def func(_input: dx.Series) -> dx.Series: _input.dtype, self._version, self._implementation ) if dtype.is_numeric(): - return _input != _input # noqa: PLR0124 + return _input != _input # pyright: ignore[reportReturnType] # noqa: PLR0124 msg = f"`.is_nan` only supported for numeric dtypes and not {dtype}, did you mean `.is_null`?" raise InvalidOperationError(msg) @@ -487,7 +487,7 @@ def is_first_distinct(self: Self) -> Self: def func(_input: dx.Series) -> dx.Series: _name = _input.name col_token = generate_temporary_column_name(n_bytes=8, columns=[_name]) - _input = add_row_index( + _input = add_row_index( # pyright: ignore[reportAssignmentType] _input.to_frame(), col_token, backend_version=self._backend_version, @@ -504,7 +504,7 @@ def is_last_distinct(self: Self) -> Self: def func(_input: dx.Series) -> dx.Series: _name = _input.name col_token = generate_temporary_column_name(n_bytes=8, columns=[_name]) - _input = add_row_index( + _input = add_row_index( # pyright: ignore[reportAssignmentType] _input.to_frame(), col_token, backend_version=self._backend_version, diff --git a/narwhals/_dask/expr_dt.py b/narwhals/_dask/expr_dt.py index c569d7dc5b..b7355dad4b 100644 --- a/narwhals/_dask/expr_dt.py +++ b/narwhals/_dask/expr_dt.py @@ -96,9 +96,9 @@ def func(s: dx.Series, time_zone: str) -> dx.Series: s.dtype, self._compliant_expr._version, Implementation.DASK ) if dtype.time_zone is None: # type: ignore[attr-defined] - return s.dt.tz_localize("UTC").dt.tz_convert(time_zone) + return s.dt.tz_localize("UTC").dt.tz_convert(time_zone) # pyright: ignore[reportAttributeAccessIssue] else: - return s.dt.tz_convert(time_zone) + return s.dt.tz_convert(time_zone) # pyright: ignore[reportAttributeAccessIssue] return self._compliant_expr._from_call(func, "tz_convert", time_zone=time_zone) @@ -125,7 +125,7 @@ def func(s: dx.Series, time_unit: TimeUnit) -> dx.Series: else: msg = "Input should be either of Date or Datetime type" raise TypeError(msg) - return result.where(~mask_na) + return result.where(~mask_na) # pyright: ignore[reportReturnType] return self._compliant_expr._from_call(func, "datetime", time_unit=time_unit) diff --git a/narwhals/_dask/group_by.py b/narwhals/_dask/group_by.py index 13b57796a2..7cae261407 100644 --- a/narwhals/_dask/group_by.py +++ b/narwhals/_dask/group_by.py @@ -107,7 +107,7 @@ def _from_native_frame(self: Self, df: DaskLazyFrame) -> DaskLazyFrame: def agg_dask( df: DaskLazyFrame, grouped: Any, - exprs: Sequence[CompliantExpr[DaskLazyFrame, dx.Series]], + exprs: Sequence[CompliantExpr[DaskLazyFrame, dx.Series]], # pyright: ignore[reportInvalidTypeArguments] keys: list[str], from_dataframe: Callable[[Any], DaskLazyFrame], ) -> DaskLazyFrame: diff --git a/narwhals/_dask/namespace.py b/narwhals/_dask/namespace.py index 5cd828adfe..cf46590086 100644 --- a/narwhals/_dask/namespace.py +++ b/narwhals/_dask/namespace.py @@ -38,7 +38,7 @@ import dask_expr as dx -class DaskNamespace(CompliantNamespace[DaskLazyFrame, "dx.Series"]): +class DaskNamespace(CompliantNamespace[DaskLazyFrame, "dx.Series"]): # pyright: ignore[reportInvalidTypeArguments] _implementation: Implementation = Implementation.DASK @property diff --git a/narwhals/_dask/utils.py b/narwhals/_dask/utils.py index 5c1c4c0742..a98241ecf2 100644 --- a/narwhals/_dask/utils.py +++ b/narwhals/_dask/utils.py @@ -62,7 +62,7 @@ def align_series_full_broadcast( return [ s if isinstance(s, dx.Series) else df._native_frame.assign(_tmp=s)["_tmp"] for s in series - ] + ] # pyright: ignore[reportReturnType] def add_row_index( @@ -155,8 +155,8 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], version: Version) -> An def name_preserving_sum(s1: dx.Series, s2: dx.Series) -> dx.Series: - return (s1 + s2).rename(s1.name) + return (s1 + s2).rename(s1.name) # pyright: ignore[reportOperatorIssue] def name_preserving_div(s1: dx.Series, s2: dx.Series) -> dx.Series: - return (s1 / s2).rename(s1.name) + return (s1 / s2).rename(s1.name) # pyright: ignore[reportOperatorIssue] From 72014548b6f49b864164167c2372fa43778556ad Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 24 Feb 2025 12:24:00 +0000 Subject: [PATCH 38/55] chore(typing): resolve `mypy` strict errors Following #2077 --- narwhals/_arrow/dataframe.py | 2 +- narwhals/_expression_parsing.py | 6 +++--- narwhals/_selectors.py | 9 ++++++--- narwhals/_spark_like/dataframe.py | 2 +- narwhals/typing.py | 2 +- narwhals/utils.py | 6 ++++-- 6 files changed, 16 insertions(+), 11 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index da07078c34..d3af796227 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -618,7 +618,7 @@ def collect( self: Self, backend: Implementation | None, **kwargs: Any, - ) -> CompliantDataFrame: + ) -> CompliantDataFrame[Any]: if backend is Implementation.PYARROW or backend is None: from narwhals._arrow.dataframe import ArrowDataFrame diff --git a/narwhals/_expression_parsing.py b/narwhals/_expression_parsing.py index 89264da2d8..b6c68dbac7 100644 --- a/narwhals/_expression_parsing.py +++ b/narwhals/_expression_parsing.py @@ -90,7 +90,7 @@ def maybe_evaluate_expr( @overload -def maybe_evaluate_expr(df: CompliantDataFrame, expr: T) -> T: ... +def maybe_evaluate_expr(df: CompliantDataFrame[Any], expr: T) -> T: ... def maybe_evaluate_expr( @@ -152,7 +152,7 @@ def reuse_series_implementation( """ plx = expr.__narwhals_namespace__() - def func(df: CompliantDataFrame) -> Sequence[CompliantSeries]: + def func(df: CompliantDataFrame[Any]) -> Sequence[CompliantSeries]: _kwargs = { **(call_kwargs or {}), **{ @@ -303,7 +303,7 @@ def extract_compliant( def evaluate_output_names_and_aliases( expr: CompliantExpr[Any, Any], - df: CompliantDataFrame | CompliantLazyFrame, + df: CompliantDataFrame[Any] | CompliantLazyFrame, exclude: Sequence[str], ) -> tuple[Sequence[str], Sequence[str]]: output_names = expr._evaluate_output_names(df) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 68342bf95d..4c1a138114 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -5,6 +5,7 @@ import re from functools import partial from typing import TYPE_CHECKING +from typing import Any from typing import Callable from typing import Collection from typing import Generic @@ -41,7 +42,7 @@ SeriesT = TypeVar("SeriesT", bound="CompliantSeries") -FrameT = TypeVar("FrameT", bound="CompliantDataFrame | CompliantLazyFrame") +FrameT = TypeVar("FrameT", bound="CompliantDataFrame[Any] | CompliantLazyFrame") SelectorOrExpr: TypeAlias = ( "CompliantSelector[FrameT, SeriesT] | CompliantExpr[FrameT, SeriesT]" ) @@ -265,7 +266,7 @@ def names(df: FrameT) -> Sequence[str]: return self._to_expr() & other def __invert__(self: Self) -> CompliantSelector[FrameT, SeriesT]: - return self.selectors.all() - self + return self.selectors.all() - self # type: ignore[no-any-return] def __repr__(self: Self) -> str: # pragma: no cover s = f"depth={self._depth}, " if is_tracks_depth(self._implementation) else "" @@ -273,6 +274,8 @@ def __repr__(self: Self) -> str: # pragma: no cover def _eval_lhs_rhs( - df: CompliantDataFrame | CompliantLazyFrame, lhs: CompliantExpr, rhs: CompliantExpr + df: CompliantDataFrame[Any] | CompliantLazyFrame, + lhs: CompliantExpr[Any, Any], + rhs: CompliantExpr[Any, Any], ) -> tuple[Sequence[str], Sequence[str]]: return lhs._evaluate_output_names(df), rhs._evaluate_output_names(df) diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index 1739be09bb..cc2ac9bc87 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -163,7 +163,7 @@ def collect( self: Self, backend: ModuleType | Implementation | str | None, **kwargs: Any, - ) -> CompliantDataFrame: + ) -> CompliantDataFrame[Any]: if backend is Implementation.PANDAS: import pandas as pd # ignore-banned-import diff --git a/narwhals/typing.py b/narwhals/typing.py index 9c4cb8bea4..8f83cce8ec 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -93,7 +93,7 @@ def schema(self) -> Mapping[str, DType]: ... CompliantFrameT = TypeVar( - "CompliantFrameT", bound="CompliantDataFrame | CompliantLazyFrame" + "CompliantFrameT", bound="CompliantDataFrame[Any] | CompliantLazyFrame" ) diff --git a/narwhals/utils.py b/narwhals/utils.py index d24d74a5a9..5d2fe3e77d 100644 --- a/narwhals/utils.py +++ b/narwhals/utils.py @@ -1309,7 +1309,7 @@ def dtype_matches_time_unit_and_time_zone( def get_column_names( - df: NativeFrame | CompliantDataFrame | CompliantLazyFrame, + df: NativeFrame | CompliantDataFrame[Any] | CompliantLazyFrame, ) -> Sequence[str]: return df.columns @@ -1319,7 +1319,9 @@ def _hasattr_static(obj: Any, attr: str) -> bool: return getattr_static(obj, attr, sentinel) is not sentinel -def is_compliant_dataframe(obj: Any) -> TypeIs[CompliantDataFrame]: +def is_compliant_dataframe( + obj: Any | CompliantDataFrame[CompliantSeriesT_co], +) -> TypeIs[CompliantDataFrame[CompliantSeriesT_co]]: return _hasattr_static(obj, "__narwhals_dataframe__") From a4edcb767a935492dfd5ab9eb9f6e8b2bdc35505 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 24 Feb 2025 12:42:33 +0000 Subject: [PATCH 39/55] fix(DRAFT): try giving a `super(...)` hints for `3.8` If this works, the errors in CI should only be for `ArrowSelector` https://github.com/narwhals-dev/narwhals/pull/2064#discussion_r1965980715 --- narwhals/_pandas_like/selectors.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index ed1947d530..c3e332dfb7 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -1,7 +1,9 @@ from __future__ import annotations +import sys from functools import partial from typing import TYPE_CHECKING +from typing import Any from typing import Iterator from narwhals._pandas_like.dataframe import PandasLikeDataFrame @@ -47,7 +49,7 @@ def _selector( function_name="selector", evaluate_output_names=evaluate_output_names, alias_output_names=None, - implementation=self._implementation, + implementation=self._implementation, # AttributeError: 'PandasSelector' object has no attribute '_implementation' backend_version=self._backend_version, version=self._version, ) @@ -58,17 +60,24 @@ def __init__(self: Self, context: _FullContext, /) -> None: self._version = context._version +# BUG: `3.8` Protocol? +# https://github.com/narwhals-dev/narwhals/pull/2064#discussion_r1965980715 class PandasSelector( # type: ignore[misc] CompliantSelector["PandasLikeDataFrame", "PandasLikeSeries"], PandasLikeExpr ): + if sys.version_info < (3, 9): + + def __init__(self, *args: Any, **kwds: Any) -> None: + super(PandasLikeExpr).__init__(*args, **kwds) + def _to_expr(self: Self) -> PandasLikeExpr: return PandasLikeExpr( - self._call, + self._call, # AttributeError: 'PandasSelector' object has no attribute '_call' depth=self._depth, function_name=self._function_name, - evaluate_output_names=self._evaluate_output_names, + evaluate_output_names=self._evaluate_output_names, # AttributeError: 'PandasSelector' object has no attribute '_evaluate_output_names' alias_output_names=self._alias_output_names, - implementation=self._implementation, + implementation=self._implementation, # AttributeError: 'PandasSelector' object has no attribute '_implementation' backend_version=self._backend_version, version=self._version, ) From cc00fdc076c4518ad5639c1288cc1c57e793a634 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 24 Feb 2025 12:47:02 +0000 Subject: [PATCH 40/55] don't forget `self` https://github.com/narwhals-dev/narwhals/actions/runs/13498306238/job/37710441415?pr=2064#step:7:2647 --- narwhals/_pandas_like/selectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index c3e332dfb7..d4eb8273cb 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -68,7 +68,7 @@ class PandasSelector( # type: ignore[misc] if sys.version_info < (3, 9): def __init__(self, *args: Any, **kwds: Any) -> None: - super(PandasLikeExpr).__init__(*args, **kwds) + super(PandasLikeExpr, self).__init__(*args, **kwds) def _to_expr(self: Self) -> PandasLikeExpr: return PandasLikeExpr( From b650c7817063c0c9fc65c78239fb9a37b7c7c3f6 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 24 Feb 2025 13:10:21 +0000 Subject: [PATCH 41/55] fix(DRAFT): try removing `CompliantSelectorNamespace.__init__` Aiming to indirectly fix https://github.com/narwhals-dev/narwhals/pull/2064#discussion_r1965980715 --- narwhals/_selectors.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 4c1a138114..a1f131c4e2 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -38,7 +38,6 @@ from narwhals.typing import TimeUnit from narwhals.utils import Implementation from narwhals.utils import Version - from narwhals.utils import _FullContext SeriesT = TypeVar("SeriesT", bound="CompliantSeries") @@ -158,14 +157,6 @@ def names(df: FrameT) -> Sequence[str]: return self._selector(series, names) - # NOTE: Can't reuse for `<3.11` - # - https://github.com/python/cpython/issues/88970 - # - https://github.com/python/cpython/pull/31628 - def __init__(self: Self, context: _FullContext, /) -> None: # pragma: no cover - self._implementation = context._implementation - self._backend_version = context._backend_version - self._version = context._version - class LazySelectorNamespace( CompliantSelectorNamespace[FrameT, SeriesT], Generic[FrameT, SeriesT], Protocol From acd8cbf0d1fc33af096fd69e795b214f2353ec15 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 24 Feb 2025 13:17:11 +0000 Subject: [PATCH 42/55] revert (72014548b6f49b864164167c2372fa43778556ad) --- narwhals/_pandas_like/selectors.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index d4eb8273cb..097016ffa8 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -1,9 +1,7 @@ from __future__ import annotations -import sys from functools import partial from typing import TYPE_CHECKING -from typing import Any from typing import Iterator from narwhals._pandas_like.dataframe import PandasLikeDataFrame @@ -65,11 +63,6 @@ def __init__(self: Self, context: _FullContext, /) -> None: class PandasSelector( # type: ignore[misc] CompliantSelector["PandasLikeDataFrame", "PandasLikeSeries"], PandasLikeExpr ): - if sys.version_info < (3, 9): - - def __init__(self, *args: Any, **kwds: Any) -> None: - super(PandasLikeExpr, self).__init__(*args, **kwds) - def _to_expr(self: Self) -> PandasLikeExpr: return PandasLikeExpr( self._call, # AttributeError: 'PandasSelector' object has no attribute '_call' From 3be58e607250beced85698f5e6d3ed197b082c9b Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 24 Feb 2025 13:22:29 +0000 Subject: [PATCH 43/55] refactor: try reusing `Protocol`'s `Generic - Didn't realise `3.8` had a `.__class_getitem__` - Possibly can replace `Protocol` w/ `Generic` in a `3.8` compat block - Provided this change doesn't break anything else --- narwhals/_selectors.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index a1f131c4e2..d4a891574e 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -8,7 +8,6 @@ from typing import Any from typing import Callable from typing import Collection -from typing import Generic from typing import Iterable from typing import Iterator from typing import Protocol @@ -49,7 +48,7 @@ EvalNames: TypeAlias = Callable[[FrameT], Sequence[str]] -class CompliantSelectorNamespace(Generic[FrameT, SeriesT], Protocol): +class CompliantSelectorNamespace(Protocol[FrameT, SeriesT]): _implementation: Implementation _backend_version: tuple[int, ...] _version: Version @@ -159,7 +158,7 @@ def names(df: FrameT) -> Sequence[str]: class LazySelectorNamespace( - CompliantSelectorNamespace[FrameT, SeriesT], Generic[FrameT, SeriesT], Protocol + CompliantSelectorNamespace[FrameT, SeriesT], Protocol[FrameT, SeriesT] ): def _iter_schema(self, df: FrameT) -> Iterator[tuple[str, DType]]: yield from df.schema.items() @@ -168,9 +167,7 @@ def _iter_columns_dtypes(self, df: FrameT, /) -> Iterator[tuple[SeriesT, DType]] yield from zip(self._iter_columns(df), df.schema.values()) -class CompliantSelector( - CompliantExpr[FrameT, SeriesT], Generic[FrameT, SeriesT], Protocol -): +class CompliantSelector(CompliantExpr[FrameT, SeriesT], Protocol[FrameT, SeriesT]): @property def selectors(self) -> CompliantSelectorNamespace[FrameT, SeriesT]: return self.__narwhals_namespace__().selectors From d57c877aea98bc18677437aa1f8eebc22a1f7ca7 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 24 Feb 2025 13:34:28 +0000 Subject: [PATCH 44/55] refactor: try the same for `CompliantExpr` - Related to https://github.com/narwhals-dev/narwhals/pull/2064#discussion_r1967646113 - Also in the bases of `CompliantSelector` --- narwhals/typing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/typing.py b/narwhals/typing.py index 8f83cce8ec..2a662484cf 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -97,7 +97,7 @@ def schema(self) -> Mapping[str, DType]: ... ) -class CompliantExpr(Protocol, Generic[CompliantFrameT, CompliantSeriesT_co]): +class CompliantExpr(Protocol[CompliantFrameT, CompliantSeriesT_co]): _implementation: Implementation _backend_version: tuple[int, ...] _version: Version From b192deb9863bd862a1dde359cb5492352a06ed4c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 24 Feb 2025 13:45:59 +0000 Subject: [PATCH 45/55] fix(DRAFT): try removing `Protocol` altogether? https://github.com/narwhals-dev/narwhals/pull/2064#discussion_r1967646113 --- narwhals/_selectors.py | 13 ++++++++++++- narwhals/typing.py | 13 ++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index d4a891574e..53c040274c 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -10,7 +10,6 @@ from typing import Collection from typing import Iterable from typing import Iterator -from typing import Protocol from typing import Sequence from typing import TypeVar from typing import overload @@ -23,6 +22,18 @@ from narwhals.utils import is_compliant_dataframe from narwhals.utils import is_tracks_depth +if not TYPE_CHECKING: + import sys + + if sys.version_info >= (3, 9): + from typing import Protocol + else: + from typing import Generic + + Protocol = Generic +else: + from typing import Protocol + if TYPE_CHECKING: from datetime import timezone diff --git a/narwhals/typing.py b/narwhals/typing.py index 2a662484cf..be2f2f1831 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -5,11 +5,22 @@ from typing import Callable from typing import Generic from typing import Literal -from typing import Protocol from typing import Sequence from typing import TypeVar from typing import Union +if not TYPE_CHECKING: + import sys + + if sys.version_info >= (3, 9): + from typing import Protocol + else: + from typing import Generic + + Protocol = Generic +else: + from typing import Protocol + if TYPE_CHECKING: from types import ModuleType from typing import Mapping From 75c5a81d8fd6c7ae9d11059309fc7957931f677e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 24 Feb 2025 13:50:45 +0000 Subject: [PATCH 46/55] fix: only replace `Protocol` for `CompliantExpr` https://github.com/narwhals-dev/narwhals/actions/runs/13499519966/job/37714261755?pr=2064 --- narwhals/typing.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/narwhals/typing.py b/narwhals/typing.py index be2f2f1831..1efc12d926 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -5,6 +5,7 @@ from typing import Callable from typing import Generic from typing import Literal +from typing import Protocol from typing import Sequence from typing import TypeVar from typing import Union @@ -13,13 +14,11 @@ import sys if sys.version_info >= (3, 9): - from typing import Protocol + from typing import Protocol as Protocol38 else: - from typing import Generic - - Protocol = Generic + from typing import Generic as Protocol38 else: - from typing import Protocol + from typing import Protocol as Protocol38 if TYPE_CHECKING: from types import ModuleType @@ -108,7 +107,7 @@ def schema(self) -> Mapping[str, DType]: ... ) -class CompliantExpr(Protocol[CompliantFrameT, CompliantSeriesT_co]): +class CompliantExpr(Protocol38[CompliantFrameT, CompliantSeriesT_co]): _implementation: Implementation _backend_version: tuple[int, ...] _version: Version From 372f1eb17ff05457b7b8fb9caa1f46da1ae3cd7d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Mon, 24 Feb 2025 13:59:38 +0000 Subject: [PATCH 47/55] ignore coverage, remove comments https://github.com/narwhals-dev/narwhals/actions/runs/13499608446/job/37714553903?pr=2064 --- narwhals/_pandas_like/selectors.py | 10 ++++------ narwhals/_selectors.py | 4 ++-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index 097016ffa8..ed1947d530 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -47,7 +47,7 @@ def _selector( function_name="selector", evaluate_output_names=evaluate_output_names, alias_output_names=None, - implementation=self._implementation, # AttributeError: 'PandasSelector' object has no attribute '_implementation' + implementation=self._implementation, backend_version=self._backend_version, version=self._version, ) @@ -58,19 +58,17 @@ def __init__(self: Self, context: _FullContext, /) -> None: self._version = context._version -# BUG: `3.8` Protocol? -# https://github.com/narwhals-dev/narwhals/pull/2064#discussion_r1965980715 class PandasSelector( # type: ignore[misc] CompliantSelector["PandasLikeDataFrame", "PandasLikeSeries"], PandasLikeExpr ): def _to_expr(self: Self) -> PandasLikeExpr: return PandasLikeExpr( - self._call, # AttributeError: 'PandasSelector' object has no attribute '_call' + self._call, depth=self._depth, function_name=self._function_name, - evaluate_output_names=self._evaluate_output_names, # AttributeError: 'PandasSelector' object has no attribute '_evaluate_output_names' + evaluate_output_names=self._evaluate_output_names, alias_output_names=self._alias_output_names, - implementation=self._implementation, # AttributeError: 'PandasSelector' object has no attribute '_implementation' + implementation=self._implementation, backend_version=self._backend_version, version=self._version, ) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 53c040274c..09632f1ecc 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -22,7 +22,7 @@ from narwhals.utils import is_compliant_dataframe from narwhals.utils import is_tracks_depth -if not TYPE_CHECKING: +if not TYPE_CHECKING: # pragma: no cover import sys if sys.version_info >= (3, 9): @@ -31,7 +31,7 @@ from typing import Generic Protocol = Generic -else: +else: # pragma: no cover from typing import Protocol if TYPE_CHECKING: From c26d7ff4809d642f709f5920dd664aa1d07f34ca Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 1 Mar 2025 18:47:07 +0000 Subject: [PATCH 48/55] chore(typing): ignore spark issues Revisit in #2044 --- narwhals/_spark_like/selectors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/narwhals/_spark_like/selectors.py b/narwhals/_spark_like/selectors.py index 5c9e77a3d0..116d8c8bd5 100644 --- a/narwhals/_spark_like/selectors.py +++ b/narwhals/_spark_like/selectors.py @@ -17,14 +17,14 @@ from narwhals.utils import _FullContext -class SparkLikeSelectorNamespace(LazySelectorNamespace["SparkLikeLazyFrame", "Column"]): +class SparkLikeSelectorNamespace(LazySelectorNamespace["SparkLikeLazyFrame", "Column"]): # type: ignore[type-var] (#2044) def _iter_columns(self, df: SparkLikeLazyFrame) -> Iterator[Column]: for col in df.columns: yield df._F.col(col) def _selector( self, - call: EvalSeries[SparkLikeLazyFrame, Column], + call: EvalSeries[SparkLikeLazyFrame, Column], # type: ignore[type-var] (#2044) evaluate_output_names: EvalNames[SparkLikeLazyFrame], /, ) -> SparkLikeSelector: @@ -44,7 +44,7 @@ def __init__(self: Self, context: _FullContext, /) -> None: self._implementation = context._implementation -class SparkLikeSelector(CompliantSelector["SparkLikeLazyFrame", "Column"], SparkLikeExpr): # type: ignore[misc] +class SparkLikeSelector(CompliantSelector["SparkLikeLazyFrame", "Column"], SparkLikeExpr): # type: ignore[type-var, misc] (#2044) def _to_expr(self: Self) -> SparkLikeExpr: return SparkLikeExpr( self._call, From b4f6a5134ac93adf199fadd30efb6de79a353bfa Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 1 Mar 2025 18:51:53 +0000 Subject: [PATCH 49/55] :unamused: https://github.com/narwhals-dev/narwhals/actions/runs/13607778818/job/38041286499?pr=2064 --- narwhals/_spark_like/selectors.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/narwhals/_spark_like/selectors.py b/narwhals/_spark_like/selectors.py index 116d8c8bd5..d29cbf82ed 100644 --- a/narwhals/_spark_like/selectors.py +++ b/narwhals/_spark_like/selectors.py @@ -17,14 +17,15 @@ from narwhals.utils import _FullContext -class SparkLikeSelectorNamespace(LazySelectorNamespace["SparkLikeLazyFrame", "Column"]): # type: ignore[type-var] (#2044) +# NOTE: See issue regarding ignores (#2044) +class SparkLikeSelectorNamespace(LazySelectorNamespace["SparkLikeLazyFrame", "Column"]): # type: ignore[type-var] def _iter_columns(self, df: SparkLikeLazyFrame) -> Iterator[Column]: for col in df.columns: yield df._F.col(col) def _selector( self, - call: EvalSeries[SparkLikeLazyFrame, Column], # type: ignore[type-var] (#2044) + call: EvalSeries[SparkLikeLazyFrame, Column], # type: ignore[type-var] evaluate_output_names: EvalNames[SparkLikeLazyFrame], /, ) -> SparkLikeSelector: @@ -44,7 +45,7 @@ def __init__(self: Self, context: _FullContext, /) -> None: self._implementation = context._implementation -class SparkLikeSelector(CompliantSelector["SparkLikeLazyFrame", "Column"], SparkLikeExpr): # type: ignore[type-var, misc] (#2044) +class SparkLikeSelector(CompliantSelector["SparkLikeLazyFrame", "Column"], SparkLikeExpr): # type: ignore[type-var, misc] def _to_expr(self: Self) -> SparkLikeExpr: return SparkLikeExpr( self._call, From ec5dd0ca48415b7418034a3583ee6fc634172b7d Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 1 Mar 2025 19:10:09 +0000 Subject: [PATCH 50/55] refactor: Reuse eager-backend `iter_columns` Utilizing (#2115) and (#2104) --- narwhals/_arrow/selectors.py | 13 ++----------- narwhals/_pandas_like/selectors.py | 18 ++---------------- narwhals/_selectors.py | 8 ++++++++ narwhals/typing.py | 2 ++ 4 files changed, 14 insertions(+), 27 deletions(-) diff --git a/narwhals/_arrow/selectors.py b/narwhals/_arrow/selectors.py index 8cdc379b2d..d9c74be112 100644 --- a/narwhals/_arrow/selectors.py +++ b/narwhals/_arrow/selectors.py @@ -1,11 +1,10 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Iterator from narwhals._arrow.expr import ArrowExpr from narwhals._selectors import CompliantSelector -from narwhals._selectors import CompliantSelectorNamespace +from narwhals._selectors import EagerSelectorNamespace if TYPE_CHECKING: from typing_extensions import Self @@ -17,15 +16,7 @@ from narwhals.utils import _FullContext -class ArrowSelectorNamespace(CompliantSelectorNamespace["ArrowDataFrame", "ArrowSeries"]): - def _iter_columns(self, df: ArrowDataFrame) -> Iterator[ArrowSeries]: - from narwhals._arrow.series import ArrowSeries - - for col, ser in zip(df.columns, df._native_frame.itercolumns()): - yield ArrowSeries( - ser, name=col, backend_version=df._backend_version, version=df._version - ) - +class ArrowSelectorNamespace(EagerSelectorNamespace["ArrowDataFrame", "ArrowSeries"]): def _selector( self, call: EvalSeries[ArrowDataFrame, ArrowSeries], diff --git a/narwhals/_pandas_like/selectors.py b/narwhals/_pandas_like/selectors.py index ed1947d530..bdf5cf33cd 100644 --- a/narwhals/_pandas_like/selectors.py +++ b/narwhals/_pandas_like/selectors.py @@ -1,14 +1,12 @@ from __future__ import annotations -from functools import partial from typing import TYPE_CHECKING -from typing import Iterator from narwhals._pandas_like.dataframe import PandasLikeDataFrame from narwhals._pandas_like.expr import PandasLikeExpr from narwhals._pandas_like.series import PandasLikeSeries from narwhals._selectors import CompliantSelector -from narwhals._selectors import CompliantSelectorNamespace +from narwhals._selectors import EagerSelectorNamespace if TYPE_CHECKING: from typing_extensions import Self @@ -21,20 +19,8 @@ class PandasSelectorNamespace( - CompliantSelectorNamespace["PandasLikeDataFrame", "PandasLikeSeries"] + EagerSelectorNamespace["PandasLikeDataFrame", "PandasLikeSeries"] ): - def _iter_columns(self, df: PandasLikeDataFrame) -> Iterator[PandasLikeSeries]: - from narwhals._pandas_like.series import PandasLikeSeries - - series = partial( - PandasLikeSeries, - implementation=df._implementation, - backend_version=df._backend_version, - version=df._version, - ) - for _col, ser in df._native_frame.items(): # noqa: PERF102 - yield series(ser) - def _selector( self, call: EvalSeries[PandasLikeDataFrame, PandasLikeSeries], diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 09632f1ecc..9ef7b20c9c 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -52,6 +52,7 @@ SeriesT = TypeVar("SeriesT", bound="CompliantSeries") FrameT = TypeVar("FrameT", bound="CompliantDataFrame[Any] | CompliantLazyFrame") +DataFrameT = TypeVar("DataFrameT", bound="CompliantDataFrame[Any]") SelectorOrExpr: TypeAlias = ( "CompliantSelector[FrameT, SeriesT] | CompliantExpr[FrameT, SeriesT]" ) @@ -168,6 +169,13 @@ def names(df: FrameT) -> Sequence[str]: return self._selector(series, names) +class EagerSelectorNamespace( + CompliantSelectorNamespace[DataFrameT, SeriesT], Protocol[DataFrameT, SeriesT] +): + def _iter_columns(self, df: DataFrameT, /) -> Iterator[SeriesT]: + yield from df.iter_columns() + + class LazySelectorNamespace( CompliantSelectorNamespace[FrameT, SeriesT], Protocol[FrameT, SeriesT] ): diff --git a/narwhals/typing.py b/narwhals/typing.py index c1d256ea72..e409568430 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -3,6 +3,7 @@ from typing import TYPE_CHECKING from typing import Any from typing import Callable +from typing import Iterator from typing import Literal from typing import Protocol from typing import Sequence @@ -83,6 +84,7 @@ def columns(self) -> Sequence[str]: ... @property def schema(self) -> Mapping[str, DType]: ... def get_column(self, name: str) -> CompliantSeriesT_co: ... + def iter_columns(self) -> Iterator[CompliantSeriesT_co]: ... class CompliantLazyFrame(Protocol): From c180c7efbd85ed78810b04625fc9ee87681e89d3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 2 Mar 2025 18:21:18 +0000 Subject: [PATCH 51/55] refactor: define `CompliantLazyFrame._iter_columns` - Required some compatibility aliasing for `pandas`, `pyarrow` - They're faux-lazy https://github.com/narwhals-dev/narwhals/pull/2064#discussion_r1976486623 --- narwhals/_arrow/dataframe.py | 2 ++ narwhals/_dask/dataframe.py | 6 ++++++ narwhals/_dask/selectors.py | 5 ----- narwhals/_duckdb/dataframe.py | 5 +++++ narwhals/_duckdb/selectors.py | 7 ------- narwhals/_pandas_like/dataframe.py | 2 ++ narwhals/_polars/dataframe.py | 3 +++ narwhals/_selectors.py | 10 +++++++--- narwhals/_spark_like/dataframe.py | 5 +++++ narwhals/_spark_like/selectors.py | 5 ----- narwhals/typing.py | 1 + 11 files changed, 31 insertions(+), 20 deletions(-) diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 2cb84475d0..e0709a84ff 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -164,6 +164,8 @@ def iter_columns(self) -> Iterator[ArrowSeries]: version=self._version, ) + _iter_columns = iter_columns + def iter_rows( self: Self, *, named: bool, buffer_size: int ) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index e77d05d742..e8dde409a3 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING from typing import Any +from typing import Iterator from typing import Literal from typing import Sequence @@ -24,6 +25,7 @@ if TYPE_CHECKING: from types import ModuleType + import dask.dataframe.dask_expr as dx from typing_extensions import Self from narwhals._dask.expr import DaskExpr @@ -79,6 +81,10 @@ def _from_native_frame(self: Self, df: Any) -> Self: version=self._version, ) + def _iter_columns(self) -> Iterator[dx.Series]: + for _col, ser in self._native_frame.items(): # noqa: PERF102 + yield ser + def with_columns(self: Self, *exprs: DaskExpr) -> Self: df = self._native_frame new_series = evaluate_exprs(self, *exprs) diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index bd42fc76c7..59c8dba474 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Iterator from narwhals._dask.expr import DaskExpr from narwhals._selectors import CompliantSelector @@ -27,10 +26,6 @@ class DaskSelectorNamespace(LazySelectorNamespace["DaskLazyFrame", "dx.Series"]): # pyright: ignore[reportInvalidTypeArguments] - def _iter_columns(self, df: DaskLazyFrame) -> Iterator[dx.Series]: - for _col, ser in df._native_frame.items(): # noqa: PERF102 - yield ser - def _selector( self, call: EvalSeries[DaskLazyFrame, dx.Series], # pyright: ignore[reportInvalidTypeForm] diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index dc74eae824..e1fa303bd9 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -2,6 +2,7 @@ from typing import TYPE_CHECKING from typing import Any +from typing import Iterator from typing import Literal from typing import Sequence @@ -85,6 +86,10 @@ def __getitem__(self: Self, item: str) -> DuckDBInterchangeSeries: self._native_frame.select(item), version=self._version ) + def _iter_columns(self) -> Iterator[duckdb.Expression]: + for col in self.columns: + yield ColumnExpression(col) + def collect( self: Self, backend: ModuleType | Implementation | str | None, diff --git a/narwhals/_duckdb/selectors.py b/narwhals/_duckdb/selectors.py index 9e99f0e78f..0e54fd3c76 100644 --- a/narwhals/_duckdb/selectors.py +++ b/narwhals/_duckdb/selectors.py @@ -1,9 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Iterator - -from duckdb import ColumnExpression from narwhals._duckdb.expr import DuckDBExpr from narwhals._selectors import CompliantSelector @@ -22,10 +19,6 @@ class DuckDBSelectorNamespace( LazySelectorNamespace["DuckDBLazyFrame", "duckdb.Expression"] # type: ignore[type-var] ): - def _iter_columns(self, df: DuckDBLazyFrame) -> Iterator[duckdb.Expression]: - for col in df.columns: - yield ColumnExpression(col) - def _selector( self, call: EvalSeries[DuckDBLazyFrame, duckdb.Expression], # type: ignore[type-var] diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index f2a336146e..e958575461 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -350,6 +350,8 @@ def iter_columns(self) -> Iterator[PandasLikeSeries]: version=self._version, ) + _iter_columns = iter_columns + def iter_rows( self: Self, *, diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index bc25184e11..4683d59e7a 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -438,6 +438,9 @@ def func(*args: Any, **kwargs: Any) -> Any: return func + def _iter_columns(self) -> Iterator[PolarsSeries]: # pragma: no cover + yield from self.collect(self._implementation).iter_columns() + @property def columns(self: Self) -> list[str]: return self._native_frame.columns diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 9ef7b20c9c..ac4f3cc0fa 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -53,6 +53,7 @@ SeriesT = TypeVar("SeriesT", bound="CompliantSeries") FrameT = TypeVar("FrameT", bound="CompliantDataFrame[Any] | CompliantLazyFrame") DataFrameT = TypeVar("DataFrameT", bound="CompliantDataFrame[Any]") +LazyFrameT = TypeVar("LazyFrameT", bound="CompliantLazyFrame") SelectorOrExpr: TypeAlias = ( "CompliantSelector[FrameT, SeriesT] | CompliantExpr[FrameT, SeriesT]" ) @@ -177,12 +178,15 @@ def _iter_columns(self, df: DataFrameT, /) -> Iterator[SeriesT]: class LazySelectorNamespace( - CompliantSelectorNamespace[FrameT, SeriesT], Protocol[FrameT, SeriesT] + CompliantSelectorNamespace[LazyFrameT, SeriesT], Protocol[LazyFrameT, SeriesT] ): - def _iter_schema(self, df: FrameT) -> Iterator[tuple[str, DType]]: + def _iter_schema(self, df: LazyFrameT) -> Iterator[tuple[str, DType]]: yield from df.schema.items() - def _iter_columns_dtypes(self, df: FrameT, /) -> Iterator[tuple[SeriesT, DType]]: + def _iter_columns(self, df: LazyFrameT) -> Iterator[SeriesT]: + yield from df._iter_columns() + + def _iter_columns_dtypes(self, df: LazyFrameT, /) -> Iterator[tuple[SeriesT, DType]]: yield from zip(self._iter_columns(df), df.schema.values()) diff --git a/narwhals/_spark_like/dataframe.py b/narwhals/_spark_like/dataframe.py index d4a792e678..ec5d7aafd7 100644 --- a/narwhals/_spark_like/dataframe.py +++ b/narwhals/_spark_like/dataframe.py @@ -4,6 +4,7 @@ from importlib import import_module from typing import TYPE_CHECKING from typing import Any +from typing import Iterator from typing import Literal from typing import Sequence from typing import cast @@ -195,6 +196,10 @@ def _collect_to_arrow(self) -> pa.Table: to_arrow: Incomplete = self._native_frame.toArrow return to_arrow() + def _iter_columns(self) -> Iterator[Column]: + for col in self.columns: + yield self._F.col(col) + @property def columns(self: Self) -> list[str]: return list(self.schema) diff --git a/narwhals/_spark_like/selectors.py b/narwhals/_spark_like/selectors.py index d29cbf82ed..eb7ab72fae 100644 --- a/narwhals/_spark_like/selectors.py +++ b/narwhals/_spark_like/selectors.py @@ -1,7 +1,6 @@ from __future__ import annotations from typing import TYPE_CHECKING -from typing import Iterator from narwhals._selectors import CompliantSelector from narwhals._selectors import LazySelectorNamespace @@ -19,10 +18,6 @@ # NOTE: See issue regarding ignores (#2044) class SparkLikeSelectorNamespace(LazySelectorNamespace["SparkLikeLazyFrame", "Column"]): # type: ignore[type-var] - def _iter_columns(self, df: SparkLikeLazyFrame) -> Iterator[Column]: - for col in df.columns: - yield df._F.col(col) - def _selector( self, call: EvalSeries[SparkLikeLazyFrame, Column], # type: ignore[type-var] diff --git a/narwhals/typing.py b/narwhals/typing.py index e409568430..30123513f4 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -101,6 +101,7 @@ def aggregate(self, *exprs: Any) -> Self: def columns(self) -> Sequence[str]: ... @property def schema(self) -> Mapping[str, DType]: ... + def _iter_columns(self) -> Iterator[Any]: ... CompliantFrameT = TypeVar( From e85c934bec1bc3908aeab558b2bd75de64554041 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 2 Mar 2025 18:22:12 +0000 Subject: [PATCH 52/55] docs: move note back into code https://github.com/narwhals-dev/narwhals/pull/2064#discussion_r1976671296 --- narwhals/_selectors.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index ac4f3cc0fa..0b7055fe18 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -80,6 +80,10 @@ def _iter_schema(self, df: FrameT, /) -> Iterator[tuple[str, DType]]: yield ser.name, ser.dtype def _iter_columns_dtypes(self, df: FrameT, /) -> Iterator[tuple[SeriesT, DType]]: + # NOTE: Defined to be overriden for lazy + # - Their `SeriesT` is a **native** object + # - `.dtype` won't return a `nw.DType` (or maybe anything) for lazy backends + # - See (https://github.com/narwhals-dev/narwhals/issues/2044) for ser in self._iter_columns(df): yield ser, ser.dtype From a43d5e5732b69a49bdc880a675b3e01ba46e81a9 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 2 Mar 2025 18:23:28 +0000 Subject: [PATCH 53/55] refactor: remove duplicate import --- narwhals/_dask/selectors.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/narwhals/_dask/selectors.py b/narwhals/_dask/selectors.py index 59c8dba474..9533721d49 100644 --- a/narwhals/_dask/selectors.py +++ b/narwhals/_dask/selectors.py @@ -19,11 +19,6 @@ from narwhals._selectors import EvalSeries from narwhals.utils import _FullContext - try: - import dask.dataframe.dask_expr as dx - except ModuleNotFoundError: - import dask_expr as dx - class DaskSelectorNamespace(LazySelectorNamespace["DaskLazyFrame", "dx.Series"]): # pyright: ignore[reportInvalidTypeArguments] def _selector( From f5765ffcc92cc49fc2fb1967ab13ea4d7ab79635 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 2 Mar 2025 18:26:26 +0000 Subject: [PATCH 54/55] typo https://results.pre-commit.ci/run/github/760058710/1740939895.mH7-kxPiRby66pjYWonI9A --- narwhals/_selectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 0b7055fe18..41b6248e21 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -80,7 +80,7 @@ def _iter_schema(self, df: FrameT, /) -> Iterator[tuple[str, DType]]: yield ser.name, ser.dtype def _iter_columns_dtypes(self, df: FrameT, /) -> Iterator[tuple[SeriesT, DType]]: - # NOTE: Defined to be overriden for lazy + # NOTE: Defined to be overridden for lazy # - Their `SeriesT` is a **native** object # - `.dtype` won't return a `nw.DType` (or maybe anything) for lazy backends # - See (https://github.com/narwhals-dev/narwhals/issues/2044) From cd31cbe243bdc150ae1e851dbce18b32ec3301f0 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sun, 2 Mar 2025 18:33:58 +0000 Subject: [PATCH 55/55] docs: add notes on `3.8` compat code https://github.com/narwhals-dev/narwhals/pull/2064#pullrequestreview-2652771711 --- narwhals/_selectors.py | 2 ++ narwhals/typing.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/narwhals/_selectors.py b/narwhals/_selectors.py index 41b6248e21..639b0eb740 100644 --- a/narwhals/_selectors.py +++ b/narwhals/_selectors.py @@ -23,6 +23,8 @@ from narwhals.utils import is_tracks_depth if not TYPE_CHECKING: # pragma: no cover + # TODO @dangotbanned: Remove after dropping `3.8` (#2084) + # - https://github.com/narwhals-dev/narwhals/pull/2064#discussion_r1965921386 import sys if sys.version_info >= (3, 9): diff --git a/narwhals/typing.py b/narwhals/typing.py index 30123513f4..d131a32ec5 100644 --- a/narwhals/typing.py +++ b/narwhals/typing.py @@ -18,6 +18,8 @@ else: from typing import Generic as Protocol38 else: + # TODO @dangotbanned: Remove after dropping `3.8` (#2084) + # - https://github.com/narwhals-dev/narwhals/pull/2064#discussion_r1965921386 from typing import Protocol as Protocol38 if TYPE_CHECKING: