diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 0d4206fa4e..eb72fa3f08 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -19,6 +19,7 @@ from narwhals._expression_parsing import evaluate_into_exprs from narwhals.dependencies import is_numpy_array from narwhals.utils import Implementation +from narwhals.utils import Version from narwhals.utils import check_column_exists from narwhals.utils import generate_temporary_column_name from narwhals.utils import is_sequence_but_not_str @@ -519,8 +520,44 @@ def tail(self: Self, n: int) -> Self: else: return self._from_native_frame(df.slice(abs(n))) - def lazy(self: Self) -> Self: - return self + def lazy(self: Self, *, backend: Implementation | None = None) -> CompliantLazyFrame: + from narwhals.utils import parse_version + + if backend is None: + return self + elif backend is Implementation.DUCKDB: + import duckdb # ignore-banned-import + + from narwhals._duckdb.dataframe import DuckDBLazyFrame + + df = self._native_frame # noqa: F841 + return DuckDBLazyFrame( + df=duckdb.table("df"), + backend_version=parse_version(duckdb.__version__), + version=self._version, + ) + elif backend is Implementation.POLARS: + import polars as pl # ignore-banned-import + + from narwhals._polars.dataframe import PolarsLazyFrame + + return PolarsLazyFrame( + df=pl.from_arrow(self._native_frame).lazy(), # type: ignore[union-attr] + backend_version=parse_version(pl.__version__), + version=self._version, + ) + elif backend is Implementation.DASK: + import dask # ignore-banned-import + import dask.dataframe as dd # ignore-banned-import + + from narwhals._dask.dataframe import DaskLazyFrame + + return DaskLazyFrame( + native_dataframe=dd.from_pandas(self._native_frame.to_pandas()), + backend_version=parse_version(dask.__version__), + version=self._version, + ) + raise AssertionError # pragma: no cover def collect(self: Self) -> ArrowDataFrame: return ArrowDataFrame( diff --git a/narwhals/_duckdb/dataframe.py b/narwhals/_duckdb/dataframe.py index f17c8073a5..6e3699b065 100644 --- a/narwhals/_duckdb/dataframe.py +++ b/narwhals/_duckdb/dataframe.py @@ -142,7 +142,14 @@ def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001 selection = (col for col in self.columns if col not in columns_to_drop) return self._from_native_frame(self._native_frame.select(*selection)) - def lazy(self: Self) -> Self: + def lazy(self: Self, *, backend: Implementation | None = None) -> Self: + # The `backend`` argument has no effect but we keep it here for + # backwards compatibility because in `narwhals.stable.v1` + # function `.from_native()` will return a DataFrame for DuckDB. + + if backend is not None: # pragma: no cover + msg = "`backend` argument is not supported for DuckDB" + raise ValueError(msg) return self def with_columns( diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index f07247371c..8972deb704 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -709,8 +709,44 @@ def unique( ) # --- lazy-only --- - def lazy(self: Self) -> Self: - return self + def lazy(self: Self, *, backend: Implementation | None = None) -> CompliantLazyFrame: + from narwhals.utils import parse_version + + if backend is None: + return self + elif backend is Implementation.DUCKDB: + import duckdb # ignore-banned-import + + from narwhals._duckdb.dataframe import DuckDBLazyFrame + + df = self._native_frame # noqa: F841 + return DuckDBLazyFrame( + df=duckdb.table("df"), + backend_version=parse_version(duckdb.__version__), + version=self._version, + ) + elif backend is Implementation.POLARS: + import polars as pl # ignore-banned-import + + from narwhals._polars.dataframe import PolarsLazyFrame + + return PolarsLazyFrame( + df=pl.from_pandas(self._native_frame).lazy(), + backend_version=parse_version(pl.__version__), + version=self._version, + ) + elif backend is Implementation.DASK: + import dask # ignore-banned-import + import dask.dataframe as dd # ignore-banned-import + + from narwhals._dask.dataframe import DaskLazyFrame + + return DaskLazyFrame( + native_dataframe=dd.from_pandas(self._native_frame), + backend_version=parse_version(dask.__version__), + version=self._version, + ) + raise AssertionError # pragma: no cover @property def shape(self: Self) -> tuple[int, int]: diff --git a/narwhals/_polars/dataframe.py b/narwhals/_polars/dataframe.py index 727912f11a..b01894b391 100644 --- a/narwhals/_polars/dataframe.py +++ b/narwhals/_polars/dataframe.py @@ -29,6 +29,7 @@ from narwhals._polars.group_by import PolarsLazyGroupBy from narwhals._polars.series import PolarsSeries from narwhals.dtypes import DType + from narwhals.typing import CompliantLazyFrame from narwhals.utils import Version T = TypeVar("T") @@ -231,12 +232,40 @@ def schema(self: Self) -> dict[str, DType]: for name, dtype in schema.items() } - def lazy(self: Self) -> PolarsLazyFrame: - return PolarsLazyFrame( - self._native_frame.lazy(), - backend_version=self._backend_version, - version=self._version, - ) + def lazy(self: Self, *, backend: Implementation | None = None) -> CompliantLazyFrame: + from narwhals.utils import parse_version + + if backend is None or backend is Implementation.POLARS: + from narwhals._polars.dataframe import PolarsLazyFrame + + return PolarsLazyFrame( + self._native_frame.lazy(), + backend_version=self._backend_version, + version=self._version, + ) + elif backend is Implementation.DUCKDB: + import duckdb # ignore-banned-import + + from narwhals._duckdb.dataframe import DuckDBLazyFrame + + df = self._native_frame # noqa: F841 + return DuckDBLazyFrame( + df=duckdb.table("df"), + backend_version=parse_version(duckdb.__version__), + version=self._version, + ) + elif backend is Implementation.DASK: + import dask # ignore-banned-import + import dask.dataframe as dd # ignore-banned-import + + from narwhals._dask.dataframe import DaskLazyFrame + + return DaskLazyFrame( + native_dataframe=dd.from_pandas(self._native_frame.to_pandas()), + backend_version=parse_version(dask.__version__), + version=self._version, + ) + raise AssertionError # pragma: no cover @overload def to_dict(self: Self, *, as_series: Literal[True]) -> dict[str, PolarsSeries]: ... diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index d02d589c2d..805c8cd6f7 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -23,6 +23,7 @@ from narwhals.exceptions import ShapeError from narwhals.schema import Schema from narwhals.translate import to_native +from narwhals.utils import Implementation from narwhals.utils import find_stacklevel from narwhals.utils import flatten from narwhals.utils import generate_repr @@ -50,7 +51,6 @@ from narwhals.typing import IntoExpr from narwhals.typing import IntoFrame from narwhals.typing import SizeUnit - from narwhals.utils import Implementation PS = ParamSpec("PS") @@ -498,54 +498,72 @@ def __arrow_c_stream__(self: Self, requested_schema: object | None = None) -> ob pa_table = self.to_arrow() return pa_table.__arrow_c_stream__(requested_schema=requested_schema) - def lazy(self: Self) -> LazyFrame[Any]: - """Lazify the DataFrame (if possible). + def lazy(self: Self, *, backend: Implementation | None = None) -> LazyFrame[Any]: + """Restrict available API methods to lazy-only ones. - If a library does not support lazy execution, then this is a no-op. + If `backend` is specified, then a conversion between different backends + might be triggered. + If a library does not support lazy execution and `backend` is not specified, + then this is will only restrict the API to lazy-only operations. This is useful + if you want to ensure that you write dataframe-agnostic code which all has + the possibility of running entirely lazily. + + Arguments: + backend: The (lazy) implementation to convert to. If not specified, and the + given library does not support lazy execution, then this will restrict + the API to lazy-only operations. Returns: A new LazyFrame. Examples: - Construct pandas, Polars and PyArrow DataFrames: - - >>> import pandas as pd >>> import polars as pl >>> import pyarrow as pa >>> import narwhals as nw - >>> from narwhals.typing import IntoFrame - >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) - >>> df_pa = pa.table(data) - - We define a library agnostic function: + >>> df_native = pl.DataFrame({"a": [1, 2], "b": [4, 6]}) + >>> df = nw.from_native(df_native) - >>> def agnostic_lazy(df_native: IntoFrame) -> IntoFrame: - ... df = nw.from_native(df_native) - ... return df.lazy().to_native() + If we call `df.lazy`, we get a `narwhals.LazyFrame` backed by a Polars + LazyFrame. - Note that then, pandas and pyarrow dataframe stay eager, but Polars DataFrame - becomes a Polars LazyFrame: + >>> df.lazy() # doctest: +SKIP + ┌─────────────────────────────┐ + | Narwhals LazyFrame | + |-----------------------------| + || + └─────────────────────────────┘ - >>> agnostic_lazy(df_pd) - foo bar ham - 0 1 6.0 a - 1 2 7.0 b - 2 3 8.0 c - >>> agnostic_lazy(df_pl) - - >>> agnostic_lazy(df_pa) - pyarrow.Table - foo: int64 - bar: double - ham: string - ---- - foo: [[1,2,3]] - bar: [[6,7,8]] - ham: [["a","b","c"]] + We can also pass DuckDB as the backend, and then we'll get a + `narwhals.LazyFrame` backed by a `duckdb.DuckDBPyRelation`. + + >>> df.lazy(backend=nw.Implementation.DUCKDB) + ┌──────────────────┐ + |Narwhals LazyFrame| + |------------------| + |┌───────┬───────┐ | + |│ a │ b │ | + |│ int64 │ int64 │ | + |├───────┼───────┤ | + |│ 1 │ 4 │ | + |│ 2 │ 6 │ | + |└───────┴───────┘ | + └──────────────────┘ """ - return self._lazyframe(self._compliant_frame.lazy(), level="lazy") + supported_lazy_backends = ( + Implementation.DASK, + Implementation.DUCKDB, + Implementation.POLARS, + ) + if backend is not None and backend not in supported_lazy_backends: + msg = ( + "Not-supported backend." + f"\n\nExpected one of {supported_lazy_backends} or `None`, got {backend}" + ) + raise ValueError(msg) + return self._lazyframe( + self._compliant_frame.lazy(backend=backend), + level="lazy", + ) def to_native(self: Self) -> DataFrameT: """Convert Narwhals DataFrame to native one. diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index ffd6fc4342..cba00013b4 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -167,15 +167,25 @@ def __getitem__(self: Self, item: tuple[slice, slice]) -> Self: ... def __getitem__(self: Self, item: Any) -> Any: return super().__getitem__(item) - def lazy(self: Self) -> LazyFrame[Any]: - """Lazify the DataFrame (if possible). + def lazy(self: Self, *, backend: Implementation | None = None) -> LazyFrame[Any]: + """Restrict available API methods to lazy-only ones. - If a library does not support lazy execution, then this is a no-op. + If `backend` is specified, then a conversion between different backends + might be triggered. + If a library does not support lazy execution and `backend` is not specified, + then this is will only restrict the API to lazy-only operations. This is useful + if you want to ensure that you write dataframe-agnostic code which all has + the possibility of running entirely lazily. + + Arguments: + backend: The (lazy) implementation to convert to. If not specified, and the + given library does not support lazy execution, then this will restrict + the API to lazy-only operations. Returns: A new LazyFrame. """ - return super().lazy() # type: ignore[return-value] + return super().lazy(backend=backend) # type: ignore[return-value] # Not sure what mypy is complaining about, probably some fancy # thing that I need to understand category theory for diff --git a/tests/frame/lazy_test.py b/tests/frame/lazy_test.py index df27a4cc9a..fa9d871dfc 100644 --- a/tests/frame/lazy_test.py +++ b/tests/frame/lazy_test.py @@ -2,17 +2,49 @@ from typing import TYPE_CHECKING +import pytest + import narwhals as nw import narwhals.stable.v1 as nw_v1 +from narwhals.utils import Implementation if TYPE_CHECKING: from tests.utils import ConstructorEager +data = {"a": [1, 2, 3]} + + def test_lazy(constructor_eager: ConstructorEager) -> None: - df = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True) + df = nw.from_native(constructor_eager(data), eager_only=True) result = df.lazy() assert isinstance(result, nw.LazyFrame) - df = nw_v1.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True) + df = nw_v1.from_native(constructor_eager(data), eager_only=True) result = df.lazy() assert isinstance(result, nw_v1.LazyFrame) + + +@pytest.mark.parametrize( + "backend", [Implementation.POLARS, Implementation.DUCKDB, Implementation.DASK] +) +def test_lazy_backend( + request: pytest.FixtureRequest, + constructor_eager: ConstructorEager, + backend: Implementation, +) -> None: + if "modin" in str(constructor_eager): + request.applymarker(pytest.mark.xfail) + if backend is Implementation.DASK: + pytest.importorskip("dask") + if backend is Implementation.DUCKDB: + pytest.importorskip("duckdb") + df = nw.from_native(constructor_eager(data), eager_only=True) + result = df.lazy(backend=backend) + assert isinstance(result, nw.LazyFrame) + assert result.implementation == backend + + +def test_lazy_backend_invalid(constructor_eager: ConstructorEager) -> None: + df = nw.from_native(constructor_eager(data), eager_only=True) + with pytest.raises(ValueError, match="Not-supported backend"): + df.lazy(backend=Implementation.PANDAS)