Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 39 additions & 2 deletions narwhals/_arrow/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from narwhals._expression_parsing import evaluate_into_exprs
from narwhals.dependencies import is_numpy_array
from narwhals.utils import Implementation
from narwhals.utils import Version
from narwhals.utils import check_column_exists
from narwhals.utils import generate_temporary_column_name
from narwhals.utils import is_sequence_but_not_str
Expand Down Expand Up @@ -519,8 +520,44 @@ def tail(self: Self, n: int) -> Self:
else:
return self._from_native_frame(df.slice(abs(n)))

def lazy(self: Self) -> Self:
return self
def lazy(self: Self, *, backend: Implementation | None = None) -> CompliantLazyFrame:
from narwhals.utils import parse_version

if backend is None:
return self
elif backend is Implementation.DUCKDB:
import duckdb # ignore-banned-import

from narwhals._duckdb.dataframe import DuckDBLazyFrame

df = self._native_frame # noqa: F841
return DuckDBLazyFrame(
df=duckdb.table("df"),
backend_version=parse_version(duckdb.__version__),
version=self._version,
)
elif backend is Implementation.POLARS:
import polars as pl # ignore-banned-import

from narwhals._polars.dataframe import PolarsLazyFrame

return PolarsLazyFrame(
df=pl.from_arrow(self._native_frame).lazy(), # type: ignore[union-attr]
backend_version=parse_version(pl.__version__),
version=self._version,
)
elif backend is Implementation.DASK:
import dask # ignore-banned-import
import dask.dataframe as dd # ignore-banned-import

from narwhals._dask.dataframe import DaskLazyFrame

return DaskLazyFrame(
native_dataframe=dd.from_pandas(self._native_frame.to_pandas()),
backend_version=parse_version(dask.__version__),
version=self._version,
)
raise AssertionError # pragma: no cover

def collect(self: Self) -> ArrowDataFrame:
return ArrowDataFrame(
Expand Down
9 changes: 8 additions & 1 deletion narwhals/_duckdb/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,14 @@ def drop(self: Self, columns: list[str], strict: bool) -> Self: # noqa: FBT001
selection = (col for col in self.columns if col not in columns_to_drop)
return self._from_native_frame(self._native_frame.select(*selection))

def lazy(self: Self) -> Self:
def lazy(self: Self, *, backend: Implementation | None = None) -> Self:
# The `backend`` argument has no effect but we keep it here for
# backwards compatibility because in `narwhals.stable.v1`
# function `.from_native()` will return a DataFrame for DuckDB.

if backend is not None: # pragma: no cover
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think duckdb should be allowed?

Suggested change
if backend is not None: # pragma: no cover
if backend not in (None, Implementation.DUCKDB): # pragma: no cover

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't think it needs to be - if this path only exists for v1 backwards-compatibility, then #1895 (comment) is something somebody could have written, but they wouldn't have passed a backend, so raising for backend is not None wouldn't break anyone's code

msg = "`backend` argument is not supported for DuckDB"
raise ValueError(msg)
return self

def with_columns(
Expand Down
40 changes: 38 additions & 2 deletions narwhals/_pandas_like/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,8 +709,44 @@ def unique(
)

# --- lazy-only ---
def lazy(self: Self) -> Self:
return self
def lazy(self: Self, *, backend: Implementation | None = None) -> CompliantLazyFrame:
from narwhals.utils import parse_version

if backend is None:
return self
elif backend is Implementation.DUCKDB:
import duckdb # ignore-banned-import

from narwhals._duckdb.dataframe import DuckDBLazyFrame

df = self._native_frame # noqa: F841
return DuckDBLazyFrame(
df=duckdb.table("df"),
backend_version=parse_version(duckdb.__version__),
version=self._version,
)
elif backend is Implementation.POLARS:
import polars as pl # ignore-banned-import

from narwhals._polars.dataframe import PolarsLazyFrame

return PolarsLazyFrame(
df=pl.from_pandas(self._native_frame).lazy(),
backend_version=parse_version(pl.__version__),
version=self._version,
)
elif backend is Implementation.DASK:
import dask # ignore-banned-import
import dask.dataframe as dd # ignore-banned-import

from narwhals._dask.dataframe import DaskLazyFrame

return DaskLazyFrame(
native_dataframe=dd.from_pandas(self._native_frame),
backend_version=parse_version(dask.__version__),
version=self._version,
)
raise AssertionError # pragma: no cover

@property
def shape(self: Self) -> tuple[int, int]:
Expand Down
41 changes: 35 additions & 6 deletions narwhals/_polars/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from narwhals._polars.group_by import PolarsLazyGroupBy
from narwhals._polars.series import PolarsSeries
from narwhals.dtypes import DType
from narwhals.typing import CompliantLazyFrame
from narwhals.utils import Version

T = TypeVar("T")
Expand Down Expand Up @@ -231,12 +232,40 @@ def schema(self: Self) -> dict[str, DType]:
for name, dtype in schema.items()
}

def lazy(self: Self) -> PolarsLazyFrame:
return PolarsLazyFrame(
self._native_frame.lazy(),
backend_version=self._backend_version,
version=self._version,
)
def lazy(self: Self, *, backend: Implementation | None = None) -> CompliantLazyFrame:
from narwhals.utils import parse_version

if backend is None or backend is Implementation.POLARS:
from narwhals._polars.dataframe import PolarsLazyFrame

return PolarsLazyFrame(
self._native_frame.lazy(),
backend_version=self._backend_version,
version=self._version,
)
elif backend is Implementation.DUCKDB:
import duckdb # ignore-banned-import

from narwhals._duckdb.dataframe import DuckDBLazyFrame

df = self._native_frame # noqa: F841
return DuckDBLazyFrame(
df=duckdb.table("df"),
backend_version=parse_version(duckdb.__version__),
version=self._version,
)
elif backend is Implementation.DASK:
import dask # ignore-banned-import
import dask.dataframe as dd # ignore-banned-import

from narwhals._dask.dataframe import DaskLazyFrame

return DaskLazyFrame(
native_dataframe=dd.from_pandas(self._native_frame.to_pandas()),
backend_version=parse_version(dask.__version__),
version=self._version,
)
raise AssertionError # pragma: no cover

@overload
def to_dict(self: Self, *, as_series: Literal[True]) -> dict[str, PolarsSeries]: ...
Expand Down
90 changes: 54 additions & 36 deletions narwhals/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from narwhals.exceptions import ShapeError
from narwhals.schema import Schema
from narwhals.translate import to_native
from narwhals.utils import Implementation
from narwhals.utils import find_stacklevel
from narwhals.utils import flatten
from narwhals.utils import generate_repr
Expand Down Expand Up @@ -50,7 +51,6 @@
from narwhals.typing import IntoExpr
from narwhals.typing import IntoFrame
from narwhals.typing import SizeUnit
from narwhals.utils import Implementation

PS = ParamSpec("PS")

Expand Down Expand Up @@ -498,54 +498,72 @@ def __arrow_c_stream__(self: Self, requested_schema: object | None = None) -> ob
pa_table = self.to_arrow()
return pa_table.__arrow_c_stream__(requested_schema=requested_schema)

def lazy(self: Self) -> LazyFrame[Any]:
"""Lazify the DataFrame (if possible).
def lazy(self: Self, *, backend: Implementation | None = None) -> LazyFrame[Any]:
"""Restrict available API methods to lazy-only ones.

If a library does not support lazy execution, then this is a no-op.
If `backend` is specified, then a conversion between different backends
might be triggered.
If a library does not support lazy execution and `backend` is not specified,
then this is will only restrict the API to lazy-only operations. This is useful
if you want to ensure that you write dataframe-agnostic code which all has
the possibility of running entirely lazily.

Arguments:
backend: The (lazy) implementation to convert to. If not specified, and the
given library does not support lazy execution, then this will restrict
the API to lazy-only operations.

Returns:
A new LazyFrame.

Examples:
Construct pandas, Polars and PyArrow DataFrames:

>>> import pandas as pd
>>> import polars as pl
>>> import pyarrow as pa
>>> import narwhals as nw
>>> from narwhals.typing import IntoFrame
>>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]}
>>> df_pd = pd.DataFrame(data)
>>> df_pl = pl.DataFrame(data)
>>> df_pa = pa.table(data)

We define a library agnostic function:
>>> df_native = pl.DataFrame({"a": [1, 2], "b": [4, 6]})
>>> df = nw.from_native(df_native)

>>> def agnostic_lazy(df_native: IntoFrame) -> IntoFrame:
... df = nw.from_native(df_native)
... return df.lazy().to_native()
If we call `df.lazy`, we get a `narwhals.LazyFrame` backed by a Polars
LazyFrame.

Note that then, pandas and pyarrow dataframe stay eager, but Polars DataFrame
becomes a Polars LazyFrame:
>>> df.lazy() # doctest: +SKIP
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
| Narwhals LazyFrame |
|-----------------------------|
|<LazyFrame at 0x7F52B9937230>|
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

>>> agnostic_lazy(df_pd)
foo bar ham
0 1 6.0 a
1 2 7.0 b
2 3 8.0 c
>>> agnostic_lazy(df_pl)
<LazyFrame ...>
>>> agnostic_lazy(df_pa)
pyarrow.Table
foo: int64
bar: double
ham: string
----
foo: [[1,2,3]]
bar: [[6,7,8]]
ham: [["a","b","c"]]
We can also pass DuckDB as the backend, and then we'll get a
`narwhals.LazyFrame` backed by a `duckdb.DuckDBPyRelation`.

>>> df.lazy(backend=nw.Implementation.DUCKDB)
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
|Narwhals LazyFrame|
|------------------|
|β”Œβ”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β” |
|β”‚ a β”‚ b β”‚ |
|β”‚ int64 β”‚ int64 β”‚ |
|β”œβ”€β”€β”€β”€β”€β”€β”€β”Όβ”€β”€β”€β”€β”€β”€β”€β”€ |
|β”‚ 1 β”‚ 4 β”‚ |
|β”‚ 2 β”‚ 6 β”‚ |
|β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”˜ |
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
"""
return self._lazyframe(self._compliant_frame.lazy(), level="lazy")
supported_lazy_backends = (
Implementation.DASK,
Implementation.DUCKDB,
Implementation.POLARS,
)
if backend is not None and backend not in supported_lazy_backends:
msg = (
"Not-supported backend."
f"\n\nExpected one of {supported_lazy_backends} or `None`, got {backend}"
)
raise ValueError(msg)
return self._lazyframe(
self._compliant_frame.lazy(backend=backend),
level="lazy",
)

def to_native(self: Self) -> DataFrameT:
"""Convert Narwhals DataFrame to native one.
Expand Down
18 changes: 14 additions & 4 deletions narwhals/stable/v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,15 +167,25 @@ def __getitem__(self: Self, item: tuple[slice, slice]) -> Self: ...
def __getitem__(self: Self, item: Any) -> Any:
return super().__getitem__(item)

def lazy(self: Self) -> LazyFrame[Any]:
"""Lazify the DataFrame (if possible).
def lazy(self: Self, *, backend: Implementation | None = None) -> LazyFrame[Any]:
"""Restrict available API methods to lazy-only ones.

If a library does not support lazy execution, then this is a no-op.
If `backend` is specified, then a conversion between different backends
might be triggered.
If a library does not support lazy execution and `backend` is not specified,
then this is will only restrict the API to lazy-only operations. This is useful
if you want to ensure that you write dataframe-agnostic code which all has
the possibility of running entirely lazily.

Arguments:
backend: The (lazy) implementation to convert to. If not specified, and the
given library does not support lazy execution, then this will restrict
the API to lazy-only operations.

Returns:
A new LazyFrame.
"""
return super().lazy() # type: ignore[return-value]
return super().lazy(backend=backend) # type: ignore[return-value]

# Not sure what mypy is complaining about, probably some fancy
# thing that I need to understand category theory for
Expand Down
36 changes: 34 additions & 2 deletions tests/frame/lazy_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,49 @@

from typing import TYPE_CHECKING

import pytest

import narwhals as nw
import narwhals.stable.v1 as nw_v1
from narwhals.utils import Implementation

if TYPE_CHECKING:
from tests.utils import ConstructorEager


data = {"a": [1, 2, 3]}


def test_lazy(constructor_eager: ConstructorEager) -> None:
df = nw.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True)
df = nw.from_native(constructor_eager(data), eager_only=True)
result = df.lazy()
assert isinstance(result, nw.LazyFrame)
df = nw_v1.from_native(constructor_eager({"a": [1, 2, 3]}), eager_only=True)
df = nw_v1.from_native(constructor_eager(data), eager_only=True)
result = df.lazy()
assert isinstance(result, nw_v1.LazyFrame)


@pytest.mark.parametrize(
"backend", [Implementation.POLARS, Implementation.DUCKDB, Implementation.DASK]
)
def test_lazy_backend(
request: pytest.FixtureRequest,
constructor_eager: ConstructorEager,
backend: Implementation,
) -> None:
if "modin" in str(constructor_eager):
request.applymarker(pytest.mark.xfail)
if backend is Implementation.DASK:
pytest.importorskip("dask")
if backend is Implementation.DUCKDB:
pytest.importorskip("duckdb")
df = nw.from_native(constructor_eager(data), eager_only=True)
result = df.lazy(backend=backend)
assert isinstance(result, nw.LazyFrame)
assert result.implementation == backend


def test_lazy_backend_invalid(constructor_eager: ConstructorEager) -> None:
df = nw.from_native(constructor_eager(data), eager_only=True)
with pytest.raises(ValueError, match="Not-supported backend"):
df.lazy(backend=Implementation.PANDAS)
Loading