Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api-reference/dataframe.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
- estimated_size
- explode
- filter
- from_arrow
- from_dict
- gather_every
- get_column
Expand Down
59 changes: 58 additions & 1 deletion narwhals/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
issue_performance_warning,
supports_arrow_c_stream,
)
from narwhals.dependencies import get_polars, is_numpy_array
from narwhals.dependencies import get_polars, is_numpy_array, is_pyarrow_table
from narwhals.exceptions import InvalidIntoExprError, InvalidOperationError
from narwhals.functions import _from_dict_no_backend
from narwhals.schema import Schema
Expand All @@ -58,6 +58,7 @@

from narwhals._compliant import CompliantDataFrame, CompliantLazyFrame
from narwhals._compliant.typing import CompliantExprAny, EagerNamespaceAny
from narwhals._translate import IntoArrowTable
from narwhals.dtypes import DType
from narwhals.group_by import GroupBy, LazyGroupBy
from narwhals.typing import (
Expand Down Expand Up @@ -458,6 +459,62 @@ def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) ->
msg = f"Expected an object which implements `__narwhals_dataframe__`, got: {type(df)}"
raise AssertionError(msg)

@classmethod
def from_arrow(
cls, native_frame: IntoArrowTable, *, backend: ModuleType | Implementation | str
) -> DataFrame[Any]:
"""Construct a DataFrame from an object which supports the PyCapsule Interface.

Arguments:
native_frame: Object which implements `__arrow_c_stream__`.
backend: specifies which eager backend instantiate to.

`backend` can be specified in various ways

- As `Implementation.<BACKEND>` with `BACKEND` being `PANDAS`, `PYARROW`,
`POLARS`, `MODIN` or `CUDF`.
- As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`.
- Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`.

Returns:
A new DataFrame.

Examples:
>>> import pandas as pd
>>> import polars as pl
>>> import narwhals as nw
>>>
>>> df_native = pd.DataFrame({"a": [1, 2], "b": [4.2, 5.1]})
>>> nw.DataFrame.from_arrow(df_native, backend="polars")
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
|Narwhals DataFrame|
|------------------|
| shape: (2, 2) |
| β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β” |
| β”‚ a ┆ b β”‚ |
| β”‚ --- ┆ --- β”‚ |
| β”‚ i64 ┆ f64 β”‚ |
| β•žβ•β•β•β•β•β•ͺ═════║ |
| β”‚ 1 ┆ 4.2 β”‚ |
| β”‚ 2 ┆ 5.1 β”‚ |
| β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜ |
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
"""
if not (supports_arrow_c_stream(native_frame) or is_pyarrow_table(native_frame)):
msg = f"Given object of type {type(native_frame)} does not support PyCapsule interface"
raise TypeError(msg)
implementation = Implementation.from_backend(backend)
if is_eager_allowed(implementation):
ns = cls._version.namespace.from_backend(implementation).compliant
compliant = ns._dataframe.from_arrow(native_frame, context=ns)
return cls(compliant, level="full")
msg = (
f"{implementation} support in Narwhals is lazy-only, but `DataFrame.from_arrow` is an eager-only function.\n\n"
"Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n"
f" nw.DataFrame.from_arrow(df, backend='pyarrow').lazy('{implementation}')"
)
raise ValueError(msg)

@classmethod
def from_dict(
cls,
Expand Down
7 changes: 7 additions & 0 deletions narwhals/stable/v1/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,13 @@ def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) ->
# We need to override any method which don't return Self so that type
# annotations are correct.

@classmethod
def from_arrow(
cls, native_frame: IntoArrowTable, *, backend: ModuleType | Implementation | str
) -> DataFrame[Any]:
result = super().from_arrow(native_frame, backend=backend)
return cast("DataFrame[Any]", result)

@classmethod
def from_dict(
cls,
Expand Down
84 changes: 84 additions & 0 deletions tests/frame/from_arrow_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from __future__ import annotations

import sys
from typing import TYPE_CHECKING, Any

import pytest

pytest.importorskip("pyarrow")
import pyarrow as pa

import narwhals as nw
from narwhals._utils import Implementation
from tests.utils import PYARROW_VERSION, assert_equal_data

if TYPE_CHECKING:
from narwhals._namespace import EagerAllowed


@pytest.fixture
def data() -> dict[str, Any]:
return {"ab": [1, 2, 3], "ba": ["four", "five", None]}


@pytest.fixture
def table(data: dict[str, Any]) -> pa.Table:
return pa.table(data)


def is_native(native: Any, backend: EagerAllowed) -> bool:
if backend in {Implementation.PYARROW, "pyarrow"}:
return isinstance(native, pa.Table)
if backend in {Implementation.POLARS, "polars"}:
import polars as pl

return isinstance(native, pl.DataFrame)
if backend in {Implementation.PANDAS, "pandas"}:
import pandas as pd

return isinstance(native, pd.DataFrame)
msg = f"Unexpected backend {backend!r}" # pragma: no cover
raise TypeError(msg) # pragma: no cover


def test_dataframe_from_arrow_table(
eager_backend: EagerAllowed, table: pa.Table, data: dict[str, Any]
) -> None:
# NOTE: PyCapsule support requires `pyarrow>=14`, but this path should work in all cases
result = nw.DataFrame.from_arrow(table, backend=eager_backend)
assert_equal_data(result, data)
assert is_native(result.to_native(), eager_backend)


@pytest.mark.xfail(PYARROW_VERSION < (14,), reason="too old")
def test_dataframe_from_arrow_pycapsule(
eager_backend: EagerAllowed, table: pa.Table, data: dict[str, Any]
) -> None:
result = nw.DataFrame.from_arrow(table, backend=eager_backend)
supports_arrow_c_stream = nw.from_native(table)
result = nw.DataFrame.from_arrow(supports_arrow_c_stream, backend=eager_backend)
assert_equal_data(result, data)
assert is_native(result.to_native(), eager_backend)


def test_dataframe_from_arrow_to_polars_no_pandas(
monkeypatch: pytest.MonkeyPatch, table: pa.Table, data: dict[str, Any]
) -> None:
pytest.importorskip("polars")
monkeypatch.delitem(sys.modules, "pandas", raising=False)
if PYARROW_VERSION < (14,): # pragma: no cover
result = nw.DataFrame.from_arrow(table, backend="polars")
else:
supports_arrow_c_stream = nw.from_native(table)
result = nw.DataFrame.from_arrow(supports_arrow_c_stream, backend="polars")
assert is_native(result.to_native(), "polars")
assert_equal_data(result, data)
assert "pandas" not in sys.modules


def test_dataframe_from_arrow_invalid(table: pa.Table, data: dict[str, Any]) -> None:
with pytest.raises(TypeError, match="PyCapsule"):
nw.DataFrame.from_arrow(data, backend=pa) # type: ignore[arg-type]
pytest.importorskip("sqlframe")
with pytest.raises(ValueError, match="lazy"):
nw.DataFrame.from_arrow(table, backend="sqlframe")
39 changes: 38 additions & 1 deletion tests/v1_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import narwhals as nw
import narwhals.stable.v1 as nw_v1
from narwhals._utils import Implementation
from narwhals.exceptions import InvalidOperationError
from narwhals.stable.v1.dependencies import (
is_cudf_dataframe,
Expand All @@ -30,7 +31,13 @@
is_pyarrow_table,
)
from narwhals.utils import Version
from tests.utils import PANDAS_VERSION, Constructor, ConstructorEager, assert_equal_data
from tests.utils import (
PANDAS_VERSION,
PYARROW_VERSION,
Constructor,
ConstructorEager,
assert_equal_data,
)

if TYPE_CHECKING:
from typing_extensions import assert_type
Expand Down Expand Up @@ -1000,3 +1007,33 @@ def test_dataframe_from_dict(eager_backend: EagerAllowed) -> None:
assert result.collect_schema() == schema
assert result._version is Version.V1
assert isinstance(result, nw_v1.DataFrame)


def test_dataframe_from_arrow(eager_backend: EagerAllowed) -> None:
pytest.importorskip("pyarrow")
import pyarrow as pa

is_pyarrow = eager_backend in {Implementation.PYARROW, "pyarrow"}
data: dict[str, Any] = {"ab": [1, 2, 3], "ba": ["four", "five", None]}
table = pa.table(data)
supports_arrow_c_stream = nw_v1.DataFrame.from_arrow(table, backend=eager_backend)
assert_equal_data(supports_arrow_c_stream, data)
assert isinstance(supports_arrow_c_stream, nw_v1.DataFrame)
assert supports_arrow_c_stream._version is Version.V1
if is_pyarrow:
assert isinstance(supports_arrow_c_stream.to_native(), pa.Table)
else:
assert not isinstance(supports_arrow_c_stream.to_native(), pa.Table)
if PYARROW_VERSION < (14,): # pragma: no cover
...
else:
result = nw_v1.DataFrame.from_arrow(
supports_arrow_c_stream, backend=eager_backend
)
assert_equal_data(result, data)
assert result._version is Version.V1
assert isinstance(result, nw_v1.DataFrame)
if is_pyarrow:
assert isinstance(result.to_native(), pa.Table)
else:
assert not isinstance(result.to_native(), pa.Table)
Loading