From 2b40eded6d8ddd8af3aa9ba3cbbf44aaca6dda7c Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 25 Jul 2025 17:20:52 +0000 Subject: [PATCH 01/13] feat: Add `DataFrame.from_dict` Towards #2116 --- narwhals/dataframe.py | 31 ++++++++++++++++++++++++++++++- narwhals/stable/v1/__init__.py | 2 ++ tests/v1_test.py | 3 ++- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 5ac80cbcd8..acbef3ee6b 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -6,6 +6,7 @@ TYPE_CHECKING, Any, Callable, + ClassVar, Generic, Literal, NoReturn, @@ -28,6 +29,7 @@ generate_repr, is_compliant_dataframe, is_compliant_lazyframe, + is_eager_allowed, is_index_selector, is_list_of, is_sequence_like, @@ -38,12 +40,13 @@ ) from narwhals.dependencies import get_polars, is_numpy_array from narwhals.exceptions import InvalidIntoExprError, InvalidOperationError +from narwhals.functions import _from_dict_no_backend from narwhals.schema import Schema from narwhals.series import Series from narwhals.translate import to_native if TYPE_CHECKING: - from collections.abc import Iterable, Iterator, Sequence + from collections.abc import Iterable, Iterator, Mapping, Sequence from io import BytesIO from pathlib import Path from types import ModuleType @@ -55,6 +58,7 @@ from narwhals._compliant import CompliantDataFrame, CompliantLazyFrame from narwhals._compliant.typing import CompliantExprAny, EagerNamespaceAny + from narwhals.dtypes import DType from narwhals.group_by import GroupBy, LazyGroupBy from narwhals.typing import ( AsofJoinStrategy, @@ -409,6 +413,8 @@ class DataFrame(BaseFrame[DataFrameT]): ``` """ + _version: ClassVar[Version] = Version.MAIN + def _extract_compliant(self, arg: Any) -> Any: from narwhals.expr import Expr from narwhals.series import Series @@ -452,6 +458,29 @@ def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> msg = f"Expected an object which implements `__narwhals_dataframe__`, got: {type(df)}" raise AssertionError(msg) + @classmethod + def from_dict( + cls, + data: Mapping[str, Any], + schema: Mapping[str, DType] | Schema | None = None, + *, + backend: ModuleType | Implementation | str | None = None, + ) -> Self: + if backend is None: + data, backend = _from_dict_no_backend(data) + implementation = Implementation.from_backend(backend) + if is_eager_allowed(implementation): + ns = cls._version.namespace.from_backend(implementation).compliant + compliant = ns._dataframe.from_dict(data, schema=schema, context=ns) + return cls(compliant, level="full") + # NOTE: (#2786) needs resolving for extensions + msg = ( + f"{implementation} support in Narwhals is lazy-only, but `DataFrame.from_dict` is an eager-only function.\n\n" + "Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n" + f" nw.DataFrame.from_dict({{'a': [1, 2]}}, backend='pyarrow').lazy('{implementation}')" + ) + raise ValueError(msg) + @property def implementation(self) -> Implementation: """Return implementation of native frame. diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 76f76fc319..1b0b2ba2a9 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -96,6 +96,8 @@ class DataFrame(NwDataFrame[IntoDataFrameT]): + _version = Version.V1 + @inherit_doc(NwDataFrame) def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> None: assert df._version is Version.V1 # noqa: S101 diff --git a/tests/v1_test.py b/tests/v1_test.py index 130c974b27..e2336ca500 100644 --- a/tests/v1_test.py +++ b/tests/v1_test.py @@ -570,7 +570,8 @@ def test_dataframe_recursive_v1() -> None: pl_frame = pl.DataFrame({"a": [1, 2, 3]}) nw_frame = nw_v1.from_native(pl_frame) - with pytest.raises(AttributeError): + # NOTE: (#2629) combined with passing in `nw_v1.DataFrame` (w/ a `_version`) into itself changes the error + with pytest.raises(AssertionError): nw_v1.DataFrame(nw_frame, level="full") nw_frame_early_return = nw_v1.from_native(nw_frame) From 5562b001ccf585f65e7cf282443d13efda575f45 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 25 Jul 2025 17:45:01 +0000 Subject: [PATCH 02/13] docs: Adapt `from_dict` docstring --- narwhals/dataframe.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index acbef3ee6b..ce0404be61 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -466,6 +466,45 @@ def from_dict( *, backend: ModuleType | Implementation | str | None = None, ) -> Self: + """Instantiate DataFrame from dictionary. + + Indexes (if present, for pandas-like backends) are aligned following + the [left-hand-rule](../concepts/pandas_index.md/). + + Notes: + For pandas-like dataframes, conversion to schema is applied after dataframe + creation. + + Arguments: + data: Dictionary to create DataFrame from. + schema: The DataFrame schema as Schema or dict of {name: type}. If not + specified, the schema will be inferred by the native library. + backend: specifies which eager backend instantiate to. Only + necessary if inputs are not Narwhals Series. + + `backend` can be specified in various ways + + - As `Implementation.` with `BACKEND` being `PANDAS`, `PYARROW`, + `POLARS`, `MODIN` or `CUDF`. + - As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`. + - Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`. + + Returns: + A new DataFrame. + + Examples: + >>> import pandas as pd + >>> import narwhals as nw + >>> data = {"c": [5, 2], "d": [1, 4]} + >>> nw.DataFrame.from_dict(data, backend="pandas") + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | c d | + | 0 5 1 | + | 1 2 4 | + └──────────────────┘ + """ if backend is None: data, backend = _from_dict_no_backend(data) implementation = Implementation.from_backend(backend) From 24997d2f5f313d38f6e9c040e58c1c7555eb9c39 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 25 Jul 2025 18:07:59 +0000 Subject: [PATCH 03/13] test: Adapt `from_dict` tests --- tests/conftest.py | 21 +++++++- tests/frame/from_dict_test.py | 94 +++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 tests/frame/from_dict_test.py diff --git a/tests/conftest.py b/tests/conftest.py index b9094b13d5..a55fa01270 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,11 +4,12 @@ import uuid from copy import deepcopy from functools import lru_cache +from importlib.util import find_spec from typing import TYPE_CHECKING, Any, Callable, cast import pytest -from narwhals._utils import generate_temporary_column_name +from narwhals._utils import Implementation, generate_temporary_column_name from tests.utils import PANDAS_VERSION if TYPE_CHECKING: @@ -23,6 +24,7 @@ from pyspark.sql import DataFrame as PySparkDataFrame from typing_extensions import TypeAlias + from narwhals._namespace import EagerAllowed from narwhals._spark_like.dataframe import SQLFrameDataFrame from narwhals.typing import NativeFrame, NativeLazyFrame from tests.utils import Constructor, ConstructorEager, ConstructorLazy @@ -308,3 +310,20 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: ) elif "constructor" in metafunc.fixturenames: metafunc.parametrize("constructor", constructors, ids=constructors_ids) + + +TEST_EAGER_BACKENDS: list[EagerAllowed] = [] +TEST_EAGER_BACKENDS.extend( + (Implementation.POLARS, "polars") if find_spec("polars") is not None else () +) +TEST_EAGER_BACKENDS.extend( + (Implementation.PANDAS, "pandas") if find_spec("pandas") is not None else () +) +TEST_EAGER_BACKENDS.extend( + (Implementation.PYARROW, "pyarrow") if find_spec("pyarrow") is not None else () +) + + +@pytest.fixture(params=TEST_EAGER_BACKENDS) +def eager_backend(request: pytest.FixtureRequest) -> EagerAllowed: + return request.param # type: ignore[no-any-return] diff --git a/tests/frame/from_dict_test.py b/tests/frame/from_dict_test.py new file mode 100644 index 0000000000..0b952077df --- /dev/null +++ b/tests/frame/from_dict_test.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pytest + +import narwhals as nw +from narwhals._utils import Implementation +from tests.utils import Constructor, assert_equal_data + +if TYPE_CHECKING: + from narwhals._namespace import EagerAllowed + + +def test_from_dict(eager_backend: EagerAllowed) -> None: + result = nw.DataFrame.from_dict({"c": [1, 2], "d": [5, 6]}, backend=eager_backend) + expected = {"c": [1, 2], "d": [5, 6]} + assert_equal_data(result, expected) + assert isinstance(result, nw.DataFrame) + + +def test_from_dict_schema(eager_backend: EagerAllowed) -> None: + schema = {"c": nw.Int16(), "d": nw.Float32()} + result = nw.DataFrame.from_dict( + {"c": [1, 2], "d": [5, 6]}, backend=eager_backend, schema=schema + ) + assert result.collect_schema() == schema + + +@pytest.mark.parametrize("backend", [Implementation.POLARS, "polars"]) +def test_from_dict_without_backend( + constructor: Constructor, backend: EagerAllowed +) -> None: + pytest.importorskip("polars") + + df = ( + nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) + .lazy() + .collect(backend=backend) + ) + result = nw.DataFrame.from_dict({"c": df["a"], "d": df["b"]}) + assert_equal_data(result, {"c": [1, 2, 3], "d": [4, 5, 6]}) + + +def test_from_dict_without_backend_invalid(constructor: Constructor) -> None: + df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})).lazy().collect() + with pytest.raises(TypeError, match="backend"): + nw.DataFrame.from_dict({"c": nw.to_native(df["a"]), "d": nw.to_native(df["b"])}) + + +def test_from_dict_with_backend_invalid() -> None: + pytest.importorskip("duckdb") + with pytest.raises(ValueError, match="lazy-only"): + nw.DataFrame.from_dict({"c": [1, 2], "d": [5, 6]}, backend="duckdb") + + +@pytest.mark.parametrize("backend", [Implementation.POLARS, "polars"]) +def test_from_dict_one_native_one_narwhals( + constructor: Constructor, backend: EagerAllowed +) -> None: + pytest.importorskip("polars") + + df = ( + nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})) + .lazy() + .collect(backend=backend) + ) + result = nw.DataFrame.from_dict({"c": nw.to_native(df["a"]), "d": df["b"]}) + expected = {"c": [1, 2, 3], "d": [4, 5, 6]} + assert_equal_data(result, expected) + + +def test_from_dict_empty(eager_backend: EagerAllowed) -> None: + result = nw.DataFrame.from_dict({}, backend=eager_backend) + assert result.shape == (0, 0) + + +def test_from_dict_empty_with_schema(eager_backend: EagerAllowed) -> None: + schema = nw.Schema({"a": nw.String(), "b": nw.Int8()}) + result = nw.DataFrame.from_dict({}, schema, backend=eager_backend) + assert result.schema == schema + + +def test_alignment() -> None: + pytest.importorskip("pandas") + import pandas as pd + + # https://github.com/narwhals-dev/narwhals/issues/1474 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = nw.DataFrame.from_dict( + {"a": df["a"], "b": df["a"].sort_values(ascending=False)}, backend=pd + ).to_native() + expected = pd.DataFrame({"a": [1, 2, 3], "b": [3, 2, 1]}) + pd.testing.assert_frame_equal(result, expected) From b523edcae9908c9829d4f5461ef4f4daa4add51a Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 25 Jul 2025 18:11:30 +0000 Subject: [PATCH 04/13] chore(typing): Ignore `var-annotated` for now Will fix after running ci --- tests/frame/from_dict_test.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/frame/from_dict_test.py b/tests/frame/from_dict_test.py index 0b952077df..6a2a4d64b8 100644 --- a/tests/frame/from_dict_test.py +++ b/tests/frame/from_dict_test.py @@ -13,7 +13,7 @@ def test_from_dict(eager_backend: EagerAllowed) -> None: - result = nw.DataFrame.from_dict({"c": [1, 2], "d": [5, 6]}, backend=eager_backend) + result = nw.DataFrame.from_dict({"c": [1, 2], "d": [5, 6]}, backend=eager_backend) # type: ignore[var-annotated] expected = {"c": [1, 2], "d": [5, 6]} assert_equal_data(result, expected) assert isinstance(result, nw.DataFrame) @@ -21,7 +21,7 @@ def test_from_dict(eager_backend: EagerAllowed) -> None: def test_from_dict_schema(eager_backend: EagerAllowed) -> None: schema = {"c": nw.Int16(), "d": nw.Float32()} - result = nw.DataFrame.from_dict( + result = nw.DataFrame.from_dict( # type: ignore[var-annotated] {"c": [1, 2], "d": [5, 6]}, backend=eager_backend, schema=schema ) assert result.collect_schema() == schema @@ -38,7 +38,7 @@ def test_from_dict_without_backend( .lazy() .collect(backend=backend) ) - result = nw.DataFrame.from_dict({"c": df["a"], "d": df["b"]}) + result = nw.DataFrame.from_dict({"c": df["a"], "d": df["b"]}) # type: ignore[var-annotated] assert_equal_data(result, {"c": [1, 2, 3], "d": [4, 5, 6]}) @@ -65,19 +65,19 @@ def test_from_dict_one_native_one_narwhals( .lazy() .collect(backend=backend) ) - result = nw.DataFrame.from_dict({"c": nw.to_native(df["a"]), "d": df["b"]}) + result = nw.DataFrame.from_dict({"c": nw.to_native(df["a"]), "d": df["b"]}) # type: ignore[var-annotated] expected = {"c": [1, 2, 3], "d": [4, 5, 6]} assert_equal_data(result, expected) def test_from_dict_empty(eager_backend: EagerAllowed) -> None: - result = nw.DataFrame.from_dict({}, backend=eager_backend) + result = nw.DataFrame.from_dict({}, backend=eager_backend) # type: ignore[var-annotated] assert result.shape == (0, 0) def test_from_dict_empty_with_schema(eager_backend: EagerAllowed) -> None: schema = nw.Schema({"a": nw.String(), "b": nw.Int8()}) - result = nw.DataFrame.from_dict({}, schema, backend=eager_backend) + result = nw.DataFrame.from_dict({}, schema, backend=eager_backend) # type: ignore[var-annotated] assert result.schema == schema @@ -87,7 +87,7 @@ def test_alignment() -> None: # https://github.com/narwhals-dev/narwhals/issues/1474 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - result = nw.DataFrame.from_dict( + result = nw.DataFrame.from_dict( # type: ignore[var-annotated] {"a": df["a"], "b": df["a"].sort_values(ascending=False)}, backend=pd ).to_native() expected = pd.DataFrame({"a": [1, 2, 3], "b": [3, 2, 1]}) From 695f9ac4850e56639aa96fe5ae3a67234621e31f Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 25 Jul 2025 18:23:57 +0000 Subject: [PATCH 05/13] fix(typing): Resolve `[var-annotated]` by not using `Self` --- narwhals/dataframe.py | 2 +- narwhals/stable/v1/__init__.py | 11 +++++++++++ tests/frame/from_dict_test.py | 14 +++++++------- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index ce0404be61..56aaa3ccc6 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -465,7 +465,7 @@ def from_dict( schema: Mapping[str, DType] | Schema | None = None, *, backend: ModuleType | Implementation | str | None = None, - ) -> Self: + ) -> DataFrame[Any]: """Instantiate DataFrame from dictionary. Indexes (if present, for pandas-like backends) are aligned following diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 1b0b2ba2a9..d6b9410ce7 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -106,6 +106,17 @@ def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> # We need to override any method which don't return Self so that type # annotations are correct. + @classmethod + def from_dict( + cls, + data: Mapping[str, Any], + schema: Mapping[str, DType] | Schema | None = None, + *, + backend: ModuleType | Implementation | str | None = None, + ) -> DataFrame[Any]: + result = super().from_dict(data, schema, backend=backend) + return cast("DataFrame[Any]", result) + @property def _series(self) -> type[Series[Any]]: return cast("type[Series[Any]]", Series) diff --git a/tests/frame/from_dict_test.py b/tests/frame/from_dict_test.py index 6a2a4d64b8..0b952077df 100644 --- a/tests/frame/from_dict_test.py +++ b/tests/frame/from_dict_test.py @@ -13,7 +13,7 @@ def test_from_dict(eager_backend: EagerAllowed) -> None: - result = nw.DataFrame.from_dict({"c": [1, 2], "d": [5, 6]}, backend=eager_backend) # type: ignore[var-annotated] + result = nw.DataFrame.from_dict({"c": [1, 2], "d": [5, 6]}, backend=eager_backend) expected = {"c": [1, 2], "d": [5, 6]} assert_equal_data(result, expected) assert isinstance(result, nw.DataFrame) @@ -21,7 +21,7 @@ def test_from_dict(eager_backend: EagerAllowed) -> None: def test_from_dict_schema(eager_backend: EagerAllowed) -> None: schema = {"c": nw.Int16(), "d": nw.Float32()} - result = nw.DataFrame.from_dict( # type: ignore[var-annotated] + result = nw.DataFrame.from_dict( {"c": [1, 2], "d": [5, 6]}, backend=eager_backend, schema=schema ) assert result.collect_schema() == schema @@ -38,7 +38,7 @@ def test_from_dict_without_backend( .lazy() .collect(backend=backend) ) - result = nw.DataFrame.from_dict({"c": df["a"], "d": df["b"]}) # type: ignore[var-annotated] + result = nw.DataFrame.from_dict({"c": df["a"], "d": df["b"]}) assert_equal_data(result, {"c": [1, 2, 3], "d": [4, 5, 6]}) @@ -65,19 +65,19 @@ def test_from_dict_one_native_one_narwhals( .lazy() .collect(backend=backend) ) - result = nw.DataFrame.from_dict({"c": nw.to_native(df["a"]), "d": df["b"]}) # type: ignore[var-annotated] + result = nw.DataFrame.from_dict({"c": nw.to_native(df["a"]), "d": df["b"]}) expected = {"c": [1, 2, 3], "d": [4, 5, 6]} assert_equal_data(result, expected) def test_from_dict_empty(eager_backend: EagerAllowed) -> None: - result = nw.DataFrame.from_dict({}, backend=eager_backend) # type: ignore[var-annotated] + result = nw.DataFrame.from_dict({}, backend=eager_backend) assert result.shape == (0, 0) def test_from_dict_empty_with_schema(eager_backend: EagerAllowed) -> None: schema = nw.Schema({"a": nw.String(), "b": nw.Int8()}) - result = nw.DataFrame.from_dict({}, schema, backend=eager_backend) # type: ignore[var-annotated] + result = nw.DataFrame.from_dict({}, schema, backend=eager_backend) assert result.schema == schema @@ -87,7 +87,7 @@ def test_alignment() -> None: # https://github.com/narwhals-dev/narwhals/issues/1474 df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) - result = nw.DataFrame.from_dict( # type: ignore[var-annotated] + result = nw.DataFrame.from_dict( {"a": df["a"], "b": df["a"].sort_values(ascending=False)}, backend=pd ).to_native() expected = pd.DataFrame({"a": [1, 2, 3], "b": [3, 2, 1]}) From bbefe38cc78939eb24b9b316847eb31ce3afa6c3 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Fri, 25 Jul 2025 18:36:31 +0000 Subject: [PATCH 06/13] test: Cover `v1` https://github.com/narwhals-dev/narwhals/pull/2885#discussion_r2231774149 --- tests/v1_test.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/v1_test.py b/tests/v1_test.py index e2336ca500..ef4fee7308 100644 --- a/tests/v1_test.py +++ b/tests/v1_test.py @@ -35,6 +35,7 @@ if TYPE_CHECKING: from typing_extensions import assert_type + from narwhals._namespace import EagerAllowed from narwhals.typing import IntoDataFrameT from tests.utils import Constructor, ConstructorEager @@ -975,3 +976,13 @@ def test_dask_order_dependent_ops() -> None: "i": [True, True, True], } assert_equal_data(result, expected) + + +def test_dataframe_from_dict(eager_backend: EagerAllowed) -> None: + schema = {"c": nw_v1.Int16(), "d": nw_v1.Float32()} + result = nw_v1.DataFrame.from_dict( + {"c": [1, 2], "d": [5, 6]}, backend=eager_backend, schema=schema + ) + assert result.collect_schema() == schema + assert result._version is Version.V1 + assert isinstance(result, nw_v1.DataFrame) From 1b5272bd9bdcc8a37b8c1db45ae568f33b118bb8 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 26 Jul 2025 10:22:19 +0000 Subject: [PATCH 07/13] feat: Add `DataFrame.from_arrow` Part of #2116 --- narwhals/dataframe.py | 22 +++++++++++++++++++++- narwhals/stable/v1/__init__.py | 7 +++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 56aaa3ccc6..94d134c62a 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -38,7 +38,7 @@ issue_performance_warning, supports_arrow_c_stream, ) -from narwhals.dependencies import get_polars, is_numpy_array +from narwhals.dependencies import get_polars, is_numpy_array, is_pyarrow_table from narwhals.exceptions import InvalidIntoExprError, InvalidOperationError from narwhals.functions import _from_dict_no_backend from narwhals.schema import Schema @@ -58,6 +58,7 @@ from narwhals._compliant import CompliantDataFrame, CompliantLazyFrame from narwhals._compliant.typing import CompliantExprAny, EagerNamespaceAny + from narwhals._translate import IntoArrowTable from narwhals.dtypes import DType from narwhals.group_by import GroupBy, LazyGroupBy from narwhals.typing import ( @@ -458,6 +459,25 @@ def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> msg = f"Expected an object which implements `__narwhals_dataframe__`, got: {type(df)}" raise AssertionError(msg) + @classmethod + def from_arrow( + cls, native_frame: IntoArrowTable, *, backend: ModuleType | Implementation | str + ) -> DataFrame[Any]: + if not (supports_arrow_c_stream(native_frame) or is_pyarrow_table(native_frame)): + msg = f"Given object of type {type(native_frame)} does not support PyCapsule interface" + raise TypeError(msg) + implementation = Implementation.from_backend(backend) + if is_eager_allowed(implementation): + ns = cls._version.namespace.from_backend(implementation).compliant + compliant = ns._dataframe.from_arrow(native_frame, context=ns) + return cls(compliant, level="full") + msg = ( + f"{implementation} support in Narwhals is lazy-only, but `DataFrame.from_arrow` is an eager-only function.\n\n" + "Hint: you may want to use an eager backend and then call `.lazy`, e.g.:\n\n" + f" nw.DataFrame.from_arrow(df, backend='pyarrow').lazy('{implementation}')" + ) + raise ValueError(msg) + @classmethod def from_dict( cls, diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index d6b9410ce7..0790c8251f 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -106,6 +106,13 @@ def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> # We need to override any method which don't return Self so that type # annotations are correct. + @classmethod + def from_arrow( + cls, native_frame: IntoArrowTable, *, backend: ModuleType | Implementation | str + ) -> DataFrame[Any]: + result = super().from_arrow(native_frame, backend=backend) + return cast("DataFrame[Any]", result) + @classmethod def from_dict( cls, From 4e1d35e56ae6d15e7cd62c6e4458e977210920ec Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 26 Jul 2025 12:04:06 +0000 Subject: [PATCH 08/13] test: Add main tests --- tests/frame/from_arrow_test.py | 81 ++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 tests/frame/from_arrow_test.py diff --git a/tests/frame/from_arrow_test.py b/tests/frame/from_arrow_test.py new file mode 100644 index 0000000000..2297491c24 --- /dev/null +++ b/tests/frame/from_arrow_test.py @@ -0,0 +1,81 @@ +from __future__ import annotations + +import sys +from typing import TYPE_CHECKING, Any + +import pytest + +pytest.importorskip("pyarrow") +import pyarrow as pa + +import narwhals as nw +from narwhals._utils import Implementation +from tests.utils import PYARROW_VERSION, assert_equal_data + +if TYPE_CHECKING: + from narwhals._namespace import EagerAllowed + + +@pytest.fixture +def data() -> dict[str, Any]: + return {"ab": [1, 2, 3], "ba": ["four", "five", None]} + + +@pytest.fixture +def table(data: dict[str, Any]) -> pa.Table: + return pa.table(data) + + +def is_native(native: Any, backend: EagerAllowed) -> bool: + if backend in {Implementation.PYARROW, "pyarrow"}: + return isinstance(native, pa.Table) + if backend in {Implementation.POLARS, "polars"}: + import polars as pl + + return isinstance(native, pl.DataFrame) + if backend in {Implementation.PANDAS, "pandas"}: + import pandas as pd + + return isinstance(native, pd.DataFrame) + msg = f"Unexpected backend {backend!r}" # pragma: no cover + raise TypeError(msg) # pragma: no cover + + +def test_from_arrow_table( + eager_backend: EagerAllowed, table: pa.Table, data: dict[str, Any] +) -> None: + # NOTE: PyCapsule support requires `pyarrow>=14`, but this path should work in all cases + result = nw.DataFrame.from_arrow(table, backend=eager_backend) + assert_equal_data(result, data) + assert is_native(result.to_native(), eager_backend) + + +@pytest.mark.xfail(PYARROW_VERSION < (14,), reason="too old") +def test_from_arrow_pycapsule( + eager_backend: EagerAllowed, table: pa.Table, data: dict[str, Any] +) -> None: + result = nw.DataFrame.from_arrow(table, backend=eager_backend) + supports_arrow_c_stream = nw.from_native(table) + result = nw.DataFrame.from_arrow(supports_arrow_c_stream, backend=eager_backend) + assert_equal_data(result, data) + assert is_native(result.to_native(), eager_backend) + + +def test_from_arrow_to_polars_no_pandas( + monkeypatch: pytest.MonkeyPatch, table: pa.Table, data: dict[str, Any] +) -> None: + pytest.importorskip("polars") + monkeypatch.delitem(sys.modules, "pandas", raising=False) + if PYARROW_VERSION < (14,): # pragma: no cover + result = nw.DataFrame.from_arrow(table, backend="polars") + else: + supports_arrow_c_stream = nw.from_native(table) + result = nw.DataFrame.from_arrow(supports_arrow_c_stream, backend="polars") + assert is_native(result.to_native(), "polars") + assert_equal_data(result, data) + assert "pandas" not in sys.modules + + +def test_from_arrow_invalid() -> None: + with pytest.raises(TypeError, match="PyCapsule"): + nw.DataFrame.from_arrow({"a": [1]}, backend=pa) # type: ignore[arg-type] From 0e4b416318ee93f30138ee98de9e74fdbd3fc502 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 26 Jul 2025 12:14:07 +0000 Subject: [PATCH 09/13] test: Add `v1` tests --- tests/v1_test.py | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/tests/v1_test.py b/tests/v1_test.py index ef4fee7308..79c344ddaf 100644 --- a/tests/v1_test.py +++ b/tests/v1_test.py @@ -11,6 +11,7 @@ import narwhals as nw import narwhals.stable.v1 as nw_v1 +from narwhals._utils import Implementation from narwhals.exceptions import InvalidOperationError from narwhals.stable.v1.dependencies import ( is_cudf_dataframe, @@ -30,7 +31,13 @@ is_pyarrow_table, ) from narwhals.utils import Version -from tests.utils import PANDAS_VERSION, Constructor, ConstructorEager, assert_equal_data +from tests.utils import ( + PANDAS_VERSION, + PYARROW_VERSION, + Constructor, + ConstructorEager, + assert_equal_data, +) if TYPE_CHECKING: from typing_extensions import assert_type @@ -978,6 +985,34 @@ def test_dask_order_dependent_ops() -> None: assert_equal_data(result, expected) +def test_dataframe_from_arrow(eager_backend: EagerAllowed) -> None: + pytest.importorskip("pyarrow") + import pyarrow as pa + + is_pyarrow = eager_backend in {Implementation.PYARROW, "pyarrow"} + data: dict[str, Any] = {"ab": [1, 2, 3], "ba": ["four", "five", None]} + table = pa.table(data) + supports_arrow_c_stream = nw_v1.DataFrame.from_arrow(table, backend=eager_backend) + assert_equal_data(supports_arrow_c_stream, data) + assert isinstance(supports_arrow_c_stream, nw_v1.DataFrame) + assert supports_arrow_c_stream._version is Version.V1 + if is_pyarrow: + assert isinstance(supports_arrow_c_stream.to_native(), pa.Table) + else: + assert not isinstance(supports_arrow_c_stream.to_native(), pa.Table) + if PYARROW_VERSION >= (14,): + result = nw_v1.DataFrame.from_arrow( + supports_arrow_c_stream, backend=eager_backend + ) + assert_equal_data(result, data) + assert result._version is Version.V1 + assert isinstance(result, nw_v1.DataFrame) + if is_pyarrow: + assert isinstance(result.to_native(), pa.Table) + else: + assert not isinstance(result.to_native(), pa.Table) + + def test_dataframe_from_dict(eager_backend: EagerAllowed) -> None: schema = {"c": nw_v1.Int16(), "d": nw_v1.Float32()} result = nw_v1.DataFrame.from_dict( From d861a55be253d588f92a320a752521d86bdfe193 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 26 Jul 2025 12:14:58 +0000 Subject: [PATCH 10/13] test: rename main tests --- tests/frame/from_arrow_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/frame/from_arrow_test.py b/tests/frame/from_arrow_test.py index 2297491c24..bb37d07e3f 100644 --- a/tests/frame/from_arrow_test.py +++ b/tests/frame/from_arrow_test.py @@ -41,7 +41,7 @@ def is_native(native: Any, backend: EagerAllowed) -> bool: raise TypeError(msg) # pragma: no cover -def test_from_arrow_table( +def test_dataframe_from_arrow_table( eager_backend: EagerAllowed, table: pa.Table, data: dict[str, Any] ) -> None: # NOTE: PyCapsule support requires `pyarrow>=14`, but this path should work in all cases @@ -51,7 +51,7 @@ def test_from_arrow_table( @pytest.mark.xfail(PYARROW_VERSION < (14,), reason="too old") -def test_from_arrow_pycapsule( +def test_dataframe_from_arrow_pycapsule( eager_backend: EagerAllowed, table: pa.Table, data: dict[str, Any] ) -> None: result = nw.DataFrame.from_arrow(table, backend=eager_backend) @@ -61,7 +61,7 @@ def test_from_arrow_pycapsule( assert is_native(result.to_native(), eager_backend) -def test_from_arrow_to_polars_no_pandas( +def test_dataframe_from_arrow_to_polars_no_pandas( monkeypatch: pytest.MonkeyPatch, table: pa.Table, data: dict[str, Any] ) -> None: pytest.importorskip("polars") @@ -76,6 +76,6 @@ def test_from_arrow_to_polars_no_pandas( assert "pandas" not in sys.modules -def test_from_arrow_invalid() -> None: +def test_dataframe_from_arrow_invalid() -> None: with pytest.raises(TypeError, match="PyCapsule"): nw.DataFrame.from_arrow({"a": [1]}, backend=pa) # type: ignore[arg-type] From 57aff0f406b550cb65a4c69a37390bbb0a82ab7e Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 26 Jul 2025 12:29:23 +0000 Subject: [PATCH 11/13] coverage https://github.com/narwhals-dev/narwhals/actions/runs/16539720011/job/46779226207 --- tests/frame/from_arrow_test.py | 7 +++++-- tests/v1_test.py | 1 + 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/frame/from_arrow_test.py b/tests/frame/from_arrow_test.py index bb37d07e3f..ea9f74590e 100644 --- a/tests/frame/from_arrow_test.py +++ b/tests/frame/from_arrow_test.py @@ -76,6 +76,9 @@ def test_dataframe_from_arrow_to_polars_no_pandas( assert "pandas" not in sys.modules -def test_dataframe_from_arrow_invalid() -> None: +def test_dataframe_from_arrow_invalid(table: pa.Table, data: dict[str, Any]) -> None: with pytest.raises(TypeError, match="PyCapsule"): - nw.DataFrame.from_arrow({"a": [1]}, backend=pa) # type: ignore[arg-type] + nw.DataFrame.from_arrow(data, backend=pa) # type: ignore[arg-type] + pytest.importorskip("sqlframe") + with pytest.raises(ValueError, match="lazy"): + nw.DataFrame.from_arrow(table, backend="sqlframe") diff --git a/tests/v1_test.py b/tests/v1_test.py index 79c344ddaf..22b996c45e 100644 --- a/tests/v1_test.py +++ b/tests/v1_test.py @@ -1011,6 +1011,7 @@ def test_dataframe_from_arrow(eager_backend: EagerAllowed) -> None: assert isinstance(result.to_native(), pa.Table) else: assert not isinstance(result.to_native(), pa.Table) + # pragma: no cover def test_dataframe_from_dict(eager_backend: EagerAllowed) -> None: From 25eeb0fa566ecb7e789494939f3f5673886c6a46 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 26 Jul 2025 12:37:00 +0000 Subject: [PATCH 12/13] cov pls https://github.com/narwhals-dev/narwhals/actions/runs/16539827751/job/46779462046?pr=2891 --- tests/v1_test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/v1_test.py b/tests/v1_test.py index 22b996c45e..753c236e75 100644 --- a/tests/v1_test.py +++ b/tests/v1_test.py @@ -1000,7 +1000,9 @@ def test_dataframe_from_arrow(eager_backend: EagerAllowed) -> None: assert isinstance(supports_arrow_c_stream.to_native(), pa.Table) else: assert not isinstance(supports_arrow_c_stream.to_native(), pa.Table) - if PYARROW_VERSION >= (14,): + if PYARROW_VERSION < (14,): # pragma: no cover + ... + else: result = nw_v1.DataFrame.from_arrow( supports_arrow_c_stream, backend=eager_backend ) @@ -1011,7 +1013,6 @@ def test_dataframe_from_arrow(eager_backend: EagerAllowed) -> None: assert isinstance(result.to_native(), pa.Table) else: assert not isinstance(result.to_native(), pa.Table) - # pragma: no cover def test_dataframe_from_dict(eager_backend: EagerAllowed) -> None: From c2ecde08bb78d4bfa1daa8b88f7022aeacd9e9b9 Mon Sep 17 00:00:00 2001 From: dangotbanned <125183946+dangotbanned@users.noreply.github.com> Date: Sat, 26 Jul 2025 15:54:19 +0000 Subject: [PATCH 13/13] docs: Add docs, api ref --- docs/api-reference/dataframe.md | 1 + narwhals/dataframe.py | 37 +++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/docs/api-reference/dataframe.md b/docs/api-reference/dataframe.md index fe44cd3e71..4105a376d1 100644 --- a/docs/api-reference/dataframe.md +++ b/docs/api-reference/dataframe.md @@ -14,6 +14,7 @@ - estimated_size - explode - filter + - from_arrow - from_dict - gather_every - get_column diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index 94d134c62a..be923f61f3 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -463,6 +463,43 @@ def __init__(self, df: Any, *, level: Literal["full", "lazy", "interchange"]) -> def from_arrow( cls, native_frame: IntoArrowTable, *, backend: ModuleType | Implementation | str ) -> DataFrame[Any]: + """Construct a DataFrame from an object which supports the PyCapsule Interface. + + Arguments: + native_frame: Object which implements `__arrow_c_stream__`. + backend: specifies which eager backend instantiate to. + + `backend` can be specified in various ways + + - As `Implementation.` with `BACKEND` being `PANDAS`, `PYARROW`, + `POLARS`, `MODIN` or `CUDF`. + - As a string: `"pandas"`, `"pyarrow"`, `"polars"`, `"modin"` or `"cudf"`. + - Directly as a module `pandas`, `pyarrow`, `polars`, `modin` or `cudf`. + + Returns: + A new DataFrame. + + Examples: + >>> import pandas as pd + >>> import polars as pl + >>> import narwhals as nw + >>> + >>> df_native = pd.DataFrame({"a": [1, 2], "b": [4.2, 5.1]}) + >>> nw.DataFrame.from_arrow(df_native, backend="polars") + ┌──────────────────┐ + |Narwhals DataFrame| + |------------------| + | shape: (2, 2) | + | ┌─────┬─────┐ | + | │ a ┆ b │ | + | │ --- ┆ --- │ | + | │ i64 ┆ f64 │ | + | ╞═════╪═════╡ | + | │ 1 ┆ 4.2 │ | + | │ 2 ┆ 5.1 │ | + | └─────┴─────┘ | + └──────────────────┘ + """ if not (supports_arrow_c_stream(native_frame) or is_pyarrow_table(native_frame)): msg = f"Given object of type {type(native_frame)} does not support PyCapsule interface" raise TypeError(msg)