diff --git a/docs/backcompat.md b/docs/backcompat.md index e1ffa329eb..185d99a163 100644 --- a/docs/backcompat.md +++ b/docs/backcompat.md @@ -103,6 +103,7 @@ Here are exceptions to our backwards compatibility policy: need to rethink Narwhals. However, we expect such radical changes to be exceedingly unlikely. - We may consider making some type hints more precise. - Anything labelled "unstable". +- We may sometimes need to bump the minimum versions of supported backends. In general, decision are driven by use-cases, and we conduct a search of public GitHub repositories before making any change. @@ -113,6 +114,11 @@ before making any change. The following are differences between the main Narwhals namespace and `narwhals.stable.v1`: +- Since Narwhals 1.35: + + - pandas' ordered categoricals get mapped to `nw.Enum` instead of `nw.Categorical`. + - `nw.Enum` must be provided `categories` at instantiation. + - Since Narwhals 1.29.0, `LazyFrame.gather_every` has been deprecated from the main namespace. - Since Narwhals 1.24.1, an empty or all-null object-dtype pandas Series is inferred to diff --git a/narwhals/_dask/utils.py b/narwhals/_dask/utils.py index 3d4ddbe6eb..ba152bd9b5 100644 --- a/narwhals/_dask/utils.py +++ b/narwhals/_dask/utils.py @@ -9,6 +9,7 @@ from narwhals.dependencies import get_pandas from narwhals.dependencies import get_pyarrow from narwhals.utils import Implementation +from narwhals.utils import Version from narwhals.utils import import_dtypes_module from narwhals.utils import isinstance_or_issubclass from narwhals.utils import parse_version @@ -24,7 +25,6 @@ from narwhals._dask.dataframe import DaskLazyFrame from narwhals._dask.expr import DaskExpr from narwhals.dtypes import DType - from narwhals.utils import Version def maybe_evaluate_expr(df: DaskLazyFrame, obj: DaskExpr | object) -> dx.Series | object: @@ -125,6 +125,20 @@ def narwhals_to_native_dtype(dtype: DType | type[DType], version: Version) -> An return "object" # pragma: no cover if isinstance_or_issubclass(dtype, dtypes.Boolean): return "bool" + if isinstance_or_issubclass(dtype, dtypes.Enum): + if version is Version.V1: + msg = "Converting to Enum is not supported in narwhals.stable.v1" + raise NotImplementedError(msg) + if isinstance(dtype, dtypes.Enum): + import pandas as pd + + # NOTE: `pandas-stubs.core.dtypes.dtypes.CategoricalDtype.categories` is too narrow + # Should be one of the `ListLike*` types + # https://github.com/pandas-dev/pandas-stubs/blob/8434bde95460b996323cc8c0fea7b0a8bb00ea26/pandas-stubs/_typing.pyi#L497-L505 + return pd.CategoricalDtype(dtype.categories, ordered=True) # pyright: ignore[reportArgumentType] + msg = "Can not cast / initialize Enum without categories present" + raise ValueError(msg) + if isinstance_or_issubclass(dtype, dtypes.Categorical): return "category" if isinstance_or_issubclass(dtype, dtypes.Datetime): diff --git a/narwhals/_pandas_like/utils.py b/narwhals/_pandas_like/utils.py index e73369d1cf..7f1a0ea020 100644 --- a/narwhals/_pandas_like/utils.py +++ b/narwhals/_pandas_like/utils.py @@ -211,7 +211,9 @@ def rename( @functools.lru_cache(maxsize=16) -def non_object_native_to_narwhals_dtype(dtype: str, version: Version) -> DType: +def non_object_native_to_narwhals_dtype(native_dtype: Any, version: Version) -> DType: + dtype = str(native_dtype) + dtypes = import_dtypes_module(version) if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}: return dtypes.Int64() @@ -249,7 +251,13 @@ def non_object_native_to_narwhals_dtype(dtype: str, version: Version) -> DType: return dtypes.String() if dtype in {"bool", "boolean", "boolean[pyarrow]", "bool[pyarrow]"}: return dtypes.Boolean() - if dtype == "category" or dtype.startswith("dictionary<"): + if dtype.startswith("dictionary<"): + return dtypes.Categorical() + if dtype == "category": + if version is Version.V1: + return dtypes.Categorical() + if native_dtype.ordered: + return dtypes.Enum(native_dtype.categories) return dtypes.Categorical() if (match_ := PATTERN_PD_DATETIME.match(dtype)) or ( match_ := PATTERN_PA_DATETIME.match(dtype) @@ -310,7 +318,7 @@ def native_to_narwhals_dtype( return arrow_native_to_narwhals_dtype(native_dtype.to_arrow(), version) return arrow_native_to_narwhals_dtype(native_dtype.pyarrow_dtype, version) if str_dtype != "object": - return non_object_native_to_narwhals_dtype(str_dtype, version) + return non_object_native_to_narwhals_dtype(native_dtype, version) elif implementation is Implementation.DASK: # Per conversations with their maintainers, they don't support arbitrary # objects, so we can just return String. @@ -471,8 +479,15 @@ def narwhals_to_native_dtype( # noqa: PLR0915 msg = "PyArrow>=11.0.0 is required for `Date` dtype." return "date32[pyarrow]" if isinstance_or_issubclass(dtype, dtypes.Enum): - msg = "Converting to Enum is not (yet) supported" - raise NotImplementedError(msg) + if version is Version.V1: + msg = "Converting to Enum is not supported in narwhals.stable.v1" + raise NotImplementedError(msg) + if isinstance(dtype, dtypes.Enum): + ns = implementation.to_native_namespace() + return ns.CategoricalDtype(dtype.categories, ordered=True) + msg = "Can not cast / initialize Enum without categories present" + raise ValueError(msg) + if isinstance_or_issubclass( dtype, (dtypes.Struct, dtypes.Array, dtypes.List, dtypes.Time, dtypes.Binary) ): diff --git a/narwhals/_polars/utils.py b/narwhals/_polars/utils.py index 9452877eac..dd0df4d8a7 100644 --- a/narwhals/_polars/utils.py +++ b/narwhals/_polars/utils.py @@ -17,6 +17,7 @@ from narwhals.exceptions import InvalidOperationError from narwhals.exceptions import NarwhalsError from narwhals.exceptions import ShapeError +from narwhals.utils import Version from narwhals.utils import import_dtypes_module from narwhals.utils import isinstance_or_issubclass @@ -26,7 +27,6 @@ from narwhals._polars.expr import PolarsExpr from narwhals._polars.series import PolarsSeries from narwhals.dtypes import DType - from narwhals.utils import Version T = TypeVar("T") @@ -110,8 +110,10 @@ def native_to_narwhals_dtype( return dtypes.Object() if dtype == pl.Categorical: return dtypes.Categorical() - if dtype == pl.Enum: - return dtypes.Enum() + if isinstance_or_issubclass(dtype, pl.Enum): + if version is Version.V1: + return dtypes.Enum() # type: ignore[call-arg] + return dtypes.Enum(dtype.categories) if dtype == pl.Date: return dtypes.Date() if isinstance_or_issubclass(dtype, pl.Datetime): @@ -185,9 +187,14 @@ def narwhals_to_native_dtype( return pl.Object() if dtype == dtypes.Categorical: return pl.Categorical() - if dtype == dtypes.Enum: - msg = "Converting to Enum is not (yet) supported" - raise NotImplementedError(msg) + if isinstance_or_issubclass(dtype, dtypes.Enum): + if version is Version.V1: + msg = "Converting to Enum is not supported in narwhals.stable.v1" + raise NotImplementedError(msg) + if isinstance(dtype, dtypes.Enum): + return pl.Enum(dtype.categories) + msg = "Can not cast / initialize Enum without categories present" + raise ValueError(msg) if dtype == dtypes.Date: return pl.Date() if dtype == dtypes.Time: diff --git a/narwhals/dtypes.py b/narwhals/dtypes.py index b31a0131a0..79da0b2673 100644 --- a/narwhals/dtypes.py +++ b/narwhals/dtypes.py @@ -1,5 +1,6 @@ from __future__ import annotations +import enum from collections import OrderedDict from datetime import timezone from itertools import starmap @@ -9,6 +10,7 @@ from narwhals.utils import isinstance_or_issubclass if TYPE_CHECKING: + from typing import Iterable from typing import Iterator from typing import Sequence @@ -464,14 +466,31 @@ class Enum(DType): Polars has an Enum data type, while pandas and PyArrow do not. Examples: - >>> import polars as pl >>> import narwhals as nw - >>> data = ["beluga", "narwhal", "orca"] - >>> s_native = pl.Series(data, dtype=pl.Enum(data)) - >>> nw.from_native(s_native, series_only=True).dtype - Enum + >>> nw.Enum(["beluga", "narwhal", "orca"]) + Enum(categories=['beluga', 'narwhal', 'orca']) """ + categories: Sequence[str] + + def __init__(self, categories: Iterable[str] | type[enum.Enum]) -> None: + if isinstance(categories, type) and issubclass(categories, enum.Enum): + self.categories = tuple(member.value for member in categories) + else: + self.categories = tuple(categories) + + def __eq__(self: Self, other: object) -> bool: + # allow comparing object instances to class + if type(other) is type: + return other is Enum + return isinstance(other, type(self)) and self.categories == other.categories + + def __hash__(self: Self) -> int: # pragma: no cover + return hash((self.__class__, tuple(self.categories))) + + def __repr__(self: Self) -> str: # pragma: no cover + return f"{type(self).__name__}(categories={list(self.categories)!r})" + class Field: """Definition of a single field within a `Struct` DataType. diff --git a/narwhals/stable/v1/_dtypes.py b/narwhals/stable/v1/_dtypes.py index c3905bc1f9..5be99ec36d 100644 --- a/narwhals/stable/v1/_dtypes.py +++ b/narwhals/stable/v1/_dtypes.py @@ -11,7 +11,7 @@ from narwhals.dtypes import Decimal from narwhals.dtypes import DType from narwhals.dtypes import Duration as NwDuration -from narwhals.dtypes import Enum +from narwhals.dtypes import Enum as NwEnum from narwhals.dtypes import Field from narwhals.dtypes import Float32 from narwhals.dtypes import Float64 @@ -72,6 +72,35 @@ def __hash__(self: Self) -> int: return hash(self.__class__) +class Enum(NwEnum): + """A fixed categorical encoding of a unique set of strings. + + Polars has an Enum data type, while pandas and PyArrow do not. + + Examples: + >>> import polars as pl + >>> import narwhals.stable.v1 as nw + >>> data = ["beluga", "narwhal", "orca"] + >>> s_native = pl.Series(data, dtype=pl.Enum(data)) + >>> nw.from_native(s_native, series_only=True).dtype + Enum + """ + + def __init__(self: Self) -> None: + super(NwEnum, self).__init__() + + def __eq__(self, other: DType | type[DType]) -> bool: # type: ignore[override] + if type(other) is type: + return other in {type(self), NwEnum} + return isinstance(other, type(self)) + + def __hash__(self: Self) -> int: # pragma: no cover + return super(NwEnum, self).__hash__() + + def __repr__(self: Self) -> str: # pragma: no cover + return super(NwEnum, self).__repr__() + + __all__ = [ "Array", "Binary", diff --git a/tests/dtypes_test.py b/tests/dtypes_test.py index f7a447c6a7..ca5f007199 100644 --- a/tests/dtypes_test.py +++ b/tests/dtypes_test.py @@ -1,9 +1,12 @@ from __future__ import annotations +import enum from datetime import datetime from datetime import timedelta from datetime import timezone from typing import TYPE_CHECKING +from typing import Any +from typing import Iterable from typing import Literal import numpy as np @@ -400,3 +403,35 @@ def test_cast_decimal_to_native() -> None: .with_columns(a=nw.col("a").cast(nw.Decimal())) .to_native() ) + + +@pytest.mark.parametrize( + "categories", + [ + ["a", "b"], + [np.str_("a"), np.str_("b")], + enum.Enum("Test", "a b"), + [1, 2, 3], + ], +) +def test_enum_valid(categories: Iterable[Any] | type[enum.Enum]) -> None: + dtype = nw.Enum(categories) + assert dtype == nw.Enum + assert len(dtype.categories) == len([*categories]) + + +def test_enum_from_series() -> None: + pytest.importorskip("polars") + import polars as pl + + elements = "a", "d", "e", "b", "c" + categories = pl.Series(elements) + categories_nw = nw.from_native(categories, series_only=True) + assert nw.Enum(categories_nw).categories == elements + assert nw.Enum(categories).categories == elements + + +def test_enum_categories_immutable() -> None: + dtype = nw.Enum(["a", "b"]) + with pytest.raises(TypeError, match="does not support item assignment"): + dtype.categories[0] = "c" # type: ignore[index] diff --git a/tests/frame/schema_test.py b/tests/frame/schema_test.py index 964146da7f..a2876eda84 100644 --- a/tests/frame/schema_test.py +++ b/tests/frame/schema_test.py @@ -176,6 +176,19 @@ def test_dtypes() -> None: assert df_from_pd.schema == df_from_pd.collect_schema() == expected assert {name: df_from_pd[name].dtype for name in df_from_pd.columns} == expected + df_from_pd = nw.from_native(df_pl.to_pandas(), eager_only=True) + + pure_pd_expected = { + **expected, + "n": nw.Datetime, + "s": nw.Object, + "u": nw.Object, + } + assert df_from_pd.schema == df_from_pd.collect_schema() == pure_pd_expected + assert { + name: df_from_pd[name].dtype for name in df_from_pd.columns + } == pure_pd_expected + df_from_pa = nw.from_native(df_pl.to_arrow(), eager_only=True) assert df_from_pa.schema == df_from_pa.collect_schema() == expected diff --git a/tests/series_only/cast_test.py b/tests/series_only/cast_test.py index 5d6d2ebd1a..8a4659455e 100644 --- a/tests/series_only/cast_test.py +++ b/tests/series_only/cast_test.py @@ -10,6 +10,7 @@ from tests.utils import PANDAS_VERSION if TYPE_CHECKING: + from tests.utils import Constructor from tests.utils import ConstructorEager @@ -113,29 +114,23 @@ def test_unknown_to_int() -> None: assert nw.from_native(df).select(nw.col("a").cast(nw.Int64)).schema == {"a": nw.Int64} -def test_cast_to_enum_polars() -> None: - pytest.importorskip("polars") - import polars as pl - - # we don't yet support metadata in dtypes, so for now disallow this - # seems like a very niche use case anyway, and allowing it later wouldn't be - # backwards-incompatible - df_pl = pl.DataFrame({"a": ["a", "b"]}, schema={"a": pl.Categorical}) - with pytest.raises( - NotImplementedError, match=r"Converting to Enum is not \(yet\) supported" +def test_cast_to_enum_vmain( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + # Backends that do not (yet) support Enum dtype + if any( + backend in str(constructor) + for backend in ["pyarrow_table", "duckdb", "sqlframe", "pyspark", "modin"] ): - nw.from_native(df_pl).select(nw.col("a").cast(nw.Enum)) + request.applymarker(pytest.mark.xfail) + df_nw = nw.from_native(constructor({"a": ["a", "b"]})) + col_a = nw.col("a") -def test_cast_to_enum_pandas() -> None: - pytest.importorskip("pandas") - import pandas as pd - - # we don't yet support metadata in dtypes, so for now disallow this - # seems like a very niche use case anyway, and allowing it later wouldn't be - # backwards-incompatible - df_pd = pd.DataFrame({"a": ["a", "b"]}, dtype="category") with pytest.raises( - NotImplementedError, match=r"Converting to Enum is not \(yet\) supported" + ValueError, match="Can not cast / initialize Enum without categories present" ): - nw.from_native(df_pd).select(nw.col("a").cast(nw.Enum)) + df_nw.select(col_a.cast(nw.Enum)) + + df_nw = df_nw.select(col_a.cast(nw.Enum(["a", "b"]))) + assert df_nw.collect_schema() == {"a": nw.Enum(["a", "b"])} diff --git a/tests/v1_test.py b/tests/v1_test.py index 8c54624add..d8dc60847c 100644 --- a/tests/v1_test.py +++ b/tests/v1_test.py @@ -7,10 +7,12 @@ import pandas as pd import pytest +import narwhals as nw import narwhals.stable.v1 as nw_v1 from tests.utils import PANDAS_VERSION from tests.utils import POLARS_VERSION from tests.utils import PYARROW_VERSION +from tests.utils import Constructor from tests.utils import ConstructorEager from tests.utils import assert_equal_data @@ -176,3 +178,56 @@ def test_int_select_pandas() -> None: nw_v1.exceptions.InvalidIntoExprError, match="\n\nHint:\n- if you were trying" ): nw_v1.to_native(df.lazy().select(0)) # type: ignore[arg-type] + + +def test_enum_v1_is_enum_unstable() -> None: + enum_v1 = nw_v1.Enum() + enum_unstable = nw.Enum(("a", "b", "c")) + assert isinstance(enum_v1, nw.Enum) + assert issubclass(nw_v1.Enum, nw.Enum) + assert enum_v1 == nw.Enum + assert enum_v1 != enum_unstable + assert enum_unstable != nw_v1.Enum + assert enum_unstable == nw.Enum + + with pytest.raises(TypeError, match=r"takes 1 positional argument"): + nw_v1.Enum(("a", "b")) # type: ignore[call-arg] + + +def test_cast_to_enum_v1( + request: pytest.FixtureRequest, constructor: Constructor +) -> None: + # Backends that do not (yet) support Enum dtype + if ( + any( + backend in str(constructor) + for backend in ["pyarrow_table", "duckdb", "sqlframe", "pyspark"] + ) + or str(constructor) == "modin" + ): + request.applymarker(pytest.mark.xfail) + + df_native = constructor({"a": ["a", "b"]}) + + with pytest.raises( + NotImplementedError, + match="Converting to Enum is not supported in narwhals.stable.v1", + ): + nw_v1.from_native(df_native).select(nw_v1.col("a").cast(nw_v1.Enum)) + + +def test_v1_ordered_categorical_pandas() -> None: + s = nw_v1.from_native( + pd.Series([0, 1], dtype=pd.CategoricalDtype(ordered=True)), series_only=True + ) + assert s.dtype == nw_v1.Categorical + + +def test_v1_enum_polars() -> None: + pytest.importorskip("polars") + import polars as pl + + s = nw_v1.from_native( + pl.Series(["a", "b"], dtype=pl.Enum(["a", "b"])), series_only=True + ) + assert s.dtype == nw_v1.Enum