-
Notifications
You must be signed in to change notification settings - Fork 179
feat: let nw.Enum accept categories, map pandas ordered categorical to Enum (only in main namespace, not stable.v1)
#2192
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3581985
69e9d88
9e63e68
3465fbf
a570751
6f22771
04b2ee6
1393ac7
6e85c1a
70d5c67
e4f2d87
86eb3a2
3e1db9e
5a996b1
1ce56ca
a8f6e42
342ea83
5dd7c72
1bf98d0
da1a455
a79554a
0712557
4fed6d1
f06df25
a0a8c39
21d03dd
36fd178
105e394
0d5b5cc
02b60e0
07d4a25
5e53281
0018a5b
99ad2b5
000013c
6aed0db
3357c55
3b40237
15cac8e
84ea789
833bc3c
7aca1b8
eb0b328
d2504a4
def83a3
f89e331
3902a29
1e3326f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -211,7 +211,9 @@ def rename( | |
|
|
||
|
|
||
| @functools.lru_cache(maxsize=16) | ||
| def non_object_native_to_narwhals_dtype(dtype: str, version: Version) -> DType: | ||
| def non_object_native_to_narwhals_dtype(native_dtype: Any, version: Version) -> DType: | ||
| dtype = str(native_dtype) | ||
|
|
||
|
Comment on lines
+214
to
+216
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change seems to have been there since the first commit (3581985), but doesn't seem to be documented? It looks like this part is related Which would mean we do the Are all non-object
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think the cost of a repeated call to
Since the tests pass, I am at least confident that all of the datatypes are hashable, however whether that hash is something meaningful or just the default
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @camriddell agreed on the My concern on the hashability though is related to #2051 (comment) Right now we won't get a warning like that because we have: native_dtype: AnyHowever - good news! @functools.lru_cache(maxsize=16)
def non_object_native_to_narwhals_dtype(
native_dtype: pd.api.extensions.ExtensionDtype, version: Version
) -> DType:And followed though to the docs to find:
Looks like we're all good π
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great find on that one, thanks so much for diving in there! |
||
| dtypes = import_dtypes_module(version) | ||
| if dtype in {"int64", "Int64", "Int64[pyarrow]", "int64[pyarrow]"}: | ||
| return dtypes.Int64() | ||
|
|
@@ -249,7 +251,13 @@ def non_object_native_to_narwhals_dtype(dtype: str, version: Version) -> DType: | |
| return dtypes.String() | ||
| if dtype in {"bool", "boolean", "boolean[pyarrow]", "bool[pyarrow]"}: | ||
| return dtypes.Boolean() | ||
| if dtype == "category" or dtype.startswith("dictionary<"): | ||
| if dtype.startswith("dictionary<"): | ||
| return dtypes.Categorical() | ||
| if dtype == "category": | ||
| if version is Version.V1: | ||
| return dtypes.Categorical() | ||
| if native_dtype.ordered: | ||
| return dtypes.Enum(native_dtype.categories) | ||
| return dtypes.Categorical() | ||
| if (match_ := PATTERN_PD_DATETIME.match(dtype)) or ( | ||
| match_ := PATTERN_PA_DATETIME.match(dtype) | ||
|
|
@@ -310,7 +318,7 @@ def native_to_narwhals_dtype( | |
| return arrow_native_to_narwhals_dtype(native_dtype.to_arrow(), version) | ||
| return arrow_native_to_narwhals_dtype(native_dtype.pyarrow_dtype, version) | ||
| if str_dtype != "object": | ||
| return non_object_native_to_narwhals_dtype(str_dtype, version) | ||
| return non_object_native_to_narwhals_dtype(native_dtype, version) | ||
| elif implementation is Implementation.DASK: | ||
| # Per conversations with their maintainers, they don't support arbitrary | ||
| # objects, so we can just return String. | ||
|
|
@@ -471,8 +479,15 @@ def narwhals_to_native_dtype( # noqa: PLR0915 | |
| msg = "PyArrow>=11.0.0 is required for `Date` dtype." | ||
| return "date32[pyarrow]" | ||
| if isinstance_or_issubclass(dtype, dtypes.Enum): | ||
| msg = "Converting to Enum is not (yet) supported" | ||
| raise NotImplementedError(msg) | ||
| if version is Version.V1: | ||
| msg = "Converting to Enum is not supported in narwhals.stable.v1" | ||
| raise NotImplementedError(msg) | ||
| if isinstance(dtype, dtypes.Enum): | ||
| ns = implementation.to_native_namespace() | ||
| return ns.CategoricalDtype(dtype.categories, ordered=True) | ||
| msg = "Can not cast / initialize Enum without categories present" | ||
| raise ValueError(msg) | ||
|
|
||
| if isinstance_or_issubclass( | ||
| dtype, (dtypes.Struct, dtypes.Array, dtypes.List, dtypes.Time, dtypes.Binary) | ||
| ): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,7 +11,7 @@ | |
| from narwhals.dtypes import Decimal | ||
| from narwhals.dtypes import DType | ||
| from narwhals.dtypes import Duration as NwDuration | ||
| from narwhals.dtypes import Enum | ||
| from narwhals.dtypes import Enum as NwEnum | ||
| from narwhals.dtypes import Field | ||
| from narwhals.dtypes import Float32 | ||
| from narwhals.dtypes import Float64 | ||
|
|
@@ -72,6 +72,35 @@ def __hash__(self: Self) -> int: | |
| return hash(self.__class__) | ||
|
|
||
|
|
||
| class Enum(NwEnum): | ||
| """A fixed categorical encoding of a unique set of strings. | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @dangotbanned the typing here gets a bit wonky as we currently need If feels like I may have something backwards here though? Would love to hear you thoughts.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the ping @camriddell, will take a look in the morning
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Definitely a tricky one, but I have a few ideas I'm gonna try out today. I did a search of existing usage and looked at what we allow in tests. I think our main concern should be preserving the behavior of I haven't tried out customizing-instance-and-subclass-checks yet - but have thought about it for another
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm wanting to avoid subclassing, since this is a pretty clear Liskov substitution principle violation (not your fault, just how
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think it's ok to allow
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Lol @MarcoGorelli the timing on this π (105e394) |
||
|
|
||
| Polars has an Enum data type, while pandas and PyArrow do not. | ||
|
|
||
| Examples: | ||
| >>> import polars as pl | ||
| >>> import narwhals.stable.v1 as nw | ||
| >>> data = ["beluga", "narwhal", "orca"] | ||
| >>> s_native = pl.Series(data, dtype=pl.Enum(data)) | ||
| >>> nw.from_native(s_native, series_only=True).dtype | ||
| Enum | ||
| """ | ||
|
|
||
| def __init__(self: Self) -> None: | ||
| super(NwEnum, self).__init__() | ||
|
|
||
| def __eq__(self, other: DType | type[DType]) -> bool: # type: ignore[override] | ||
| if type(other) is type: | ||
| return other in {type(self), NwEnum} | ||
| return isinstance(other, type(self)) | ||
|
|
||
| def __hash__(self: Self) -> int: # pragma: no cover | ||
| return super(NwEnum, self).__hash__() | ||
|
|
||
| def __repr__(self: Self) -> str: # pragma: no cover | ||
| return super(NwEnum, self).__repr__() | ||
|
|
||
|
|
||
| __all__ = [ | ||
| "Array", | ||
| "Binary", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@MarcoGorelli does this seem like it could be widened upstream (https://github.com/pandas-dev/pandas-stubs)?
I've traced back the runtime check to
is_list_like:The current annotation only permits
list[Any]as a non-pandasSequenceThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@camriddell ignore this, I only meant to add as a comment - not the review π«£
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@MarcoGorelli gentle nudge on this, in case it was missed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hey - yeah, probably, the pandas stubs definitely don't get all the attention they probably deserve