-
Notifications
You must be signed in to change notification settings - Fork 151
maint: pandas 2.0 forward compatible changes #540
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
49031a6
de00265
28f0d93
2825838
fd97396
9a18d8e
a15f6dc
193deb5
dbbfc0b
54d4f7e
8f8fcb1
81db314
e39d7a2
5b7c41c
7e95822
ea36060
2d65460
ce3edf1
5a5c4a5
f43dcaf
e4be900
07c720b
102d944
ae81daa
6a9363e
0ebbb50
08d8805
c13d6df
bb0bb2f
f751693
c4e1027
97928e1
2cbbd58
5a698e2
bcd58bf
5565ef1
9c5a660
570547b
621ebc9
dcd666e
3f3d938
fe76e8a
94e2f1b
9c3a56c
6f5b914
a9fd475
9b3d88c
580980d
5cc13b7
36331e1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| """ | ||
| Copyright 2023 Man Group Operations Limited | ||
|
|
||
| Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt. | ||
|
|
||
| As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. | ||
| """ | ||
| import pandas as pd | ||
| from packaging import version | ||
|
|
||
| PANDAS_VERSION = version.parse(pd.__version__) | ||
| CHECK_FREQ_VERSION = version.Version("1.1") | ||
| IS_PANDAS_ZERO = PANDAS_VERSION < version.Version("1.0") | ||
| IS_PANDAS_TWO = PANDAS_VERSION >= version.Version("2.0") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,7 @@ | |
| from collections import Counter | ||
| from arcticdb.exceptions import ArcticNativeException, ArcticNativeNotYetImplemented | ||
| from arcticdb.supported_types import DateRangeInput, time_types as supported_time_types | ||
| from arcticdb.util._versions import IS_PANDAS_TWO | ||
| from arcticdb.version_store.read_result import ReadResult | ||
| from arcticdb_ext.version_store import SortedValue as _SortedValue | ||
| from pandas.core.internals import make_block | ||
|
|
@@ -179,6 +180,14 @@ def _to_primitive(arr, arr_name, dynamic_strings, string_max_len=None, coerce_co | |
| return arr.codes | ||
|
|
||
| obj_tokens = (object, "object", "O") | ||
| if np.issubdtype(arr.dtype, np.datetime64): | ||
| # ArcticDB only operates at nanosecond resolution (i.e. `datetime64[ns]`) type because so did Pandas < 2. | ||
| # In Pandas >= 2.0, other resolution are supported (namely `ms`, `s`, and `us`). | ||
| # See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution # noqa: E501 | ||
| # We want to maintain consistent behaviour, so we convert any other resolution | ||
| # to `datetime64[ns]`. | ||
| arr = arr.astype(DTN64_DTYPE, copy=False) | ||
jjerphan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| if arr.dtype.hasobject is False and not ( | ||
| dynamic_strings and arr.dtype == "float" and coerce_column_type in obj_tokens | ||
| ): | ||
|
|
@@ -188,12 +197,19 @@ def _to_primitive(arr, arr_name, dynamic_strings, string_max_len=None, coerce_co | |
|
|
||
| if len(arr) == 0: | ||
| if coerce_column_type is None: | ||
| raise ArcticNativeNotYetImplemented( | ||
| "coercing column type is required when empty column of object type, Column type={} for column={}" | ||
| .format(arr.dtype, arr_name) | ||
| ) | ||
| else: | ||
| return arr.astype(coerce_column_type) | ||
| if IS_PANDAS_TWO: | ||
| # Before Pandas 2.0, empty series' dtype was float, but as of Pandas 2.0. empty series' dtype became object. | ||
| # See: https://github.com/pandas-dev/pandas/issues/17261 | ||
| # We want to maintain consistent behaviour, so we treat empty series as containing floats. | ||
| # val_type = ValueType::FLOAT; | ||
| coerce_column_type = float | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have to consider #224 first. The concern is if the user did not intend for the column to be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a way to change the type of an (empty) column? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given the Otherwise, we would end up having a new behaviour that exists only in a few versions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would wait for #646 to be merged before this PR first. What do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you just get rid of coerce_column_type entirely, and all associated tests, and we'll just record a NoneType now that 646 is merged? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done in #804. |
||
| return arr.astype(coerce_column_type) | ||
| else: | ||
| raise ArcticNativeNotYetImplemented( | ||
| "coercing column type is required when empty column of object type, Column type={} for column={}" | ||
| .format(arr.dtype, arr_name) | ||
| ) | ||
| return arr.astype(coerce_column_type) | ||
|
|
||
| # Coerce column allows us to force a column to the given type, which means we can skip expensive iterations in | ||
| # Python with the caveat that if the user gave an invalid type it's going to blow up in the core. | ||
|
|
@@ -277,6 +293,7 @@ def _from_tz_timestamp(ts, tz): | |
| def _normalize_single_index(index, index_names, index_norm, dynamic_strings=None, string_max_len=None): | ||
| # index: pd.Index or np.ndarray -> np.ndarray | ||
| index_tz = None | ||
|
|
||
| if isinstance(index, RangeIndex): | ||
| # skip index since we can reconstruct it, so no need to actually store it | ||
| if index.name: | ||
|
|
@@ -507,6 +524,20 @@ def _index_to_records(self, df, pd_norm, dynamic_strings, string_max_len): | |
| df.reset_index(fields, inplace=True) | ||
| index = df.index | ||
| else: | ||
| n_rows = len(index) | ||
| n_categorical_columns = len(df.select_dtypes(include="category").columns) | ||
| if IS_PANDAS_TWO and isinstance(index, RangeIndex) and n_rows == 0 and n_categorical_columns == 0: | ||
| # In Pandas 1.0, an Index is used by default for any empty dataframe or series, except if | ||
| # there are categorical columns in which case a RangeIndex is used. | ||
| # | ||
| # In Pandas 2.0, RangeIndex is used by default for _any_ empty dataframe or series. | ||
| # See: https://github.com/pandas-dev/pandas/issues/49572 | ||
| # Yet internally, ArcticDB uses a DatetimeIndex for empty dataframes and series without categorical | ||
| # columns. | ||
| # | ||
| # The index is converted to a DatetimeIndex for preserving the behavior of ArcticDB with Pandas 1.0. | ||
| index = DatetimeIndex([]) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I came up with two tests for safe normalisation:
So the best solution is to introduce something in the C++ to allow the index type of an empty DF:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should just save whatever we're given, and then in the case of append and update we will ignore the index mismatch if there weren't any rows in the initial dataframe. |
||
|
|
||
| index_norm = pd_norm.index | ||
| index_norm.is_not_range_index = not isinstance(index, RangeIndex) | ||
|
|
||
|
|
@@ -563,6 +594,13 @@ def denormalize(self, item, norm_meta): | |
| else: | ||
| s.name = None | ||
|
|
||
| if s.empty and IS_PANDAS_TWO: | ||
| # Before Pandas 2.0, empty series' dtype was float, but as of Pandas 2.0. empty series' dtype became object. | ||
| # See: https://github.com/pandas-dev/pandas/issues/17261 | ||
| # We want to maintain consistent behaviour, so we return empty series as containing objects | ||
| # when the Pandas version is >= 2.0 | ||
| s = s.astype("object") | ||
|
|
||
| return s | ||
|
|
||
|
|
||
|
|
@@ -670,7 +708,23 @@ def denormalize(self, item, norm_meta): | |
| columns, denormed_columns, data = _denormalize_columns(item, norm_meta, idx_type, n_indexes) | ||
|
|
||
| if not self._skip_df_consolidation: | ||
| columns_dtype = {} if data is None else {name: np_array.dtype for name, np_array in data.items()} | ||
| df = DataFrame(data, index=index, columns=columns) | ||
|
|
||
| # Setting the columns' dtype manually, since pandas might just convert the dtype of some | ||
| # (empty) columns to another one and since the `dtype` keyword for `pd.DataFrame` constructor | ||
| # does not accept a mapping such as `columns_dtype`. | ||
| # For instance the following code has been tried but returns a pandas.DataFrame full of NaNs: | ||
| # | ||
| # columns_mapping = {} if data is None else { | ||
| # name: pd.Series(np_array, index=index, dtype=np_array.dtype) | ||
| # for name, np_array in data.items() | ||
| # } | ||
| # df = DataFrame(index=index, columns=columns_mapping, copy=False) | ||
| # | ||
| for column_name, dtype in columns_dtype.items(): | ||
| df[column_name] = df[column_name].astype(dtype, copy=False) | ||
jjerphan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| else: | ||
| if index is not None: | ||
| df = self.df_without_consolidation(columns, index, item, n_indexes, data) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.