man-group · jjerphan · Aug 22, 2023 · Jun 27, 2023 · Jun 27, 2023 · Jun 27, 2023
@@ -0,0 +1,14 @@
+"""
+Copyright 2023 Man Group Operations Limited
+
+Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.
+
+As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
+"""
+import pandas as pd
+from packaging import version
+
+PANDAS_VERSION = version.parse(pd.__version__)
+CHECK_FREQ_VERSION = version.Version("1.1")
+IS_PANDAS_ZERO = PANDAS_VERSION < version.Version("1.0")
+IS_PANDAS_TWO = PANDAS_VERSION >= version.Version("2.0")
@@ -18,10 +18,10 @@
 from six import PY3
 from copy import deepcopy
 from functools import wraps
-from packaging import version
 
 from arcticdb.config import Defaults
 from arcticdb.log import configure, logger_by_name
+from arcticdb.util._versions import PANDAS_VERSION, CHECK_FREQ_VERSION
 from arcticdb.version_store import NativeVersionStore
 from arcticdb.version_store._custom_normalizers import CustomNormalizer
 from arcticc.pb2.descriptors_pb2 import NormalizationMetadata
@@ -32,11 +32,6 @@
 from arcticdb_ext import set_config_int, get_config_int, unset_config_int
 
 
-PANDAS_VERSION = version.parse(pd.__version__)
-CHECK_FREQ_VERSION = version.Version("1.1")
-IS_PANDAS_ZERO = PANDAS_VERSION < version.Version("1.0")
-
-
 def maybe_not_check_freq(f):
     """Ignore frequency when pandas is newer as starts to check frequency which it did not previously do."""
 

@@ -25,6 +25,7 @@
 from collections import Counter
 from arcticdb.exceptions import ArcticNativeException, ArcticNativeNotYetImplemented
 from arcticdb.supported_types import DateRangeInput, time_types as supported_time_types
+from arcticdb.util._versions import IS_PANDAS_TWO
 from arcticdb.version_store.read_result import ReadResult
 from arcticdb_ext.version_store import SortedValue as _SortedValue
 from pandas.core.internals import make_block
@@ -179,6 +180,14 @@ def _to_primitive(arr, arr_name, dynamic_strings, string_max_len=None, coerce_co
         return arr.codes
 
     obj_tokens = (object, "object", "O")
+    if np.issubdtype(arr.dtype, np.datetime64):
+        # ArcticDB only operates at nanosecond resolution (i.e. `datetime64[ns]`) type because so did Pandas < 2.
+        # In Pandas >= 2.0, other resolution are supported (namely `ms`, `s`, and `us`).
+        # See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution  # noqa: E501
+        # We want to maintain consistent behaviour, so we convert any other resolution
+        # to `datetime64[ns]`.
+        arr = arr.astype(DTN64_DTYPE, copy=False)
+
     if arr.dtype.hasobject is False and not (
         dynamic_strings and arr.dtype == "float" and coerce_column_type in obj_tokens
     ):
@@ -188,12 +197,19 @@ def _to_primitive(arr, arr_name, dynamic_strings, string_max_len=None, coerce_co
 
     if len(arr) == 0:
         if coerce_column_type is None:
-            raise ArcticNativeNotYetImplemented(
-                "coercing column type is required when empty column of object type, Column type={} for column={}"
-                .format(arr.dtype, arr_name)
-            )
-        else:
-            return arr.astype(coerce_column_type)
+            if IS_PANDAS_TWO:
+                # Before Pandas 2.0, empty series' dtype was float, but as of Pandas 2.0. empty series' dtype became object.
+                # See: https://github.com/pandas-dev/pandas/issues/17261
+                # We want to maintain consistent behaviour, so we treat empty series as containing floats.
+                # val_type = ValueType::FLOAT;
+                coerce_column_type = float
+                return arr.astype(coerce_column_type)
+            else:
+                raise ArcticNativeNotYetImplemented(
+                    "coercing column type is required when empty column of object type, Column type={} for column={}"
+                    .format(arr.dtype, arr_name)
+                )
+        return arr.astype(coerce_column_type)
 
     # Coerce column allows us to force a column to the given type, which means we can skip expensive iterations in
     # Python with the caveat that if the user gave an invalid type it's going to blow up in the core.
@@ -277,6 +293,7 @@ def _from_tz_timestamp(ts, tz):
 def _normalize_single_index(index, index_names, index_norm, dynamic_strings=None, string_max_len=None):
     # index: pd.Index or np.ndarray -> np.ndarray
     index_tz = None
+
     if isinstance(index, RangeIndex):
         # skip index since we can reconstruct it, so no need to actually store it
         if index.name:
@@ -507,6 +524,20 @@ def _index_to_records(self, df, pd_norm, dynamic_strings, string_max_len):
             df.reset_index(fields, inplace=True)
             index = df.index
         else:
+            n_rows = len(index)
+            n_categorical_columns = len(df.select_dtypes(include="category").columns)
+            if IS_PANDAS_TWO and isinstance(index, RangeIndex) and n_rows == 0 and n_categorical_columns == 0:
+                # In Pandas 1.0, an Index is used by default for any empty dataframe or series, except if
+                # there are categorical columns in which case a RangeIndex is used.
+                #
+                # In Pandas 2.0, RangeIndex is used by default for _any_ empty dataframe or series.
+                # See: https://github.com/pandas-dev/pandas/issues/49572
+                # Yet internally, ArcticDB uses a DatetimeIndex for empty dataframes and series without categorical
+                # columns.
+                #
+                # The index is converted to a DatetimeIndex for preserving the behavior of ArcticDB with Pandas 1.0.
+                index = DatetimeIndex([])
+
             index_norm = pd_norm.index
             index_norm.is_not_range_index = not isinstance(index, RangeIndex)
 
@@ -563,6 +594,13 @@ def denormalize(self, item, norm_meta):
         else:
             s.name = None
 
+        if s.empty and IS_PANDAS_TWO:
+            # Before Pandas 2.0, empty series' dtype was float, but as of Pandas 2.0. empty series' dtype became object.
+            # See: https://github.com/pandas-dev/pandas/issues/17261
+            # We want to maintain consistent behaviour, so we return empty series as containing objects
+            # when the Pandas version is >= 2.0
+            s = s.astype("object")
+
         return s
 
 
@@ -670,7 +708,23 @@ def denormalize(self, item, norm_meta):
         columns, denormed_columns, data = _denormalize_columns(item, norm_meta, idx_type, n_indexes)
 
         if not self._skip_df_consolidation:
+            columns_dtype = {} if data is None else {name: np_array.dtype for name, np_array in data.items()}
             df = DataFrame(data, index=index, columns=columns)
+
+            # Setting the columns' dtype manually, since pandas might just convert the dtype of some
+            # (empty) columns to another one and since the `dtype` keyword for `pd.DataFrame` constructor
+            # does not accept a mapping such as `columns_dtype`.
+            # For instance the following code has been tried but returns a pandas.DataFrame full of NaNs:
+            #
+            #       columns_mapping = {} if data is None else {
+            #           name: pd.Series(np_array, index=index, dtype=np_array.dtype)
+            #           for name, np_array in data.items()
+            #       }
+            #       df = DataFrame(index=index, columns=columns_mapping, copy=False)
+            #
+            for column_name, dtype in columns_dtype.items():
+                df[column_name] = df[column_name].astype(dtype, copy=False)
+
         else:
             if index is not None:
                 df = self.df_without_consolidation(columns, index, item, n_indexes, data)

@@ -7,11 +7,13 @@
 """
 
 import datetime
+import pytz
 from enum import Enum, auto
 from typing import Optional, Any, Tuple, Dict, AnyStr, Union, List, Iterable, NamedTuple
 from numpy import datetime64
 
 from arcticdb.supported_types import Timestamp
+from arcticdb.util._versions import IS_PANDAS_TWO
 
 from arcticdb.version_store.processing import QueryBuilder
 from arcticdb.version_store._store import NativeVersionStore, VersionedItem, VersionQueryInput
@@ -1503,6 +1505,11 @@ def get_description(self, symbol: str, as_of: Optional[AsOf] = None) -> SymbolDe
         """
         info = self._nvs.get_info(symbol, as_of)
         last_update_time = pd.to_datetime(info["last_update"], utc=True)
+        if IS_PANDAS_TWO:
+            # Pandas 2.0.0 now uses `datetime.timezone.utc` instead of `pytz.UTC`.
+            # See: https://github.com/pandas-dev/pandas/issues/34916
+            # We enforce the use of `pytz.UTC` for consistency.
+            last_update_time = last_update_time.replace(tzinfo=pytz.UTC)
         columns = tuple(NameWithDType(n, t) for n, t in zip(info["col_names"]["columns"], info["dtype"]))
         index = NameWithDType(info["col_names"]["index"], info["col_names"]["index_dtype"])
         date_range = tuple(

@@ -30,8 +30,9 @@
     random_strings_of_length,
     random_floats,
 )
-import random
+from arcticdb.util._versions import IS_PANDAS_TWO
 
+import random
 
 if AZURE_SUPPORT:
     from azure.storage.blob import BlobServiceClient
@@ -1094,7 +1095,18 @@ def test_read_description_batch_high_amount(arctic_library):
             assert results_list[idx].date_range == date_range_comp_with_utc
             if version > 0:
                 assert results_list[idx].last_update_time > results_list[idx - 1].last_update_time
-                assert results_list[idx].last_update_time.tz == pytz.UTC
+
+                result_last_update_time = results_list[idx].last_update_time
+                tz = result_last_update_time.tz
+
+                if IS_PANDAS_TWO:
+                    # Pandas 2.0.0 now uses `datetime.timezone.utc` instead of `pytz.UTC`.
+                    # See: https://github.com/pandas-dev/pandas/issues/34916
+                    # TODO: is there a better way to handle this edge case?
+                    assert tz == timezone.utc
+                else:
+                    assert isinstance(tz, pytz.BaseTzInfo)
+                    assert tz == pytz.UTC
 
 
 def test_read_description_batch_empty_nat(arctic_library):

@@ -2046,8 +2046,12 @@ def test_dynamic_schema_column_hash_update(lmdb_version_store_column_buckets):
 
     lib.update("symbol", df2)
     vit = lib.read("symbol")
+    # In Pandas < 2.0, updating a `DataFrame` uniquely storing integers with
+    # another `DataFrame` that is uniquely storing integers changes all the dtypes
+    # to float64.
     df.update(df2)
-    assert_frame_equal(vit.data.astype("float"), df)
+    df = df.astype("int64", copy=False)
+    assert_frame_equal(vit.data, df)
 
 
 def test_dynamic_schema_column_hash_append(lmdb_version_store_column_buckets):

@@ -6,11 +6,14 @@
 As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
 """
 import datetime
+import sys
+
 import numpy as np
 import pandas as pd
 import pytest
 
 from arcticdb.exceptions import ArcticNativeNotYetImplemented
+from arcticdb.util._versions import IS_PANDAS_TWO
 from arcticdb.util.test import assert_frame_equal
 
 
@@ -81,19 +84,40 @@ def test_categorical_empty(lmdb_version_store, sym):
     lib = lmdb_version_store
     lib.write(sym, df)
     read_df = lib.read(sym).data
+    # In Pandas 1.0, an Index is used by default for any an empty dataframe or series is created,
+    # except if there are categorical columns in which case a RangeIndex is used.
+    #
+    # In Pandas 2.0, RangeIndex is used by default for _any_ an empty dataframe or series is created.
+    # See: https://github.com/pandas-dev/pandas/issues/49572
+    assert isinstance(df.index, pd.RangeIndex)
+    assert isinstance(read_df.index, pd.RangeIndex)
     assert_frame_equal(df, read_df)
 
 
 def test_categorical_with_integers(lmdb_version_store, sym):
     c = pd.Categorical(np.arange(6))
-    df = pd.DataFrame({"int": np.arange(6), "cat": c})
+    df = pd.DataFrame({"int": np.arange(6), "cat_int": c})
     lib = lmdb_version_store
     lib.write(sym, df)
     read_df = lib.read(sym).data
     # Not pickled
     assert lib.get_info(sym)["type"] == "pandasdf"
     # should be category
-    assert read_df.cat.dtype == "category"
+    assert read_df.cat_int.dtype == "category"
+    if IS_PANDAS_TWO and sys.platform.startswith("win32"):
+        # Pandas 2.0.0 changed the underlying creation from numpy integral arrays:
+        # "Instantiating using a numpy numeric array now follows the dtype of the numpy array.
+        # Previously, all indexes created from numpy numeric arrays were forced to 64-bit.
+        # Now, for example, Index(np.array([1, 2, 3])) will be int32 on 32-bit systems,
+        # where it previously would have been int64 even on 32-bit systems.
+        # Instantiating Index using a list of numbers will still return 64bit dtypes,
+        # e.g. Index([1, 2, 3]) will have a int64 dtype, which is the same as previously."
+        # See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#index-can-now-hold-numpy-numeric-dtypes
+        # We have not control over the underlying integral array storing code for categorical columns
+        # so we replace the categorical column with its codes to perform the comparison with indentical dtypes.
+        df.cat_int = df.cat_int.cat.codes.astype(np.int32)
+        read_df.cat_int = read_df.cat_int.cat.codes.astype(np.int32)
+
     assert_frame_equal(df, read_df)
 
 
@@ -109,6 +133,20 @@ def test_categorical_with_integers_and_strings(lmdb_version_store, sym):
     # should be category
     assert read_df.cat_int.dtype == "category"
     assert read_df.cat_str.dtype == "category"
+    if IS_PANDAS_TWO and sys.platform.startswith("win32"):
+        # Pandas 2.0.0 changed the underlying creation from numpy integral arrays:
+        # "Instantiating using a numpy numeric array now follows the dtype of the numpy array.
+        # Previously, all indexes created from numpy numeric arrays were forced to 64-bit.
+        # Now, for example, Index(np.array([1, 2, 3])) will be int32 on 32-bit systems,
+        # where it previously would have been int64 even on 32-bit systems.
+        # Instantiating Index using a list of numbers will still return 64bit dtypes,
+        # e.g. Index([1, 2, 3]) will have a int64 dtype, which is the same as previously."
+        # See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#index-can-now-hold-numpy-numeric-dtypes
+        # We have not control over the underlying integral array storing code for categorical columns
+        # so we replace the categorical column with its codes to perform the comparison with indentical dtypes.
+        df.cat_int = df.cat_int.cat.codes.astype(np.int32)
+        read_df.cat_int = read_df.cat_int.cat.codes.astype(np.int32)
+
     assert_frame_equal(df, read_df)
 
 

@@ -13,6 +13,7 @@
 import sys
 
 from arcticdb.util.test import assert_frame_equal
+from arcticdb.util._versions import IS_PANDAS_TWO
 from arcticdb_ext.tools import AZURE_SUPPORT
 
 
@@ -81,4 +82,9 @@ def test_stress_multicolumn(lib_type, request):
         output_df = lib.read(name).data
         print("reading from arctic native: {}".format(pd.Timestamp("now") - now))
 
+        if IS_PANDAS_TWO and test_data.empty:
+            # In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created.
+            # The index has to be converted to a DatetimeIndex by ArcticDB to perform updates.
+            test_data.index = test_data.index.astype("datetime64[ns]")
+
         assert_frame_equal(test_data, output_df)
@@ -10,6 +10,7 @@
 
 from arcticdb.version_store._common import TimeFrame
 from arcticdb.util.test import assert_frame_equal, assert_series_equal
+from arcticdb.util._versions import IS_PANDAS_TWO
 
 
 def test_write_no_rows(lmdb_version_store, sym):
@@ -134,12 +135,45 @@ def test_empty_series(lmdb_version_store_dynamic_schema, sym):
     ser = pd.Series([])
     lmdb_version_store_dynamic_schema.write(sym, ser)
     assert not lmdb_version_store_dynamic_schema.is_symbol_pickled(sym)
+    if IS_PANDAS_TWO:
+        # In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created.
+        # The index is converted to a DatetimeIndex for preserving the behavior of ArcticDB with
+        # Pandas 1.0.
+        ser.index = ser.index.astype("datetime64[ns]")
+
     assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, ser)
 
 
 def test_fallback_to_pickle(lmdb_version_store, sym):
     column_names = ["a", "b", "c"]
     df = pd.DataFrame(columns=column_names)
     lmdb_version_store.write(sym, df)
-    assert lmdb_version_store.is_symbol_pickled(sym)
-    assert_frame_equal(df, lmdb_version_store.read(sym).data)
+
+    # In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created.
+    # The index is converted to a DatetimeIndex for preserving the behavior of ArcticDB with Pandas 1.0.
+    assert isinstance(df.index, pd.RangeIndex if IS_PANDAS_TWO else pd.Index)
+
+    if IS_PANDAS_TWO:
+        # In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created.
+        # The index has to be converted to a DatetimeIndex by ArcticDB to perform updates.
+        df.index = df.index.astype("datetime64[ns]")
+
+    # Before Pandas 2.0, empty Series' dtype was "float64" and empty DataFrames' Columns' dtype was "object".
+    # As of Pandas 2.0, empty Series' dtype is "object" and empty DataFrames' Columns' dtype remains "object".
+    # See: https://github.com/pandas-dev/pandas/issues/17261
+    # When normalizing in Pandas 2.0, we convert empty Series' dtype to float to "float64" to be consistent
+    # with the behavior of ArcticDB with Pandas 1.0.
+    # The same logic is used to normalize empty DataFrames' columns.
+
+    # Hence:
+    if IS_PANDAS_TWO:
+        # In Pandas 2.0, empty Dataframes can now be stored without being pickled.
+        assert not lmdb_version_store.is_symbol_pickled(sym)
+        # and ArcticDB returns empty DataFrames with float64 for all columns if they have not been specified.
+        df = df.astype("float64", copy=False)
+        assert_frame_equal(df, lmdb_version_store.read(sym).data)
+    else:
+        # In Pandas 1.0, empty Dataframes are pickled.
+        assert lmdb_version_store.is_symbol_pickled(sym)
+        # and ArcticDB simply deserialize empty DataFrames.
+        assert_frame_equal(df, lmdb_version_store.read(sym).data)
@@ -27,7 +27,8 @@
 from arcticdb_ext.storage import NoDataFoundException
 from arcticdb.version_store.processing import QueryBuilder
 from arcticdb_ext.exceptions import InternalException, UserInputException
-from arcticdb.util.test import assert_frame_equal, PANDAS_VERSION
+from arcticdb.util.test import assert_frame_equal
+from arcticdb.util._versions import PANDAS_VERSION
 from arcticdb.util.hypothesis import (
     use_of_function_scoped_fixtures_in_hypothesis_checked,
     integral_type_strategies,