Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
49031a6
build: Remove the upper-bound on pandas
jjerphan Jun 27, 2023
de00265
python-test: Use keyword argument for pd.DataFrame.drop
jjerphan Jun 27, 2023
28f0d93
python-test: Replace pd.DataFrame.append with pd.concat
jjerphan Jun 27, 2023
2825838
maint: Adaptations for pandas 2.0 empty Series
jjerphan Jul 3, 2023
fd97396
test: Adapt tests and normalization for empty Series and DataFrames
jjerphan Jul 3, 2023
9a18d8e
test: Adapt more tests
jjerphan Jul 10, 2023
a15f6dc
test: Handle edge case on py.UTC with Pandas 2.0
jjerphan Jul 10, 2023
193deb5
fix: Make black happy
jjerphan Jul 10, 2023
dbbfc0b
Merge branch 'master' into pandas-2.0-support
jjerphan Jul 10, 2023
54d4f7e
fixup! test: Handle edge case on py.UTC with Pandas 2.0
jjerphan Jul 10, 2023
8f8fcb1
test: Adapt more tests
jjerphan Jul 10, 2023
81db314
test: Adapt behavior for empty pandas.{Series,DataFrame}
jjerphan Jul 11, 2023
e39d7a2
Merge remote-tracking branch 'upstream/master' into pandas-2.0-support
jjerphan Jul 11, 2023
5b7c41c
test: Adapt the CustomTimeseries test fixture
jjerphan Jul 11, 2023
7e95822
test: Adapt tests
jjerphan Jul 12, 2023
ea36060
python: Adapt logic for Pandas 2.0 empty series
jjerphan Jul 12, 2023
2d65460
test: Reintroduce dtype checks for test_categorical_empty
jjerphan Jul 12, 2023
ce3edf1
test: Adaptation due to change of Index.dtype on 32bit platforms
jjerphan Jul 12, 2023
5a5c4a5
Merge remote-tracking branch 'upstream/master' into pandas-2.0-support
jjerphan Jul 12, 2023
f43dcaf
python: Adapt denormalization of empty DataFrames' columns' arrays
jjerphan Jul 12, 2023
e4be900
test: Do not checks dtypes for a test on Windows and pandas 2.0
jjerphan Jul 12, 2023
07c720b
Adapt denornalization
jjerphan Jul 13, 2023
102d944
docs: Add comment about change of behavior for `datetime64`
jjerphan Jul 13, 2023
ae81daa
python: Convert to all `datetime64 to` `datetime64[ns]`
jjerphan Jul 13, 2023
6a9363e
Edge-case for testing categorical variables dtype on 32bit systems
jjerphan Jul 13, 2023
0ebbb50
Update python/tests/integration/arcticdb/version_store/test_categoric…
jjerphan Jul 13, 2023
08d8805
Update python/tests/integration/arcticdb/version_store/test_categoric…
jjerphan Jul 13, 2023
c13d6df
doc: Add BSL header
jjerphan Jul 18, 2023
bb0bb2f
Merge remote-tracking branch 'upstream/master' into pandas-2.0-support
jjerphan Jul 18, 2023
f751693
fix: Remove import for the unused IS_PANDAS_ZERO
jjerphan Jul 18, 2023
c4e1027
Merge remote-tracking branch 'upstream/master' into pandas-2.0-support
jjerphan Jul 26, 2023
97928e1
test: Adapt test for pandas-specific weirdness
jjerphan Jul 26, 2023
2cbbd58
python: Explicitly specify dtypes for DataFrames
jjerphan Jul 26, 2023
5a698e2
test: Remove support for 32bit architectures
jjerphan Jul 26, 2023
bcd58bf
python: Remove redundant class of isinstance check
jjerphan Jul 26, 2023
5565ef1
test: Remove unneeded branch
jjerphan Jul 26, 2023
9c5a660
Merge branch 'master' into pandas-2.0-support
jjerphan Jul 26, 2023
570547b
test: Move tests after the introduction of test_arctic_batch.py
jjerphan Jul 26, 2023
621ebc9
build: Pin pandas < 2.0
jjerphan Jul 27, 2023
dcd666e
Merge branch 'master' into pandas-2.0-support
jjerphan Aug 8, 2023
3f3d938
Use pytz.UTC for timezone on return
jjerphan Aug 10, 2023
fe76e8a
Remove useless imports
jjerphan Aug 10, 2023
94e2f1b
Merge branch 'master' into pandas-2.0-support
jjerphan Aug 21, 2023
9c3a56c
Safely cast and do not copy NumPy arrays when possible
jjerphan Aug 21, 2023
6f5b914
Revert "Safely cast and do not copy NumPy arrays when possible"
jjerphan Aug 21, 2023
a9fd475
Revert "build: Pin pandas < 2.0"
jjerphan Aug 21, 2023
9b3d88c
test: Adapt more tests for Pandas 2.0 breaking changes
jjerphan Aug 21, 2023
580980d
fixup! test: Adapt more tests for Pandas 2.0 breaking changes
jjerphan Aug 21, 2023
5cc13b7
docs: Adapt the comment for consolidation
jjerphan Aug 22, 2023
36331e1
build: Pin pandas < 2.0
jjerphan Jul 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions python/arcticdb/util/_versions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""
Copyright 2023 Man Group Operations Limited

Use of this software is governed by the Business Source License 1.1 included in the file licenses/BSL.txt.

As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
"""
import pandas as pd
from packaging import version

PANDAS_VERSION = version.parse(pd.__version__)
CHECK_FREQ_VERSION = version.Version("1.1")
IS_PANDAS_ZERO = PANDAS_VERSION < version.Version("1.0")
IS_PANDAS_TWO = PANDAS_VERSION >= version.Version("2.0")
7 changes: 1 addition & 6 deletions python/arcticdb/util/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
from six import PY3
from copy import deepcopy
from functools import wraps
from packaging import version

from arcticdb.config import Defaults
from arcticdb.log import configure, logger_by_name
from arcticdb.util._versions import PANDAS_VERSION, CHECK_FREQ_VERSION
from arcticdb.version_store import NativeVersionStore
from arcticdb.version_store._custom_normalizers import CustomNormalizer
from arcticc.pb2.descriptors_pb2 import NormalizationMetadata
Expand All @@ -32,11 +32,6 @@
from arcticdb_ext import set_config_int, get_config_int, unset_config_int


PANDAS_VERSION = version.parse(pd.__version__)
CHECK_FREQ_VERSION = version.Version("1.1")
IS_PANDAS_ZERO = PANDAS_VERSION < version.Version("1.0")


def maybe_not_check_freq(f):
"""Ignore frequency when pandas is newer as starts to check frequency which it did not previously do."""

Expand Down
66 changes: 60 additions & 6 deletions python/arcticdb/version_store/_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from collections import Counter
from arcticdb.exceptions import ArcticNativeException, ArcticNativeNotYetImplemented
from arcticdb.supported_types import DateRangeInput, time_types as supported_time_types
from arcticdb.util._versions import IS_PANDAS_TWO
from arcticdb.version_store.read_result import ReadResult
from arcticdb_ext.version_store import SortedValue as _SortedValue
from pandas.core.internals import make_block
Expand Down Expand Up @@ -179,6 +180,14 @@ def _to_primitive(arr, arr_name, dynamic_strings, string_max_len=None, coerce_co
return arr.codes

obj_tokens = (object, "object", "O")
if np.issubdtype(arr.dtype, np.datetime64):
# ArcticDB only operates at nanosecond resolution (i.e. `datetime64[ns]`) type because so did Pandas < 2.
# In Pandas >= 2.0, other resolution are supported (namely `ms`, `s`, and `us`).
# See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#construction-with-datetime64-or-timedelta64-dtype-with-unsupported-resolution # noqa: E501
# We want to maintain consistent behaviour, so we convert any other resolution
# to `datetime64[ns]`.
arr = arr.astype(DTN64_DTYPE, copy=False)

if arr.dtype.hasobject is False and not (
dynamic_strings and arr.dtype == "float" and coerce_column_type in obj_tokens
):
Expand All @@ -188,12 +197,19 @@ def _to_primitive(arr, arr_name, dynamic_strings, string_max_len=None, coerce_co

if len(arr) == 0:
if coerce_column_type is None:
raise ArcticNativeNotYetImplemented(
"coercing column type is required when empty column of object type, Column type={} for column={}"
.format(arr.dtype, arr_name)
)
else:
return arr.astype(coerce_column_type)
if IS_PANDAS_TWO:
# Before Pandas 2.0, empty series' dtype was float, but as of Pandas 2.0. empty series' dtype became object.
# See: https://github.com/pandas-dev/pandas/issues/17261
# We want to maintain consistent behaviour, so we treat empty series as containing floats.
# val_type = ValueType::FLOAT;
coerce_column_type = float
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have to consider #224 first.

The concern is if the user did not intend for the column to be float but happened to have no data for the initial write, then it wouldn't be possible to append to this symbol subsequently.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a way to change the type of an (empty) column?

Copy link
Contributor

@qc00 qc00 Jul 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given the NoneType is being reviewed, I would rather we don't introduce this behaviour and temporarily skip the offending tests instead.

Otherwise, we would end up having a new behaviour that exists only in a few versions.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would wait for #646 to be merged before this PR first. What do you think?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you just get rid of coerce_column_type entirely, and all associated tests, and we'll just record a NoneType now that 646 is merged?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in #804.

return arr.astype(coerce_column_type)
else:
raise ArcticNativeNotYetImplemented(
"coercing column type is required when empty column of object type, Column type={} for column={}"
.format(arr.dtype, arr_name)
)
return arr.astype(coerce_column_type)

# Coerce column allows us to force a column to the given type, which means we can skip expensive iterations in
# Python with the caveat that if the user gave an invalid type it's going to blow up in the core.
Expand Down Expand Up @@ -277,6 +293,7 @@ def _from_tz_timestamp(ts, tz):
def _normalize_single_index(index, index_names, index_norm, dynamic_strings=None, string_max_len=None):
# index: pd.Index or np.ndarray -> np.ndarray
index_tz = None

if isinstance(index, RangeIndex):
# skip index since we can reconstruct it, so no need to actually store it
if index.name:
Expand Down Expand Up @@ -507,6 +524,20 @@ def _index_to_records(self, df, pd_norm, dynamic_strings, string_max_len):
df.reset_index(fields, inplace=True)
index = df.index
else:
n_rows = len(index)
n_categorical_columns = len(df.select_dtypes(include="category").columns)
if IS_PANDAS_TWO and isinstance(index, RangeIndex) and n_rows == 0 and n_categorical_columns == 0:
# In Pandas 1.0, an Index is used by default for any empty dataframe or series, except if
# there are categorical columns in which case a RangeIndex is used.
#
# In Pandas 2.0, RangeIndex is used by default for _any_ empty dataframe or series.
# See: https://github.com/pandas-dev/pandas/issues/49572
# Yet internally, ArcticDB uses a DatetimeIndex for empty dataframes and series without categorical
# columns.
#
# The index is converted to a DatetimeIndex for preserving the behavior of ArcticDB with Pandas 1.0.
index = DatetimeIndex([])
Copy link
Contributor

@qc00 qc00 Jul 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I came up with two tests for safe normalisation:

  • The data frame we read back should be equivalent to the input
  • If the user had a particular (index) type in mind but accidentally generated a zero-row data frame initially, the user should be able to append to such a frame

So the best solution is to introduce something in the C++ to allow the index type of an empty DF:

  • to be accurately saved
  • to change on the first append

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should just save whatever we're given, and then in the case of append and update we will ignore the index mismatch if there weren't any rows in the initial dataframe.


index_norm = pd_norm.index
index_norm.is_not_range_index = not isinstance(index, RangeIndex)

Expand Down Expand Up @@ -563,6 +594,13 @@ def denormalize(self, item, norm_meta):
else:
s.name = None

if s.empty and IS_PANDAS_TWO:
# Before Pandas 2.0, empty series' dtype was float, but as of Pandas 2.0. empty series' dtype became object.
# See: https://github.com/pandas-dev/pandas/issues/17261
# We want to maintain consistent behaviour, so we return empty series as containing objects
# when the Pandas version is >= 2.0
s = s.astype("object")

return s


Expand Down Expand Up @@ -670,7 +708,23 @@ def denormalize(self, item, norm_meta):
columns, denormed_columns, data = _denormalize_columns(item, norm_meta, idx_type, n_indexes)

if not self._skip_df_consolidation:
columns_dtype = {} if data is None else {name: np_array.dtype for name, np_array in data.items()}
df = DataFrame(data, index=index, columns=columns)

# Setting the columns' dtype manually, since pandas might just convert the dtype of some
# (empty) columns to another one and since the `dtype` keyword for `pd.DataFrame` constructor
# does not accept a mapping such as `columns_dtype`.
# For instance the following code has been tried but returns a pandas.DataFrame full of NaNs:
#
# columns_mapping = {} if data is None else {
# name: pd.Series(np_array, index=index, dtype=np_array.dtype)
# for name, np_array in data.items()
# }
# df = DataFrame(index=index, columns=columns_mapping, copy=False)
#
for column_name, dtype in columns_dtype.items():
df[column_name] = df[column_name].astype(dtype, copy=False)

else:
if index is not None:
df = self.df_without_consolidation(columns, index, item, n_indexes, data)
Expand Down
7 changes: 7 additions & 0 deletions python/arcticdb/version_store/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
"""

import datetime
import pytz
from enum import Enum, auto
from typing import Optional, Any, Tuple, Dict, AnyStr, Union, List, Iterable, NamedTuple
from numpy import datetime64

from arcticdb.supported_types import Timestamp
from arcticdb.util._versions import IS_PANDAS_TWO

from arcticdb.version_store.processing import QueryBuilder
from arcticdb.version_store._store import NativeVersionStore, VersionedItem, VersionQueryInput
Expand Down Expand Up @@ -1503,6 +1505,11 @@ def get_description(self, symbol: str, as_of: Optional[AsOf] = None) -> SymbolDe
"""
info = self._nvs.get_info(symbol, as_of)
last_update_time = pd.to_datetime(info["last_update"], utc=True)
if IS_PANDAS_TWO:
# Pandas 2.0.0 now uses `datetime.timezone.utc` instead of `pytz.UTC`.
# See: https://github.com/pandas-dev/pandas/issues/34916
# We enforce the use of `pytz.UTC` for consistency.
last_update_time = last_update_time.replace(tzinfo=pytz.UTC)
columns = tuple(NameWithDType(n, t) for n, t in zip(info["col_names"]["columns"], info["dtype"]))
index = NameWithDType(info["col_names"]["index"], info["col_names"]["index_dtype"])
date_range = tuple(
Expand Down
16 changes: 14 additions & 2 deletions python/tests/integration/arcticdb/test_arctic_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@
random_strings_of_length,
random_floats,
)
import random
from arcticdb.util._versions import IS_PANDAS_TWO

import random

if AZURE_SUPPORT:
from azure.storage.blob import BlobServiceClient
Expand Down Expand Up @@ -1094,7 +1095,18 @@ def test_read_description_batch_high_amount(arctic_library):
assert results_list[idx].date_range == date_range_comp_with_utc
if version > 0:
assert results_list[idx].last_update_time > results_list[idx - 1].last_update_time
assert results_list[idx].last_update_time.tz == pytz.UTC

result_last_update_time = results_list[idx].last_update_time
tz = result_last_update_time.tz

if IS_PANDAS_TWO:
# Pandas 2.0.0 now uses `datetime.timezone.utc` instead of `pytz.UTC`.
# See: https://github.com/pandas-dev/pandas/issues/34916
# TODO: is there a better way to handle this edge case?
assert tz == timezone.utc
else:
assert isinstance(tz, pytz.BaseTzInfo)
assert tz == pytz.UTC


def test_read_description_batch_empty_nat(arctic_library):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2046,8 +2046,12 @@ def test_dynamic_schema_column_hash_update(lmdb_version_store_column_buckets):

lib.update("symbol", df2)
vit = lib.read("symbol")
# In Pandas < 2.0, updating a `DataFrame` uniquely storing integers with
# another `DataFrame` that is uniquely storing integers changes all the dtypes
# to float64.
df.update(df2)
assert_frame_equal(vit.data.astype("float"), df)
df = df.astype("int64", copy=False)
assert_frame_equal(vit.data, df)


def test_dynamic_schema_column_hash_append(lmdb_version_store_column_buckets):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@
As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
"""
import datetime
import sys

import numpy as np
import pandas as pd
import pytest

from arcticdb.exceptions import ArcticNativeNotYetImplemented
from arcticdb.util._versions import IS_PANDAS_TWO
from arcticdb.util.test import assert_frame_equal


Expand Down Expand Up @@ -81,19 +84,40 @@ def test_categorical_empty(lmdb_version_store, sym):
lib = lmdb_version_store
lib.write(sym, df)
read_df = lib.read(sym).data
# In Pandas 1.0, an Index is used by default for any an empty dataframe or series is created,
# except if there are categorical columns in which case a RangeIndex is used.
#
# In Pandas 2.0, RangeIndex is used by default for _any_ an empty dataframe or series is created.
# See: https://github.com/pandas-dev/pandas/issues/49572
assert isinstance(df.index, pd.RangeIndex)
assert isinstance(read_df.index, pd.RangeIndex)
assert_frame_equal(df, read_df)


def test_categorical_with_integers(lmdb_version_store, sym):
c = pd.Categorical(np.arange(6))
df = pd.DataFrame({"int": np.arange(6), "cat": c})
df = pd.DataFrame({"int": np.arange(6), "cat_int": c})
lib = lmdb_version_store
lib.write(sym, df)
read_df = lib.read(sym).data
# Not pickled
assert lib.get_info(sym)["type"] == "pandasdf"
# should be category
assert read_df.cat.dtype == "category"
assert read_df.cat_int.dtype == "category"
if IS_PANDAS_TWO and sys.platform.startswith("win32"):
# Pandas 2.0.0 changed the underlying creation from numpy integral arrays:
# "Instantiating using a numpy numeric array now follows the dtype of the numpy array.
# Previously, all indexes created from numpy numeric arrays were forced to 64-bit.
# Now, for example, Index(np.array([1, 2, 3])) will be int32 on 32-bit systems,
# where it previously would have been int64 even on 32-bit systems.
# Instantiating Index using a list of numbers will still return 64bit dtypes,
# e.g. Index([1, 2, 3]) will have a int64 dtype, which is the same as previously."
# See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#index-can-now-hold-numpy-numeric-dtypes
# We have not control over the underlying integral array storing code for categorical columns
# so we replace the categorical column with its codes to perform the comparison with indentical dtypes.
df.cat_int = df.cat_int.cat.codes.astype(np.int32)
read_df.cat_int = read_df.cat_int.cat.codes.astype(np.int32)

assert_frame_equal(df, read_df)


Expand All @@ -109,6 +133,20 @@ def test_categorical_with_integers_and_strings(lmdb_version_store, sym):
# should be category
assert read_df.cat_int.dtype == "category"
assert read_df.cat_str.dtype == "category"
if IS_PANDAS_TWO and sys.platform.startswith("win32"):
# Pandas 2.0.0 changed the underlying creation from numpy integral arrays:
# "Instantiating using a numpy numeric array now follows the dtype of the numpy array.
# Previously, all indexes created from numpy numeric arrays were forced to 64-bit.
# Now, for example, Index(np.array([1, 2, 3])) will be int32 on 32-bit systems,
# where it previously would have been int64 even on 32-bit systems.
# Instantiating Index using a list of numbers will still return 64bit dtypes,
# e.g. Index([1, 2, 3]) will have a int64 dtype, which is the same as previously."
# See: https://pandas.pydata.org/docs/dev/whatsnew/v2.0.0.html#index-can-now-hold-numpy-numeric-dtypes
# We have not control over the underlying integral array storing code for categorical columns
# so we replace the categorical column with its codes to perform the comparison with indentical dtypes.
df.cat_int = df.cat_int.cat.codes.astype(np.int32)
read_df.cat_int = read_df.cat_int.cat.codes.astype(np.int32)

assert_frame_equal(df, read_df)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import sys

from arcticdb.util.test import assert_frame_equal
from arcticdb.util._versions import IS_PANDAS_TWO
from arcticdb_ext.tools import AZURE_SUPPORT


Expand Down Expand Up @@ -81,4 +82,9 @@ def test_stress_multicolumn(lib_type, request):
output_df = lib.read(name).data
print("reading from arctic native: {}".format(pd.Timestamp("now") - now))

if IS_PANDAS_TWO and test_data.empty:
# In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created.
# The index has to be converted to a DatetimeIndex by ArcticDB to perform updates.
test_data.index = test_data.index.astype("datetime64[ns]")

assert_frame_equal(test_data, output_df)
38 changes: 36 additions & 2 deletions python/tests/unit/arcticdb/version_store/test_empty_writes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from arcticdb.version_store._common import TimeFrame
from arcticdb.util.test import assert_frame_equal, assert_series_equal
from arcticdb.util._versions import IS_PANDAS_TWO


def test_write_no_rows(lmdb_version_store, sym):
Expand Down Expand Up @@ -134,12 +135,45 @@ def test_empty_series(lmdb_version_store_dynamic_schema, sym):
ser = pd.Series([])
lmdb_version_store_dynamic_schema.write(sym, ser)
assert not lmdb_version_store_dynamic_schema.is_symbol_pickled(sym)
if IS_PANDAS_TWO:
# In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created.
# The index is converted to a DatetimeIndex for preserving the behavior of ArcticDB with
# Pandas 1.0.
ser.index = ser.index.astype("datetime64[ns]")

assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, ser)


def test_fallback_to_pickle(lmdb_version_store, sym):
column_names = ["a", "b", "c"]
df = pd.DataFrame(columns=column_names)
lmdb_version_store.write(sym, df)
assert lmdb_version_store.is_symbol_pickled(sym)
assert_frame_equal(df, lmdb_version_store.read(sym).data)

# In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created.
# The index is converted to a DatetimeIndex for preserving the behavior of ArcticDB with Pandas 1.0.
assert isinstance(df.index, pd.RangeIndex if IS_PANDAS_TWO else pd.Index)

if IS_PANDAS_TWO:
# In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created.
# The index has to be converted to a DatetimeIndex by ArcticDB to perform updates.
df.index = df.index.astype("datetime64[ns]")

# Before Pandas 2.0, empty Series' dtype was "float64" and empty DataFrames' Columns' dtype was "object".
# As of Pandas 2.0, empty Series' dtype is "object" and empty DataFrames' Columns' dtype remains "object".
# See: https://github.com/pandas-dev/pandas/issues/17261
# When normalizing in Pandas 2.0, we convert empty Series' dtype to float to "float64" to be consistent
# with the behavior of ArcticDB with Pandas 1.0.
# The same logic is used to normalize empty DataFrames' columns.

# Hence:
if IS_PANDAS_TWO:
# In Pandas 2.0, empty Dataframes can now be stored without being pickled.
assert not lmdb_version_store.is_symbol_pickled(sym)
# and ArcticDB returns empty DataFrames with float64 for all columns if they have not been specified.
df = df.astype("float64", copy=False)
assert_frame_equal(df, lmdb_version_store.read(sym).data)
else:
# In Pandas 1.0, empty Dataframes are pickled.
assert lmdb_version_store.is_symbol_pickled(sym)
# and ArcticDB simply deserialize empty DataFrames.
assert_frame_equal(df, lmdb_version_store.read(sym).data)
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
from arcticdb_ext.storage import NoDataFoundException
from arcticdb.version_store.processing import QueryBuilder
from arcticdb_ext.exceptions import InternalException, UserInputException
from arcticdb.util.test import assert_frame_equal, PANDAS_VERSION
from arcticdb.util.test import assert_frame_equal
from arcticdb.util._versions import PANDAS_VERSION
from arcticdb.util.hypothesis import (
use_of_function_scoped_fixtures_in_hypothesis_checked,
integral_type_strategies,
Expand Down
Loading