diff --git a/cpp/arcticdb/entity/types.cpp b/cpp/arcticdb/entity/types.cpp index 5683d0ca20..dfc9a74798 100644 --- a/cpp/arcticdb/entity/types.cpp +++ b/cpp/arcticdb/entity/types.cpp @@ -36,9 +36,6 @@ std::string_view datatype_to_str(const DataType dt) { TO_STR(UTF_FIXED64) TO_STR(UTF_DYNAMIC64) TO_STR(EMPTYVAL) - // TO_STR(UTF8_STRING) -// TO_STR(BYTES) - // TO_STR(PICKLE) #undef TO_STR default:return std::string_view("UNKNOWN"); } diff --git a/cpp/arcticdb/entity/types.hpp b/cpp/arcticdb/entity/types.hpp index aaceb12d0c..d5d3db6afa 100644 --- a/cpp/arcticdb/entity/types.hpp +++ b/cpp/arcticdb/entity/types.hpp @@ -203,7 +203,6 @@ enum class DataType : uint8_t { ASCII_DYNAMIC64 = detail::combine_val_bits(ValueType::ASCII_DYNAMIC, SizeBits::S64), UTF_FIXED64 = detail::combine_val_bits(ValueType::UTF8_FIXED, SizeBits::S64), UTF_DYNAMIC64 = detail::combine_val_bits(ValueType::UTF_DYNAMIC, SizeBits::S64), - BYTES_DYNAMIC64 = detail::combine_val_bits(ValueType::BYTES, SizeBits::S64), EMPTYVAL = detail::combine_val_bits(ValueType::EMPTY, SizeBits::S64), #undef DT_COMBINE UNKNOWN = 0, @@ -338,6 +337,7 @@ constexpr char get_dtype_specifier(ValueType vt){ case ValueType::UTF8_FIXED: return 'U'; case ValueType::ASCII_FIXED: return 'S'; case ValueType::BYTES: return 'O'; + case ValueType::EMPTY: return 'O'; default: return 'x'; } diff --git a/cpp/arcticdb/python/python_to_tensor_frame.cpp b/cpp/arcticdb/python/python_to_tensor_frame.cpp index bb885a924d..9f525b2231 100644 --- a/cpp/arcticdb/python/python_to_tensor_frame.cpp +++ b/cpp/arcticdb/python/python_to_tensor_frame.cpp @@ -52,9 +52,9 @@ NativeTensor obj_to_tensor(PyObject *ptr) { auto arr = pybind11::detail::array_proxy(ptr); auto descr = pybind11::detail::array_descriptor_proxy(arr->descr); auto ndim = arr->nd; - auto val_type = get_value_type(descr->kind); - auto val_bytes = static_cast(descr->elsize); ssize_t size = ndim == 1 ? arr->dimensions[0] : arr->dimensions[0] * arr->dimensions[1]; + auto val_type = size > 0 ? get_value_type(descr->kind) : ValueType::EMPTY; + auto val_bytes = static_cast(descr->elsize); auto c_style = arr->strides[0] == val_bytes; if (is_sequence_type(val_type)) { @@ -108,7 +108,10 @@ NativeTensor obj_to_tensor(PyObject *ptr) { } } - auto dt = combine_data_type(val_type, get_size_bits(val_bytes)); + // When processing empty collections, the size bits have to be `SizeBits::S64`, + // and we can't use `val_bytes` to get this information since some dtype have another `elsize` than 8. + SizeBits size_bits = val_type == ValueType::EMPTY ? SizeBits::S64 : get_size_bits(val_bytes); + auto dt = combine_data_type(val_type, size_bits); ssize_t nbytes = size * descr->elsize; return {nbytes, ndim, arr->strides, arr->dimensions, dt, descr->elsize, arr->data}; } diff --git a/cpp/arcticdb/stream/index.hpp b/cpp/arcticdb/stream/index.hpp index 39b11c92f1..ac111f25ec 100644 --- a/cpp/arcticdb/stream/index.hpp +++ b/cpp/arcticdb/stream/index.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -80,9 +81,20 @@ class TimeseriesIndex : public BaseIndex { } void check(const FieldCollection &fields) const { - util::check_arg(fields.size() >= int(field_count()), "expected at least {} fields, actual {}", - field_count(), fields.size()); - util::check_arg(fields[0].type() == this->field(0).type(), "expected field[0]={}, actual {}", + const size_t fields_size = fields.size(); + const int current_fields_size = int(field_count()); + + const TypeDescriptor &first_field_type = fields[0].type(); + const TypeDescriptor ¤t_first_field_type = this->field(0).type(); + + const bool valid_type_promotion = has_valid_type_promotion(first_field_type, current_first_field_type).has_value(); + const bool trivial_type_compatibility = trivially_compatible_types(first_field_type, current_first_field_type); + + const bool compatible_types = valid_type_promotion || trivial_type_compatibility; + + util::check_arg(fields_size >= current_fields_size, "expected at least {} fields, actual {}", + current_fields_size, fields_size); + util::check_arg(compatible_types, "expected field[0]={}, actual {}", this->field(0), fields[0]); } diff --git a/cpp/arcticdb/version/schema_checks.hpp b/cpp/arcticdb/version/schema_checks.hpp index 0e01b176c6..9349b1f119 100644 --- a/cpp/arcticdb/version/schema_checks.hpp +++ b/cpp/arcticdb/version/schema_checks.hpp @@ -63,7 +63,13 @@ inline bool columns_match(const StreamDescriptor &left, const StreamDescriptor & if (left.fields(i).name() != right.fields(i).name()) return false; - if (!trivially_compatible_types(left.fields(i).type(), right.fields(i).type())) + const TypeDescriptor &left_type = left.fields(i).type(); + const TypeDescriptor &right_type = right.fields(i).type(); + + const bool valid_type_promotion = has_valid_type_promotion(left_type, right_type).has_value(); + const bool trivial_type_compatibility = trivially_compatible_types(left_type, right_type); + + if (!trivial_type_compatibility and !valid_type_promotion) return false; } return true; diff --git a/python/arcticdb/version_store/_normalization.py b/python/arcticdb/version_store/_normalization.py index 56084c3483..dd0148260f 100644 --- a/python/arcticdb/version_store/_normalization.py +++ b/python/arcticdb/version_store/_normalization.py @@ -25,7 +25,7 @@ from collections import Counter from arcticdb.exceptions import ArcticNativeException, ArcticDbNotYetImplemented from arcticdb.supported_types import DateRangeInput, time_types as supported_time_types -from arcticdb.util._versions import IS_PANDAS_TWO +from arcticdb.util._versions import IS_PANDAS_TWO, IS_PANDAS_ZERO from arcticdb.version_store.read_result import ReadResult from arcticdb_ext.version_store import SortedValue as _SortedValue from pandas.core.internals import make_block @@ -196,19 +196,6 @@ def _to_primitive(arr, arr_name, dynamic_strings, string_max_len=None, coerce_co return arr if len(arr) == 0: - if coerce_column_type is None: - if IS_PANDAS_TWO: - # Before Pandas 2.0, empty series' dtype was float, but as of Pandas 2.0. empty series' dtype became object. - # See: https://github.com/pandas-dev/pandas/issues/17261 - # We want to maintain consistent behaviour, so we treat empty series as containing floats. - # val_type = ValueType::FLOAT; - coerce_column_type = float - return arr.astype(coerce_column_type) - else: - raise ArcticDbNotYetImplemented( - "coercing column type is required when empty column of object type, Column type={} for column={}" - .format(arr.dtype, arr_name) - ) return arr.astype(coerce_column_type) # Coerce column allows us to force a column to the given type, which means we can skip expensive iterations in @@ -594,12 +581,12 @@ def denormalize(self, item, norm_meta): else: s.name = None - if s.empty and IS_PANDAS_TWO: + if s.empty: # Before Pandas 2.0, empty series' dtype was float, but as of Pandas 2.0. empty series' dtype became object. # See: https://github.com/pandas-dev/pandas/issues/17261 # We want to maintain consistent behaviour, so we return empty series as containing objects # when the Pandas version is >= 2.0 - s = s.astype("object") + s = s.astype("object") if IS_PANDAS_TWO else s.astype("float") return s @@ -738,8 +725,13 @@ def denormalize(self, item, norm_meta): for key in norm_meta.common.categories: if key in data: category_info = list(norm_meta.common.categories[key].category) - res = pd.Categorical.from_codes(codes=data[key], categories=category_info) - df[key] = res + codes = data[key] + # `pd.Categorical.from_codes` from `pandas~=0.25.x` (pandas' supported version for python 3.6) + # does not support `codes` of `dtype=object`: it has to have an integral dtype. + # See: https://github.com/pandas-dev/pandas/blob/0.25.x/pandas/core/arrays/categorical.py#L688-L704 + if IS_PANDAS_ZERO: + codes = np.asarray(codes, dtype=int) + df[key] = pd.Categorical.from_codes(codes=codes, categories=category_info) for key in norm_meta.common.int_categories: if key in data: category_info = list(norm_meta.common.int_categories[key].category) diff --git a/python/tests/unit/arcticdb/version_store/test_empty_writes.py b/python/tests/unit/arcticdb/version_store/test_empty_writes.py index c73f37e9d9..176959c6c3 100644 --- a/python/tests/unit/arcticdb/version_store/test_empty_writes.py +++ b/python/tests/unit/arcticdb/version_store/test_empty_writes.py @@ -5,12 +5,12 @@ As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0. """ +import pytest import pandas as pd import numpy as np from arcticdb.version_store._common import TimeFrame from arcticdb.util.test import assert_frame_equal, assert_series_equal -from arcticdb.util._versions import IS_PANDAS_TWO def test_write_no_rows(lmdb_version_store, sym): @@ -21,7 +21,10 @@ def test_write_no_rows(lmdb_version_store, sym): assert not lmdb_version_store.is_symbol_pickled(sym) df.index = df.index.astype("datetime64[ns]") df["a"] = df["a"].astype("float64") - assert_frame_equal(lmdb_version_store.read(sym).data, df) + + # ArcticDB stores empty columns under a dedicated `EMPTYVAL` type, so the types are not going to match with pandas + # until the first append. + assert_frame_equal(lmdb_version_store.read(sym).data, df, check_index_type=False, check_dtype=False) df2 = pd.DataFrame([[1.3, 6, "test"]], columns=column_names, index=[pd.Timestamp(0)]) df2 = pd.concat((df, df2)) @@ -89,7 +92,9 @@ def test_write_no_rows_and_columns(lmdb_version_store_dynamic_schema, sym): lmdb_version_store_dynamic_schema.write(sym, df) assert not lmdb_version_store_dynamic_schema.is_symbol_pickled(sym) df.index = df.index.astype("datetime64[ns]") - assert_frame_equal(lmdb_version_store_dynamic_schema.read(sym).data, df) + # ArcticDB stores empty columns under a dedicated `EMPTYVAL` type, so the types are not going to match with pandas + # until the first append. + assert_frame_equal(lmdb_version_store_dynamic_schema.read(sym).data, df, check_index_type=False, check_dtype=False) df2 = pd.DataFrame([[1.3, 6, "test"]], columns=column_names, index=[pd.Timestamp(2)]) lmdb_version_store_dynamic_schema.append(sym, df2) @@ -135,10 +140,22 @@ def test_empty_series(lmdb_version_store_dynamic_schema, sym): ser = pd.Series([]) lmdb_version_store_dynamic_schema.write(sym, ser) assert not lmdb_version_store_dynamic_schema.is_symbol_pickled(sym) - if IS_PANDAS_TWO: - # In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created. - # The index is converted to a DatetimeIndex for preserving the behavior of ArcticDB with - # Pandas 1.0. - ser.index = ser.index.astype("datetime64[ns]") - assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, ser) + # ArcticDB stores empty columns under a dedicated `EMPTYVAL` type, so the types are not going to match with pandas + # until the first append. + assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, ser, check_index_type=False) + + +@pytest.mark.parametrize("dtype", ["int64", "float64"]) +def test_append_empty_series(lmdb_version_store_dynamic_schema, sym, dtype): + ser = pd.Series([]) + lmdb_version_store_dynamic_schema.write(sym, ser) + assert not lmdb_version_store_dynamic_schema.is_symbol_pickled(sym) + + # ArcticDB stores empty columns under a dedicated `EMPTYVAL` type, so the types are not going to match with pandas + # until the first append. + assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, ser, check_index_type=False) + + new_ser = pd.Series([1, 2, 3], dtype=dtype) + lmdb_version_store_dynamic_schema.append(sym, new_ser) + assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, new_ser)