man-group · jjerphan · Sep 12, 2023 · Aug 29, 2023 · Aug 30, 2023 · Aug 30, 2023
@@ -36,9 +36,6 @@ std::string_view datatype_to_str(const DataType dt) {
         TO_STR(UTF_FIXED64)
         TO_STR(UTF_DYNAMIC64)
         TO_STR(EMPTYVAL)
-        //    TO_STR(UTF8_STRING)
-//     TO_STR(BYTES)
-        //    TO_STR(PICKLE)
 #undef TO_STR
         default:return std::string_view("UNKNOWN");
     }

@@ -203,7 +203,6 @@ enum class DataType : uint8_t {
     ASCII_DYNAMIC64 = detail::combine_val_bits(ValueType::ASCII_DYNAMIC, SizeBits::S64),
     UTF_FIXED64 = detail::combine_val_bits(ValueType::UTF8_FIXED, SizeBits::S64),
     UTF_DYNAMIC64 = detail::combine_val_bits(ValueType::UTF_DYNAMIC, SizeBits::S64),
-    BYTES_DYNAMIC64 = detail::combine_val_bits(ValueType::BYTES, SizeBits::S64),
     EMPTYVAL = detail::combine_val_bits(ValueType::EMPTY, SizeBits::S64),
 #undef DT_COMBINE
     UNKNOWN = 0,
@@ -338,6 +337,7 @@ constexpr char get_dtype_specifier(ValueType vt){
         case ValueType::UTF8_FIXED: return 'U';
         case ValueType::ASCII_FIXED: return 'S';
         case ValueType::BYTES: return 'O';
+        case ValueType::EMPTY: return 'O';
         default:
             return 'x';
     }

@@ -52,9 +52,9 @@ NativeTensor obj_to_tensor(PyObject *ptr) {
     auto arr = pybind11::detail::array_proxy(ptr);
     auto descr = pybind11::detail::array_descriptor_proxy(arr->descr);
     auto ndim = arr->nd;
-    auto val_type = get_value_type(descr->kind);
-    auto val_bytes = static_cast<uint8_t>(descr->elsize);
     ssize_t size = ndim == 1 ? arr->dimensions[0] : arr->dimensions[0] * arr->dimensions[1];
+    auto val_type = size > 0 ? get_value_type(descr->kind) : ValueType::EMPTY;
+    auto val_bytes = static_cast<uint8_t>(descr->elsize);
     auto c_style = arr->strides[0] == val_bytes;
 
     if (is_sequence_type(val_type)) {
@@ -108,7 +108,10 @@ NativeTensor obj_to_tensor(PyObject *ptr) {
         }
     }
 
-    auto dt = combine_data_type(val_type, get_size_bits(val_bytes));
+    // When processing empty collections, the size bits have to be `SizeBits::S64`,
+    // and we can't use `val_bytes` to get this information since some dtype have another `elsize` than 8.
+    SizeBits size_bits = val_type == ValueType::EMPTY ? SizeBits::S64 : get_size_bits(val_bytes);
+    auto dt = combine_data_type(val_type, size_bits);
     ssize_t nbytes = size * descr->elsize;
     return {nbytes, ndim, arr->strides, arr->dimensions, dt, descr->elsize, arr->data};
 }

@@ -14,6 +14,7 @@
 #include <arcticdb/entity/index_range.hpp>
 #include <arcticdb/pipeline/index_fields.hpp>
 #include <arcticdb/entity/stream_descriptor.hpp>
+#include <arcticdb/entity/type_utils.hpp>
 
 #include <folly/Range.h>
 
@@ -80,9 +81,20 @@ class TimeseriesIndex : public BaseIndex<TimeseriesIndex> {
     }
 
     void check(const FieldCollection &fields) const {
-        util::check_arg(fields.size() >= int(field_count()), "expected at least {} fields, actual {}",
-                        field_count(), fields.size());
-        util::check_arg(fields[0].type() == this->field(0).type(), "expected field[0]={}, actual {}",
+        const size_t fields_size = fields.size();
+        const int current_fields_size = int(field_count());
+
+        const TypeDescriptor &first_field_type = fields[0].type();
+        const TypeDescriptor &current_first_field_type = this->field(0).type();
+
+        const bool valid_type_promotion = has_valid_type_promotion(first_field_type, current_first_field_type).has_value();
+        const bool trivial_type_compatibility = trivially_compatible_types(first_field_type, current_first_field_type);
+
+        const bool compatible_types = valid_type_promotion || trivial_type_compatibility;
+
+        util::check_arg(fields_size >= current_fields_size, "expected at least {} fields, actual {}",
+                        current_fields_size, fields_size);
+        util::check_arg(compatible_types, "expected field[0]={}, actual {}",
                         this->field(0), fields[0]);
     }
 

@@ -63,7 +63,13 @@ inline bool columns_match(const StreamDescriptor &left, const StreamDescriptor &
         if (left.fields(i).name() != right.fields(i).name())
             return false;
 
-        if (!trivially_compatible_types(left.fields(i).type(), right.fields(i).type()))
+        const TypeDescriptor &left_type = left.fields(i).type();
+        const TypeDescriptor &right_type = right.fields(i).type();
+
+        const bool valid_type_promotion = has_valid_type_promotion(left_type, right_type).has_value();
+        const bool trivial_type_compatibility = trivially_compatible_types(left_type, right_type);
+
+        if (!trivial_type_compatibility and !valid_type_promotion)
             return false;
     }
     return true;

@@ -25,7 +25,7 @@
 from collections import Counter
 from arcticdb.exceptions import ArcticNativeException, ArcticDbNotYetImplemented
 from arcticdb.supported_types import DateRangeInput, time_types as supported_time_types
-from arcticdb.util._versions import IS_PANDAS_TWO
+from arcticdb.util._versions import IS_PANDAS_TWO, IS_PANDAS_ZERO
 from arcticdb.version_store.read_result import ReadResult
 from arcticdb_ext.version_store import SortedValue as _SortedValue
 from pandas.core.internals import make_block
@@ -196,19 +196,6 @@ def _to_primitive(arr, arr_name, dynamic_strings, string_max_len=None, coerce_co
         return arr
 
     if len(arr) == 0:
-        if coerce_column_type is None:
-            if IS_PANDAS_TWO:
-                # Before Pandas 2.0, empty series' dtype was float, but as of Pandas 2.0. empty series' dtype became object.
-                # See: https://github.com/pandas-dev/pandas/issues/17261
-                # We want to maintain consistent behaviour, so we treat empty series as containing floats.
-                # val_type = ValueType::FLOAT;
-                coerce_column_type = float
-                return arr.astype(coerce_column_type)
-            else:
-                raise ArcticDbNotYetImplemented(
-                    "coercing column type is required when empty column of object type, Column type={} for column={}"
-                    .format(arr.dtype, arr_name)
-                )
         return arr.astype(coerce_column_type)
 
     # Coerce column allows us to force a column to the given type, which means we can skip expensive iterations in
@@ -594,12 +581,12 @@ def denormalize(self, item, norm_meta):
         else:
             s.name = None
 
-        if s.empty and IS_PANDAS_TWO:
+        if s.empty:
             # Before Pandas 2.0, empty series' dtype was float, but as of Pandas 2.0. empty series' dtype became object.
             # See: https://github.com/pandas-dev/pandas/issues/17261
             # We want to maintain consistent behaviour, so we return empty series as containing objects
             # when the Pandas version is >= 2.0
-            s = s.astype("object")
+            s = s.astype("object") if IS_PANDAS_TWO else s.astype("float")
 
         return s
 
@@ -738,8 +725,13 @@ def denormalize(self, item, norm_meta):
         for key in norm_meta.common.categories:
             if key in data:
                 category_info = list(norm_meta.common.categories[key].category)
-                res = pd.Categorical.from_codes(codes=data[key], categories=category_info)
-                df[key] = res
+                codes = data[key]
+                # `pd.Categorical.from_codes` from `pandas~=0.25.x` (pandas' supported version for python 3.6)
+                # does not support `codes` of `dtype=object`: it has to have an integral dtype.
+                # See: https://github.com/pandas-dev/pandas/blob/0.25.x/pandas/core/arrays/categorical.py#L688-L704
+                if IS_PANDAS_ZERO:
+                    codes = np.asarray(codes, dtype=int)
+                df[key] = pd.Categorical.from_codes(codes=codes, categories=category_info)
         for key in norm_meta.common.int_categories:
             if key in data:
                 category_info = list(norm_meta.common.int_categories[key].category)

@@ -5,12 +5,12 @@
 
 As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
 """
+import pytest
 import pandas as pd
 import numpy as np
 
 from arcticdb.version_store._common import TimeFrame
 from arcticdb.util.test import assert_frame_equal, assert_series_equal
-from arcticdb.util._versions import IS_PANDAS_TWO
 
 
 def test_write_no_rows(lmdb_version_store, sym):
@@ -21,21 +21,21 @@ def test_write_no_rows(lmdb_version_store, sym):
     assert not lmdb_version_store.is_symbol_pickled(sym)
     df.index = df.index.astype("datetime64[ns]")
     df["a"] = df["a"].astype("float64")
-    assert_frame_equal(lmdb_version_store.read(sym).data, df)
+    assert_frame_equal(lmdb_version_store.read(sym).data, df, check_index_type=False, check_dtype=False)
 
     df2 = pd.DataFrame([[1.3, 6, "test"]], columns=column_names, index=[pd.Timestamp(0)])
     df2 = pd.concat((df, df2))
     # coercing not needed
     lmdb_version_store.append(sym, df2, dynamic_strings=True)
-    assert_frame_equal(lmdb_version_store.read(sym).data, df2)
+    assert_frame_equal(lmdb_version_store.read(sym).data, df2, check_index_type=False, check_dtype=False)
 
     df3 = pd.DataFrame(
         [[3.3, 8, None], [2.3, 10, "test2"]], columns=column_names, index=[pd.Timestamp(1), pd.Timestamp(2)]
     )
     df2 = pd.concat((df2, df3))
     # coercing not needed
     lmdb_version_store.append(sym, df3, dynamic_strings=True)
-    assert_frame_equal(lmdb_version_store.read(sym).data, df2)
+    assert_frame_equal(lmdb_version_store.read(sym).data, df2, check_index_type=False, check_dtype=False)
 
 
 def test_write_no_columns_dynamic_schema(lmdb_version_store_dynamic_schema, sym):
@@ -89,12 +89,12 @@ def test_write_no_rows_and_columns(lmdb_version_store_dynamic_schema, sym):
     lmdb_version_store_dynamic_schema.write(sym, df)
     assert not lmdb_version_store_dynamic_schema.is_symbol_pickled(sym)
     df.index = df.index.astype("datetime64[ns]")
-    assert_frame_equal(lmdb_version_store_dynamic_schema.read(sym).data, df)
+    assert_frame_equal(lmdb_version_store_dynamic_schema.read(sym).data, df, check_index_type=False, check_dtype=False)
 
     df2 = pd.DataFrame([[1.3, 6, "test"]], columns=column_names, index=[pd.Timestamp(2)])
     lmdb_version_store_dynamic_schema.append(sym, df2)
     ans = lmdb_version_store_dynamic_schema.read(sym).data
-    assert_frame_equal(ans, df2)
+    assert_frame_equal(ans, df2, check_index_type=False, check_dtype=False)
 
     df4 = pd.DataFrame(
         [[3.3, 8, None, 3.5], [2.3, 10, "test2"]],
@@ -103,7 +103,7 @@ def test_write_no_rows_and_columns(lmdb_version_store_dynamic_schema, sym):
     )
     df5 = pd.concat((df2, df4))
     lmdb_version_store_dynamic_schema.append(sym, df4, dynamic_strings=True)
-    assert_frame_equal(lmdb_version_store_dynamic_schema.read(sym).data, df5)
+    assert_frame_equal(lmdb_version_store_dynamic_schema.read(sym).data, df5, check_index_type=False, check_dtype=False)
 
 
 def test_update_no_columns_dynamic_schema(lmdb_version_store_dynamic_schema, sym):
@@ -135,10 +135,18 @@ def test_empty_series(lmdb_version_store_dynamic_schema, sym):
     ser = pd.Series([])
     lmdb_version_store_dynamic_schema.write(sym, ser)
     assert not lmdb_version_store_dynamic_schema.is_symbol_pickled(sym)
-    if IS_PANDAS_TWO:
-        # In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created.
-        # The index is converted to a DatetimeIndex for preserving the behavior of ArcticDB with
-        # Pandas 1.0.
-        ser.index = ser.index.astype("datetime64[ns]")
 
-    assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, ser)
+    assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, ser, check_index_type=False)
+
+
+@pytest.mark.parametrize("dtype", ["int64", "float64"])
+def test_append_empty_series(lmdb_version_store_dynamic_schema, sym, dtype):
+    ser = pd.Series([])
+    lmdb_version_store_dynamic_schema.write(sym, ser)
+    assert not lmdb_version_store_dynamic_schema.is_symbol_pickled(sym)
+
+    assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, ser, check_index_type=False)
+
+    new_ser = pd.Series([1, 2, 3], dtype=dtype)
+    lmdb_version_store_dynamic_schema.append(sym, new_ser)
+    assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, new_ser)