Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions cpp/arcticdb/entity/types.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@ std::string_view datatype_to_str(const DataType dt) {
TO_STR(UTF_FIXED64)
TO_STR(UTF_DYNAMIC64)
TO_STR(EMPTYVAL)
// TO_STR(UTF8_STRING)
// TO_STR(BYTES)
// TO_STR(PICKLE)
#undef TO_STR
default:return std::string_view("UNKNOWN");
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/arcticdb/entity/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,6 @@ enum class DataType : uint8_t {
ASCII_DYNAMIC64 = detail::combine_val_bits(ValueType::ASCII_DYNAMIC, SizeBits::S64),
UTF_FIXED64 = detail::combine_val_bits(ValueType::UTF8_FIXED, SizeBits::S64),
UTF_DYNAMIC64 = detail::combine_val_bits(ValueType::UTF_DYNAMIC, SizeBits::S64),
BYTES_DYNAMIC64 = detail::combine_val_bits(ValueType::BYTES, SizeBits::S64),
EMPTYVAL = detail::combine_val_bits(ValueType::EMPTY, SizeBits::S64),
#undef DT_COMBINE
UNKNOWN = 0,
Expand Down Expand Up @@ -338,6 +337,7 @@ constexpr char get_dtype_specifier(ValueType vt){
case ValueType::UTF8_FIXED: return 'U';
case ValueType::ASCII_FIXED: return 'S';
case ValueType::BYTES: return 'O';
case ValueType::EMPTY: return 'O';
default:
return 'x';
}
Expand Down
9 changes: 6 additions & 3 deletions cpp/arcticdb/python/python_to_tensor_frame.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ NativeTensor obj_to_tensor(PyObject *ptr) {
auto arr = pybind11::detail::array_proxy(ptr);
auto descr = pybind11::detail::array_descriptor_proxy(arr->descr);
auto ndim = arr->nd;
auto val_type = get_value_type(descr->kind);
auto val_bytes = static_cast<uint8_t>(descr->elsize);
ssize_t size = ndim == 1 ? arr->dimensions[0] : arr->dimensions[0] * arr->dimensions[1];
auto val_type = size > 0 ? get_value_type(descr->kind) : ValueType::EMPTY;
auto val_bytes = static_cast<uint8_t>(descr->elsize);
auto c_style = arr->strides[0] == val_bytes;

if (is_sequence_type(val_type)) {
Expand Down Expand Up @@ -108,7 +108,10 @@ NativeTensor obj_to_tensor(PyObject *ptr) {
}
}

auto dt = combine_data_type(val_type, get_size_bits(val_bytes));
// When processing empty collections, the size bits have to be `SizeBits::S64`,
// and we can't use `val_bytes` to get this information since some dtype have another `elsize` than 8.
SizeBits size_bits = val_type == ValueType::EMPTY ? SizeBits::S64 : get_size_bits(val_bytes);
auto dt = combine_data_type(val_type, size_bits);
ssize_t nbytes = size * descr->elsize;
return {nbytes, ndim, arr->strides, arr->dimensions, dt, descr->elsize, arr->data};
}
Expand Down
18 changes: 15 additions & 3 deletions cpp/arcticdb/stream/index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <arcticdb/entity/index_range.hpp>
#include <arcticdb/pipeline/index_fields.hpp>
#include <arcticdb/entity/stream_descriptor.hpp>
#include <arcticdb/entity/type_utils.hpp>

#include <folly/Range.h>

Expand Down Expand Up @@ -80,9 +81,20 @@ class TimeseriesIndex : public BaseIndex<TimeseriesIndex> {
}

void check(const FieldCollection &fields) const {
util::check_arg(fields.size() >= int(field_count()), "expected at least {} fields, actual {}",
field_count(), fields.size());
util::check_arg(fields[0].type() == this->field(0).type(), "expected field[0]={}, actual {}",
const size_t fields_size = fields.size();
const int current_fields_size = int(field_count());

const TypeDescriptor &first_field_type = fields[0].type();
const TypeDescriptor &current_first_field_type = this->field(0).type();

const bool valid_type_promotion = has_valid_type_promotion(first_field_type, current_first_field_type).has_value();
const bool trivial_type_compatibility = trivially_compatible_types(first_field_type, current_first_field_type);

const bool compatible_types = valid_type_promotion || trivial_type_compatibility;

util::check_arg(fields_size >= current_fields_size, "expected at least {} fields, actual {}",
current_fields_size, fields_size);
util::check_arg(compatible_types, "expected field[0]={}, actual {}",
this->field(0), fields[0]);
}

Expand Down
8 changes: 7 additions & 1 deletion cpp/arcticdb/version/schema_checks.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,13 @@ inline bool columns_match(const StreamDescriptor &left, const StreamDescriptor &
if (left.fields(i).name() != right.fields(i).name())
return false;

if (!trivially_compatible_types(left.fields(i).type(), right.fields(i).type()))
const TypeDescriptor &left_type = left.fields(i).type();
const TypeDescriptor &right_type = right.fields(i).type();

const bool valid_type_promotion = has_valid_type_promotion(left_type, right_type).has_value();
const bool trivial_type_compatibility = trivially_compatible_types(left_type, right_type);

if (!trivial_type_compatibility and !valid_type_promotion)
return false;
}
return true;
Expand Down
28 changes: 10 additions & 18 deletions python/arcticdb/version_store/_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from collections import Counter
from arcticdb.exceptions import ArcticNativeException, ArcticDbNotYetImplemented
from arcticdb.supported_types import DateRangeInput, time_types as supported_time_types
from arcticdb.util._versions import IS_PANDAS_TWO
from arcticdb.util._versions import IS_PANDAS_TWO, IS_PANDAS_ZERO
from arcticdb.version_store.read_result import ReadResult
from arcticdb_ext.version_store import SortedValue as _SortedValue
from pandas.core.internals import make_block
Expand Down Expand Up @@ -196,19 +196,6 @@ def _to_primitive(arr, arr_name, dynamic_strings, string_max_len=None, coerce_co
return arr

if len(arr) == 0:
if coerce_column_type is None:
if IS_PANDAS_TWO:
# Before Pandas 2.0, empty series' dtype was float, but as of Pandas 2.0. empty series' dtype became object.
# See: https://github.com/pandas-dev/pandas/issues/17261
# We want to maintain consistent behaviour, so we treat empty series as containing floats.
# val_type = ValueType::FLOAT;
coerce_column_type = float
return arr.astype(coerce_column_type)
else:
raise ArcticDbNotYetImplemented(
"coercing column type is required when empty column of object type, Column type={} for column={}"
.format(arr.dtype, arr_name)
)
return arr.astype(coerce_column_type)

# Coerce column allows us to force a column to the given type, which means we can skip expensive iterations in
Expand Down Expand Up @@ -594,12 +581,12 @@ def denormalize(self, item, norm_meta):
else:
s.name = None

if s.empty and IS_PANDAS_TWO:
if s.empty:
# Before Pandas 2.0, empty series' dtype was float, but as of Pandas 2.0. empty series' dtype became object.
# See: https://github.com/pandas-dev/pandas/issues/17261
# We want to maintain consistent behaviour, so we return empty series as containing objects
# when the Pandas version is >= 2.0
s = s.astype("object")
s = s.astype("object") if IS_PANDAS_TWO else s.astype("float")

return s

Expand Down Expand Up @@ -738,8 +725,13 @@ def denormalize(self, item, norm_meta):
for key in norm_meta.common.categories:
if key in data:
category_info = list(norm_meta.common.categories[key].category)
res = pd.Categorical.from_codes(codes=data[key], categories=category_info)
df[key] = res
codes = data[key]
# `pd.Categorical.from_codes` from `pandas~=0.25.x` (pandas' supported version for python 3.6)
# does not support `codes` of `dtype=object`: it has to have an integral dtype.
# See: https://github.com/pandas-dev/pandas/blob/0.25.x/pandas/core/arrays/categorical.py#L688-L704
if IS_PANDAS_ZERO:
codes = np.asarray(codes, dtype=int)
df[key] = pd.Categorical.from_codes(codes=codes, categories=category_info)
for key in norm_meta.common.int_categories:
if key in data:
category_info = list(norm_meta.common.int_categories[key].category)
Expand Down
34 changes: 21 additions & 13 deletions python/tests/unit/arcticdb/version_store/test_empty_writes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@

As of the Change Date specified in that file, in accordance with the Business Source License, use of this software will be governed by the Apache License, version 2.0.
"""
import pytest
import pandas as pd
import numpy as np

from arcticdb.version_store._common import TimeFrame
from arcticdb.util.test import assert_frame_equal, assert_series_equal
from arcticdb.util._versions import IS_PANDAS_TWO


def test_write_no_rows(lmdb_version_store, sym):
Expand All @@ -21,21 +21,21 @@ def test_write_no_rows(lmdb_version_store, sym):
assert not lmdb_version_store.is_symbol_pickled(sym)
df.index = df.index.astype("datetime64[ns]")
df["a"] = df["a"].astype("float64")
assert_frame_equal(lmdb_version_store.read(sym).data, df)
assert_frame_equal(lmdb_version_store.read(sym).data, df, check_index_type=False, check_dtype=False)

df2 = pd.DataFrame([[1.3, 6, "test"]], columns=column_names, index=[pd.Timestamp(0)])
df2 = pd.concat((df, df2))
# coercing not needed
lmdb_version_store.append(sym, df2, dynamic_strings=True)
assert_frame_equal(lmdb_version_store.read(sym).data, df2)
assert_frame_equal(lmdb_version_store.read(sym).data, df2, check_index_type=False, check_dtype=False)

df3 = pd.DataFrame(
[[3.3, 8, None], [2.3, 10, "test2"]], columns=column_names, index=[pd.Timestamp(1), pd.Timestamp(2)]
)
df2 = pd.concat((df2, df3))
# coercing not needed
lmdb_version_store.append(sym, df3, dynamic_strings=True)
assert_frame_equal(lmdb_version_store.read(sym).data, df2)
assert_frame_equal(lmdb_version_store.read(sym).data, df2, check_index_type=False, check_dtype=False)


def test_write_no_columns_dynamic_schema(lmdb_version_store_dynamic_schema, sym):
Expand Down Expand Up @@ -89,12 +89,12 @@ def test_write_no_rows_and_columns(lmdb_version_store_dynamic_schema, sym):
lmdb_version_store_dynamic_schema.write(sym, df)
assert not lmdb_version_store_dynamic_schema.is_symbol_pickled(sym)
df.index = df.index.astype("datetime64[ns]")
assert_frame_equal(lmdb_version_store_dynamic_schema.read(sym).data, df)
assert_frame_equal(lmdb_version_store_dynamic_schema.read(sym).data, df, check_index_type=False, check_dtype=False)

df2 = pd.DataFrame([[1.3, 6, "test"]], columns=column_names, index=[pd.Timestamp(2)])
lmdb_version_store_dynamic_schema.append(sym, df2)
ans = lmdb_version_store_dynamic_schema.read(sym).data
assert_frame_equal(ans, df2)
assert_frame_equal(ans, df2, check_index_type=False, check_dtype=False)

df4 = pd.DataFrame(
[[3.3, 8, None, 3.5], [2.3, 10, "test2"]],
Expand All @@ -103,7 +103,7 @@ def test_write_no_rows_and_columns(lmdb_version_store_dynamic_schema, sym):
)
df5 = pd.concat((df2, df4))
lmdb_version_store_dynamic_schema.append(sym, df4, dynamic_strings=True)
assert_frame_equal(lmdb_version_store_dynamic_schema.read(sym).data, df5)
assert_frame_equal(lmdb_version_store_dynamic_schema.read(sym).data, df5, check_index_type=False, check_dtype=False)


def test_update_no_columns_dynamic_schema(lmdb_version_store_dynamic_schema, sym):
Expand Down Expand Up @@ -135,10 +135,18 @@ def test_empty_series(lmdb_version_store_dynamic_schema, sym):
ser = pd.Series([])
lmdb_version_store_dynamic_schema.write(sym, ser)
assert not lmdb_version_store_dynamic_schema.is_symbol_pickled(sym)
if IS_PANDAS_TWO:
# In Pandas 2.0, RangeIndex is used by default when an empty dataframe or series is created.
# The index is converted to a DatetimeIndex for preserving the behavior of ArcticDB with
# Pandas 1.0.
ser.index = ser.index.astype("datetime64[ns]")

assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, ser)
assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, ser, check_index_type=False)


@pytest.mark.parametrize("dtype", ["int64", "float64"])
def test_append_empty_series(lmdb_version_store_dynamic_schema, sym, dtype):
ser = pd.Series([])
lmdb_version_store_dynamic_schema.write(sym, ser)
assert not lmdb_version_store_dynamic_schema.is_symbol_pickled(sym)

assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, ser, check_index_type=False)

new_ser = pd.Series([1, 2, 3], dtype=dtype)
lmdb_version_store_dynamic_schema.append(sym, new_ser)
assert_series_equal(lmdb_version_store_dynamic_schema.read(sym).data, new_ser)