diff --git a/c_glib/test/test-array.rb b/c_glib/test/test-array.rb index 43181742cc1..ee176b9949e 100644 --- a/c_glib/test/test-array.rb +++ b/c_glib/test/test-array.rb @@ -154,7 +154,7 @@ def test_diff def test_different_type array = build_string_array(["Start", "Shutdown", "Reboot"]) other_array = build_int8_array([2, 3, 6, 10]) - assert_equal("# Array types differed: string vs int8", + assert_equal("# Array types differed: string vs int8\n", array.diff_unified(other_array)) end end diff --git a/ci/conda_env_python.yml b/ci/conda_env_python.yml index c76c370e210..480c1d59799 100644 --- a/ci/conda_env_python.yml +++ b/ci/conda_env_python.yml @@ -16,6 +16,7 @@ # under the License. # don't add pandas here, because it is not a mandatory test dependency +cffi cython cloudpickle hypothesis diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 034ae61db75..be067d0f580 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -109,6 +109,7 @@ set(ARROW_SRCS tensor.cc type.cc visitor.cc + c/bridge.cc io/buffered.cc io/compressed.cc io/file.cc @@ -247,6 +248,7 @@ add_subdirectory(testing) # add_subdirectory(array) +add_subdirectory(c) add_subdirectory(io) add_subdirectory(util) add_subdirectory(vendored) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index b48e4d23b5b..fb8acbe2f94 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -642,6 +642,12 @@ Result> StructArray::Make( if (offset > length) { return Status::IndexError("Offset greater than length of child arrays"); } + if (null_bitmap == nullptr) { + if (null_count > 0) { + return Status::Invalid("null_count = ", null_count, " but no null bitmap given"); + } + null_count = 0; + } return std::make_shared(struct_(fields), length - offset, children, null_bitmap, null_count, offset); } diff --git a/cpp/src/arrow/array/builder_primitive.h b/cpp/src/arrow/array/builder_primitive.h index 78d211b954b..f1d4fb369d4 100644 --- a/cpp/src/arrow/array/builder_primitive.h +++ b/cpp/src/arrow/array/builder_primitive.h @@ -83,7 +83,7 @@ class NumericBuilder : public ArrayBuilder { /// uninitialized memory access Status AppendNulls(int64_t length) final { ARROW_RETURN_NOT_OK(Reserve(length)); - data_builder_.UnsafeAppend(length, static_cast(0)); + data_builder_.UnsafeAppend(length, value_type{}); // zero UnsafeSetNull(length); return Status::OK(); } @@ -91,7 +91,7 @@ class NumericBuilder : public ArrayBuilder { /// \brief Append a single null element Status AppendNull() final { ARROW_RETURN_NOT_OK(Reserve(1)); - data_builder_.UnsafeAppend(static_cast(0)); + data_builder_.UnsafeAppend(value_type{}); // zero UnsafeAppendToBitmap(false); return Status::OK(); } @@ -243,7 +243,7 @@ class NumericBuilder : public ArrayBuilder { void UnsafeAppendNull() { ArrayBuilder::UnsafeAppendToBitmap(false); - data_builder_.UnsafeAppend(0); + data_builder_.UnsafeAppend(value_type{}); // zero } std::shared_ptr type() const override { return type_; } diff --git a/cpp/src/arrow/array/builder_time.h b/cpp/src/arrow/array/builder_time.h index 83597336f35..d1d5f967961 100644 --- a/cpp/src/arrow/array/builder_time.h +++ b/cpp/src/arrow/array/builder_time.h @@ -21,52 +21,23 @@ #include -#include "arrow/array.h" #include "arrow/array/builder_base.h" -#include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" -#include "arrow/buffer_builder.h" -#include "arrow/status.h" -#include "arrow/type_traits.h" -#include "arrow/util/macros.h" namespace arrow { -class ARROW_EXPORT DayTimeIntervalBuilder : public ArrayBuilder { +// TODO this class is untested + +class ARROW_EXPORT DayTimeIntervalBuilder : public NumericBuilder { public: - using TypeClass = DayTimeIntervalType; using DayMilliseconds = DayTimeIntervalType::DayMilliseconds; explicit DayTimeIntervalBuilder(MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) : DayTimeIntervalBuilder(day_time_interval(), pool) {} - DayTimeIntervalBuilder(std::shared_ptr type, - MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) - : ArrayBuilder(pool), builder_(fixed_size_binary(sizeof(DayMilliseconds)), pool) {} - - void Reset() override { builder_.Reset(); } - Status Resize(int64_t capacity) override { return builder_.Resize(capacity); } - Status Append(DayMilliseconds day_millis) { - return builder_.Append(reinterpret_cast(&day_millis)); - } - void UnsafeAppend(DayMilliseconds day_millis) { - builder_.UnsafeAppend(reinterpret_cast(&day_millis)); - } - using ArrayBuilder::UnsafeAppendNull; - Status AppendNull() override { return builder_.AppendNull(); } - Status AppendNulls(int64_t length) override { return builder_.AppendNulls(length); } - Status FinishInternal(std::shared_ptr* out) override { - auto result = builder_.FinishInternal(out); - if (*out != NULLPTR) { - (*out)->type = type(); - } - return result; - } - - std::shared_ptr type() const override { return day_time_interval(); } - - private: - FixedSizeBinaryBuilder builder_; + explicit DayTimeIntervalBuilder(std::shared_ptr type, + MemoryPool* pool ARROW_MEMORY_POOL_DEFAULT) + : NumericBuilder(type, pool) {} }; } // namespace arrow diff --git a/cpp/src/arrow/array/diff_test.cc b/cpp/src/arrow/array/diff_test.cc index 0ad321158ee..b76d74fcdad 100644 --- a/cpp/src/arrow/array/diff_test.cc +++ b/cpp/src/arrow/array/diff_test.cc @@ -152,7 +152,7 @@ TEST_F(DiffTest, Errors) { ASSERT_RAISES(TypeError, Diff(*base_, *target_, default_memory_pool())); ASSERT_FALSE(base_->Equals(*target_, EqualOptions().diff_sink(&formatted))); - ASSERT_EQ(formatted.str(), R"(# Array types differed: int32 vs string)"); + ASSERT_EQ(formatted.str(), "# Array types differed: int32 vs string\n"); } template diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index c67ac16b6e9..d94fb0b0174 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -18,6 +18,7 @@ #include "arrow/array/validate.h" #include "arrow/array.h" +#include "arrow/util/bit_util.h" #include "arrow/util/logging.h" #include "arrow/visitor_inline.h" @@ -40,11 +41,13 @@ struct ValidateArrayVisitor { ARROW_RETURN_IF(array.data()->buffers.size() != 2, Status::Invalid("number of buffers is != 2")); - if (array.length() > 0 && array.data()->buffers[1] == nullptr) { - return Status::Invalid("values buffer is null"); - } - if (array.length() > 0 && array.values() == nullptr) { - return Status::Invalid("values is null"); + if (array.length() > 0) { + if (array.data()->buffers[1] == nullptr) { + return Status::Invalid("values buffer is null"); + } + if (array.values() == nullptr) { + return Status::Invalid("values is null"); + } } return Status::OK(); } @@ -227,21 +230,20 @@ struct ValidateArrayVisitor { Status ValidateOffsets(const ArrayType& array) { using offset_type = typename ArrayType::offset_type; - auto value_offsets = array.value_offsets(); - if (value_offsets == nullptr) { - if (array.length() != 0) { + if (array.length() > 0) { + // For length 0, an empty offsets array seems accepted as a special case (ARROW-544) + auto value_offsets = array.value_offsets(); + if (value_offsets == nullptr) { return Status::Invalid("non-empty array but value_offsets_ is null"); } - return Status::OK(); - } - if (value_offsets->size() / static_cast(sizeof(offset_type)) < array.length()) { - return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(), - " isn't large enough for length: ", array.length()); - } - - auto first_offset = array.value_offset(0); - if (array.offset() == 0 && first_offset != 0) { - return Status::Invalid("The first offset isn't zero"); + if (value_offsets->size() / static_cast(sizeof(offset_type)) < + array.length() + 1) { + return Status::Invalid("offset buffer size (bytes): ", value_offsets->size(), + " isn't large enough for length: ", array.length()); + } + if (array.offset() == 0 && array.value_offset(0) != 0) { + return Status::Invalid("The first offset isn't zero"); + } } return Status::OK(); } @@ -270,6 +272,37 @@ Status ValidateArray(const Array& array) { "of type ", type.ToString(), ", got ", data.buffers.size()); } + // Validate length of fixed-witdh buffers + if (array.length() > 0) { + for (size_t i = 0; i < data.buffers.size(); ++i) { + const auto bit_width = layout.bit_widths[i]; + if (bit_width > 0) { + const auto& buffer = data.buffers[i]; + if (buffer == nullptr) { + if (i == 0) { + // Null bitmap may be absent + continue; + } else { + return Status::Invalid("Buffer #", i, + " is null in non-empty array " + "of type ", + type.ToString()); + } + } + const auto min_size = + BitUtil::BytesForBits(bit_width * (array.length() + array.offset())); + if (buffer->size() < min_size) { + return Status::Invalid("Buffer #", i, + " is too small in array " + "of type ", + type.ToString(), " with length ", array.length(), + " and offset ", array.offset(), " (got ", buffer->size(), + ", expected at least ", min_size, ")"); + } + } + } + } + if (type.id() != Type::EXTENSION) { if (data.child_data.size() != static_cast(type.num_children())) { return Status::Invalid("Expected ", type.num_children(), diff --git a/cpp/src/arrow/c/CMakeLists.txt b/cpp/src/arrow/c/CMakeLists.txt new file mode 100644 index 00000000000..e23ce6ce819 --- /dev/null +++ b/cpp/src/arrow/c/CMakeLists.txt @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +add_arrow_test(bridge_test PREFIX "arrow-c") + +arrow_install_all_headers("arrow/c") diff --git a/cpp/src/arrow/c/abi.h b/cpp/src/arrow/c/abi.h new file mode 100644 index 00000000000..60c75187372 --- /dev/null +++ b/cpp/src/arrow/c/abi.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define ARROW_FLAG_DICTIONARY_ORDERED 1 +#define ARROW_FLAG_NULLABLE 2 +#define ARROW_FLAG_MAP_KEYS_SORTED 4 + +struct ArrowArray { + // Type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + + // Data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; +}; + +#ifdef __cplusplus +} +#endif diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc new file mode 100644 index 00000000000..992c1ab7b2a --- /dev/null +++ b/cpp/src/arrow/c/bridge.cc @@ -0,0 +1,1259 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/c/bridge.h" + +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/c/helpers.h" +#include "arrow/memory_pool.h" +#include "arrow/stl.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" +#include "arrow/util/parsing.h" +#include "arrow/util/string_view.h" +#include "arrow/visitor_inline.h" + +namespace arrow { + +using internal::checked_cast; + +namespace { + +////////////////////////////////////////////////////////////////////////// +// C data export + +Status ExportingNotImplemented(const DataType& type) { + return Status::NotImplemented("Exporting ", type.ToString(), " array not supported"); +} + +template +using PoolVector = std::vector>; + +struct ExportedArrayPrivateData { + std::string format_; + std::string name_; + std::string metadata_; + PoolVector buffers_; + struct ArrowArray dictionary_; + PoolVector child_arrays_; + PoolVector child_array_pointers_; + + std::shared_ptr data_; + + // Allocate ExportedArrayPrivateData instances using MemoryPool, + // to allow accounting memory and checking for memory leaks. + + static void* operator new(size_t size) { + DCHECK_EQ(size, sizeof(ExportedArrayPrivateData)); + uint8_t* data; + ARROW_CHECK_OK(default_memory_pool()->Allocate(static_cast(size), &data)); + return data; + } + + static void operator delete(void* ptr) { + default_memory_pool()->Free(reinterpret_cast(ptr), + sizeof(ExportedArrayPrivateData)); + } +}; + +void ReleaseExportedArray(struct ArrowArray* array) { + if (array->format == nullptr) { + // Array already released + return; + } + for (int64_t i = 0; i < array->n_children; ++i) { + struct ArrowArray* child = array->children[i]; + ArrowReleaseArray(child); + DCHECK_EQ(child->format, nullptr) + << "Child release callback should have marked it released"; + } + struct ArrowArray* dict = array->dictionary; + if (dict != nullptr && dict->format != nullptr) { + ArrowReleaseArray(dict); + DCHECK_EQ(dict->format, nullptr) + << "Dictionary release callback should have marked it released"; + } + DCHECK_NE(array->private_data, nullptr); + delete reinterpret_cast(array->private_data); + + array->format = nullptr; +} + +template +Status DowncastMetadataSize(SizeType size, int32_t* out) { + *out = static_cast(size); + if (*out < 0 || static_cast(*out) != size) { + return Status::Invalid("Metadata too large (more than 2**31 items or bytes)"); + } + return Status::OK(); +} + +Status ExportMetadata(const KeyValueMetadata& metadata, std::string* out) { + int32_t npairs; + RETURN_NOT_OK(DowncastMetadataSize(metadata.size(), &npairs)); + // Pre-compute total string size + size_t total_size = 4; + for (int32_t i = 0; i < npairs; ++i) { + total_size += 8 + metadata.key(i).length() + metadata.value(i).length(); + } + out->resize(total_size); + + char* data_start = &(*out)[0]; + char* data = data_start; + auto write_int32 = [&](int32_t v) -> void { + const int32_t le_v = BitUtil::ToLittleEndian(v); + memcpy(data, &le_v, 4); + data += 4; + }; + auto write_string = [&](const std::string& s) -> Status { + int32_t len; + RETURN_NOT_OK(DowncastMetadataSize(s.length(), &len)); + write_int32(len); + if (len > 0) { + memcpy(data, s.data(), len); + data += len; + } + return Status::OK(); + }; + + write_int32(npairs); + for (int32_t i = 0; i < npairs; ++i) { + RETURN_NOT_OK(write_string(metadata.key(i))); + RETURN_NOT_OK(write_string(metadata.value(i))); + } + DCHECK_EQ(static_cast(data - data_start), total_size); + return Status::OK(); +} + +struct ArrayExporter { + explicit ArrayExporter(const std::shared_ptr& data) + : data_(data), flags_(0) {} + + Status Export(const Field* field = nullptr) { + if (field != nullptr) { + export_.name_ = field->name(); + flags_ = field->nullable() ? ARROW_FLAG_NULLABLE : 0; + } else { + flags_ = ARROW_FLAG_NULLABLE; + } + RETURN_NOT_OK(VisitTypeInline(*data_->type, this)); + DCHECK(!export_.format_.empty()); + + // Store buffer pointers + export_.buffers_.resize(data_->buffers.size()); + std::transform(data_->buffers.begin(), data_->buffers.end(), export_.buffers_.begin(), + [](const std::shared_ptr& buffer) -> const void* { + return buffer ? buffer->data() : nullptr; + }); + + // Export dictionary + if (data_->dictionary != nullptr) { + if (checked_cast(*data_->type).ordered()) { + flags_ |= ARROW_FLAG_DICTIONARY_ORDERED; + } + dict_exporter_.reset(new ArrayExporter(data_->dictionary->data())); + RETURN_NOT_OK(dict_exporter_->Export()); + } + + // Export children + export_.child_arrays_.resize(data_->child_data.size()); + for (size_t i = 0; i < data_->child_data.size(); ++i) { + child_exporters_.emplace_back(data_->child_data[i]); + RETURN_NOT_OK( + child_exporters_.back().Export(data_->type->child(static_cast(i)).get())); + } + + // Export metadata + export_.metadata_ = ""; + if (field != nullptr) { + const auto metadata = field->metadata(); + if (metadata != nullptr && metadata->size() > 0) { + RETURN_NOT_OK(ExportMetadata(*metadata, &export_.metadata_)); + } + } + + // Store owning pointer to ArrayData + export_.data_ = data_; + + return Status::OK(); + } + + // Finalize exporting by setting C struct fields and allocating + // autonomous private data for each array node. + // + // This function can't fail, as properly reclaiming memory in case of error + // would be too fragile. After this function returns, memory is reclaimed + // by calling the release() pointer in the top level ArrowArray struct. + void Finish(struct ArrowArray* c_struct_) { + // First, create permanent ExportedArrayPrivateData, to make sure that + // child ArrayData pointers don't get invalidated. + auto pdata = new ExportedArrayPrivateData(std::move(export_)); + + // Second, finish dictionary and children. + if (dict_exporter_) { + dict_exporter_->Finish(&pdata->dictionary_); + } + pdata->child_array_pointers_.resize(data_->child_data.size(), nullptr); + for (size_t i = 0; i < data_->child_data.size(); ++i) { + auto ptr = pdata->child_array_pointers_[i] = &pdata->child_arrays_[i]; + child_exporters_[i].Finish(ptr); + } + + // Third, fill C struct. + DCHECK_NE(c_struct_, nullptr); + memset(c_struct_, 0, sizeof(*c_struct_)); + + c_struct_->format = pdata->format_.c_str(); + c_struct_->name = pdata->name_.c_str(); + c_struct_->metadata = pdata->metadata_.empty() ? nullptr : pdata->metadata_.c_str(); + c_struct_->flags = flags_; + + c_struct_->length = data_->length; + c_struct_->null_count = data_->null_count; + c_struct_->offset = data_->offset; + c_struct_->n_buffers = static_cast(pdata->buffers_.size()); + c_struct_->n_children = static_cast(pdata->child_array_pointers_.size()); + c_struct_->buffers = pdata->buffers_.data(); + + // We only initialize child_array_pointers_ here because the child_arrays_ + // vector may be resized, moved or copied around. + std::transform(pdata->child_arrays_.begin(), pdata->child_arrays_.end(), + pdata->child_array_pointers_.begin(), + [](struct ArrowArray& array) { return &array; }); + c_struct_->children = pdata->child_array_pointers_.data(); + c_struct_->dictionary = dict_exporter_ ? &pdata->dictionary_ : nullptr; + c_struct_->private_data = pdata; + c_struct_->release = ReleaseExportedArray; + } + + // Type-specific visitors + + Status Visit(const DataType& type) { return ExportingNotImplemented(type); } + + Status Visit(const NullType& type) { + export_.format_ = "n"; + return Status::OK(); + } + + Status Visit(const BooleanType& type) { + export_.format_ = "b"; + return Status::OK(); + } + + Status Visit(const Int8Type& type) { + export_.format_ = "c"; + return Status::OK(); + } + + Status Visit(const UInt8Type& type) { + export_.format_ = "C"; + return Status::OK(); + } + + Status Visit(const Int16Type& type) { + export_.format_ = "s"; + return Status::OK(); + } + + Status Visit(const UInt16Type& type) { + export_.format_ = "S"; + return Status::OK(); + } + + Status Visit(const Int32Type& type) { + export_.format_ = "i"; + return Status::OK(); + } + + Status Visit(const UInt32Type& type) { + export_.format_ = "I"; + return Status::OK(); + } + + Status Visit(const Int64Type& type) { + export_.format_ = "l"; + return Status::OK(); + } + + Status Visit(const UInt64Type& type) { + export_.format_ = "L"; + return Status::OK(); + } + + Status Visit(const HalfFloatType& type) { + export_.format_ = "e"; + return Status::OK(); + } + + Status Visit(const FloatType& type) { + export_.format_ = "f"; + return Status::OK(); + } + + Status Visit(const DoubleType& type) { + export_.format_ = "g"; + return Status::OK(); + } + + Status Visit(const FixedSizeBinaryType& type) { + export_.format_ = "w:" + std::to_string(type.byte_width()); + return Status::OK(); + } + + Status Visit(const Decimal128Type& type) { + export_.format_ = + "d:" + std::to_string(type.precision()) + "," + std::to_string(type.scale()); + return Status::OK(); + } + + Status Visit(const BinaryType& type) { + export_.format_ = "z"; + return Status::OK(); + } + + Status Visit(const LargeBinaryType& type) { + export_.format_ = "Z"; + return Status::OK(); + } + + Status Visit(const StringType& type) { + export_.format_ = "u"; + return Status::OK(); + } + + Status Visit(const LargeStringType& type) { + export_.format_ = "U"; + return Status::OK(); + } + + Status Visit(const Date32Type& type) { + export_.format_ = "tdD"; + return Status::OK(); + } + + Status Visit(const Date64Type& type) { + export_.format_ = "tdm"; + return Status::OK(); + } + + Status Visit(const Time32Type& type) { + switch (type.unit()) { + case TimeUnit::SECOND: + export_.format_ = "tts"; + break; + case TimeUnit::MILLI: + export_.format_ = "ttm"; + break; + default: + return Status::Invalid("Invalid time unit for Time32: ", type.unit()); + } + return Status::OK(); + } + + Status Visit(const Time64Type& type) { + switch (type.unit()) { + case TimeUnit::MICRO: + export_.format_ = "ttu"; + break; + case TimeUnit::NANO: + export_.format_ = "ttn"; + break; + default: + return Status::Invalid("Invalid time unit for Time64: ", type.unit()); + } + return Status::OK(); + } + + Status Visit(const TimestampType& type) { + switch (type.unit()) { + case TimeUnit::SECOND: + export_.format_ = "tss:"; + break; + case TimeUnit::MILLI: + export_.format_ = "tsm:"; + break; + case TimeUnit::MICRO: + export_.format_ = "tsu:"; + break; + case TimeUnit::NANO: + export_.format_ = "tsn:"; + break; + default: + return Status::Invalid("Invalid time unit for Timestamp: ", type.unit()); + } + export_.format_ += type.timezone(); + return Status::OK(); + } + + Status Visit(const DurationType& type) { + switch (type.unit()) { + case TimeUnit::SECOND: + export_.format_ = "tDs"; + break; + case TimeUnit::MILLI: + export_.format_ = "tDm"; + break; + case TimeUnit::MICRO: + export_.format_ = "tDu"; + break; + case TimeUnit::NANO: + export_.format_ = "tDn"; + break; + default: + return Status::Invalid("Invalid time unit for Duration: ", type.unit()); + } + return Status::OK(); + } + + Status Visit(const MonthIntervalType& type) { + export_.format_ = "tiM"; + return Status::OK(); + } + + Status Visit(const DayTimeIntervalType& type) { + export_.format_ = "tiD"; + return Status::OK(); + } + + Status Visit(const ListType& type) { + export_.format_ = "+l"; + return Status::OK(); + } + + Status Visit(const LargeListType& type) { + export_.format_ = "+L"; + return Status::OK(); + } + + Status Visit(const FixedSizeListType& type) { + export_.format_ = "+w:" + std::to_string(type.list_size()); + return Status::OK(); + } + + Status Visit(const StructType& type) { + export_.format_ = "+s"; + return Status::OK(); + } + + Status Visit(const MapType& type) { + export_.format_ = "+m"; + if (type.keys_sorted()) { + flags_ |= ARROW_FLAG_MAP_KEYS_SORTED; + } + return Status::OK(); + } + + Status Visit(const UnionType& type) { + std::string& s = export_.format_; + s = "+u"; + if (type.mode() == UnionMode::DENSE) { + s += "d:"; + } else { + DCHECK_EQ(type.mode(), UnionMode::SPARSE); + s += "s:"; + } + bool first = true; + for (const auto code : type.type_codes()) { + if (!first) { + s += ","; + } + s += std::to_string(code); + first = false; + } + return Status::OK(); + } + + Status Visit(const DictionaryType& type) { + // Dictionary array is exported as its index array + return VisitTypeInline(*type.index_type(), this); + } + + std::shared_ptr data_; + ExportedArrayPrivateData export_; + int64_t flags_; + std::unique_ptr dict_exporter_; + std::vector child_exporters_; +}; + +} // namespace + +Status ExportArray(const Array& array, struct ArrowArray* out) { + ArrayExporter exporter{array.data()}; + RETURN_NOT_OK(exporter.Export()); + exporter.Finish(out); + return Status::OK(); +} + +Status ExportArray(const Field& field, const Array& array, struct ArrowArray* out) { + ArrayExporter exporter{array.data()}; + RETURN_NOT_OK(exporter.Export(&field)); + exporter.Finish(out); + return Status::OK(); +} + +Status ExportRecordBatch(const RecordBatch& batch, struct ArrowArray* out) { + std::shared_ptr array; + RETURN_NOT_OK(batch.ToStructArray(&array)); + auto field = ::arrow::field("", array->type(), /*nullable=*/false); + if (batch.schema()->HasMetadata()) { + field = field->WithMetadata(batch.schema()->metadata()); + } + return ExportArray(*field, *array, out); +} + +////////////////////////////////////////////////////////////////////////// +// C data import + +namespace { + +Status InvalidFormatString(util::string_view v) { + return Status::Invalid("Invalid or unsupported format string: '", v, "'"); +} + +class FormatStringParser { + public: + FormatStringParser() {} + + explicit FormatStringParser(util::string_view v) : view_(v), index_(0) {} + + bool AtEnd() const { return index_ >= view_.length(); } + + char Next() { return view_[index_++]; } + + util::string_view Rest() { return view_.substr(index_); } + + Status CheckNext(char c) { + if (AtEnd() || Next() != c) { + return Invalid(); + } + return Status::OK(); + } + + Status CheckHasNext() { + if (AtEnd()) { + return Invalid(); + } + return Status::OK(); + } + + Status CheckAtEnd() { + if (!AtEnd()) { + return Invalid(); + } + return Status::OK(); + } + + template + Status ParseInt(util::string_view v, IntType* out) { + using ArrowIntType = typename CTypeTraits::ArrowType; + internal::StringConverter converter; + if (!converter(v.data(), v.size(), out)) { + return Invalid(); + } + return Status::OK(); + } + + Status ParseTimeUnit(TimeUnit::type* out) { + RETURN_NOT_OK(CheckHasNext()); + switch (Next()) { + case 's': + *out = TimeUnit::SECOND; + break; + case 'm': + *out = TimeUnit::MILLI; + break; + case 'u': + *out = TimeUnit::MICRO; + break; + case 'n': + *out = TimeUnit::NANO; + break; + default: + return Invalid(); + } + return Status::OK(); + } + + std::vector Split(util::string_view v, char delim = ',') { + std::vector parts; + size_t start = 0, end; + while (true) { + end = v.find_first_of(delim, start); + parts.push_back(v.substr(start, end - start)); + if (end == util::string_view::npos) { + break; + } + start = end + 1; + } + return parts; + } + + template + Status ParseInts(util::string_view v, std::vector* out) { + auto parts = Split(v); + std::vector result; + result.reserve(parts.size()); + for (const auto& p : parts) { + IntType i; + RETURN_NOT_OK(ParseInt(p, &i)); + result.push_back(i); + } + *out = std::move(result); + return Status::OK(); + } + + Status Invalid() { return InvalidFormatString(view_); } + + protected: + util::string_view view_; + size_t index_; +}; + +Status DecodeMetadata(const char* metadata, std::shared_ptr* out) { + auto read_int32 = [&](int32_t* out) -> Status { + int32_t v; + memcpy(&v, metadata, 4); + metadata += 4; + *out = BitUtil::FromLittleEndian(v); + if (*out < 0) { + return Status::Invalid("Invalid encoded metadata string"); + } + return Status::OK(); + }; + + auto read_string = [&](std::string* out) -> Status { + int32_t len; + RETURN_NOT_OK(read_int32(&len)); + out->resize(len); + if (len > 0) { + memcpy(&(*out)[0], metadata, len); + metadata += len; + } + return Status::OK(); + }; + + out->reset(); + if (metadata == nullptr) { + return Status::OK(); + } + int32_t npairs; + RETURN_NOT_OK(read_int32(&npairs)); + if (npairs == 0) { + return Status::OK(); + } + std::vector keys(npairs); + std::vector values(npairs); + for (int32_t i = 0; i < npairs; ++i) { + RETURN_NOT_OK(read_string(&keys[i])); + RETURN_NOT_OK(read_string(&values[i])); + } + *out = key_value_metadata(std::move(keys), std::move(values)); + return Status::OK(); +} + +// A wrapper struct for an imported C ArrowArray. +// The ArrowArray is released on destruction. +struct ImportedArrayData { + struct ArrowArray array_; + + ImportedArrayData() { + array_.format = nullptr; // Initially released + } + + void Release() { + if (array_.format != nullptr && array_.release != nullptr) { + array_.release(&array_); + array_.format = nullptr; + } + } + + ~ImportedArrayData() { Release(); } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ImportedArrayData); +}; + +// A buffer wrapping an imported piece of data. +class ImportedBuffer : public Buffer { + public: + ImportedBuffer(const uint8_t* data, int64_t size, + std::shared_ptr import) + : Buffer(data, size), import_(std::move(import)) {} + + ~ImportedBuffer() override {} + + protected: + std::shared_ptr import_; +}; + +static constexpr int64_t kMaxImportRecursionLevel = 64; + +struct ArrayImporter { + ArrayImporter() {} + + Status Import(struct ArrowArray* src) { + if (src->format == nullptr) { + return Status::Invalid("Cannot import released ArrowArray"); + } + recursion_level_ = 0; + import_ = std::make_shared(); + c_struct_ = &import_->array_; + ArrowMoveArray(src, c_struct_); + return DoImport(); + } + + Result> MakeField() { + std::shared_ptr metadata; + RETURN_NOT_OK(DecodeMetadata(c_struct_->metadata, &metadata)); + const char* name = c_struct_->name ? c_struct_->name : ""; + bool nullable = (c_struct_->flags & ARROW_FLAG_NULLABLE) != 0; + return field(name, data_->type, nullable, metadata); + } + + Result> Finish() { + if (dict_importer_ != nullptr) { + auto indices = MakeArray(data_); + ARROW_ASSIGN_OR_RAISE(auto values, dict_importer_->Finish()); + bool ordered = (c_struct_->flags & ARROW_FLAG_DICTIONARY_ORDERED) != 0; + auto type = dictionary(indices->type(), values->type(), ordered); + return std::make_shared(type, indices, values); + } else { + return MakeArray(data_); + } + } + + protected: + Status ImportChild(const ArrayImporter* parent, struct ArrowArray* src) { + if (src->format == nullptr) { + return Status::Invalid("Cannot import released ArrowArray"); + } + recursion_level_ = parent->recursion_level_ + 1; + if (recursion_level_ >= kMaxImportRecursionLevel) { + return Status::Invalid("Recursion level in ArrowArray struct exceeded"); + } + // Child buffers keep the entire parent import alive. + // Perhaps we can move the child structs to an owned area + // when the parent ImportedArrayData::Release() gets called, + // but that is another level of complication. + import_ = parent->import_; + // The ArrowArray shouldn't be moved, it's owned by its parent + c_struct_ = src; + return DoImport(); + } + + Status ImportDict(const ArrayImporter* parent, struct ArrowArray* src) { + return ImportChild(parent, src); + } + + Status DoImport() { + // First import children (required for reconstituting parent type) + for (int64_t i = 0; i < c_struct_->n_children; ++i) { + child_importers_.emplace_back(new ArrayImporter); + RETURN_NOT_OK(child_importers_.back()->ImportChild(this, c_struct_->children[i])); + } + + // Import main data + RETURN_NOT_OK(ProcessFormat()); + + // Import dictionary values + if (c_struct_->dictionary != nullptr) { + // Check this index type + bool indices_ok = false; + if (is_integer(data_->type->id())) { + indices_ok = checked_cast(*data_->type).is_signed(); + } + if (!indices_ok) { + return Status::Invalid( + "ArrowArray struct has a dictionary but is not a signed integer type: ", + data_->type); + } + dict_importer_.reset(new ArrayImporter); + RETURN_NOT_OK(dict_importer_->ImportDict(this, c_struct_->dictionary)); + } + return Status::OK(); + } + + Status ProcessFormat() { + f_parser_ = FormatStringParser(c_struct_->format); + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'n': + return ProcessNull(); + case 'b': + return ProcessPrimitive(boolean()); + case 'c': + return ProcessPrimitive(int8()); + case 'C': + return ProcessPrimitive(uint8()); + case 's': + return ProcessPrimitive(int16()); + case 'S': + return ProcessPrimitive(uint16()); + case 'i': + return ProcessPrimitive(int32()); + case 'I': + return ProcessPrimitive(uint32()); + case 'l': + return ProcessPrimitive(int64()); + case 'L': + return ProcessPrimitive(uint64()); + case 'e': + return ProcessPrimitive(float16()); + case 'f': + return ProcessPrimitive(float32()); + case 'g': + return ProcessPrimitive(float64()); + case 'u': + return ProcessStringLike(utf8()); + case 'U': + return ProcessStringLike(large_utf8()); + case 'z': + return ProcessStringLike(binary()); + case 'Z': + return ProcessStringLike(large_binary()); + case 'w': + return ProcessFixedSizeBinary(); + case 'd': + return ProcessDecimal(); + case 't': + return ProcessTemporal(); + case '+': + return ProcessNested(); + } + return f_parser_.Invalid(); + } + + Status ProcessTemporal() { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'd': + return ProcessDate(); + case 't': + return ProcessTime(); + case 'D': + return ProcessDuration(); + case 'i': + return ProcessInterval(); + case 's': + return ProcessTimestamp(); + } + return f_parser_.Invalid(); + } + + Status ProcessNested() { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'l': + return ProcessListLike(); + case 'L': + return ProcessListLike(); + case 'w': + return ProcessFixedSizeList(); + case 's': + return ProcessStruct(); + case 'm': + return ProcessMap(); + case 'u': + return ProcessUnion(); + } + return f_parser_.Invalid(); + } + + Status ProcessDate() { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'D': + return ProcessPrimitive(date32()); + case 'm': + return ProcessPrimitive(date64()); + } + return f_parser_.Invalid(); + } + + Status ProcessInterval() { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + switch (f_parser_.Next()) { + case 'D': + return ProcessPrimitive(day_time_interval()); + case 'M': + return ProcessPrimitive(month_interval()); + } + return f_parser_.Invalid(); + } + + Status ProcessTime() { + TimeUnit::type unit; + RETURN_NOT_OK(f_parser_.ParseTimeUnit(&unit)); + if (unit == TimeUnit::SECOND || unit == TimeUnit::MILLI) { + return ProcessPrimitive(time32(unit)); + } else { + return ProcessPrimitive(time64(unit)); + } + } + + Status ProcessDuration() { + TimeUnit::type unit; + RETURN_NOT_OK(f_parser_.ParseTimeUnit(&unit)); + return ProcessPrimitive(duration(unit)); + } + + Status ProcessTimestamp() { + TimeUnit::type unit; + RETURN_NOT_OK(f_parser_.ParseTimeUnit(&unit)); + RETURN_NOT_OK(f_parser_.CheckNext(':')); + return ImportFixedSizePrimitive(timestamp(unit, std::string(f_parser_.Rest()))); + } + + Status ProcessFixedSizeBinary() { + RETURN_NOT_OK(f_parser_.CheckNext(':')); + int32_t byte_width = -1; + RETURN_NOT_OK(f_parser_.ParseInt(f_parser_.Rest(), &byte_width)); + if (byte_width < 0) { + return f_parser_.Invalid(); + } + return ImportFixedSizePrimitive(fixed_size_binary(byte_width)); + } + + Status ProcessDecimal() { + RETURN_NOT_OK(f_parser_.CheckNext(':')); + std::vector prec_scale; + RETURN_NOT_OK(f_parser_.ParseInts(f_parser_.Rest(), &prec_scale)); + if (prec_scale.size() != 2) { + return f_parser_.Invalid(); + } + if (prec_scale[0] <= 0 || prec_scale[1] <= 0) { + return f_parser_.Invalid(); + } + return ImportFixedSizePrimitive(decimal(prec_scale[0], prec_scale[1])); + } + + Status ProcessPrimitive(const std::shared_ptr& type) { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + return ImportFixedSizePrimitive(type); + } + + Status ProcessNull() { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + auto type = null(); + RETURN_NOT_OK(CheckNoChildren(type)); + // XXX should we be lenient on the number of buffers? + RETURN_NOT_OK(CheckNumBuffers(type, 1)); + RETURN_NOT_OK(AllocateArrayData(type)); + RETURN_NOT_OK(ImportBitsBuffer(0)); + return Status::OK(); + } + + template + Status ProcessStringLike(const std::shared_ptr& type) { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + return ImportStringLike(type); + } + + template + Status ProcessListLike() { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + RETURN_NOT_OK(CheckNumChildren(1)); + ARROW_ASSIGN_OR_RAISE(auto field, MakeChildField(0)); + auto type = std::make_shared(field); + RETURN_NOT_OK(CheckNumBuffers(type, 2)); + RETURN_NOT_OK(AllocateArrayData(type)); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK(ImportOffsetsBuffer(1)); + return Status::OK(); + } + + Status ProcessMap() { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + RETURN_NOT_OK(CheckNumChildren(1)); + ARROW_ASSIGN_OR_RAISE(auto field, MakeChildField(0)); + const auto& value_type = field->type(); + if (value_type->id() != Type::STRUCT) { + return Status::Invalid("Imported map array has unexpected child field type: ", + field->ToString()); + } + if (value_type->num_children() != 2) { + return Status::Invalid("Imported map array has unexpected child field type: ", + field->ToString()); + } + + bool keys_sorted = (c_struct_->flags & ARROW_FLAG_MAP_KEYS_SORTED); + auto type = + map(value_type->child(0)->type(), value_type->child(1)->type(), keys_sorted); + // Process buffers as for ListType + RETURN_NOT_OK(CheckNumBuffers(type, 2)); + RETURN_NOT_OK(AllocateArrayData(type)); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK(ImportOffsetsBuffer(1)); + return Status::OK(); + } + + Status ProcessFixedSizeList() { + RETURN_NOT_OK(f_parser_.CheckNext(':')); + int32_t list_size = -1; + RETURN_NOT_OK(f_parser_.ParseInt(f_parser_.Rest(), &list_size)); + if (list_size < 0) { + return f_parser_.Invalid(); + } + RETURN_NOT_OK(CheckNumChildren(1)); + ARROW_ASSIGN_OR_RAISE(auto field, MakeChildField(0)); + auto type = fixed_size_list(field, list_size); + RETURN_NOT_OK(CheckNumBuffers(type, 1)); + RETURN_NOT_OK(AllocateArrayData(type)); + RETURN_NOT_OK(ImportNullBitmap()); + return Status::OK(); + } + + Status ProcessStruct() { + RETURN_NOT_OK(f_parser_.CheckAtEnd()); + ARROW_ASSIGN_OR_RAISE(auto fields, MakeChildFields()); + auto type = struct_(std::move(fields)); + RETURN_NOT_OK(CheckNumBuffers(type, 1)); + RETURN_NOT_OK(AllocateArrayData(type)); + RETURN_NOT_OK(ImportNullBitmap()); + return Status::OK(); + } + + Status ProcessUnion() { + RETURN_NOT_OK(f_parser_.CheckHasNext()); + UnionMode::type mode; + switch (f_parser_.Next()) { + case 'd': + mode = UnionMode::DENSE; + break; + case 's': + mode = UnionMode::SPARSE; + break; + default: + return f_parser_.Invalid(); + } + RETURN_NOT_OK(f_parser_.CheckNext(':')); + std::vector type_codes; + RETURN_NOT_OK(f_parser_.ParseInts(f_parser_.Rest(), &type_codes)); + ARROW_ASSIGN_OR_RAISE(auto fields, MakeChildFields()); + if (fields.size() != type_codes.size()) { + return Status::Invalid( + "ArrowArray struct number of children incompatible with format string '", + c_struct_->format, "'"); + } + auto type = union_(std::move(fields), std::move(type_codes), mode); + RETURN_NOT_OK(CheckNumBuffers(type, 3)); + RETURN_NOT_OK(AllocateArrayData(type)); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK(ImportFixedSizeBuffer(1, sizeof(int8_t))); + if (mode == UnionMode::DENSE) { + RETURN_NOT_OK(ImportFixedSizeBuffer(2, sizeof(int32_t))); + } else { + RETURN_NOT_OK(ImportUnusedBuffer(2)); + } + return Status::OK(); + } + + Status ImportFixedSizePrimitive(const std::shared_ptr& type) { + const auto& fw_type = checked_cast(*type); + RETURN_NOT_OK(CheckNoChildren(type)); + RETURN_NOT_OK(CheckNumBuffers(type, 2)); + RETURN_NOT_OK(AllocateArrayData(type)); + RETURN_NOT_OK(ImportNullBitmap()); + if (BitUtil::IsMultipleOf8(fw_type.bit_width())) { + RETURN_NOT_OK(ImportFixedSizeBuffer(1, fw_type.bit_width() / 8)); + } else { + DCHECK_EQ(fw_type.bit_width(), 1); + RETURN_NOT_OK(ImportBitsBuffer(1)); + } + return Status::OK(); + } + + template + Status ImportStringLike(const std::shared_ptr& type) { + RETURN_NOT_OK(CheckNoChildren(type)); + RETURN_NOT_OK(CheckNumBuffers(type, 3)); + RETURN_NOT_OK(AllocateArrayData(type)); + RETURN_NOT_OK(ImportNullBitmap()); + RETURN_NOT_OK(ImportOffsetsBuffer(1)); + RETURN_NOT_OK(ImportStringValuesBuffer(1, 2)); + return Status::OK(); + } + + Result> MakeChildField(int64_t child_id) { + const auto& child = child_importers_[child_id]; + if (child->c_struct_->name == nullptr) { + return Status::Invalid("Expected non-null name in imported array child"); + } + return child->MakeField(); + } + + Result>> MakeChildFields() { + std::vector> fields(child_importers_.size()); + for (int64_t i = 0; i < static_cast(child_importers_.size()); ++i) { + ARROW_ASSIGN_OR_RAISE(fields[i], MakeChildField(i)); + } + return fields; + } + + Status CheckNoChildren(const std::shared_ptr& type) { + return CheckNumChildren(type, 0); + } + + Status CheckNumChildren(const std::shared_ptr& type, int64_t n_children) { + if (c_struct_->n_children != n_children) { + return Status::Invalid("Expected ", n_children, " children for imported type ", + *type, ", ArrowArray struct has ", c_struct_->n_children); + } + return Status::OK(); + } + + Status CheckNumChildren(int64_t n_children) { + if (c_struct_->n_children != n_children) { + return Status::Invalid("Expected ", n_children, " children for imported format '", + c_struct_->format, "', ArrowArray struct has ", + c_struct_->n_children); + } + return Status::OK(); + } + + Status CheckNumBuffers(const std::shared_ptr& type, int64_t n_buffers) { + if (n_buffers != c_struct_->n_buffers) { + return Status::Invalid("Expected ", n_buffers, " buffers for imported type ", *type, + ", ArrowArray struct has ", c_struct_->n_buffers); + } + return Status::OK(); + } + + Status AllocateArrayData(const std::shared_ptr& type) { + DCHECK_EQ(data_, nullptr); + data_ = std::make_shared(type, c_struct_->length, c_struct_->null_count, + c_struct_->offset); + data_->buffers.resize(static_cast(c_struct_->n_buffers)); + data_->child_data.resize(static_cast(c_struct_->n_children)); + DCHECK_EQ(child_importers_.size(), data_->child_data.size()); + std::transform(child_importers_.begin(), child_importers_.end(), + data_->child_data.begin(), + [](std::unique_ptr& child) { return child->data_; }); + return Status::OK(); + } + + Status ImportNullBitmap(int32_t buffer_id = 0) { + RETURN_NOT_OK(ImportBitsBuffer(buffer_id)); + if (data_->null_count != 0 && data_->buffers[buffer_id] == nullptr) { + return Status::Invalid( + "ArrowArray struct has null bitmap buffer but non-zero null_count ", + data_->null_count); + } + return Status::OK(); + } + + Status ImportBitsBuffer(int32_t buffer_id) { + // Compute visible size of buffer + int64_t buffer_size = + BitUtil::RoundUpToMultipleOf8(c_struct_->length + c_struct_->offset) / 8; + return ImportBuffer(buffer_id, buffer_size); + } + + Status ImportUnusedBuffer(int32_t buffer_id) { return ImportBuffer(buffer_id, 0); } + + Status ImportFixedSizeBuffer(int32_t buffer_id, int64_t byte_width) { + // Compute visible size of buffer + int64_t buffer_size = byte_width * (c_struct_->length + c_struct_->offset); + return ImportBuffer(buffer_id, buffer_size); + } + + template + Status ImportOffsetsBuffer(int32_t buffer_id) { + // Compute visible size of buffer + int64_t buffer_size = + sizeof(OffsetType) * (c_struct_->length + c_struct_->offset + 1); + return ImportBuffer(buffer_id, buffer_size); + } + + template + Status ImportStringValuesBuffer(int32_t offsets_buffer_id, int32_t buffer_id, + int64_t byte_width = 1) { + auto offsets = data_->GetValues(offsets_buffer_id); + // Compute visible size of buffer + int64_t buffer_size = byte_width * offsets[c_struct_->length]; + return ImportBuffer(buffer_id, buffer_size); + } + + Status ImportBuffer(int32_t buffer_id, int64_t buffer_size) { + std::shared_ptr* out = &data_->buffers[buffer_id]; + auto data = reinterpret_cast(c_struct_->buffers[buffer_id]); + if (data != nullptr) { + *out = std::make_shared(data, buffer_size, import_); + } else { + out->reset(); + } + return Status::OK(); + } + + std::shared_ptr import_; + struct ArrowArray* c_struct_; + FormatStringParser f_parser_; + int64_t recursion_level_; + std::shared_ptr data_; + std::vector> child_importers_; + std::unique_ptr dict_importer_; +}; + +} // namespace + +Result> ImportArray(struct ArrowArray* array) { + ArrayImporter importer; + RETURN_NOT_OK(importer.Import(array)); + return importer.Finish(); +} + +Status ImportArray(struct ArrowArray* array, std::shared_ptr* out_field, + std::shared_ptr* out_array) { + ArrayImporter importer; + RETURN_NOT_OK(importer.Import(array)); + RETURN_NOT_OK(importer.MakeField().Value(out_field)); + return importer.Finish().Value(out_array); +} + +Result> ImportRecordBatch(struct ArrowArray* array) { + std::shared_ptr array_result; + std::shared_ptr field_result; + RETURN_NOT_OK(ImportArray(array, &field_result, &array_result)); + if (array_result->type_id() != Type::STRUCT) { + return Status::Invalid("Imported array has type ", array_result->type(), + " but only a struct array can be converted to RecordBatch"); + } + if (array_result->null_count() != 0) { + return Status::Invalid("Imported array has nulls, cannot convert to RecordBatch"); + } + if (array_result->offset() != 0) { + return Status::Invalid( + "Imported array has non-zero offset, " + "cannot convert to RecordBatch"); + } + std::shared_ptr batch; + RETURN_NOT_OK(RecordBatch::FromStructArray(array_result, &batch)); + return batch->ReplaceSchemaMetadata(field_result->metadata()); +} + +} // namespace arrow diff --git a/cpp/src/arrow/c/bridge.h b/cpp/src/arrow/c/bridge.h new file mode 100644 index 00000000000..93ec35fd8a4 --- /dev/null +++ b/cpp/src/arrow/c/bridge.h @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/c/abi.h" +#include "arrow/result.h" +#include "arrow/status.h" +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief Export C++ array using the C data interface format. +/// +/// The array is considered to have empty name and metadata. +/// The resulting ArrowArray struct keeps the array data and buffers alive +/// until its release callback is called by the consumer. +/// +/// \param[in] array Array object to export +/// \param[out] out C struct where to export the array +ARROW_EXPORT +Status ExportArray(const Array& array, struct ArrowArray* out); + +/// \brief Export C++ array using the C data interface format. +/// +/// The field argument specifies the array name and metadata. +/// The resulting ArrowArray struct keeps the array data and buffers alive +/// until its release callback is called by the consumer. +/// +/// \param[in] field Field object holding top-level name and metadata +/// \param[in] array Array object to export +/// \param[out] out C struct where to export the array +ARROW_EXPORT +Status ExportArray(const Field& field, const Array& array, struct ArrowArray* out); + +/// \brief Export C++ record batch using the C data interface format. +/// +/// The record batch is exported as if it were a struct array, but with +/// additional top-level metadata. +/// The resulting ArrowArray struct keeps the record batch data and buffers alive +/// until its release callback is called by the consumer. +/// +/// \param[in] batch Record batch to export +/// \param[out] out C struct where to export the record batch +ARROW_EXPORT +Status ExportRecordBatch(const RecordBatch& batch, struct ArrowArray* out); + +/// \brief Import C++ array from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \return Imported array object +ARROW_EXPORT +Result> ImportArray(struct ArrowArray* array); + +/// \brief Import C++ array and field from the C data interface. +/// +/// The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting array. +/// In addition, a Field object is created to represent the top-level array +/// name, type and metadata. +/// +/// \param[in,out] array C data interface struct holding the array data +/// \param[out] out_field Imported field object +/// \param[out] out_array Imported array object +ARROW_EXPORT +Status ImportArray(struct ArrowArray* array, std::shared_ptr* out_field, + std::shared_ptr* out_array); + +/// \brief Import C++ record batch from the C data interface. +/// +/// The array type represented by the ArrowArray struct must be a struct type +/// array. The ArrowArray struct has its contents moved (as per the C data interface +/// specification) to a private object held alive by the resulting record batch. +/// +/// \param[in,out] array C data interface struct holding the record batch data +/// \return Imported record batch object +ARROW_EXPORT +Result> ImportRecordBatch(struct ArrowArray* array); + +} // namespace arrow diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc new file mode 100644 index 00000000000..4c7440e6bf0 --- /dev/null +++ b/cpp/src/arrow/c/bridge_test.cc @@ -0,0 +1,1659 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include + +#include "arrow/c/bridge.h" +#include "arrow/c/helpers.h" +#include "arrow/ipc/json_simple.h" +#include "arrow/memory_pool.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/macros.h" +#include "arrow/util/string_view.h" + +namespace arrow { + +class ExportGuard { + public: + explicit ExportGuard(struct ArrowArray* c_export) : c_export_(c_export) {} + + ~ExportGuard() { Release(); } + + void Release() { + if (c_export_) { + ArrowReleaseArray(c_export_); + c_export_ = nullptr; + } + } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ExportGuard); + + struct ArrowArray* c_export_; +}; + +class ReleaseCallback { + public: + explicit ReleaseCallback(struct ArrowArray* c_struct) : called_(false) { + orig_release_ = c_struct->release; + orig_private_data_ = c_struct->private_data; + c_struct->release = ReleaseUnbound; + c_struct->private_data = this; + } + + static void ReleaseUnbound(struct ArrowArray* c_struct) { + reinterpret_cast(c_struct->private_data)->Release(c_struct); + } + + void Release(struct ArrowArray* c_struct) { + ASSERT_FALSE(called_) << "ReleaseCallback called twice"; + called_ = true; + ASSERT_NE(c_struct->format, nullptr) + << "ReleaseCallback called with released ArrowArray"; + // Call original release callback + c_struct->release = orig_release_; + c_struct->private_data = orig_private_data_; + ArrowReleaseArray(c_struct); + } + + void AssertCalled() { ASSERT_TRUE(called_) << "ReleaseCallback was not called"; } + + void AssertNotCalled() { ASSERT_FALSE(called_) << "ReleaseCallback was called"; } + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(ReleaseCallback); + + bool called_; + void (*orig_release_)(struct ArrowArray*); + void* orig_private_data_; +}; + +static const std::vector kMetadataKeys1{"key1", "key2"}; +static const std::vector kMetadataValues1{"", "bar"}; +// clang-format off +static const std::string kEncodedMetadata1{ // NOLINT: runtime/string + 2, 0, 0, 0, + 4, 0, 0, 0, 'k', 'e', 'y', '1', 0, 0, 0, 0, + 4, 0, 0, 0, 'k', 'e', 'y', '2', 3, 0, 0, 0, 'b', 'a', 'r'}; +// clang-format off + +static const std::vector kMetadataKeys2{"key"}; +static const std::vector kMetadataValues2{"abcde"}; +// clang-format off +static const std::string kEncodedMetadata2{ // NOLINT: runtime/string + 1, 0, 0, 0, + 3, 0, 0, 0, 'k', 'e', 'y', 5, 0, 0, 0, 'a', 'b', 'c', 'd', 'e'}; +// clang-format off + +//////////////////////////////////////////////////////////////////////////// +// Export tests + +static constexpr int64_t kDefaultFlags = ARROW_FLAG_NULLABLE; + +struct ExportChecker { + ExportChecker(std::vector flattened_formats, + std::vector flattened_names, + std::vector flattened_flags = {}, + std::vector flattened_metadata = {}) + : flattened_formats_(std::move(flattened_formats)), + flattened_names_(std::move(flattened_names)), + flattened_flags_( + flattened_flags.empty() + ? std::vector(flattened_formats_.size(), kDefaultFlags) + : std::move(flattened_flags)), + flattened_metadata_(std::move(flattened_metadata)), + flattened_index_(0) {} + + void operator()(struct ArrowArray* c_export, const ArrayData& expected_data, + bool inner = false) { + ASSERT_LT(flattened_index_, flattened_formats_.size()); + ASSERT_LT(flattened_index_, flattened_names_.size()); + ASSERT_LT(flattened_index_, flattened_flags_.size()); + ASSERT_EQ(std::string(c_export->format), flattened_formats_[flattened_index_]); + ASSERT_EQ(std::string(c_export->name), flattened_names_[flattened_index_]); + if (!flattened_metadata_.empty()) { + const auto& expected_md = flattened_metadata_[flattened_index_]; + ASSERT_EQ(std::string(c_export->metadata, expected_md.size()), expected_md); + } else { + ASSERT_EQ(c_export->metadata, nullptr); + } + ASSERT_EQ(c_export->flags, flattened_flags_[flattened_index_]); + ++flattened_index_; + + ASSERT_EQ(c_export->length, expected_data.length); + ASSERT_EQ(c_export->null_count, expected_data.null_count); + ASSERT_EQ(c_export->offset, expected_data.offset); + + ASSERT_EQ(c_export->n_buffers, static_cast(expected_data.buffers.size())); + ASSERT_EQ(c_export->n_children, + static_cast(expected_data.child_data.size())); + ASSERT_NE(c_export->buffers, nullptr); + for (int64_t i = 0; i < c_export->n_buffers; ++i) { + auto expected_ptr = + expected_data.buffers[i] ? expected_data.buffers[i]->data() : nullptr; + ASSERT_EQ(c_export->buffers[i], expected_ptr); + } + + if (expected_data.dictionary != nullptr) { + // Recurse into dictionary + ASSERT_NE(c_export->dictionary, nullptr); + operator()(c_export->dictionary, *expected_data.dictionary->data(), true); + } else { + ASSERT_EQ(c_export->dictionary, nullptr); + } + + if (c_export->n_children > 0) { + ASSERT_NE(c_export->children, nullptr); + // Recurse into children + for (int64_t i = 0; i < c_export->n_children; ++i) { + ASSERT_NE(c_export->children[i], nullptr); + operator()(c_export->children[i], *expected_data.child_data[i], true); + } + } else { + ASSERT_EQ(c_export->children, nullptr); + } + + if (!inner) { + // Caller gave the right number of names and format strings + ASSERT_EQ(flattened_index_, flattened_formats_.size()); + ASSERT_EQ(flattened_index_, flattened_names_.size()); + ASSERT_EQ(flattened_index_, flattened_flags_.size()); + } + } + + const std::vector flattened_formats_; + const std::vector flattened_names_; + std::vector flattened_flags_; + const std::vector flattened_metadata_; + size_t flattened_index_; +}; + +class TestExport : public ::testing::Test { + public: + void SetUp() override { pool_ = default_memory_pool(); } + + static std::function*)> JSONArrayFactory( + std::shared_ptr type, const char* json) { + return [=](std::shared_ptr* out) -> Status { + return ::arrow::ipc::internal::json::ArrayFromJSON(type, json, out); + }; + } + + template + void TestWithArrayFactory(ArrayFactory&& factory, ExportCheckFunc&& func) { + auto orig_bytes = pool_->bytes_allocated(); + + std::shared_ptr arr; + ASSERT_OK(factory(&arr)); + const ArrayData& data = *arr->data(); // non-owning reference + struct ArrowArray c_export; + ASSERT_OK(ExportArray(*arr, &c_export)); + + ExportGuard guard(&c_export); + auto new_bytes = pool_->bytes_allocated(); + ASSERT_GT(new_bytes, orig_bytes); + + // Release the shared_ptr, underlying data should be held alive + arr.reset(); + ASSERT_EQ(pool_->bytes_allocated(), new_bytes); + func(&c_export, data); + + // Release the ArrowArray, underlying data should be destroyed + guard.Release(); + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + template + void TestNested(ArrayFactory&& factory, std::vector flattened_formats, + std::vector flattened_names, + std::vector flattened_flags = {}, + std::vector flattened_metadata = {}) { + ExportChecker checker(std::move(flattened_formats), std::move(flattened_names), + std::move(flattened_flags), std::move(flattened_metadata)); + + TestWithArrayFactory(std::move(factory), checker); + } + + void TestNested(const std::shared_ptr& type, const char* json, + std::vector flattened_formats, + std::vector flattened_names, + std::vector flattened_flags = {}, + std::vector flattened_metadata = {}) { + TestNested(JSONArrayFactory(type, json), std::move(flattened_formats), + std::move(flattened_names), std::move(flattened_flags), + std::move(flattened_metadata)); + } + + template + void TestPrimitive(ArrayFactory&& factory, const char* format) { + TestNested(std::forward(factory), {format}, {""}); + } + + void TestPrimitive(const std::shared_ptr& type, const char* json, + const char* format) { + TestNested(type, json, {format}, {""}); + } + + template + void TestMoveWithArrayFactory(ArrayFactory&& factory, ExportCheckFunc&& func) { + auto orig_bytes = pool_->bytes_allocated(); + + std::shared_ptr arr; + ASSERT_OK(factory(&arr)); + const ArrayData& data = *arr->data(); // non-owning reference + struct ArrowArray c_export_temp, c_export_final; + ASSERT_OK(ExportArray(*arr, &c_export_temp)); + + // Move the ArrowArray to its final location + ArrowMoveArray(&c_export_temp, &c_export_final); + ASSERT_EQ(c_export_temp.format, nullptr); // released + + ExportGuard guard(&c_export_final); + auto new_bytes = pool_->bytes_allocated(); + ASSERT_GT(new_bytes, orig_bytes); + + // Release the shared_ptr, underlying data should be held alive + arr.reset(); + ASSERT_EQ(pool_->bytes_allocated(), new_bytes); + func(&c_export_final, data); + + // Release the ArrowArray, underlying data should be destroyed + guard.Release(); + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + template + void TestMoveNested(ArrayFactory&& factory, std::vector flattened_formats, + std::vector flattened_names) { + ExportChecker checker(std::move(flattened_formats), std::move(flattened_names)); + + TestMoveWithArrayFactory(std::move(factory), checker); + } + + void TestMoveNested(const std::shared_ptr& type, const char* json, + std::vector flattened_formats, + std::vector flattened_names) { + TestMoveNested(JSONArrayFactory(type, json), std::move(flattened_formats), + std::move(flattened_names)); + } + + void TestMovePrimitive(const std::shared_ptr& type, const char* json, + const char* format) { + TestMoveNested(type, json, {format}, {""}); + } + + template + void TestMoveChildWithArrayFactory(ArrayFactory&& factory, int64_t child_id, + ExportCheckFunc&& func) { + auto orig_bytes = pool_->bytes_allocated(); + + std::shared_ptr arr; + ASSERT_OK(factory(&arr)); + struct ArrowArray c_export_parent, c_export_child; + ASSERT_OK(ExportArray(*arr, &c_export_parent)); + + auto bytes_with_parent = pool_->bytes_allocated(); + ASSERT_GT(bytes_with_parent, orig_bytes); + + // Move the child ArrowArray to its final location + { + ExportGuard parent_guard(&c_export_parent); + ASSERT_LT(child_id, c_export_parent.n_children); + ArrowMoveArray(c_export_parent.children[child_id], &c_export_child); + } + ExportGuard child_guard(&c_export_child); + + // Now parent is released + ASSERT_EQ(c_export_parent.format, nullptr); + auto bytes_with_child = pool_->bytes_allocated(); + ASSERT_LT(bytes_with_child, bytes_with_parent); + ASSERT_GT(bytes_with_child, orig_bytes); + + // Release the shared_ptr, some underlying data should be held alive + const ArrayData& data = *arr->data()->child_data[child_id]; // non-owning reference + arr.reset(); + ASSERT_LT(pool_->bytes_allocated(), bytes_with_child); + ASSERT_GT(pool_->bytes_allocated(), orig_bytes); + func(&c_export_child, data); + + // Release the ArrowArray, underlying data should be destroyed + child_guard.Release(); + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + template + void TestMoveChild(ArrayFactory&& factory, int64_t child_id, + std::vector flattened_formats, + std::vector flattened_names) { + ExportChecker checker(std::move(flattened_formats), std::move(flattened_names)); + + TestMoveChildWithArrayFactory(std::move(factory), child_id, checker); + } + + void TestMoveChild(const std::shared_ptr& type, const char* json, + int64_t child_id, std::vector flattened_formats, + std::vector flattened_names) { + TestMoveChild(JSONArrayFactory(type, json), child_id, std::move(flattened_formats), + std::move(flattened_names)); + } + + protected: + MemoryPool* pool_; +}; + +TEST_F(TestExport, Primitive) { + TestPrimitive(int8(), "[1, 2, null, -3]", "c"); + TestPrimitive(int16(), "[1, 2, -3]", "s"); + TestPrimitive(int32(), "[1, 2, null, -3]", "i"); + TestPrimitive(int64(), "[1, 2, -3]", "l"); + TestPrimitive(uint8(), "[1, 2, 3]", "C"); + TestPrimitive(uint16(), "[1, 2, null, 3]", "S"); + TestPrimitive(uint32(), "[1, 2, 3]", "I"); + TestPrimitive(uint64(), "[1, 2, null, 3]", "L"); + + TestPrimitive(boolean(), "[true, false, null]", "b"); + TestPrimitive(null(), "[null, null]", "n"); + + TestPrimitive(float32(), "[1.5, null]", "f"); + TestPrimitive(float64(), "[1.5, null]", "g"); + + TestPrimitive(fixed_size_binary(3), R"(["foo", "bar", null])", "w:3"); + TestPrimitive(binary(), R"(["foo", "bar", null])", "z"); + TestPrimitive(large_binary(), R"(["foo", "bar", null])", "Z"); + TestPrimitive(utf8(), R"(["foo", "bar", null])", "u"); + TestPrimitive(large_utf8(), R"(["foo", "bar", null])", "U"); + + TestPrimitive(decimal(16, 4), R"(["1234.5670", null])", "d:16,4"); +} + +TEST_F(TestExport, PrimitiveSliced) { + auto factory = [](std::shared_ptr* out) -> Status { + *out = ArrayFromJSON(int16(), "[1, 2, null, -3]")->Slice(1, 2); + return Status::OK(); + }; + + TestPrimitive(factory, "s"); +} + +TEST_F(TestExport, Null) { + TestPrimitive(null(), "[null, null, null]", "n"); + TestPrimitive(null(), "[]", "n"); +} + +TEST_F(TestExport, Temporal) { + const char* json = "[1, 2, null, 42]"; + TestPrimitive(date32(), json, "tdD"); + TestPrimitive(date64(), json, "tdm"); + TestPrimitive(time32(TimeUnit::SECOND), json, "tts"); + TestPrimitive(time32(TimeUnit::MILLI), json, "ttm"); + TestPrimitive(time64(TimeUnit::MICRO), json, "ttu"); + TestPrimitive(time64(TimeUnit::NANO), json, "ttn"); + TestPrimitive(duration(TimeUnit::SECOND), json, "tDs"); + TestPrimitive(duration(TimeUnit::MILLI), json, "tDm"); + TestPrimitive(duration(TimeUnit::MICRO), json, "tDu"); + TestPrimitive(duration(TimeUnit::NANO), json, "tDn"); + TestPrimitive(month_interval(), json, "tiM"); + + TestPrimitive(day_time_interval(), "[[7, 600], null]", "tiD"); + + json = R"(["1970-01-01","2000-02-29","1900-02-28"])"; + TestPrimitive(timestamp(TimeUnit::SECOND), json, "tss:"); + TestPrimitive(timestamp(TimeUnit::SECOND, "Europe/Paris"), json, "tss:Europe/Paris"); + TestPrimitive(timestamp(TimeUnit::MILLI), json, "tsm:"); + TestPrimitive(timestamp(TimeUnit::MILLI, "Europe/Paris"), json, "tsm:Europe/Paris"); + TestPrimitive(timestamp(TimeUnit::MICRO), json, "tsu:"); + TestPrimitive(timestamp(TimeUnit::MICRO, "Europe/Paris"), json, "tsu:Europe/Paris"); + TestPrimitive(timestamp(TimeUnit::NANO), json, "tsn:"); + TestPrimitive(timestamp(TimeUnit::NANO, "Europe/Paris"), json, "tsn:Europe/Paris"); +} + +TEST_F(TestExport, List) { + TestNested(list(int8()), "[[1, 2], [3, null], null]", {"+l", "c"}, {"", "item"}); + TestNested(large_list(uint16()), "[[1, 2], [3, null], null]", {"+L", "S"}, + {"", "item"}); + TestNested(fixed_size_list(int64(), 2), "[[1, 2], [3, null], null]", {"+w:2", "l"}, + {"", "item"}); + + TestNested(list(large_list(int32())), "[[[1, 2], [3], null], null]", {"+l", "+L", "i"}, + {"", "item", "item"}); +} + +TEST_F(TestExport, ListSliced) { + { + auto factory = [](std::shared_ptr* out) -> Status { + *out = ArrayFromJSON(list(int8()), "[[1, 2], [3, null], [4, 5, 6], null]") + ->Slice(1, 2); + return Status::OK(); + }; + TestNested(factory, {"+l", "c"}, {"", "item"}); + } + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(int16(), "[1, 2, 3, 4, null, 5, 6, 7, 8]")->Slice(1, 6); + auto offsets = ArrayFromJSON(int32(), "[0, 2, 3, 5, 6]")->Slice(2, 4); + return ListArray::FromArrays(*offsets, *values, default_memory_pool(), out); + }; + TestNested(factory, {"+l", "s"}, {"", "item"}); + } +} + +TEST_F(TestExport, Struct) { + const char* data = R"([[1, "foo"], [2, null]])"; + auto type = struct_({field("a", int8()), field("b", utf8())}); + TestNested(type, data, {"+s", "c", "u"}, {"", "a", "b"}, + {ARROW_FLAG_NULLABLE, ARROW_FLAG_NULLABLE, ARROW_FLAG_NULLABLE}); + + // With nullable = false + type = struct_({field("a", int8(), /*nullable=*/false), field("b", utf8())}); + TestNested(type, data, {"+s", "c", "u"}, {"", "a", "b"}, + {ARROW_FLAG_NULLABLE, 0, ARROW_FLAG_NULLABLE}); + + // With metadata + auto f0 = type->child(0); + auto f1 = type->child(1)->WithMetadata( + key_value_metadata(kMetadataKeys1, kMetadataValues1)); + type = struct_({f0, f1}); + TestNested(type, data, {"+s", "c", "u"}, {"", "a", "b"}, + {ARROW_FLAG_NULLABLE, 0, ARROW_FLAG_NULLABLE}, + {"", "", kEncodedMetadata1}); +} + +TEST_F(TestExport, Map) { + const char* json = R"([[[1, "foo"], [2, null]], [[3, "bar"]]])"; + TestNested(map(int8(), utf8()), json, + {"+m", "+s", "c", "u"}, {"", "entries", "key", "value"}, + {ARROW_FLAG_NULLABLE, 0, 0, ARROW_FLAG_NULLABLE}); + TestNested(map(int8(), utf8(), /*keys_sorted=*/ true), json, + {"+m", "+s", "c", "u"}, {"", "entries", "key", "value"}, + {ARROW_FLAG_NULLABLE | ARROW_FLAG_MAP_KEYS_SORTED, 0, 0, + ARROW_FLAG_NULLABLE}); +} + +TEST_F(TestExport, Union) { + const char* data = "[null, [42, 1], [43, true], [42, null], [42, 2]]"; + // Dense + auto field_a = field("a", int8()); + auto field_b = field("b", boolean(), /*nullable=*/false); + auto type = union_({field_a, field_b}, {42, 43}, UnionMode::DENSE); + TestNested(type, data, {"+ud:42,43", "c", "b"}, {"", "a", "b"}, + {ARROW_FLAG_NULLABLE, ARROW_FLAG_NULLABLE, 0}); + // Sparse + field_a = field("a", int8(), /*nullable=*/false); + field_b = field("b", boolean()); + type = union_({field_a, field_b}, {42, 43}, UnionMode::SPARSE); + TestNested(type, data, {"+us:42,43", "c", "b"}, {"", "a", "b"}, + {ARROW_FLAG_NULLABLE, 0, ARROW_FLAG_NULLABLE}); +} + +TEST_F(TestExport, Dictionary) { + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(utf8(), R"(["foo", "bar", "quux"])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + return DictionaryArray::FromArrays(dictionary(indices->type(), values->type()), + indices, values, out); + }; + TestNested(factory, {"i", "u"}, {"", ""}); + } + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + return DictionaryArray::FromArrays( + dictionary(indices->type(), values->type(), /*ordered=*/true), indices, values, + out); + }; + TestNested(factory, {"i", "+l", "u"}, {"", "", "item"}, + {ARROW_FLAG_NULLABLE | ARROW_FLAG_DICTIONARY_ORDERED, ARROW_FLAG_NULLABLE, + ARROW_FLAG_NULLABLE}); + } + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + std::shared_ptr dict_array; + RETURN_NOT_OK(DictionaryArray::FromArrays( + dictionary(indices->type(), values->type()), indices, values, &dict_array)); + auto offsets = ArrayFromJSON(int64(), "[0, 2, 5]"); + RETURN_NOT_OK( + LargeListArray::FromArrays(*offsets, *dict_array, default_memory_pool(), out)); + return (*out)->ValidateFull(); + }; + TestNested(factory, {"+L", "i", "+l", "u"}, {"", "item", "", "item"}); + } +} + +TEST_F(TestExport, MovePrimitive) { + TestMovePrimitive(int8(), "[1, 2, null, -3]", "c"); + TestMovePrimitive(fixed_size_binary(3), R"(["foo", "bar", null])", "w:3"); + TestMovePrimitive(binary(), R"(["foo", "bar", null])", "z"); +} + +TEST_F(TestExport, MoveNested) { + TestMoveNested(list(int8()), "[[1, 2], [3, null], null]", {"+l", "c"}, {"", "item"}); + TestMoveNested(list(large_list(int32())), "[[[1, 2], [3], null], null]", + {"+l", "+L", "i"}, {"", "item", "item"}); + TestMoveNested(struct_({field("a", int8()), field("b", utf8())}), + R"([[1, "foo"], [2, null]])", {"+s", "c", "u"}, {"", "a", "b"}); +} + +TEST_F(TestExport, MoveDictionary) { + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(utf8(), R"(["foo", "bar", "quux"])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + return DictionaryArray::FromArrays(dictionary(indices->type(), values->type()), + indices, values, out); + }; + TestMoveNested(factory, {"i", "u"}, {"", ""}); + } + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + std::shared_ptr dict_array; + RETURN_NOT_OK(DictionaryArray::FromArrays( + dictionary(indices->type(), values->type()), indices, values, &dict_array)); + auto offsets = ArrayFromJSON(int64(), "[0, 2, 5]"); + RETURN_NOT_OK( + LargeListArray::FromArrays(*offsets, *dict_array, default_memory_pool(), out)); + return (*out)->ValidateFull(); + }; + TestMoveNested(factory, {"+L", "i", "+l", "u"}, {"", "item", "", "item"}); + } +} + +TEST_F(TestExport, MoveChild) { + TestMoveChild(list(int8()), "[[1, 2], [3, null], null]", /*child_id=*/0, {"c"}, + {"item"}); + TestMoveChild(list(large_list(int32())), "[[[1, 2], [3], null], null]", + /*child_id=*/0, {"+L", "i"}, {"item", "item"}); + TestMoveChild(struct_({field("ints", int8()), field("strs", utf8())}), + R"([[1, "foo"], [2, null]])", + /*child_id=*/0, {"c"}, {"ints"}); + TestMoveChild(struct_({field("ints", int8()), field("strs", utf8())}), + R"([[1, "foo"], [2, null]])", + /*child_id=*/1, {"u"}, {"strs"}); + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + std::shared_ptr dict_array; + RETURN_NOT_OK(DictionaryArray::FromArrays( + dictionary(indices->type(), values->type()), indices, values, &dict_array)); + auto offsets = ArrayFromJSON(int64(), "[0, 2, 5]"); + RETURN_NOT_OK( + LargeListArray::FromArrays(*offsets, *dict_array, default_memory_pool(), out)); + return (*out)->ValidateFull(); + }; + TestMoveChild(factory, /*child_id=*/0, {"i", "+l", "u"}, {"item", "", "item"}); + } +} + +TEST_F(TestExport, WithField) { + struct ArrowArray c_export; + { + auto arr = ArrayFromJSON(null(), "[null, null, null]"); + auto f = field("thing", null()); + ASSERT_OK(ExportArray(*f, *arr, &c_export)); + + ExportGuard guard(&c_export); + const auto& data = *arr->data(); + arr.reset(); + f.reset(); + ExportChecker({"n"}, {"thing"}, {ARROW_FLAG_NULLABLE})(&c_export, data); + } + { + // With nullable = false + auto arr = ArrayFromJSON(null(), "[null, null, null]"); + auto f = field("thing", null(), /*nullable=*/false); + ASSERT_OK(ExportArray(*f, *arr, &c_export)); + + ExportGuard guard(&c_export); + ExportChecker({"n"}, {"thing"}, {0})(&c_export, *arr->data()); + } + { + // With metadata + auto arr = ArrayFromJSON(null(), "[null, null, null]"); + auto f = field("thing", null(), /*nullable=*/false); + f = f->WithMetadata(key_value_metadata(kMetadataKeys1, kMetadataValues1)); + ASSERT_OK(ExportArray(*f, *arr, &c_export)); + + ExportGuard guard(&c_export); + ExportChecker({"n"}, {"thing"}, {0}, {kEncodedMetadata1})(&c_export, *arr->data()); + } +} + +TEST_F(TestExport, AsRecordBatch) { + struct ArrowArray c_export; + + auto schema = ::arrow::schema( + {field("ints", int16()), field("bools", boolean(), /*nullable=*/false)}); + auto arr0 = ArrayFromJSON(int16(), "[1, 2, null]"); + auto arr1 = ArrayFromJSON(boolean(), "[false, true, false]"); + + { + auto batch = RecordBatch::Make(schema, 3, {arr0, arr1}); + ASSERT_OK(ExportRecordBatch(*batch, &c_export)); + ExportGuard guard(&c_export); + + ASSERT_EQ(c_export.null_count, 0); + StructArray expected(struct_(schema->fields()), 3, batch->columns()); + ASSERT_EQ(expected.null_count(), 0); // compute null count for comparison below + ExportChecker({"+s", "s", "b"}, {"", "ints", "bools"}, + {0, ARROW_FLAG_NULLABLE, 0})(&c_export, *expected.data()); + } + { + // With schema and field metadata + auto f0 = schema->field(0); + auto f1 = schema->field(1); + f1 = f1->WithMetadata(key_value_metadata(kMetadataKeys1, kMetadataValues1)); + schema = ::arrow::schema({f0, f1}, + key_value_metadata(kMetadataKeys2, kMetadataValues2)); + auto batch = RecordBatch::Make(schema, 3, {arr0, arr1}); + ASSERT_OK(ExportRecordBatch(*batch, &c_export)); + ExportGuard guard(&c_export); + + ASSERT_EQ(c_export.null_count, 0); + StructArray expected(struct_(schema->fields()), 3, batch->columns()); + ASSERT_EQ(expected.null_count(), 0); // compute null count for comparison below + ExportChecker( + {"+s", "s", "b"}, {"", "ints", "bools"}, {0, ARROW_FLAG_NULLABLE, 0}, + {kEncodedMetadata2, "", kEncodedMetadata1})(&c_export, *expected.data()); + } +} + +//////////////////////////////////////////////////////////////////////////// +// Import tests + +// [true, false, true, true, false, true, true, true] * 2 +static const uint8_t bits_buffer1[] = {0xed, 0xed}; + +static const void* buffers_no_nulls_no_data[1] = {nullptr}; +static const void* buffers_nulls_no_data1[1] = {bits_buffer1}; + +static const uint8_t data_buffer1[] = {1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16}; +static const uint8_t data_buffer2[] = "abcdefghijklmnopqrstuvwxyz"; +static const uint64_t data_buffer3[] = {123456789, 0, 987654321, 0}; +static const uint8_t data_buffer4[] = {1, 2, 0, 1, 3, 0}; +static const float data_buffer5[] = {0.0f, 1.5f, -2.0f, 3.0f, 4.0f, 5.0f}; +static const double data_buffer6[] = {0.0, 1.5, -2.0, 3.0, 4.0, 5.0}; +static const int32_t data_buffer7[] = {1234, 5678, 9012, 3456}; +static const int64_t data_buffer8[] = {123456789, 987654321, -123456789, -987654321}; +static const void* primitive_buffers_no_nulls1[2] = {nullptr, data_buffer1}; +static const void* primitive_buffers_nulls1[2] = {bits_buffer1, data_buffer1}; +static const void* primitive_buffers_no_nulls2[2] = {nullptr, data_buffer2}; +static const void* primitive_buffers_no_nulls3[2] = {nullptr, data_buffer3}; +static const void* primitive_buffers_no_nulls4[2] = {nullptr, data_buffer4}; +static const void* primitive_buffers_no_nulls5[2] = {nullptr, data_buffer5}; +static const void* primitive_buffers_no_nulls6[2] = {nullptr, data_buffer6}; +static const void* primitive_buffers_no_nulls7[2] = {nullptr, data_buffer7}; +static const void* primitive_buffers_nulls7[2] = {bits_buffer1, data_buffer7}; +static const void* primitive_buffers_no_nulls8[2] = {nullptr, data_buffer8}; +static const void* primitive_buffers_nulls8[2] = {bits_buffer1, data_buffer8}; + +static const int64_t timestamp_data_buffer1[] = {0, 951782400, -2203977600LL}; +static const int64_t timestamp_data_buffer2[] = {0, 951782400000LL, -2203977600000LL}; +static const int64_t timestamp_data_buffer3[] = {0, 951782400000000LL, + -2203977600000000LL}; +static const int64_t timestamp_data_buffer4[] = {0, 951782400000000000LL, + -2203977600000000000LL}; +static const void* timestamp_buffers_no_nulls1[2] = {nullptr, timestamp_data_buffer1}; +static const void* timestamp_buffers_nulls1[2] = {bits_buffer1, timestamp_data_buffer1}; +static const void* timestamp_buffers_no_nulls2[2] = {nullptr, timestamp_data_buffer2}; +static const void* timestamp_buffers_no_nulls3[2] = {nullptr, timestamp_data_buffer3}; +static const void* timestamp_buffers_no_nulls4[2] = {nullptr, timestamp_data_buffer4}; + +static const uint8_t string_data_buffer1[] = "foobarquux"; + +static const int32_t string_offsets_buffer1[] = {0, 3, 3, 6, 10}; +static const void* string_buffers_no_nulls1[3] = {nullptr, string_offsets_buffer1, + string_data_buffer1}; + +static const int64_t large_string_offsets_buffer1[] = {0, 3, 3, 6, 10}; +static const void* large_string_buffers_no_nulls1[3] = { + nullptr, large_string_offsets_buffer1, string_data_buffer1}; + +static const int32_t list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8}; +static const void* list_buffers_no_nulls1[2] = {nullptr, list_offsets_buffer1}; + +static const int64_t large_list_offsets_buffer1[] = {0, 2, 2, 5, 6, 8}; +static const void* large_list_buffers_no_nulls1[2] = {nullptr, + large_list_offsets_buffer1}; + +static const int8_t type_codes_buffer1[] = {42, 42, 43, 43, 42}; +static const int32_t union_offsets_buffer1[] = {0, 1, 0, 1, 2}; +static const void* sparse_union_buffers_no_nulls1[3] = {nullptr, type_codes_buffer1, + nullptr}; +static const void* dense_union_buffers_no_nulls1[3] = {nullptr, type_codes_buffer1, + union_offsets_buffer1}; + +class TestImport : public ::testing::Test { + public: + void SetUp() override { + memset(&c_struct_, 0, sizeof(c_struct_)); + c_struct_.name = ""; + } + + // Create a new ArrowArray struct with a stable C pointer + struct ArrowArray* AddChild() { + nested_structs_.emplace_back(); + struct ArrowArray* result = &nested_structs_.back(); + memset(result, 0, sizeof(*result)); + return result; + } + + // Create a stable C pointer to the N last structs in nested_structs_ + struct ArrowArray** NLastChildren(int64_t n_children, struct ArrowArray* parent) { + children_arrays_.emplace_back(n_children); + struct ArrowArray** children = children_arrays_.back().data(); + int64_t nested_offset; + // If parent is itself at the end of nested_structs_, skip it + if (parent != nullptr && &nested_structs_.back() == parent) { + nested_offset = static_cast(nested_structs_.size()) - n_children - 1; + } else { + nested_offset = static_cast(nested_structs_.size()) - n_children; + } + for (int64_t i = 0; i < n_children; ++i) { + children[i] = &nested_structs_[nested_offset + i]; + } + return children; + } + + struct ArrowArray* LastChild(struct ArrowArray* parent) { + return *NLastChildren(1, parent); + } + + void FillPrimitive(struct ArrowArray* c, const char* format, int64_t length, + int64_t null_count, int64_t offset, const void** buffers, + int64_t flags = kDefaultFlags) { + c->flags = flags; + c->format = format; + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 2; + c->buffers = buffers; + } + + void FillDictionary(struct ArrowArray* c) { c->dictionary = LastChild(c); } + + void FillStringLike(struct ArrowArray* c, const char* format, int64_t length, + int64_t null_count, int64_t offset, const void** buffers, + int64_t flags = kDefaultFlags) { + c->flags = flags; + c->format = format; + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 3; + c->buffers = buffers; + } + + void FillListLike(struct ArrowArray* c, const char* format, int64_t length, + int64_t null_count, int64_t offset, const void** buffers, + int64_t flags = kDefaultFlags) { + c->flags = flags; + c->format = format; + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 2; + c->buffers = buffers; + c->n_children = 1; + c->children = NLastChildren(1, c); + c->children[0]->name = "item"; + } + + void FillFixedSizeListLike(struct ArrowArray* c, const char* format, int64_t length, + int64_t null_count, int64_t offset, const void** buffers, + int64_t flags = kDefaultFlags) { + c->flags = flags; + c->format = format; + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 1; + c->buffers = buffers; + c->n_children = 1; + c->children = NLastChildren(1, c); + c->children[0]->name = "item"; + } + + void FillStructLike(struct ArrowArray* c, const char* format, int64_t length, + int64_t null_count, int64_t offset, + std::vector child_names, const void** buffers, + int64_t flags = kDefaultFlags) { + c->flags = flags; + c->format = format; + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 1; + c->buffers = buffers; + c->n_children = static_cast(child_names.size()); + c->children = NLastChildren(c->n_children, c); + for (int64_t i = 0; i < c->n_children; ++i) { + children_names_.push_back(std::move(child_names[i])); + c->children[i]->name = children_names_.back().c_str(); + } + } + + void FillUnionLike(struct ArrowArray* c, const char* format, int64_t length, + int64_t null_count, int64_t offset, + std::vector child_names, const void** buffers, + int64_t flags = kDefaultFlags) { + c->flags = flags; + c->format = format; + c->length = length; + c->null_count = null_count; + c->offset = offset; + c->n_buffers = 3; + c->buffers = buffers; + c->n_children = static_cast(child_names.size()); + c->children = NLastChildren(c->n_children, c); + for (int64_t i = 0; i < c->n_children; ++i) { + children_names_.push_back(std::move(child_names[i])); + c->children[i]->name = children_names_.back().c_str(); + } + } + + void FillPrimitive(const char* format, int64_t length, int64_t null_count, + int64_t offset, const void** buffers, + int64_t flags = kDefaultFlags) { + FillPrimitive(&c_struct_, format, length, null_count, offset, buffers, flags); + } + + void FillDictionary() { FillDictionary(&c_struct_); } + + void FillStringLike(const char* format, int64_t length, int64_t null_count, + int64_t offset, const void** buffers, + int64_t flags = kDefaultFlags) { + FillStringLike(&c_struct_, format, length, null_count, offset, buffers, flags); + } + + void FillListLike(const char* format, int64_t length, int64_t null_count, + int64_t offset, const void** buffers, int64_t flags = kDefaultFlags) { + FillListLike(&c_struct_, format, length, null_count, offset, buffers, flags); + } + + void FillFixedSizeListLike(const char* format, int64_t length, int64_t null_count, + int64_t offset, const void** buffers, + int64_t flags = kDefaultFlags) { + FillFixedSizeListLike(&c_struct_, format, length, null_count, offset, buffers, flags); + } + + void FillStructLike(const char* format, int64_t length, int64_t null_count, + int64_t offset, std::vector child_names, + const void** buffers, int64_t flags = kDefaultFlags) { + FillStructLike(&c_struct_, format, length, null_count, offset, std::move(child_names), + buffers, flags); + } + + void FillUnionLike(const char* format, int64_t length, int64_t null_count, + int64_t offset, std::vector child_names, + const void** buffers, int64_t flags = kDefaultFlags) { + FillUnionLike(&c_struct_, format, length, null_count, offset, std::move(child_names), + buffers, flags); + } + + void CheckImport(const std::shared_ptr& expected) { + ReleaseCallback cb(&c_struct_); + + ASSERT_OK_AND_ASSIGN(auto array, ImportArray(&c_struct_)); + ASSERT_TRUE(ArrowIsReleased(&c_struct_)); // was moved + ASSERT_OK(array->ValidateFull()); + // Special case: Null array doesn't have any data, so it needn't + // keep the ArrowArray struct alive. + if (expected->type_id() != Type::NA) { + cb.AssertNotCalled(); + } + AssertArraysEqual(*expected, *array, true); + array.reset(); + cb.AssertCalled(); + } + + void CheckImportError() { + ReleaseCallback cb(&c_struct_); + + ASSERT_RAISES(Invalid, ImportArray(&c_struct_)); + ASSERT_TRUE(ArrowIsReleased(&c_struct_)); // was moved + // The ArrowArray should have been released. + cb.AssertCalled(); + } + + void CheckImportAsRecordBatchError() { + ReleaseCallback cb(&c_struct_); + + ASSERT_RAISES(Invalid, ImportRecordBatch(&c_struct_)); + ASSERT_TRUE(ArrowIsReleased(&c_struct_)); // was moved + // The ArrowArray should have been released. + cb.AssertCalled(); + } + + protected: + struct ArrowArray c_struct_; + // Deque elements don't move when the deque is appended to, which allows taking + // stable C pointers to them. + std::deque nested_structs_; + std::deque> children_arrays_; + std::deque children_names_; +}; + +TEST_F(TestImport, Primitive) { + FillPrimitive("c", 3, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(int8(), "[1, 2, 3]")); + FillPrimitive("C", 5, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(uint8(), "[1, 2, 3, 4, 5]")); + FillPrimitive("s", 3, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(int16(), "[513, 1027, 1541]")); + FillPrimitive("S", 3, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(uint16(), "[513, 1027, 1541]")); + FillPrimitive("i", 2, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(int32(), "[67305985, 134678021]")); + FillPrimitive("I", 2, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(uint32(), "[67305985, 134678021]")); + FillPrimitive("l", 2, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(int64(), "[578437695752307201, 1157159078456920585]")); + FillPrimitive("L", 2, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(uint64(), "[578437695752307201, 1157159078456920585]")); + + FillPrimitive("b", 3, 0, 0, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(boolean(), "[true, false, false]")); + FillPrimitive("f", 6, 0, 0, primitive_buffers_no_nulls5); + CheckImport(ArrayFromJSON(float32(), "[0.0, 1.5, -2.0, 3.0, 4.0, 5.0]")); + FillPrimitive("g", 6, 0, 0, primitive_buffers_no_nulls6); + CheckImport(ArrayFromJSON(float64(), "[0.0, 1.5, -2.0, 3.0, 4.0, 5.0]")); + + // With nulls + FillPrimitive("c", 9, -1, 0, primitive_buffers_nulls1); + CheckImport(ArrayFromJSON(int8(), "[1, null, 3, 4, null, 6, 7, 8, 9]")); + FillPrimitive("c", 9, 2, 0, primitive_buffers_nulls1); + CheckImport(ArrayFromJSON(int8(), "[1, null, 3, 4, null, 6, 7, 8, 9]")); + FillPrimitive("s", 3, -1, 0, primitive_buffers_nulls1); + CheckImport(ArrayFromJSON(int16(), "[513, null, 1541]")); + FillPrimitive("s", 3, 1, 0, primitive_buffers_nulls1); + CheckImport(ArrayFromJSON(int16(), "[513, null, 1541]")); + FillPrimitive("b", 3, -1, 0, primitive_buffers_nulls1); + CheckImport(ArrayFromJSON(boolean(), "[true, null, false]")); + FillPrimitive("b", 3, 1, 0, primitive_buffers_nulls1); + CheckImport(ArrayFromJSON(boolean(), "[true, null, false]")); +} + +TEST_F(TestImport, Temporal) { + FillPrimitive("tdD", 3, 0, 0, primitive_buffers_no_nulls7); + CheckImport(ArrayFromJSON(date32(), "[1234, 5678, 9012]")); + FillPrimitive("tdm", 3, 0, 0, primitive_buffers_no_nulls8); + CheckImport(ArrayFromJSON(date64(), "[123456789, 987654321, -123456789]")); + + FillPrimitive("tts", 2, 0, 0, primitive_buffers_no_nulls7); + CheckImport(ArrayFromJSON(time32(TimeUnit::SECOND), "[1234, 5678]")); + FillPrimitive("ttm", 2, 0, 0, primitive_buffers_no_nulls7); + CheckImport(ArrayFromJSON(time32(TimeUnit::MILLI), "[1234, 5678]")); + FillPrimitive("ttu", 2, 0, 0, primitive_buffers_no_nulls8); + CheckImport(ArrayFromJSON(time64(TimeUnit::MICRO), "[123456789, 987654321]")); + FillPrimitive("ttn", 2, 0, 0, primitive_buffers_no_nulls8); + CheckImport(ArrayFromJSON(time64(TimeUnit::NANO), "[123456789, 987654321]")); + + FillPrimitive("tDs", 2, 0, 0, primitive_buffers_no_nulls8); + CheckImport(ArrayFromJSON(duration(TimeUnit::SECOND), "[123456789, 987654321]")); + FillPrimitive("tDm", 2, 0, 0, primitive_buffers_no_nulls8); + CheckImport(ArrayFromJSON(duration(TimeUnit::MILLI), "[123456789, 987654321]")); + FillPrimitive("tDu", 2, 0, 0, primitive_buffers_no_nulls8); + CheckImport(ArrayFromJSON(duration(TimeUnit::MICRO), "[123456789, 987654321]")); + FillPrimitive("tDn", 2, 0, 0, primitive_buffers_no_nulls8); + CheckImport(ArrayFromJSON(duration(TimeUnit::NANO), "[123456789, 987654321]")); + + FillPrimitive("tiM", 3, 0, 0, primitive_buffers_no_nulls7); + CheckImport(ArrayFromJSON(month_interval(), "[1234, 5678, 9012]")); + FillPrimitive("tiD", 2, 0, 0, primitive_buffers_no_nulls7); + CheckImport(ArrayFromJSON(day_time_interval(), "[[1234, 5678], [9012, 3456]]")); + + const char* json = R"(["1970-01-01","2000-02-29","1900-02-28"])"; + FillPrimitive("tss:", 3, 0, 0, timestamp_buffers_no_nulls1); + CheckImport(ArrayFromJSON(timestamp(TimeUnit::SECOND), json)); + FillPrimitive("tsm:", 3, 0, 0, timestamp_buffers_no_nulls2); + CheckImport(ArrayFromJSON(timestamp(TimeUnit::MILLI), json)); + FillPrimitive("tsu:", 3, 0, 0, timestamp_buffers_no_nulls3); + CheckImport(ArrayFromJSON(timestamp(TimeUnit::MICRO), json)); + FillPrimitive("tsn:", 3, 0, 0, timestamp_buffers_no_nulls4); + CheckImport(ArrayFromJSON(timestamp(TimeUnit::NANO), json)); + + // With nulls + FillPrimitive("tdD", 3, -1, 0, primitive_buffers_nulls7); + CheckImport(ArrayFromJSON(date32(), "[1234, null, 9012]")); + FillPrimitive("tdm", 3, -1, 0, primitive_buffers_nulls8); + CheckImport(ArrayFromJSON(date64(), "[123456789, null, -123456789]")); + FillPrimitive("ttn", 2, -1, 0, primitive_buffers_nulls8); + CheckImport(ArrayFromJSON(time64(TimeUnit::NANO), "[123456789, null]")); + FillPrimitive("tDn", 2, -1, 0, primitive_buffers_nulls8); + CheckImport(ArrayFromJSON(duration(TimeUnit::NANO), "[123456789, null]")); + FillPrimitive("tiM", 3, -1, 0, primitive_buffers_nulls7); + CheckImport(ArrayFromJSON(month_interval(), "[1234, null, 9012]")); + FillPrimitive("tiD", 2, -1, 0, primitive_buffers_nulls7); + CheckImport(ArrayFromJSON(day_time_interval(), "[[1234, 5678], null]")); + FillPrimitive("tss:UTC+2", 3, -1, 0, timestamp_buffers_nulls1); + CheckImport(ArrayFromJSON(timestamp(TimeUnit::SECOND, "UTC+2"), + R"(["1970-01-01",null,"1900-02-28"])")); +} + +TEST_F(TestImport, Null) { + const void* buffers[] = {nullptr}; + c_struct_.format = "n"; + c_struct_.length = 3; + c_struct_.null_count = 3; + c_struct_.offset = 0; + c_struct_.n_buffers = 1; + c_struct_.buffers = buffers; + CheckImport(ArrayFromJSON(null(), "[null, null, null]")); +} + +TEST_F(TestImport, PrimitiveWithOffset) { + FillPrimitive("c", 3, 0, 2, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(int8(), "[3, 4, 5]")); + FillPrimitive("S", 3, 0, 1, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(uint16(), "[1027, 1541, 2055]")); + + FillPrimitive("b", 4, 0, 7, primitive_buffers_no_nulls1); + CheckImport(ArrayFromJSON(boolean(), "[false, false, true, false]")); +} + +TEST_F(TestImport, NullWithOffset) { + const void* buffers[] = {nullptr}; + c_struct_.format = "n"; + c_struct_.length = 3; + c_struct_.null_count = 3; + c_struct_.offset = 5; + c_struct_.n_buffers = 1; + c_struct_.buffers = buffers; + CheckImport(ArrayFromJSON(null(), "[null, null, null]")); +} + +TEST_F(TestImport, String) { + FillStringLike("u", 4, 0, 0, string_buffers_no_nulls1); + CheckImport(ArrayFromJSON(utf8(), R"(["foo", "", "bar", "quux"])")); + FillStringLike("z", 4, 0, 0, string_buffers_no_nulls1); + CheckImport(ArrayFromJSON(binary(), R"(["foo", "", "bar", "quux"])")); + FillStringLike("U", 4, 0, 0, large_string_buffers_no_nulls1); + CheckImport(ArrayFromJSON(large_utf8(), R"(["foo", "", "bar", "quux"])")); + FillStringLike("Z", 4, 0, 0, large_string_buffers_no_nulls1); + CheckImport(ArrayFromJSON(large_binary(), R"(["foo", "", "bar", "quux"])")); + + FillPrimitive("w:3", 2, 0, 0, primitive_buffers_no_nulls2); + CheckImport(ArrayFromJSON(fixed_size_binary(3), R"(["abc", "def"])")); + FillPrimitive("d:15,4", 2, 0, 0, primitive_buffers_no_nulls3); + CheckImport(ArrayFromJSON(decimal(15, 4), R"(["12345.6789", "98765.4321"])")); +} + +TEST_F(TestImport, List) { + FillPrimitive(AddChild(), "c", 8, 0, 0, primitive_buffers_no_nulls1); + FillListLike("+l", 5, 0, 0, list_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list(int8()), "[[1, 2], [], [3, 4, 5], [6], [7, 8]]")); + FillPrimitive(AddChild(), "s", 5, 0, 0, primitive_buffers_no_nulls1); + FillListLike("+l", 3, 0, 0, list_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list(int16()), "[[513, 1027], [], [1541, 2055, 2569]]")); + + // Large list + FillPrimitive(AddChild(), "s", 5, 0, 0, primitive_buffers_no_nulls1); + FillListLike("+L", 3, 0, 0, large_list_buffers_no_nulls1); + CheckImport( + ArrayFromJSON(large_list(int16()), "[[513, 1027], [], [1541, 2055, 2569]]")); + + // Fixed-size list + FillPrimitive(AddChild(), "c", 9, 0, 0, primitive_buffers_no_nulls1); + FillFixedSizeListLike("+w:3", 3, 0, 0, buffers_no_nulls_no_data); + CheckImport( + ArrayFromJSON(fixed_size_list(int8(), 3), "[[1, 2, 3], [4, 5, 6], [7, 8, 9]]")); +} + +TEST_F(TestImport, NestedList) { + FillPrimitive(AddChild(), "c", 8, 0, 0, primitive_buffers_no_nulls1); + FillListLike(AddChild(), "+l", 5, 0, 0, list_buffers_no_nulls1); + FillListLike("+L", 3, 0, 0, large_list_buffers_no_nulls1); + CheckImport(ArrayFromJSON(large_list(list(int8())), + "[[[1, 2], []], [], [[3, 4, 5], [6], [7, 8]]]")); + + FillPrimitive(AddChild(), "c", 6, 0, 0, primitive_buffers_no_nulls1); + FillFixedSizeListLike(AddChild(), "+w:3", 2, 0, 0, buffers_no_nulls_no_data); + FillListLike("+l", 2, 0, 0, list_buffers_no_nulls1); + CheckImport( + ArrayFromJSON(list(fixed_size_list(int8(), 3)), "[[[1, 2, 3], [4, 5, 6]], []]")); +} + +TEST_F(TestImport, ListWithOffset) { + // Offset in child + FillPrimitive(AddChild(), "c", 8, 0, 1, primitive_buffers_no_nulls1); + FillListLike("+l", 5, 0, 0, list_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list(int8()), "[[2, 3], [], [4, 5, 6], [7], [8, 9]]")); + + FillPrimitive(AddChild(), "c", 9, 0, 1, primitive_buffers_no_nulls1); + FillFixedSizeListLike("+w:3", 3, 0, 0, buffers_no_nulls_no_data); + CheckImport( + ArrayFromJSON(fixed_size_list(int8(), 3), "[[2, 3, 4], [5, 6, 7], [8, 9, 10]]")); + + // Offset in parent + FillPrimitive(AddChild(), "c", 8, 0, 0, primitive_buffers_no_nulls1); + FillListLike("+l", 4, 0, 1, list_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list(int8()), "[[], [3, 4, 5], [6], [7, 8]]")); + + FillPrimitive(AddChild(), "c", 9, 0, 0, primitive_buffers_no_nulls1); + FillFixedSizeListLike("+w:3", 3, 0, 1, buffers_no_nulls_no_data); + CheckImport( + ArrayFromJSON(fixed_size_list(int8(), 3), "[[4, 5, 6], [7, 8, 9], [10, 11, 12]]")); + + // Both + FillPrimitive(AddChild(), "c", 8, 0, 2, primitive_buffers_no_nulls1); + FillListLike("+l", 4, 0, 1, list_buffers_no_nulls1); + CheckImport(ArrayFromJSON(list(int8()), "[[], [5, 6, 7], [8], [9, 10]]")); + + FillPrimitive(AddChild(), "c", 9, 0, 2, primitive_buffers_no_nulls1); + FillFixedSizeListLike("+w:3", 3, 0, 1, buffers_no_nulls_no_data); + CheckImport(ArrayFromJSON(fixed_size_list(int8(), 3), + "[[6, 7, 8], [9, 10, 11], [12, 13, 14]]")); +} + +TEST_F(TestImport, Struct) { + FillStringLike(AddChild(), "u", 3, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), "S", 3, -1, 0, primitive_buffers_nulls1); + FillStructLike("+s", 3, 0, 0, {"strs", "ints"}, buffers_no_nulls_no_data); + auto expected = ArrayFromJSON(struct_({field("strs", utf8()), field("ints", uint16())}), + R"([["foo", 513], ["", null], ["bar", 1541]])"); + CheckImport(expected); + + FillStringLike(AddChild(), "u", 3, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), "S", 3, 0, 0, primitive_buffers_no_nulls1); + FillStructLike("+s", 3, -1, 0, {"strs", "ints"}, buffers_nulls_no_data1); + expected = ArrayFromJSON(struct_({field("strs", utf8()), field("ints", uint16())}), + R"([["foo", 513], null, ["bar", 1541]])"); + CheckImport(expected); + + FillStringLike(AddChild(), "u", 3, 0, 0, string_buffers_no_nulls1, /*flags=*/0); + FillPrimitive(AddChild(), "S", 3, 0, 0, primitive_buffers_no_nulls1); + FillStructLike("+s", 3, -1, 0, {"strs", "ints"}, buffers_nulls_no_data1); + expected = ArrayFromJSON( + struct_({field("strs", utf8(), /*nullable=*/false), field("ints", uint16())}), + R"([["foo", 513], null, ["bar", 1541]])"); + CheckImport(expected); +} + +TEST_F(TestImport, Union) { + // Sparse + FillStringLike(AddChild(), "u", 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), "c", 4, -1, 0, primitive_buffers_nulls1); + FillUnionLike("+us:43,42", 4, 0, 0, {"strs", "ints"}, sparse_union_buffers_no_nulls1); + auto type = + union_({field("strs", utf8()), field("ints", int8())}, {43, 42}, UnionMode::SPARSE); + auto expected = + ArrayFromJSON(type, R"([[42, 1], [42, null], [43, "bar"], [43, "quux"]])"); + CheckImport(expected); + + // Dense + FillStringLike(AddChild(), "u", 2, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), "c", 3, -1, 0, primitive_buffers_nulls1); + FillUnionLike("+ud:43,42", 5, 0, 0, {"strs", "ints"}, dense_union_buffers_no_nulls1); + type = + union_({field("strs", utf8()), field("ints", int8())}, {43, 42}, UnionMode::DENSE); + expected = + ArrayFromJSON(type, R"([[42, 1], [42, null], [43, "foo"], [43, ""], [42, 3]])"); + CheckImport(expected); +} + +TEST_F(TestImport, StructWithOffset) { + // Child + FillStringLike(AddChild(), "u", 3, 0, 1, string_buffers_no_nulls1); + FillPrimitive(AddChild(), "c", 3, 0, 2, primitive_buffers_no_nulls1); + FillStructLike("+s", 3, 0, 0, {"strs", "ints"}, buffers_no_nulls_no_data); + auto expected = ArrayFromJSON(struct_({field("strs", utf8()), field("ints", int8())}), + R"([["", 3], ["bar", 4], ["quux", 5]])"); + CheckImport(expected); + + // Parent and child + FillStringLike(AddChild(), "u", 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), "c", 4, 0, 2, primitive_buffers_no_nulls1); + FillStructLike("+s", 3, 0, 1, {"strs", "ints"}, buffers_no_nulls_no_data); + expected = ArrayFromJSON(struct_({field("strs", utf8()), field("ints", int8())}), + R"([["", 4], ["bar", 5], ["quux", 6]])"); + CheckImport(expected); +} + +TEST_F(TestImport, Dictionary) { + FillStringLike(AddChild(), "u", 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive("c", 6, 0, 0, primitive_buffers_no_nulls4); + FillDictionary(); + + auto dict_values = ArrayFromJSON(utf8(), R"(["foo", "", "bar", "quux"])"); + auto indices = ArrayFromJSON(int8(), "[1, 2, 0, 1, 3, 0]"); + std::shared_ptr expected; + ASSERT_OK(DictionaryArray::FromArrays(dictionary(int8(), utf8()), indices, dict_values, + &expected)); + CheckImport(expected); + + FillStringLike(AddChild(), "u", 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive("c", 6, 0, 0, primitive_buffers_no_nulls4, + ARROW_FLAG_NULLABLE | ARROW_FLAG_DICTIONARY_ORDERED); + FillDictionary(); + + ASSERT_OK(DictionaryArray::FromArrays(dictionary(int8(), utf8(), /*ordered=*/true), + indices, dict_values, &expected)); + CheckImport(expected); +} + +TEST_F(TestImport, DictionaryWithOffset) { + FillStringLike(AddChild(), "u", 3, 0, 1, string_buffers_no_nulls1); + FillPrimitive("c", 3, 0, 0, primitive_buffers_no_nulls4); + FillDictionary(); + + auto dict_values = ArrayFromJSON(utf8(), R"(["", "bar", "quux"])"); + auto indices = ArrayFromJSON(int8(), "[1, 2, 0]"); + std::shared_ptr expected; + ASSERT_OK(DictionaryArray::FromArrays(dictionary(int8(), utf8()), indices, dict_values, + &expected)); + CheckImport(expected); + + FillStringLike(AddChild(), "u", 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive("c", 4, 0, 2, primitive_buffers_no_nulls4); + FillDictionary(); + + dict_values = ArrayFromJSON(utf8(), R"(["foo", "", "bar", "quux"])"); + indices = ArrayFromJSON(int8(), "[0, 1, 3, 0]"); + ASSERT_OK(DictionaryArray::FromArrays(dictionary(int8(), utf8()), indices, dict_values, + &expected)); + CheckImport(expected); +} + +TEST_F(TestImport, ErrorFormatString) { + FillPrimitive("cc", 3, 0, 0, primitive_buffers_no_nulls1); + CheckImportError(); + FillPrimitive("w3", 2, 0, 0, primitive_buffers_no_nulls2); + CheckImportError(); + FillPrimitive("w:three", 2, 0, 0, primitive_buffers_no_nulls2); + CheckImportError(); + FillPrimitive("w:3,5", 2, 0, 0, primitive_buffers_no_nulls2); + CheckImportError(); + FillPrimitive("d:15", 2, 0, 0, primitive_buffers_no_nulls3); + CheckImportError(); + FillPrimitive("d:15.4", 2, 0, 0, primitive_buffers_no_nulls3); + CheckImportError(); + FillPrimitive("t", 3, 0, 0, primitive_buffers_no_nulls7); + CheckImportError(); + FillPrimitive("td", 3, 0, 0, primitive_buffers_no_nulls7); + CheckImportError(); + FillPrimitive("tz", 3, 0, 0, primitive_buffers_no_nulls7); + CheckImportError(); + FillPrimitive("tdd", 3, 0, 0, primitive_buffers_no_nulls7); + CheckImportError(); + FillPrimitive("tdDd", 3, 0, 0, primitive_buffers_no_nulls7); + CheckImportError(); + FillPrimitive("tss", 3, 0, 0, timestamp_buffers_no_nulls1); + CheckImportError(); + FillPrimitive("tss;UTC", 3, 0, 0, timestamp_buffers_no_nulls1); + CheckImportError(); +} + +TEST_F(TestImport, ErrorPrimitive) { + // Bad number of buffers + FillPrimitive("c", 3, 0, 0, primitive_buffers_no_nulls1); + c_struct_.n_buffers = 1; + CheckImportError(); + // Zero null bitmap but non-zero null_count + FillPrimitive("c", 3, 1, 0, primitive_buffers_no_nulls1); + CheckImportError(); +} + +TEST_F(TestImport, ErrorDictionary) { + // Bad index type + FillPrimitive(AddChild(), "c", 3, 0, 0, primitive_buffers_no_nulls4); + FillStringLike("u", 3, 0, 1, string_buffers_no_nulls1); + FillDictionary(); + CheckImportError(); +} + +TEST_F(TestImport, WithField) { + std::shared_ptr field; + std::shared_ptr array; + auto expected_array = ArrayFromJSON(int8(), "[1, 2, 3]"); + + { + FillPrimitive("c", 3, 0, 0, primitive_buffers_no_nulls1); + auto expected_field = ::arrow::field("", int8()); + + ReleaseCallback cb(&c_struct_); + ASSERT_OK(ImportArray(&c_struct_, &field, &array)); + ASSERT_TRUE(ArrowIsReleased(&c_struct_)); // was moved + ASSERT_OK(array->ValidateFull()); + AssertArraysEqual(*expected_array, *array, true); + AssertFieldEqual(*expected_field, *field); + array.reset(); + cb.AssertCalled(); + } + { + // With nullable = false and metadata + FillPrimitive("c", 3, 0, 0, primitive_buffers_no_nulls1, 0); + c_struct_.name = "ints"; + c_struct_.metadata = kEncodedMetadata1.c_str(); + auto expected_field = ::arrow::field( + "ints", int8(), /*nullable=*/false, + key_value_metadata(kMetadataKeys1, kMetadataValues1)); + + ReleaseCallback cb(&c_struct_); + ASSERT_OK(ImportArray(&c_struct_, &field, &array)); + ASSERT_TRUE(ArrowIsReleased(&c_struct_)); // was moved + ASSERT_OK(array->ValidateFull()); + AssertArraysEqual(*expected_array, *array, true); + AssertFieldEqual(*expected_field, *field); + array.reset(); + cb.AssertCalled(); + } +} + +TEST_F(TestImport, AsRecordBatch) { + std::shared_ptr batch; + auto schema = ::arrow::schema( + {field("strs", utf8(), /*nullable=*/false), field("ints", uint16())}); + auto expected_strs = ArrayFromJSON(utf8(), R"(["", "bar", "quux"])"); + auto expected_ints = ArrayFromJSON(uint16(), "[513, null, 1541]"); + + { + FillStringLike(AddChild(), "u", 3, 0, 1, string_buffers_no_nulls1, 0); + FillPrimitive(AddChild(), "S", 3, -1, 0, primitive_buffers_nulls1); + FillStructLike("+s", 3, 0, 0, {"strs", "ints"}, buffers_no_nulls_no_data); + + ReleaseCallback cb(&c_struct_); + ASSERT_OK_AND_ASSIGN(batch, ImportRecordBatch(&c_struct_)); + ASSERT_TRUE(ArrowIsReleased(&c_struct_)); // was moved + ASSERT_OK(batch->ValidateFull()); + ASSERT_EQ(batch->num_columns(), 2); + AssertSchemaEqual(*schema, *batch->schema()); + AssertArraysEqual(*expected_strs, *batch->column(0), true); + AssertArraysEqual(*expected_ints, *batch->column(1), true); + batch.reset(); + cb.AssertCalled(); + } + { + // With schema and field metadata + FillStringLike(AddChild(), "u", 3, 0, 1, string_buffers_no_nulls1, 0); + FillPrimitive(AddChild(), "S", 3, -1, 0, primitive_buffers_nulls1); + FillStructLike("+s", 3, 0, 0, {"strs", "ints"}, buffers_no_nulls_no_data); + c_struct_.metadata = kEncodedMetadata1.c_str(); + c_struct_.children[0]->metadata = kEncodedMetadata2.c_str(); + auto f0 = schema->field(0)->WithMetadata( + key_value_metadata(kMetadataKeys2, kMetadataValues2)); + auto f1 = schema->field(1); + schema = ::arrow::schema( + {f0, f1}, key_value_metadata(kMetadataKeys1, kMetadataValues1)); + + ReleaseCallback cb(&c_struct_); + ASSERT_OK_AND_ASSIGN(batch, ImportRecordBatch(&c_struct_)); + ASSERT_TRUE(ArrowIsReleased(&c_struct_)); // was moved + ASSERT_OK(batch->ValidateFull()); + ASSERT_EQ(batch->num_columns(), 2); + AssertSchemaEqual(*schema, *batch->schema()); + AssertArraysEqual(*expected_strs, *batch->column(0), true); + AssertArraysEqual(*expected_ints, *batch->column(1), true); + batch.reset(); + cb.AssertCalled(); + } +} + +TEST_F(TestImport, AsRecordBatchError) { + // Not a struct + FillStringLike("u", 3, 0, 1, string_buffers_no_nulls1); + CheckImportAsRecordBatchError(); + + // Struct with non-zero parent offset + FillStringLike(AddChild(), "u", 4, 0, 0, string_buffers_no_nulls1); + FillPrimitive(AddChild(), "c", 4, 0, 0, primitive_buffers_no_nulls1); + FillStructLike("+s", 3, 0, 1, {"strs", "ints"}, buffers_no_nulls_no_data); + CheckImportAsRecordBatchError(); + + // Struct with nulls in parent + FillStringLike(AddChild(), "u", 3, 0, 0, string_buffers_no_nulls1, /*flags=*/0); + FillPrimitive(AddChild(), "S", 3, 0, 0, primitive_buffers_no_nulls1); + FillStructLike("+s", 3, -1, 0, {"strs", "ints"}, buffers_nulls_no_data1); + CheckImportAsRecordBatchError(); +} + +//////////////////////////////////////////////////////////////////////////// +// C++ -> C -> C++ roundtripping tests + +class TestRoundtrip : public ::testing::Test { + public: + using ArrayFactory = std::function*)>; + + void SetUp() override { pool_ = default_memory_pool(); } + + static ArrayFactory JSONArrayFactory(std::shared_ptr type, const char* json) { + return [=](std::shared_ptr* out) -> Status { + return ::arrow::ipc::internal::json::ArrayFromJSON(type, json, out); + }; + } + + static ArrayFactory SlicedArrayFactory(ArrayFactory factory) { + return [=](std::shared_ptr* out) -> Status { + std::shared_ptr arr; + RETURN_NOT_OK(factory(&arr)); + DCHECK_GE(arr->length(), 2); + *out = arr->Slice(1, arr->length() - 2); + return Status::OK(); + }; + } + + template + void TestWithArrayFactory(ArrayFactory&& factory) { + struct ArrowArray c_export; + ExportGuard guard(&c_export); + + auto orig_bytes = pool_->bytes_allocated(); + std::shared_ptr arr; + ASSERT_OK(factory(&arr)); + ASSERT_OK(ExportArray(*arr, &c_export)); + + auto new_bytes = pool_->bytes_allocated(); + if (arr->type_id() != Type::NA) { + ASSERT_GT(new_bytes, orig_bytes); + } + + arr.reset(); + ASSERT_EQ(pool_->bytes_allocated(), new_bytes); + ASSERT_OK_AND_ASSIGN(arr, ImportArray(&c_export)); + ASSERT_OK(arr->ValidateFull()); + ASSERT_TRUE(ArrowIsReleased(&c_export)); + + // Re-export and re-import + ASSERT_OK(ExportArray(*arr, &c_export)); + arr.reset(); + ASSERT_OK_AND_ASSIGN(arr, ImportArray(&c_export)); + ASSERT_OK(arr->ValidateFull()); + ASSERT_TRUE(ArrowIsReleased(&c_export)); + + // Check value of imported array + { + std::shared_ptr expected; + ASSERT_OK(factory(&expected)); + AssertTypeEqual(*expected->type(), *arr->type()); + AssertArraysEqual(*expected, *arr, true); + } + if (arr->type_id() != Type::NA) { + ASSERT_GE(pool_->bytes_allocated(), new_bytes); + } + arr.reset(); + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + template + void TestWithBatchFactory(BatchFactory&& factory) { + struct ArrowArray c_export; + ExportGuard guard(&c_export); + + auto orig_bytes = pool_->bytes_allocated(); + std::shared_ptr batch; + ASSERT_OK(factory(&batch)); + ASSERT_OK(ExportRecordBatch(*batch, &c_export)); + + auto new_bytes = pool_->bytes_allocated(); + batch.reset(); + ASSERT_EQ(pool_->bytes_allocated(), new_bytes); + ASSERT_OK_AND_ASSIGN(batch, ImportRecordBatch(&c_export)); + ASSERT_OK(batch->ValidateFull()); + ASSERT_TRUE(ArrowIsReleased(&c_export)); + + // Re-export and re-import + ASSERT_OK(ExportRecordBatch(*batch, &c_export)); + batch.reset(); + ASSERT_OK_AND_ASSIGN(batch, ImportRecordBatch(&c_export)); + ASSERT_OK(batch->ValidateFull()); + ASSERT_TRUE(ArrowIsReleased(&c_export)); + + // Check value of imported record batch + { + std::shared_ptr expected; + ASSERT_OK(factory(&expected)); + AssertSchemaEqual(*expected->schema(), *batch->schema()); + AssertBatchesEqual(*expected, *batch); + } + ASSERT_GE(pool_->bytes_allocated(), new_bytes); + batch.reset(); + ASSERT_EQ(pool_->bytes_allocated(), orig_bytes); + } + + void TestWithJSON(std::shared_ptr type, const char* json) { + TestWithArrayFactory(JSONArrayFactory(type, json)); + } + + void TestWithJSONSliced(std::shared_ptr type, const char* json) { + TestWithArrayFactory(SlicedArrayFactory(JSONArrayFactory(type, json))); + } + + protected: + MemoryPool* pool_; +}; + +TEST_F(TestRoundtrip, Null) { + TestWithJSON(null(), "[]"); + TestWithJSON(null(), "[null, null]"); + + TestWithJSONSliced(null(), "[null, null]"); + TestWithJSONSliced(null(), "[null, null, null]"); +} + +TEST_F(TestRoundtrip, Primitive) { + TestWithJSON(int32(), "[]"); + TestWithJSON(int32(), "[4, 5, null]"); + + TestWithJSONSliced(int32(), "[4, 5]"); + TestWithJSONSliced(int32(), "[4, 5, 6, null]"); +} + +TEST_F(TestRoundtrip, Nested) { + TestWithJSON(list(int32()), "[]"); + TestWithJSON(list(int32()), "[[4, 5], [6, null], null]"); + + TestWithJSONSliced(list(int32()), "[[4, 5], [6, null], null]"); + + auto type = struct_({field("ints", int16()), field("bools", boolean())}); + TestWithJSON(type, "[]"); + TestWithJSON(type, "[[4, true], [5, false]]"); + TestWithJSON(type, "[[4, null], null, [5, false]]"); + + TestWithJSONSliced(type, "[[4, null], null, [5, false]]"); + + // With nullable = false and metadata + auto f0 = field("ints", int16(), /*nullable=*/false); + auto f1 = field("bools", boolean(), /*nullable=*/true, + key_value_metadata(kMetadataKeys1, kMetadataValues1)); + type = struct_({f0, f1}); + TestWithJSON(type, "[]"); + TestWithJSON(type, "[[4, true], [5, null]]"); + + TestWithJSONSliced(type, "[[4, true], [5, null], [6, false]]"); + + // Map type + type = map(utf8(), int32()); + const char* json = R"([[["foo", 123], ["bar", -456]], null, + [["foo", null]], []])"; + TestWithJSON(type, json); + TestWithJSONSliced(type, json); + + type = map(utf8(), int32(), /*keys_sorted=*/ true); + TestWithJSON(type, json); + TestWithJSONSliced(type, json); +} + +TEST_F(TestRoundtrip, Dictionary) { + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(utf8(), R"(["foo", "bar", "quux"])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + return DictionaryArray::FromArrays(dictionary(indices->type(), values->type()), + indices, values, out); + }; + TestWithArrayFactory(factory); + TestWithArrayFactory(SlicedArrayFactory(factory)); + } + { + auto factory = [](std::shared_ptr* out) -> Status { + auto values = ArrayFromJSON(list(utf8()), R"([["abc", "def"], ["efg"], []])"); + auto indices = ArrayFromJSON(int32(), "[0, 2, 1, null, 1]"); + return DictionaryArray::FromArrays( + dictionary(indices->type(), values->type(), /*ordered=*/true), indices, values, + out); + }; + TestWithArrayFactory(factory); + TestWithArrayFactory(SlicedArrayFactory(factory)); + } +} + +TEST_F(TestRoundtrip, RecordBatch) { + auto schema = ::arrow::schema( + {field("ints", int16()), field("bools", boolean(), /*nullable=*/false)}); + auto arr0 = ArrayFromJSON(int16(), "[1, 2, null]"); + auto arr1 = ArrayFromJSON(boolean(), "[false, true, false]"); + + { + auto factory = [&](std::shared_ptr* out) -> Status { + *out = RecordBatch::Make(schema, 3, {arr0, arr1}); + return Status::OK(); + }; + TestWithBatchFactory(factory); + } + { + // With schema and field metadata + auto factory = [&](std::shared_ptr* out) -> Status { + auto f0 = schema->field(0); + auto f1 = schema->field(1); + f1 = f1->WithMetadata(key_value_metadata(kMetadataKeys1, kMetadataValues1)); + auto schema_with_md = ::arrow::schema({f0, f1}, + key_value_metadata(kMetadataKeys2, kMetadataValues2)); + *out = RecordBatch::Make(schema_with_md, 3, {arr0, arr1}); + return Status::OK(); + }; + TestWithBatchFactory(factory); + } +} + +// TODO C -> C++ -> C roundtripping tests? + +} // namespace arrow diff --git a/cpp/src/arrow/c/helpers.h b/cpp/src/arrow/c/helpers.h new file mode 100644 index 00000000000..58b9761e3db --- /dev/null +++ b/cpp/src/arrow/c/helpers.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/c/abi.h" + +#ifdef __cplusplus +extern "C" { +#endif + +inline int ArrowIsReleased(const struct ArrowArray* array) { + return array->format == NULL; +} + +inline void ArrowMoveArray(struct ArrowArray* src, struct ArrowArray* dest) { + assert(dest != src); + assert(!ArrowIsReleased(src)); + memcpy(dest, src, sizeof(struct ArrowArray)); + src->format = NULL; + src->release = NULL; +} + +inline void ArrowReleaseArray(struct ArrowArray* array) { + if (array->format != NULL) { + if (array->release != NULL) { + array->release(array); + assert(ArrowIsReleased(array)); + } else { + array->format = NULL; + } + } +} + +#ifdef __cplusplus +} +#endif diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index f0c41ac05e0..9656372c618 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -926,7 +926,8 @@ Status PrintDiff(const Array& left, const Array& right, std::ostream* os) { } if (!left.type()->Equals(right.type())) { - *os << "# Array types differed: " << *left.type() << " vs " << *right.type(); + *os << "# Array types differed: " << *left.type() << " vs " << *right.type() + << std::endl; return Status::OK(); } diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index 16f34575523..06320b844d2 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -39,6 +39,7 @@ namespace internal { namespace json { using ::arrow::internal::checked_cast; +using ::arrow::internal::checked_pointer_cast; static constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; @@ -155,6 +156,7 @@ class BooleanConverter final : public ConcreteConverter { // Convert single signed integer value (also {Date,Time}{32,64} and Timestamp) template enable_if_physical_signed_integer ConvertNumber(const rj::Value& json_obj, + const DataType& type, typename T::c_type* out) { if (json_obj.IsInt64()) { int64_t v64 = json_obj.GetInt64(); @@ -162,8 +164,7 @@ enable_if_physical_signed_integer ConvertNumber(const rj::Value& json if (*out == v64) { return Status::OK(); } else { - return Status::Invalid("Value ", v64, " out of bounds for ", - TypeTraits::type_singleton()); + return Status::Invalid("Value ", v64, " out of bounds for ", type); } } else { *out = static_cast(0); @@ -174,6 +175,7 @@ enable_if_physical_signed_integer ConvertNumber(const rj::Value& json // Convert single unsigned integer value template enable_if_physical_unsigned_integer ConvertNumber(const rj::Value& json_obj, + const DataType& type, typename T::c_type* out) { if (json_obj.IsUint64()) { uint64_t v64 = json_obj.GetUint64(); @@ -181,8 +183,7 @@ enable_if_physical_unsigned_integer ConvertNumber(const rj::Value& js if (*out == v64) { return Status::OK(); } else { - return Status::Invalid("Value ", v64, " out of bounds for ", - TypeTraits::type_singleton()); + return Status::Invalid("Value ", v64, " out of bounds for ", type); } } else { *out = static_cast(0); @@ -193,6 +194,7 @@ enable_if_physical_unsigned_integer ConvertNumber(const rj::Value& js // Convert single floating point value template enable_if_physical_floating_point ConvertNumber(const rj::Value& json_obj, + const DataType& type, typename T::c_type* out) { if (json_obj.IsNumber()) { *out = static_cast(json_obj.GetDouble()); @@ -212,9 +214,13 @@ class IntegerConverter final : public ConcreteConverter> static constexpr auto is_signed = std::is_signed::value; public: - explicit IntegerConverter(const std::shared_ptr& type) { - this->type_ = type; - builder_ = std::make_shared>(); + explicit IntegerConverter(const std::shared_ptr& type) { this->type_ = type; } + + Status Init() override { + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), this->type_, &builder)); + builder_ = checked_pointer_cast>(std::move(builder)); + return Status::OK(); } Status AppendNull() override { return builder_->AppendNull(); } @@ -224,7 +230,7 @@ class IntegerConverter final : public ConcreteConverter> return AppendNull(); } c_type value; - RETURN_NOT_OK(ConvertNumber(json_obj, &value)); + RETURN_NOT_OK(ConvertNumber(json_obj, *this->type_, &value)); return builder_->Append(value); } @@ -254,7 +260,7 @@ class FloatConverter final : public ConcreteConverter> { return AppendNull(); } c_type value; - RETURN_NOT_OK(ConvertNumber(json_obj, &value)); + RETURN_NOT_OK(ConvertNumber(json_obj, *this->type_, &value)); return builder_->Append(value); } @@ -321,7 +327,7 @@ class TimestampConverter final : public ConcreteConverter { } int64_t value; if (json_obj.IsNumber()) { - RETURN_NOT_OK(ConvertNumber(json_obj, &value)); + RETURN_NOT_OK(ConvertNumber(json_obj, *this->type_, &value)); } else if (json_obj.IsString()) { auto view = util::string_view(json_obj.GetString(), json_obj.GetStringLength()); if (!from_string_(view.data(), view.size(), &value)) { @@ -340,6 +346,43 @@ class TimestampConverter final : public ConcreteConverter { std::shared_ptr builder_; }; +// ------------------------------------------------------------------------ +// Converter for day-time interval arrays + +class DayTimeIntervalConverter final + : public ConcreteConverter { + public: + explicit DayTimeIntervalConverter(const std::shared_ptr& type) { + this->type_ = type; + builder_ = std::make_shared(default_memory_pool()); + } + + Status AppendNull() override { return builder_->AppendNull(); } + + Status AppendValue(const rj::Value& json_obj) override { + if (json_obj.IsNull()) { + return AppendNull(); + } + DayTimeIntervalType::DayMilliseconds value; + if (!json_obj.IsArray()) { + return JSONTypeError("array", json_obj.GetType()); + } + if (json_obj.Size() != 2) { + return Status::Invalid( + "day time interval pair must have exactly two elements, had ", json_obj.Size()); + } + RETURN_NOT_OK(ConvertNumber(json_obj[0], *this->type_, &value.days)); + RETURN_NOT_OK( + ConvertNumber(json_obj[1], *this->type_, &value.milliseconds)); + return builder_->Append(value); + } + + std::shared_ptr builder() override { return builder_; } + + private: + std::shared_ptr builder_; +}; + // ------------------------------------------------------------------------ // Converter for binary and string arrays @@ -713,6 +756,11 @@ Status GetConverter(const std::shared_ptr& type, std::shared_ptr* out) { std::shared_ptr res; + auto not_implemented = [&]() -> Status { + return Status::NotImplemented("JSON conversion to ", type->ToString(), + " not implemented"); + }; + #define SIMPLE_CONVERTER_CASE(ID, CLASS) \ case ID: \ res = std::make_shared(type); \ @@ -722,16 +770,17 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::INT8, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::INT16, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::INT32, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::TIME32, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::DATE32, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::INT64, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::TIME64, IntegerConverter) - SIMPLE_CONVERTER_CASE(Type::TIMESTAMP, TimestampConverter) - SIMPLE_CONVERTER_CASE(Type::DATE64, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::UINT8, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::UINT16, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::UINT32, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::UINT64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIMESTAMP, TimestampConverter) + SIMPLE_CONVERTER_CASE(Type::DATE32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::DATE64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIME32, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::TIME64, IntegerConverter) + SIMPLE_CONVERTER_CASE(Type::DURATION, IntegerConverter) SIMPLE_CONVERTER_CASE(Type::NA, NullConverter) SIMPLE_CONVERTER_CASE(Type::BOOL, BooleanConverter) SIMPLE_CONVERTER_CASE(Type::HALF_FLOAT, IntegerConverter) @@ -749,10 +798,21 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter) SIMPLE_CONVERTER_CASE(Type::DECIMAL, DecimalConverter) SIMPLE_CONVERTER_CASE(Type::UNION, UnionConverter) - default: { - return Status::NotImplemented("JSON conversion to ", type->ToString(), - " not implemented"); + case Type::INTERVAL: { + switch (checked_cast(*type).interval_type()) { + case IntervalType::MONTHS: + res = std::make_shared>(type); + break; + case IntervalType::DAY_TIME: + res = std::make_shared(type); + break; + default: + return not_implemented(); + } + break; } + default: + return not_implemented(); } #undef SIMPLE_CONVERTER_CASE diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index b83c87d8c56..af75e1ce644 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -352,6 +352,59 @@ TEST(TestTimestamp, Basics) { {0, 951782400000000000LL, -2203977600000000000LL}); } +TEST(TestDate, Basics) { + auto type = date32(); + AssertJSONArray(type, R"([5, null, 42])", {true, false, true}, {5, 0, 42}); + type = date64(); + AssertJSONArray(type, R"([1, null, 9999999999999])", {true, false, true}, + {1, 0, 9999999999999LL}); +} + +TEST(TestTime, Basics) { + auto type = time32(TimeUnit::SECOND); + AssertJSONArray(type, R"([5, null, 42])", {true, false, true}, {5, 0, 42}); + type = time32(TimeUnit::MILLI); + AssertJSONArray(type, R"([5, null, 42])", {true, false, true}, {5, 0, 42}); + + type = time64(TimeUnit::MICRO); + AssertJSONArray(type, R"([1, null, 9999999999999])", {true, false, true}, + {1, 0, 9999999999999LL}); + type = time64(TimeUnit::NANO); + AssertJSONArray(type, R"([1, null, 9999999999999])", {true, false, true}, + {1, 0, 9999999999999LL}); +} + +TEST(TestDuration, Basics) { + auto type = duration(TimeUnit::SECOND); + AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", + {false, true, true}, + {0, -7777777777777LL, 9999999999999LL}); + type = duration(TimeUnit::MILLI); + AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", + {false, true, true}, + {0, -7777777777777LL, 9999999999999LL}); + type = duration(TimeUnit::MICRO); + AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", + {false, true, true}, + {0, -7777777777777LL, 9999999999999LL}); + type = duration(TimeUnit::NANO); + AssertJSONArray(type, R"([null, -7777777777777, 9999999999999])", + {false, true, true}, + {0, -7777777777777LL, 9999999999999LL}); +} + +TEST(TestMonthInterval, Basics) { + auto type = month_interval(); + AssertJSONArray(type, R"([123, -456, null])", {true, true, false}, + {123, -456, 0}); +} + +TEST(TestDayTimeInterval, Basics) { + auto type = day_time_interval(); + AssertJSONArray(type, R"([[1, -600], null])", {true, false}, + {{1, -600}, {}}); +} + TEST(TestString, Errors) { std::shared_ptr type = utf8(); std::shared_ptr array; diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index 80e775e2234..ea71cb20f20 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -202,6 +202,7 @@ std::shared_ptr RecordBatch::Make( Status RecordBatch::FromStructArray(const std::shared_ptr& array, std::shared_ptr* out) { + // TODO fail if null_count != 0? if (array->type_id() != Type::STRUCT) { return Status::Invalid("Cannot construct record batch from array of type ", *array->type()); @@ -211,6 +212,19 @@ Status RecordBatch::FromStructArray(const std::shared_ptr& array, return Status::OK(); } +Status RecordBatch::ToStructArray(std::shared_ptr* out) const { + ARROW_ASSIGN_OR_RAISE(*out, StructArray::Make(columns(), schema()->fields())); + return Status::OK(); +} + +std::vector> RecordBatch::columns() const { + std::vector> children(num_columns()); + for (int i = 0; i < num_columns(); ++i) { + children[i] = column(i); + } + return children; +} + const std::string& RecordBatch::column_name(int i) const { return schema_->field(i)->name(); } diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index 92943640e17..8ae5cc18d94 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -71,6 +71,18 @@ class ARROW_EXPORT RecordBatch { const std::shared_ptr& schema, int64_t num_rows, const std::vector>& columns); + /// \brief Convert record batch to struct array + /// + /// Create a struct array whose child arrays are the record batch's columns. + /// Note that the record batch's top-level field metadata cannot be reflected + /// in the resulting struct array. + Status ToStructArray(std::shared_ptr* out) const; + + /// \brief Construct record batch from struct array + /// + /// This constructs a record batch using the child arrays of the given + /// array, which must be a struct array. Note that the struct array's own + /// null bitmap is not reflected in the resulting record batch. static Status FromStructArray(const std::shared_ptr& array, std::shared_ptr* out); @@ -85,6 +97,9 @@ class ARROW_EXPORT RecordBatch { /// \return true if batches are equal std::shared_ptr schema() const { return schema_; } + /// \brief Retrieve all columns at once + std::vector> columns() const; + /// \brief Retrieve an array from the record batch /// \param[in] i field index, does not boundscheck /// \return an Array object diff --git a/cpp/src/arrow/stl.h b/cpp/src/arrow/stl.h index eeb31b421f4..c46463f5e6d 100644 --- a/cpp/src/arrow/stl.h +++ b/cpp/src/arrow/stl.h @@ -88,7 +88,8 @@ using CBuilderType = /// contigiuous ranges while appending. This default implementation will call /// ConversionTraits::AppendRow() for each value in the range. template -Status AppendListValues(CBuilderType& value_builder, Range&& cell_range) { +inline Status AppendListValues(CBuilderType& value_builder, + Range&& cell_range) { for (auto const& value : cell_range) { ARROW_RETURN_NOT_OK(ConversionTraits::AppendRow(value_builder, value)); } @@ -109,7 +110,7 @@ Status AppendListValues(CBuilderType& value_builder, Range&& cell_ra }; \ \ template <> \ - Status AppendListValues&>( \ + inline Status AppendListValues&>( \ typename TypeTraits::BuilderType & value_builder, \ const std::vector& cell_range) { \ return value_builder.AppendValues(cell_range); \ @@ -483,7 +484,7 @@ class allocator { explicit allocator(MemoryPool* pool) noexcept : pool_(pool) {} template - allocator(const allocator& rhs) noexcept : pool_(rhs.pool_) {} + allocator(const allocator& rhs) noexcept : pool_(rhs.pool()) {} ~allocator() { pool_ = NULLPTR; } diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index b0a7dbca42e..3617595d7bd 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -139,6 +139,24 @@ void AssertSchemaEqual(const Schema& lhs, const Schema& rhs) { } } +void AssertFieldEqual(const Field& lhs, const Field& rhs) { + if (!lhs.Equals(rhs)) { + std::stringstream ss; + ss << "left field: " << lhs.ToString() << std::endl + << "right field: " << rhs.ToString() << std::endl; + FAIL() << ss.str(); + } +} + +void AssertTypeEqual(const DataType& lhs, const DataType& rhs) { + if (!lhs.Equals(rhs)) { + std::stringstream ss; + ss << "left type: " << lhs.ToString() << std::endl + << "right right: " << rhs.ToString() << std::endl; + FAIL() << ss.str(); + } +} + void AssertDatumsEqual(const Datum& expected, const Datum& actual) { // TODO: Implements better print. ASSERT_TRUE(actual.Equals(expected)); diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 091b1dd0d27..09600586ab5 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -182,11 +182,14 @@ ARROW_EXPORT void AssertBufferEqual(const Buffer& buffer, const std::vector& expected); ARROW_EXPORT void AssertBufferEqual(const Buffer& buffer, const std::string& expected); ARROW_EXPORT void AssertBufferEqual(const Buffer& buffer, const Buffer& expected); -ARROW_EXPORT void AssertSchemaEqual(const Schema& lhs, const Schema& rhs); - ARROW_EXPORT void AssertTablesEqual(const Table& expected, const Table& actual, bool same_chunk_layout = true, bool flatten = false); +// These three all compare metadata +ARROW_EXPORT void AssertSchemaEqual(const Schema& lhs, const Schema& rhs); +ARROW_EXPORT void AssertFieldEqual(const Field& lhs, const Field& rhs); +ARROW_EXPORT void AssertTypeEqual(const DataType& lhs, const DataType& rhs); + ARROW_EXPORT void AssertDatumsEqual(const Datum& expected, const Datum& actual); template diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index ada2d7b5b96..9a38dcf0dbd 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1138,7 +1138,7 @@ class ARROW_EXPORT TimestampType : public TemporalType, public ParametricType { std::string timezone_; }; -// Base class for the different kinds of intervals. +// Base class for the different kinds of calendar intervals. class ARROW_EXPORT IntervalType : public TemporalType, public ParametricType { public: enum type { MONTHS, DAY_TIME }; @@ -1150,10 +1150,10 @@ class ARROW_EXPORT IntervalType : public TemporalType, public ParametricType { std::string ComputeFingerprint() const override; }; -/// \brief Represents a some number of months. +/// \brief Represents a number of months. /// -/// Type representing a number of months. Corresponeds to YearMonth type -/// in Schema.fbs (Years are defined as 12 months). +/// Type representing a number of months. Corresponds to YearMonth type +/// in Schema.fbs (years are defined as 12 months). class ARROW_EXPORT MonthIntervalType : public IntervalType { public: static constexpr Type::type type_id = Type::INTERVAL; @@ -1202,8 +1202,7 @@ class ARROW_EXPORT DayTimeIntervalType : public IntervalType { std::string name() const override { return "day_time_interval"; } }; -// \brief Represents an amount of elapsed time without any relation to a calendar -// artifact. +/// \brief Represents an elapsed time without any relation to a calendar artifact. class ARROW_EXPORT DurationType : public TemporalType, public ParametricType { public: using Unit = TimeUnit; diff --git a/cpp/src/arrow/util/key_value_metadata.cc b/cpp/src/arrow/util/key_value_metadata.cc index ca597f1c21c..e1bce496667 100644 --- a/cpp/src/arrow/util/key_value_metadata.cc +++ b/cpp/src/arrow/util/key_value_metadata.cc @@ -61,9 +61,9 @@ KeyValueMetadata::KeyValueMetadata( ARROW_CHECK_EQ(keys_.size(), values_.size()); } -KeyValueMetadata::KeyValueMetadata(const std::vector& keys, - const std::vector& values) - : keys_(keys), values_(values) { +KeyValueMetadata::KeyValueMetadata(std::vector keys, + std::vector values) + : keys_(std::move(keys)), values_(std::move(values)) { ARROW_CHECK_EQ(keys.size(), values.size()); } @@ -164,9 +164,9 @@ std::shared_ptr key_value_metadata( return std::make_shared(pairs); } -std::shared_ptr key_value_metadata( - const std::vector& keys, const std::vector& values) { - return std::make_shared(keys, values); +std::shared_ptr key_value_metadata(std::vector keys, + std::vector values) { + return std::make_shared(std::move(keys), std::move(values)); } } // namespace arrow diff --git a/cpp/src/arrow/util/key_value_metadata.h b/cpp/src/arrow/util/key_value_metadata.h index d84e060822d..7152624ce5c 100644 --- a/cpp/src/arrow/util/key_value_metadata.h +++ b/cpp/src/arrow/util/key_value_metadata.h @@ -34,8 +34,7 @@ namespace arrow { class ARROW_EXPORT KeyValueMetadata { public: KeyValueMetadata(); - KeyValueMetadata(const std::vector& keys, - const std::vector& values); + KeyValueMetadata(std::vector keys, std::vector values); explicit KeyValueMetadata(const std::unordered_map& map); virtual ~KeyValueMetadata() = default; @@ -75,8 +74,8 @@ key_value_metadata(const std::unordered_map& pairs); /// /// \param keys sequence of metadata keys /// \param values sequence of corresponding metadata values -std::shared_ptr ARROW_EXPORT key_value_metadata( - const std::vector& keys, const std::vector& values); +std::shared_ptr ARROW_EXPORT +key_value_metadata(std::vector keys, std::vector values); } // namespace arrow diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 85e0c3a8ad3..8243fa5e155 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -1192,8 +1192,8 @@ TEST_F(TestNullParquetIO, NullDictionaryColumn) { ASSERT_OK(::arrow::AllocateEmptyBitmap(::arrow::default_memory_pool(), SMALL_SIZE, &null_bitmap)); - std::shared_ptr indices = - std::make_shared<::arrow::Int8Array>(SMALL_SIZE, nullptr, null_bitmap, SMALL_SIZE); + std::shared_ptr indices; + ASSERT_OK(MakeArrayOfNull(::arrow::int8(), SMALL_SIZE, &indices)); std::shared_ptr<::arrow::DictionaryType> dict_type = std::make_shared<::arrow::DictionaryType>(::arrow::int8(), ::arrow::null()); diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 7821b8f3382..6d37d053e4d 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1116,6 +1116,27 @@ cdef class Array(_PandasConvertible): _append_array_buffers(self.sp_array.get().data().get(), res) return res + def _export_to_c(self, uintptr_t out_ptr): + """ + Export to a C ArrowArray struct, given its pointer. + + Be careful: if you don't pass the ArrowArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + check_status(ExportArray(deref(self.sp_array), + out_ptr)) + + @staticmethod + def _import_from_c(uintptr_t in_ptr): + """ + Import from a C ArrowArray struct, given its pointer. + + This is a low-level function intended for expert users. + """ + result = GetResultValue(ImportArray( in_ptr)) + return pyarrow_wrap_array(result) + cdef wrap_array_output(PyObject* output): cdef object obj = PyObject_to_object(output) diff --git a/python/pyarrow/cffi.py b/python/pyarrow/cffi.py new file mode 100644 index 00000000000..4d6198c9f9d --- /dev/null +++ b/python/pyarrow/cffi.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import absolute_import + +import cffi + +c_source = """ + struct ArrowArray { + // Type description + const char* format; + const char* name; + const char* metadata; + int64_t flags; + + // Data description + int64_t length; + int64_t null_count; + int64_t offset; + int64_t n_buffers; + int64_t n_children; + const void** buffers; + struct ArrowArray** children; + struct ArrowArray* dictionary; + + // Release callback + void (*release)(struct ArrowArray*); + // Opaque producer-specific data + void* private_data; + }; + """ + +# TODO use out-of-line mode for faster import and avoid C parsing +ffi = cffi.FFI() +ffi.cdef(c_source) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 4f2248782c9..99172030f24 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1726,6 +1726,17 @@ cdef extern from 'arrow/array/concatenate.h' namespace 'arrow' nogil: CStatus Concatenate(const vector[shared_ptr[CArray]]& arrays, CMemoryPool* pool, shared_ptr[CArray]* result) +cdef extern from 'arrow/c/abi.h': + cdef struct ArrowArray: + pass + +cdef extern from 'arrow/c/bridge.h' namespace 'arrow' nogil: + CStatus ExportArray(CArray& array, ArrowArray* out) + CResult[shared_ptr[CArray]] ImportArray(ArrowArray* array) + + CStatus ExportRecordBatch(CRecordBatch& batch, ArrowArray* out) + CResult[shared_ptr[CRecordBatch]] ImportRecordBatch(ArrowArray* array) + cdef extern from "" namespace "std": # Work around https://github.com/cython/cython/issues/2169 unique_ptr[CCodec] move(unique_ptr[CCodec]) nogil diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index e58729624c9..587f805bc92 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -863,6 +863,27 @@ cdef class RecordBatch(_PandasConvertible): &c_record_batch)) return pyarrow_wrap_batch(c_record_batch) + def _export_to_c(self, uintptr_t out_ptr): + """ + Export to a C ArrowArray struct, given its pointer. + + Be careful: if you don't pass the ArrowArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + check_status(ExportRecordBatch(deref(self.batch), + out_ptr)) + + @staticmethod + def _import_from_c(uintptr_t in_ptr): + """ + Import from a C ArrowArray struct, given its pointer. + + This is a low-level function intended for expert users. + """ + result = GetResultValue(ImportRecordBatch( in_ptr)) + return pyarrow_wrap_batch(result) + def _reconstruct_record_batch(columns, schema): """ diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 7476f4f1a52..059d3b287c9 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -319,10 +319,10 @@ def test_array_diff(): +"bar" +null ''' - assert arr1.diff(arr3) == '# Array types differed: string vs int64' - assert arr1.diff(arr3) == '# Array types differed: string vs int64' - assert arr1.diff(arr4) == ('# Array types differed: string vs ' - 'list') + assert arr1.diff(arr3).strip() == '# Array types differed: string vs int64' + assert arr1.diff(arr3).strip() == '# Array types differed: string vs int64' + assert arr1.diff(arr4).strip() == ('# Array types differed: string vs ' + 'list') def test_array_iter(): diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py new file mode 100644 index 00000000000..6fe44ad6543 --- /dev/null +++ b/python/pyarrow/tests/test_cffi.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow as pa +try: + from pyarrow.cffi import ffi +except ImportError: + ffi = None + +import pytest + +needs_cffi = pytest.mark.skipif(ffi is None, + reason="test needs cffi package installed") + + +@needs_cffi +def test_export_import_array(): + c = ffi.new("struct ArrowArray*") + ptr = int(ffi.cast("uintptr_t", c)) + + old_allocated = pa.total_allocated_bytes() + + arr = pa.array([1, 2, 42]) + py_value = arr.to_pylist() + arr._export_to_c(ptr) + new_allocated = pa.total_allocated_bytes() + assert new_allocated > old_allocated + # Delete C++ object and recreate new C++ object from exported pointer + del arr + arr_new = pa.Array._import_from_c(ptr) + assert arr_new.to_pylist() == py_value + assert pa.total_allocated_bytes() == new_allocated + del arr_new + assert pa.total_allocated_bytes() == old_allocated + + +@needs_cffi +def test_export_import_batch(): + c = ffi.new("struct ArrowArray*") + ptr = int(ffi.cast("uintptr_t", c)) + + old_allocated = pa.total_allocated_bytes() + + batch = pa.record_batch( + [pa.array([1, 2, None]), pa.array([True, False, False])], + names=['ints', 'bools']) + py_value = batch.to_pydict() + batch._export_to_c(ptr) + new_allocated = pa.total_allocated_bytes() + assert new_allocated > old_allocated + # Delete C++ object and recreate new C++ object from exported pointer + del batch + batch_new = pa.RecordBatch._import_from_c(ptr) + assert batch_new.to_pydict() == py_value + assert pa.total_allocated_bytes() == new_allocated + del batch_new + assert pa.total_allocated_bytes() == old_allocated diff --git a/python/requirements-test.txt b/python/requirements-test.txt index 10d445cbc44..fce3bb53a65 100644 --- a/python/requirements-test.txt +++ b/python/requirements-test.txt @@ -1,3 +1,4 @@ +cffi cython hypothesis pandas diff --git a/r/DESCRIPTION b/r/DESCRIPTION index d027017e2ad..398c39fd39a 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -44,6 +44,7 @@ Suggests: hms, knitr, lubridate, + reticulate, rmarkdown, testthat, tibble @@ -75,6 +76,7 @@ Collate: 'memory-pool.R' 'message.R' 'parquet.R' + 'py-to-r.R' 'read-record-batch.R' 'read-table.R' 'record-batch-reader.R' diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 26f123368c3..15bff81d933 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -39,6 +39,10 @@ s3_register(m, cl) } } + s3_register("reticulate::py_to_r", "pyarrow.lib.Array") + s3_register("reticulate::py_to_r", "pyarrow.lib.RecordBatch") + s3_register("reticulate::r_to_py", "Array") + s3_register("reticulate::r_to_py", "RecordBatch") invisible() } diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index b587c659e82..2cb298ad8a8 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1172,6 +1172,30 @@ parquet___arrow___FileReader__GetSchema <- function(reader){ .Call(`_arrow_parquet___arrow___FileReader__GetSchema` , reader) } +ImportArray <- function(array){ + .Call(`_arrow_ImportArray` , array) +} + +ImportRecordBatch <- function(array){ + .Call(`_arrow_ImportRecordBatch` , array) +} + +allocate_arrow_array <- function(){ + .Call(`_arrow_allocate_arrow_array` ) +} + +delete_arrow_array <- function(ptr){ + invisible(.Call(`_arrow_delete_arrow_array` , ptr)) +} + +ExportArray <- function(array, ptr){ + invisible(.Call(`_arrow_ExportArray` , array, ptr)) +} + +ExportRecordBatch <- function(array, ptr){ + invisible(.Call(`_arrow_ExportRecordBatch` , array, ptr)) +} + RecordBatch__num_columns <- function(x){ .Call(`_arrow_RecordBatch__num_columns` , x) } diff --git a/r/R/py-to-r.R b/r/R/py-to-r.R new file mode 100644 index 00000000000..bab2c1c1b95 --- /dev/null +++ b/r/R/py-to-r.R @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +py_to_r.pyarrow.lib.Array <- function(x, ...) { + ptr <- allocate_arrow_array() + on.exit(delete_arrow_array(ptr)) + x$`_export_to_c`(ptr) + Array$create(ImportArray(ptr)) +} + +r_to_py.Array <- function(x, convert = FALSE) { + ptr <- allocate_arrow_array() + on.exit(delete_arrow_array(ptr)) + ExportArray(x, ptr) + pa <- reticulate::import("pyarrow", convert = convert) + pa$Array$`_import_from_c`(ptr) +} + +py_to_r.pyarrow.lib.RecordBatch <- function(x, ...) { + ptr <- allocate_arrow_array() + on.exit(delete_arrow_array(ptr)) + x$`_export_to_c`(ptr) + shared_ptr(RecordBatch, ImportRecordBatch(ptr)) +} + +r_to_py.RecordBatch <- function(x, convert = FALSE) { + ptr <- allocate_arrow_array() + on.exit(delete_arrow_array(ptr)) + ExportRecordBatch(x, ptr) + pa <- reticulate::import("pyarrow", convert = convert) + pa$RecordBatch$`_import_from_c`(ptr) +} diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index b18e772fb2b..3b301590be4 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4581,6 +4581,100 @@ RcppExport SEXP _arrow_parquet___arrow___FileReader__GetSchema(SEXP reader_sexp) } #endif +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ImportArray(uintptr_t array); +RcppExport SEXP _arrow_ImportArray(SEXP array_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter::type array(array_sexp); + return Rcpp::wrap(ImportArray(array)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ImportArray(SEXP array_sexp){ + Rf_error("Cannot call ImportArray(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +std::shared_ptr ImportRecordBatch(uintptr_t array); +RcppExport SEXP _arrow_ImportRecordBatch(SEXP array_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter::type array(array_sexp); + return Rcpp::wrap(ImportRecordBatch(array)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ImportRecordBatch(SEXP array_sexp){ + Rf_error("Cannot call ImportRecordBatch(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +uintptr_t allocate_arrow_array(); +RcppExport SEXP _arrow_allocate_arrow_array(){ +BEGIN_RCPP + return Rcpp::wrap(allocate_arrow_array()); +END_RCPP +} +#else +RcppExport SEXP _arrow_allocate_arrow_array(){ + Rf_error("Cannot call allocate_arrow_array(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +void delete_arrow_array(uintptr_t ptr); +RcppExport SEXP _arrow_delete_arrow_array(SEXP ptr_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter::type ptr(ptr_sexp); + delete_arrow_array(ptr); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_delete_arrow_array(SEXP ptr_sexp){ + Rf_error("Cannot call delete_arrow_array(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +void ExportArray(const std::shared_ptr& array, uintptr_t ptr); +RcppExport SEXP _arrow_ExportArray(SEXP array_sexp, SEXP ptr_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type array(array_sexp); + Rcpp::traits::input_parameter::type ptr(ptr_sexp); + ExportArray(array, ptr); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_ExportArray(SEXP array_sexp, SEXP ptr_sexp){ + Rf_error("Cannot call ExportArray(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + +// py-to-r.cpp +#if defined(ARROW_R_WITH_ARROW) +void ExportRecordBatch(const std::shared_ptr& array, uintptr_t ptr); +RcppExport SEXP _arrow_ExportRecordBatch(SEXP array_sexp, SEXP ptr_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type array(array_sexp); + Rcpp::traits::input_parameter::type ptr(ptr_sexp); + ExportRecordBatch(array, ptr); + return R_NilValue; +END_RCPP +} +#else +RcppExport SEXP _arrow_ExportRecordBatch(SEXP array_sexp, SEXP ptr_sexp){ + Rf_error("Cannot call ExportRecordBatch(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // recordbatch.cpp #if defined(ARROW_R_WITH_ARROW) int RecordBatch__num_columns(const std::shared_ptr& x); @@ -5760,6 +5854,12 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_parquet___arrow___FileWriter__Close", (DL_FUNC) &_arrow_parquet___arrow___FileWriter__Close, 1}, { "_arrow_parquet___arrow___WriteTable", (DL_FUNC) &_arrow_parquet___arrow___WriteTable, 4}, { "_arrow_parquet___arrow___FileReader__GetSchema", (DL_FUNC) &_arrow_parquet___arrow___FileReader__GetSchema, 1}, + { "_arrow_ImportArray", (DL_FUNC) &_arrow_ImportArray, 1}, + { "_arrow_ImportRecordBatch", (DL_FUNC) &_arrow_ImportRecordBatch, 1}, + { "_arrow_allocate_arrow_array", (DL_FUNC) &_arrow_allocate_arrow_array, 0}, + { "_arrow_delete_arrow_array", (DL_FUNC) &_arrow_delete_arrow_array, 1}, + { "_arrow_ExportArray", (DL_FUNC) &_arrow_ExportArray, 2}, + { "_arrow_ExportRecordBatch", (DL_FUNC) &_arrow_ExportRecordBatch, 2}, { "_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, { "_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, { "_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index e40cb6eaf07..4359c528c79 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -196,6 +196,7 @@ inline std::shared_ptr extract(SEXP x) { #if defined(ARROW_R_WITH_ARROW) #include +#include #include #include #include diff --git a/r/src/py-to-r.cpp b/r/src/py-to-r.cpp new file mode 100644 index 00000000000..13d842b81bb --- /dev/null +++ b/r/src/py-to-r.cpp @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "./arrow_types.h" + +#if defined(ARROW_R_WITH_ARROW) + +// [[arrow::export]] +std::shared_ptr ImportArray(uintptr_t array) { + return VALUE_OR_STOP(arrow::ImportArray(reinterpret_cast(array))); +} + +// [[arrow::export]] +std::shared_ptr ImportRecordBatch(uintptr_t array) { + return VALUE_OR_STOP( + arrow::ImportRecordBatch(reinterpret_cast(array))); +} + +// [[arrow::export]] +uintptr_t allocate_arrow_array() { return reinterpret_cast(new ArrowArray); } + +// [[arrow::export]] +void delete_arrow_array(uintptr_t ptr) { + delete reinterpret_cast(ptr); +} + +// [[arrow::export]] +void ExportArray(const std::shared_ptr& array, uintptr_t ptr) { + STOP_IF_NOT_OK(arrow::ExportArray(*array, reinterpret_cast(ptr))); +} + +// [[arrow::export]] +void ExportRecordBatch(const std::shared_ptr& array, uintptr_t ptr) { + STOP_IF_NOT_OK( + arrow::ExportRecordBatch(*array, reinterpret_cast(ptr))); +} + +#endif diff --git a/r/tests/testthat/helper-skip.R b/r/tests/testthat/helper-skip.R index b87c5d6ef45..d6489fb0093 100644 --- a/r/tests/testthat/helper-skip.R +++ b/r/tests/testthat/helper-skip.R @@ -21,3 +21,10 @@ skip_if_not_available <- function(feature) { skip(paste("Arrow C++ not built with support for", feature)) } } + +skip_if_no_pyarrow <- function() { + skip_if_not_installed("reticulate") + if (!reticulate::py_module_available("pyarrow")) { + skip("pyarrow not available for testing") + } +} diff --git a/r/tests/testthat/test-python.R b/r/tests/testthat/test-python.R new file mode 100644 index 00000000000..d78ce2afc5b --- /dev/null +++ b/r/tests/testthat/test-python.R @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +context("To/from Python") + +test_that("Array from Python", { + skip_if_no_pyarrow() + pa <- reticulate::import("pyarrow") + py <- pa$array(c(1, 2, 3)) + expect_equal(py, Array$create(c(1, 2, 3))) +}) + +test_that("Array to Python", { + skip_if_no_pyarrow() + pa <- reticulate::import("pyarrow", convert=FALSE) + r <- Array$create(c(1, 2, 3)) + py <- pa$concat_arrays(list(r)) + expect_is(py, "pyarrow.lib.Array") + expect_equal(reticulate::py_to_r(py), r) +}) + +test_that("RecordBatch to/from Python", { + skip_if_no_pyarrow() + pa <- reticulate::import("pyarrow", convert=FALSE) + batch <- record_batch(col1=c(1, 2, 3), col2=letters[1:3]) + py <- reticulate::r_to_py(batch) + expect_is(py, "pyarrow.lib.RecordBatch") + expect_equal(reticulate::py_to_r(py), batch) +})