From be2b28b553e5a9b70be99909a5383cb16d65a9dc Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 25 Aug 2023 21:17:01 +0200 Subject: [PATCH 01/22] Add UUID canonical extension type --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/acero/hash_join_node_test.cc | 2 + cpp/src/arrow/acero/util_test.cc | 3 ++ cpp/src/arrow/c/bridge_test.cc | 23 ++++---- cpp/src/arrow/extension/CMakeLists.txt | 2 +- .../extension/fixed_shape_tensor_test.cc | 15 ------ cpp/src/arrow/extension/uuid_array.cc | 51 ++++++++++++++++++ cpp/src/arrow/extension/uuid_array.h | 51 ++++++++++++++++++ cpp/src/arrow/extension/uuid_array_test.cc | 53 +++++++++++++++++++ cpp/src/arrow/extension_type.cc | 3 +- cpp/src/arrow/extension_type_test.cc | 28 ++-------- .../test_integration_client.cc | 1 - .../integration/json_integration_test.cc | 35 ++++++------ cpp/src/arrow/ipc/read_write_test.cc | 4 +- cpp/src/arrow/ipc/test_common.cc | 3 ++ cpp/src/arrow/scalar_test.cc | 4 +- cpp/src/arrow/testing/extension_type.h | 25 --------- cpp/src/arrow/testing/gtest_util.cc | 45 +++++++--------- cpp/src/arrow/testing/gtest_util.h | 3 ++ 19 files changed, 231 insertions(+), 121 deletions(-) create mode 100644 cpp/src/arrow/extension/uuid_array.cc create mode 100644 cpp/src/arrow/extension/uuid_array.h create mode 100644 cpp/src/arrow/extension/uuid_array_test.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 89f28ee416e..55ee8d9906a 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -909,6 +909,7 @@ if(ARROW_JSON) arrow_add_object_library(ARROW_JSON extension/fixed_shape_tensor.cc extension/opaque.cc + extension/uuid_array.cc json/options.cc json/chunked_builder.cc json/chunker.cc diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 9065e286a22..35d065ac353 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -29,6 +29,7 @@ #include "arrow/compute/kernels/test_util.h" #include "arrow/compute/light_array_internal.h" #include "arrow/compute/row/row_encoder_internal.h" +#include "arrow/extension/uuid_array.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" @@ -54,6 +55,7 @@ using compute::SortIndices; using compute::SortKey; using compute::Take; using compute::internal::RowEncoder; +using extension::uuid; namespace acero { diff --git a/cpp/src/arrow/acero/util_test.cc b/cpp/src/arrow/acero/util_test.cc index a291075a0a9..8340eab80be 100644 --- a/cpp/src/arrow/acero/util_test.cc +++ b/cpp/src/arrow/acero/util_test.cc @@ -21,9 +21,12 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" +#include "arrow/extension/uuid_array.h" + using testing::Eq; namespace arrow { +using extension::uuid; namespace acero { const char* kLeftSuffix = ".left"; diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 09bb524adbd..d0e5933e600 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -31,6 +31,7 @@ #include "arrow/c/bridge.h" #include "arrow/c/helpers.h" #include "arrow/c/util_internal.h" +#include "arrow/extension/uuid_array.h" #include "arrow/ipc/json_simple.h" #include "arrow/memory_pool.h" #include "arrow/testing/builder.h" @@ -53,6 +54,7 @@ namespace arrow { +using extension::uuid; using internal::ArrayDeviceExportTraits; using internal::ArrayDeviceStreamExportTraits; using internal::ArrayExportGuard; @@ -2192,15 +2194,16 @@ TEST_F(TestSchemaImport, Dictionary) { } TEST_F(TestSchemaImport, UnregisteredExtension) { - FillPrimitive("w:16"); - c_struct_.metadata = kEncodedUuidMetadata.c_str(); - auto expected = fixed_size_binary(16); + FillPrimitive(AddChild(), "u"); + FillPrimitive("c"); + FillDictionary(); + c_struct_.metadata = kEncodedDictExtensionMetadata.c_str(); + auto expected = dictionary(int8(), utf8()); CheckImport(expected); } TEST_F(TestSchemaImport, RegisteredExtension) { { - ExtensionTypeGuard guard(uuid()); FillPrimitive("w:16"); c_struct_.metadata = kEncodedUuidMetadata.c_str(); auto expected = uuid(); @@ -2326,8 +2329,6 @@ TEST_F(TestSchemaImport, DictionaryError) { } TEST_F(TestSchemaImport, ExtensionError) { - ExtensionTypeGuard guard(uuid()); - // Storage type doesn't match FillPrimitive("w:15"); c_struct_.metadata = kEncodedUuidMetadata.c_str(); @@ -3711,7 +3712,10 @@ std::shared_ptr GetStorageWithMetadata(const std::string& field_name, } TEST_F(TestSchemaRoundtrip, UnregisteredExtension) { - TestWithTypeFactory(uuid, []() { return fixed_size_binary(16); }); + TestWithTypeFactory(complex128, []() { + return struct_({::arrow::field("real", float64(), /*nullable=*/false), + ::arrow::field("imag", float64(), /*nullable=*/false)}); + }); TestWithTypeFactory(dict_extension_type, []() { return dictionary(int8(), utf8()); }); // Inside nested type. @@ -3723,7 +3727,7 @@ TEST_F(TestSchemaRoundtrip, UnregisteredExtension) { } TEST_F(TestSchemaRoundtrip, RegisteredExtension) { - ExtensionTypeGuard guard({uuid(), dict_extension_type(), complex128()}); + ExtensionTypeGuard guard({dict_extension_type(), complex128()}); TestWithTypeFactory(uuid); TestWithTypeFactory(dict_extension_type); TestWithTypeFactory(complex128); @@ -4082,7 +4086,7 @@ TEST_F(TestArrayRoundtrip, Dictionary) { } TEST_F(TestArrayRoundtrip, RegisteredExtension) { - ExtensionTypeGuard guard({smallint(), complex128(), dict_extension_type(), uuid()}); + ExtensionTypeGuard guard({smallint(), complex128(), dict_extension_type()}); TestWithArrayFactory(ExampleSmallint); TestWithArrayFactory(ExampleUuid); @@ -4128,7 +4132,6 @@ TEST_F(TestArrayRoundtrip, UnregisteredExtension) { }; TestWithArrayFactory(ExampleSmallint, StorageExtractor(ExampleSmallint)); - TestWithArrayFactory(ExampleUuid, StorageExtractor(ExampleUuid)); TestWithArrayFactory(ExampleComplex128, StorageExtractor(ExampleComplex128)); TestWithArrayFactory(ExampleDictExtension, StorageExtractor(ExampleDictExtension)); } diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index 5cb4bc77af2..dfbeb62a2a5 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -set(CANONICAL_EXTENSION_TESTS bool8_test.cc) +set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_array_test.cc) if(ARROW_JSON) list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 3fd39a11ff5..361c7ed39f3 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -23,7 +23,6 @@ #include "arrow/array/array_primitive.h" #include "arrow/io/memory.h" #include "arrow/ipc/reader.h" -#include "arrow/ipc/writer.h" #include "arrow/record_batch.h" #include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" @@ -71,20 +70,6 @@ class TestExtensionType : public ::testing::Test { std::string serialized_; }; -auto RoundtripBatch = [](const std::shared_ptr& batch, - std::shared_ptr* out) { - ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); - ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), - out_stream.get())); - - ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); - - io::BufferReader reader(complete_ipc_stream); - std::shared_ptr batch_reader; - ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); - ASSERT_OK(batch_reader->ReadNext(out)); -}; - TEST_F(TestExtensionType, CheckDummyRegistration) { // We need a registered dummy type at runtime to allow for IPC deserialization auto registered_type = GetExtensionType("arrow.fixed_shape_tensor"); diff --git a/cpp/src/arrow/extension/uuid_array.cc b/cpp/src/arrow/extension/uuid_array.cc new file mode 100644 index 00000000000..87c6106c6b6 --- /dev/null +++ b/cpp/src/arrow/extension/uuid_array.cc @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension_type.h" +#include "arrow/util/logging.h" + +#include "arrow/extension/uuid_array.h" + +namespace arrow { +namespace extension { + +bool UuidType::ExtensionEquals(const ExtensionType& other) const { + return (other.extension_name() == this->extension_name()); +} + +std::shared_ptr UuidType::MakeArray(std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("uuid", static_cast(*data->type).extension_name()); + return std::make_shared(data); +} + +Result> UuidType::Deserialize( + std::shared_ptr storage_type, const std::string& serialized) const { + if (serialized != "uuid-serialized") { + return Status::Invalid("Type identifier did not match: '", serialized, "'"); + } + if (!storage_type->Equals(*fixed_size_binary(16))) { + return Status::Invalid("Invalid storage type for UuidType: ", + storage_type->ToString()); + } + return std::make_shared(); +} + +std::shared_ptr uuid() { return std::make_shared(); } + +} // namespace extension +} // namespace arrow diff --git a/cpp/src/arrow/extension/uuid_array.h b/cpp/src/arrow/extension/uuid_array.h new file mode 100644 index 00000000000..c9e27fedbc0 --- /dev/null +++ b/cpp/src/arrow/extension/uuid_array.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/extension_type.h" + +namespace arrow { +namespace extension { + +class ARROW_EXPORT UuidArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +class ARROW_EXPORT UuidType : public ExtensionType { + public: + UuidType() : ExtensionType(fixed_size_binary(16)) {} + + std::string extension_name() const override { return "uuid"; } + + bool ExtensionEquals(const ExtensionType& other) const override; + + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized) const override; + + std::string Serialize() const override { return "uuid-serialized"; } +}; + +ARROW_EXPORT +std::shared_ptr uuid(); + +} // namespace extension +} // namespace arrow diff --git a/cpp/src/arrow/extension/uuid_array_test.cc b/cpp/src/arrow/extension/uuid_array_test.cc new file mode 100644 index 00000000000..bfc40f811e6 --- /dev/null +++ b/cpp/src/arrow/extension/uuid_array_test.cc @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/uuid_array.h" + +#include "arrow/testing/matchers.h" + +#include "arrow/array/array_nested.h" +#include "arrow/array/array_primitive.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/writer.h" +#include "arrow/record_batch.h" +#include "arrow/tensor.h" +#include "arrow/testing/gtest_util.h" +#include "arrow/util/key_value_metadata.h" + +#include "arrow/testing/extension_type.h" + +namespace arrow { + +using extension::uuid; + +class TestUuuidExtensionType : public ::testing::Test {}; + +TEST_F(TestUuuidExtensionType, ExtensionTypeTest) { + auto type = uuid(); + ASSERT_EQ(type->id(), Type::EXTENSION); + + const auto& ext_type = static_cast(*type); + std::string serialized = ext_type.Serialize(); + + ASSERT_OK_AND_ASSIGN(auto deserialized, + ext_type.Deserialize(fixed_size_binary(16), serialized)); + ASSERT_TRUE(deserialized->Equals(*type)); + ASSERT_FALSE(deserialized->Equals(*fixed_size_binary(16))); +} + +} // namespace arrow diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 83c7ebed4f3..043b8d3733a 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -32,6 +32,7 @@ #include "arrow/extension/fixed_shape_tensor.h" #include "arrow/extension/opaque.h" #endif +#include "arrow/extension/uuid_array.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" @@ -147,7 +148,7 @@ static void CreateGlobalRegistry() { // Register canonical extension types g_registry = std::make_shared(); - std::vector> ext_types{extension::bool8()}; + std::vector> ext_types{extension::bool8(), extension::uuid()}; #ifdef ARROW_JSON ext_types.push_back(extension::fixed_shape_tensor(int64(), {})); diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index f104c984a64..b24e6c395e9 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -26,6 +26,7 @@ #include "arrow/array/array_nested.h" #include "arrow/array/util.h" +#include "arrow/extension/uuid_array.h" #include "arrow/extension_type.h" #include "arrow/io/memory.h" #include "arrow/ipc/options.h" @@ -41,6 +42,8 @@ namespace arrow { +using extension::uuid; + class Parametric1Array : public ExtensionArray { public: using ExtensionArray::ExtensionArray; @@ -176,16 +179,7 @@ class ExtStructType : public ExtensionType { std::string Serialize() const override { return "ext-struct-type-unique-code"; } }; -class TestExtensionType : public ::testing::Test { - public: - void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared())); } - - void TearDown() { - if (GetExtensionType("uuid")) { - ASSERT_OK(UnregisterExtensionType("uuid")); - } - } -}; +class TestExtensionType : public ::testing::Test {}; TEST_F(TestExtensionType, ExtensionTypeTest) { auto type_not_exist = GetExtensionType("uuid-unknown"); @@ -211,20 +205,6 @@ TEST_F(TestExtensionType, ExtensionTypeTest) { ASSERT_EQ(deserialized->byte_width(), 16); } -auto RoundtripBatch = [](const std::shared_ptr& batch, - std::shared_ptr* out) { - ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); - ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), - out_stream.get())); - - ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); - - io::BufferReader reader(complete_ipc_stream); - std::shared_ptr batch_reader; - ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); - ASSERT_OK(batch_reader->ReadNext(out)); -}; - TEST_F(TestExtensionType, IpcRoundtrip) { auto ext_arr = ExampleUuid(); auto batch = RecordBatch::Make(schema({field("f0", uuid())}), 4, {ext_arr}); diff --git a/cpp/src/arrow/flight/integration_tests/test_integration_client.cc b/cpp/src/arrow/flight/integration_tests/test_integration_client.cc index c5c3f10576d..75f0780486a 100644 --- a/cpp/src/arrow/flight/integration_tests/test_integration_client.cc +++ b/cpp/src/arrow/flight/integration_tests/test_integration_client.cc @@ -145,7 +145,6 @@ class IntegrationTestScenario : public Scenario { Status RunClient(std::unique_ptr client) override { // Make sure the required extension types are registered. - ExtensionTypeGuard uuid_ext_guard(uuid()); ExtensionTypeGuard dict_ext_guard(dict_extension_type()); FlightDescriptor descr{FlightDescriptor::PATH, "", {FLAGS_path}}; diff --git a/cpp/src/arrow/integration/json_integration_test.cc b/cpp/src/arrow/integration/json_integration_test.cc index 9b56928c688..28c363e2d2f 100644 --- a/cpp/src/arrow/integration/json_integration_test.cc +++ b/cpp/src/arrow/integration/json_integration_test.cc @@ -31,6 +31,7 @@ #include "arrow/array.h" #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" +#include "arrow/extension/uuid_array.h" #include "arrow/integration/json_integration.h" #include "arrow/integration/json_internal.h" #include "arrow/io/file.h" @@ -67,11 +68,14 @@ DEFINE_bool(validate_times, true, namespace arrow::internal::integration { -using ::arrow::internal::TemporaryDir; -using ::arrow::ipc::DictionaryFieldMapper; -using ::arrow::ipc::DictionaryMemo; -using ::arrow::ipc::IpcWriteOptions; -using ::arrow::ipc::MetadataVersion; +using extension::uuid; +using internal::TemporaryDir; +using ipc::DictionaryFieldMapper; +using ipc::DictionaryMemo; +using ipc::IpcWriteOptions; +using ipc::MetadataVersion; + +namespace testing { using namespace ::arrow::ipc::test; // NOLINT @@ -224,7 +228,7 @@ Status RunCommand(const std::string& json_path, const std::string& arrow_path, const std::string& command) { // Make sure the required extension types are registered, as they will be // referenced in test data. - ExtensionTypeGuard ext_guard({uuid(), dict_extension_type()}); + ExtensionTypeGuard ext_guard({dict_extension_type()}); if (json_path == "") { return Status::Invalid("Must specify json file name"); @@ -1027,12 +1031,10 @@ TEST(TestJsonFileReadWrite, JsonExample1) { TEST(TestJsonFileReadWrite, JsonExample2) { // Example 2: two extension types (one registered, one unregistered) - auto uuid_type = uuid(); + auto uuid_type = arrow::extension::uuid(); auto buffer = Buffer::Wrap(json_example2, strlen(json_example2)); { - ExtensionTypeGuard ext_guard(uuid_type); - ASSERT_OK_AND_ASSIGN(auto reader, IntegrationJsonReader::Open(buffer)); // The second field is an unregistered extension and will be read as // its underlying storage. @@ -1046,13 +1048,11 @@ TEST(TestJsonFileReadWrite, JsonExample2) { auto storage_array = ArrayFromJSON(fixed_size_binary(16), R"(["0123456789abcdef", null])"); - AssertArraysEqual(*batch->column(0), UuidArray(uuid_type, storage_array)); + AssertArraysEqual(*batch->column(0), + arrow::extension::UuidArray(uuid_type, storage_array)); AssertArraysEqual(*batch->column(1), NullArray(2)); } - - // Should fail now that the Uuid extension is unregistered - ASSERT_RAISES(KeyError, IntegrationJsonReader::Open(buffer)); } TEST(TestJsonFileReadWrite, JsonExample3) { @@ -1119,7 +1119,7 @@ class TestJsonRoundTrip : public ::testing::TestWithParam { }; void CheckRoundtrip(const RecordBatch& batch) { - ExtensionTypeGuard guard({uuid(), dict_extension_type(), complex128()}); + ExtensionTypeGuard guard({dict_extension_type(), complex128()}); TestSchemaRoundTrip(batch.schema()); @@ -1176,7 +1176,7 @@ const std::vector kBatchCases = { INSTANTIATE_TEST_SUITE_P(TestJsonRoundTrip, TestJsonRoundTrip, ::testing::ValuesIn(kBatchCases)); -} // namespace arrow::internal::integration +} // namespace testing int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); @@ -1184,8 +1184,8 @@ int main(int argc, char** argv) { int ret = 0; if (FLAGS_integration) { - arrow::Status result = - arrow::internal::integration::RunCommand(FLAGS_json, FLAGS_arrow, FLAGS_mode); + arrow::Status result = arrow::internal::integration::testing::RunCommand( + FLAGS_json, FLAGS_arrow, FLAGS_mode); if (!result.ok()) { std::cout << "Error message: " << result.ToString() << std::endl; ret = 1; @@ -1197,3 +1197,4 @@ int main(int argc, char** argv) { gflags::ShutDownCommandLineFlags(); return ret; } +} // namespace arrow::internal::integration diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index ff7838cc39d..99526331eea 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -31,6 +31,7 @@ #include "arrow/array.h" #include "arrow/array/builder_primitive.h" #include "arrow/buffer_builder.h" +#include "arrow/extension/uuid_array.h" #include "arrow/io/file.h" #include "arrow/io/memory.h" #include "arrow/io/test_common.h" @@ -59,6 +60,7 @@ namespace arrow { +using extension::uuid; using internal::checked_cast; using internal::checked_pointer_cast; using internal::TemporaryDir; @@ -407,7 +409,7 @@ static int g_file_number = 0; class ExtensionTypesMixin { public: // Register the extension types required to ensure roundtripping - ExtensionTypesMixin() : ext_guard_({uuid(), dict_extension_type(), complex128()}) {} + ExtensionTypesMixin() : ext_guard_({dict_extension_type(), complex128()}) {} protected: ExtensionTypeGuard ext_guard_; diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 87c02e2d87a..8de3ba0adb1 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -28,6 +28,7 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" #include "arrow/array/builder_time.h" +#include "arrow/extension/uuid_array.h" #include "arrow/ipc/test_common.h" #include "arrow/pretty_print.h" #include "arrow/record_batch.h" @@ -50,6 +51,8 @@ namespace arrow { +using extension::uuid; +using extension::UuidArray; using internal::checked_cast; namespace ipc { diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 104a5697b57..4a9fe153d1b 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -31,6 +31,7 @@ #include "arrow/array/util.h" #include "arrow/buffer.h" #include "arrow/compute/cast.h" +#include "arrow/extension/uuid_array.h" #include "arrow/memory_pool.h" #include "arrow/scalar.h" #include "arrow/status.h" @@ -43,7 +44,8 @@ namespace arrow { using compute::Cast; using compute::CastOptions; - +using extension::uuid; +using extension::UuidType; using internal::checked_cast; using internal::checked_pointer_cast; diff --git a/cpp/src/arrow/testing/extension_type.h b/cpp/src/arrow/testing/extension_type.h index 6515631f202..7c368c91634 100644 --- a/cpp/src/arrow/testing/extension_type.h +++ b/cpp/src/arrow/testing/extension_type.h @@ -27,28 +27,6 @@ namespace arrow { -class ARROW_TESTING_EXPORT UuidArray : public ExtensionArray { - public: - using ExtensionArray::ExtensionArray; -}; - -class ARROW_TESTING_EXPORT UuidType : public ExtensionType { - public: - UuidType() : ExtensionType(fixed_size_binary(16)) {} - - std::string extension_name() const override { return "uuid"; } - - bool ExtensionEquals(const ExtensionType& other) const override; - - std::shared_ptr MakeArray(std::shared_ptr data) const override; - - Result> Deserialize( - std::shared_ptr storage_type, - const std::string& serialized) const override; - - std::string Serialize() const override { return "uuid-serialized"; } -}; - class ARROW_TESTING_EXPORT SmallintArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; @@ -175,9 +153,6 @@ class ARROW_TESTING_EXPORT Complex128Type : public ExtensionType { std::string Serialize() const override { return "complex128-serialized"; } }; -ARROW_TESTING_EXPORT -std::shared_ptr uuid(); - ARROW_TESTING_EXPORT std::shared_ptr smallint(); diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 95de16c715f..2a118a42c12 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -17,6 +17,7 @@ #include "arrow/testing/gtest_util.h" +#include "arrow/extension/uuid_array.h" #include "arrow/testing/extension_type.h" #ifdef _WIN32 @@ -49,9 +50,13 @@ #include "arrow/buffer.h" #include "arrow/compute/api_vector.h" #include "arrow/datum.h" +#include "arrow/io/memory.h" #include "arrow/ipc/json_simple.h" #include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep +#include "arrow/ipc/reader.h" +#include "arrow/ipc/writer.h" #include "arrow/pretty_print.h" +#include "arrow/record_batch.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/tensor.h" @@ -586,6 +591,20 @@ void ApproxCompareBatch(const RecordBatch& left, const RecordBatch& right, }); } +void RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out) { + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); + ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), + out_stream.get())); + + ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); + + io::BufferReader reader(complete_ipc_stream); + std::shared_ptr batch_reader; + ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); + ASSERT_OK(batch_reader->ReadNext(out)); +} + std::shared_ptr TweakValidityBit(const std::shared_ptr& array, int64_t index, bool validity) { auto data = array->data()->Copy(); @@ -847,28 +866,6 @@ Future<> SleepABitAsync() { /////////////////////////////////////////////////////////////////////////// // Extension types -bool UuidType::ExtensionEquals(const ExtensionType& other) const { - return (other.extension_name() == this->extension_name()); -} - -std::shared_ptr UuidType::MakeArray(std::shared_ptr data) const { - DCHECK_EQ(data->type->id(), Type::EXTENSION); - DCHECK_EQ("uuid", static_cast(*data->type).extension_name()); - return std::make_shared(data); -} - -Result> UuidType::Deserialize( - std::shared_ptr storage_type, const std::string& serialized) const { - if (serialized != "uuid-serialized") { - return Status::Invalid("Type identifier did not match: '", serialized, "'"); - } - if (!storage_type->Equals(*fixed_size_binary(16))) { - return Status::Invalid("Invalid storage type for UuidType: ", - storage_type->ToString()); - } - return std::make_shared(); -} - bool SmallintType::ExtensionEquals(const ExtensionType& other) const { return (other.extension_name() == this->extension_name()); } @@ -982,8 +979,6 @@ Result> Complex128Type::Deserialize( return std::make_shared(); } -std::shared_ptr uuid() { return std::make_shared(); } - std::shared_ptr smallint() { return std::make_shared(); } std::shared_ptr tinyint() { return std::make_shared(); } @@ -1011,7 +1006,7 @@ std::shared_ptr ExampleUuid() { auto arr = ArrayFromJSON( fixed_size_binary(16), "[null, \"abcdefghijklmno0\", \"abcdefghijklmno1\", \"abcdefghijklmno2\"]"); - return ExtensionType::WrapArray(uuid(), arr); + return ExtensionType::WrapArray(extension::uuid(), arr); } std::shared_ptr ExampleSmallint() { diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index 85b4c1f1f01..e77694d12a0 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -309,6 +309,9 @@ ARROW_TESTING_EXPORT void ApproxCompareBatch( const RecordBatch& left, const RecordBatch& right, bool compare_metadata = true, const EqualOptions& options = TestingEqualOptions()); +ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out); + // Check if the padding of the buffers of the array is zero. // Also cause valgrind warnings if the padding bytes are uninitialized. ARROW_TESTING_EXPORT void AssertZeroPadded(const Array& array); From e1d874e54c217810dc785be33dcaa4d503e2d923 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 27 Oct 2023 20:06:16 +0200 Subject: [PATCH 02/22] Move import --- cpp/src/arrow/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 55ee8d9906a..51e5cff276f 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -375,6 +375,8 @@ set(ARROW_SRCS device.cc extension_type.cc extension/bool8.cc + extension/uuid_array.cc + memory_pool.cc pretty_print.cc record_batch.cc result.cc From 4cee09e4802e546f90b530f11747c4333590b530 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 27 Oct 2023 20:39:49 +0200 Subject: [PATCH 03/22] Rename uuid examples --- c_glib/example/extension-type.c | 12 ++++++------ c_glib/test/test-extension-data-type.rb | 4 ++-- python/pyarrow/tests/extensions.pyx | 2 +- python/pyarrow/tests/test_extension_type.py | 4 ++-- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/c_glib/example/extension-type.c b/c_glib/example/extension-type.c index 1861943d7e5..9aae6b2ee4f 100644 --- a/c_glib/example/extension-type.c +++ b/c_glib/example/extension-type.c @@ -59,7 +59,7 @@ G_DEFINE_TYPE(ExampleUUIDDataType, static gchar * example_uuid_data_type_get_extension_name(GArrowExtensionDataType *data_type) { - return g_strdup("uuid"); + return g_strdup("example-uuid"); } static gboolean @@ -70,9 +70,8 @@ example_uuid_data_type_equal(GArrowExtensionDataType *data_type, return TRUE; } -static const gchar *example_uuid_data_type_serialize_id = "uuid-serialized"; -static ExampleUUIDDataType * -example_uuid_data_type_new(void); +static const gchar *example_uuid_data_type_serialize_id = "example-uuid-serialized"; +static ExampleUUIDDataType *example_uuid_data_type_new(void); static GArrowDataType * example_uuid_data_type_deserialize(GArrowExtensionDataType *data_type, @@ -221,8 +220,9 @@ main(int argc, char **argv) /* Create a record batch to serialize the created UUID extension array. */ GList *fields = NULL; - fields = - g_list_append(fields, garrow_field_new("uuid", GARROW_DATA_TYPE(uuid_data_type))); + fields = g_list_append(fields, + garrow_field_new("example-uuid", + GARROW_DATA_TYPE(uuid_data_type))); GArrowSchema *schema = garrow_schema_new(fields); g_list_free_full(fields, g_object_unref); GList *columns = NULL; diff --git a/c_glib/test/test-extension-data-type.rb b/c_glib/test/test-extension-data-type.rb index 6c114b81e2c..b3a3ce107ff 100644 --- a/c_glib/test/test-extension-data-type.rb +++ b/c_glib/test/test-extension-data-type.rb @@ -53,7 +53,7 @@ def test_name def test_to_s omit("gobject-introspection gem doesn't support implementing methods for GLib object yet") data_type = UUIDDataType.new - assert_equal("extension", data_type.to_s) + assert_equal("extension", data_type.to_s) end def test_storage_data_type @@ -65,7 +65,7 @@ def test_storage_data_type def test_extension_name omit("gobject-introspection gem doesn't support implementing methods for GLib object yet") data_type = UUIDDataType.new - assert_equal("uuid", data_type.extension_name) + assert_equal("example-uuid", data_type.extension_name) end def test_wrap_array diff --git a/python/pyarrow/tests/extensions.pyx b/python/pyarrow/tests/extensions.pyx index c1bf9aae1ec..309b574dc02 100644 --- a/python/pyarrow/tests/extensions.pyx +++ b/python/pyarrow/tests/extensions.pyx @@ -37,7 +37,7 @@ cdef extern from * namespace "arrow::py" nogil: class UuidType : public ExtensionType { public: UuidType() : ExtensionType(fixed_size_binary(16)) {} - std::string extension_name() const override { return "uuid"; } + std::string extension_name() const override { return "example-uuid"; } bool ExtensionEquals(const ExtensionType& other) const override { return other.extension_name() == this->extension_name(); diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 0d50c467e96..bc7b07ac303 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1347,7 +1347,7 @@ def test_cpp_extension_in_python(tmpdir): mod = __import__('extensions') uuid_type = mod._make_uuid_type() - assert uuid_type.extension_name == "uuid" + assert uuid_type.extension_name == "example-uuid" assert uuid_type.storage_type == pa.binary(16) array = mod._make_uuid_array() @@ -1356,7 +1356,7 @@ def test_cpp_extension_in_python(tmpdir): assert array[0].as_py() == b'abcdefghijklmno0' assert array[1].as_py() == b'0onmlkjihgfedcba' - buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["uuid"])) + buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["example-uuid"])) batch = ipc_read_batch(buf) reconstructed_array = batch.column(0) From 5a15f884eb8e0477d1d463b1cef568f9172abf7c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 27 Oct 2023 22:58:53 +0200 Subject: [PATCH 04/22] Renaming uuid extension in datagen --- dev/archery/archery/integration/datagen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 47310c905a9..c96ef2bce88 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1845,7 +1845,7 @@ def generate_nested_dictionary_case(): def generate_extension_case(): dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0') - uuid_type = ExtensionType('uuid', 'uuid-serialized', + uuid_type = ExtensionType('example-uuid', 'uuid-serialized', FixedSizeBinaryField('', 16)) dict_ext_type = ExtensionType( 'dict-extension', 'dict-extension-serialized', From 9a59ec6a2f8730e45d47fb527cd2551a9806dc31 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 28 Oct 2023 05:56:47 +0200 Subject: [PATCH 05/22] Add scalar --- cpp/src/arrow/CMakeLists.txt | 4 +- cpp/src/arrow/acero/hash_join_node_test.cc | 2 +- cpp/src/arrow/acero/util_test.cc | 2 +- cpp/src/arrow/c/bridge_test.cc | 2 +- cpp/src/arrow/extension/CMakeLists.txt | 2 +- .../extension/{uuid_array.cc => uuid.cc} | 5 +- .../arrow/extension/{uuid_array.h => uuid.h} | 7 +- .../{uuid_array_test.cc => uuid_test.cc} | 4 +- cpp/src/arrow/extension_type.cc | 2 +- cpp/src/arrow/extension_type_test.cc | 2 +- .../integration/json_integration_test.cc | 2 +- cpp/src/arrow/ipc/read_write_test.cc | 2 +- cpp/src/arrow/ipc/test_common.cc | 2 +- cpp/src/arrow/scalar_test.cc | 2 +- cpp/src/arrow/testing/gtest_util.cc | 2 +- python/pyarrow/__init__.py | 18 ++-- python/pyarrow/array.pxi | 8 +- python/pyarrow/includes/libarrow.pxd | 15 +++- python/pyarrow/lib.pxd | 3 + python/pyarrow/public-api.pxi | 6 +- python/pyarrow/scalar.pxi | 10 +++ python/pyarrow/tests/test_extension_type.py | 87 +++++++++++++------ python/pyarrow/types.pxi | 47 ++++++++++ 23 files changed, 176 insertions(+), 60 deletions(-) rename cpp/src/arrow/extension/{uuid_array.cc => uuid.cc} (92%) rename cpp/src/arrow/extension/{uuid_array.h => uuid.h} (84%) rename cpp/src/arrow/extension/{uuid_array_test.cc => uuid_test.cc} (94%) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 51e5cff276f..a37226477d4 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -375,7 +375,7 @@ set(ARROW_SRCS device.cc extension_type.cc extension/bool8.cc - extension/uuid_array.cc + extension/uuid.cc memory_pool.cc pretty_print.cc record_batch.cc @@ -909,9 +909,9 @@ endif() if(ARROW_JSON) arrow_add_object_library(ARROW_JSON + extension/bool8.cc extension/fixed_shape_tensor.cc extension/opaque.cc - extension/uuid_array.cc json/options.cc json/chunked_builder.cc json/chunker.cc diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 35d065ac353..6fed1cdf803 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -29,7 +29,7 @@ #include "arrow/compute/kernels/test_util.h" #include "arrow/compute/light_array_internal.h" #include "arrow/compute/row/row_encoder_internal.h" -#include "arrow/extension/uuid_array.h" +#include "arrow/extension/uuid.h" #include "arrow/testing/extension_type.h" #include "arrow/testing/generator.h" #include "arrow/testing/gtest_util.h" diff --git a/cpp/src/arrow/acero/util_test.cc b/cpp/src/arrow/acero/util_test.cc index 8340eab80be..ec0a79d177c 100644 --- a/cpp/src/arrow/acero/util_test.cc +++ b/cpp/src/arrow/acero/util_test.cc @@ -21,7 +21,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" -#include "arrow/extension/uuid_array.h" +#include "arrow/extension/uuid.h" using testing::Eq; diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index d0e5933e600..dbb98344024 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -31,7 +31,7 @@ #include "arrow/c/bridge.h" #include "arrow/c/helpers.h" #include "arrow/c/util_internal.h" -#include "arrow/extension/uuid_array.h" +#include "arrow/extension/uuid.h" #include "arrow/ipc/json_simple.h" #include "arrow/memory_pool.h" #include "arrow/testing/builder.h" diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index dfbeb62a2a5..065ea3f1ddb 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_array_test.cc) +set(CANONICAL_EXTENSION_TESTS bool8_test.cc uuid_test.cc) if(ARROW_JSON) list(APPEND CANONICAL_EXTENSION_TESTS fixed_shape_tensor_test.cc opaque_test.cc) diff --git a/cpp/src/arrow/extension/uuid_array.cc b/cpp/src/arrow/extension/uuid.cc similarity index 92% rename from cpp/src/arrow/extension/uuid_array.cc rename to cpp/src/arrow/extension/uuid.cc index 87c6106c6b6..f3c1d9e1dd5 100644 --- a/cpp/src/arrow/extension/uuid_array.cc +++ b/cpp/src/arrow/extension/uuid.cc @@ -18,7 +18,7 @@ #include "arrow/extension_type.h" #include "arrow/util/logging.h" -#include "arrow/extension/uuid_array.h" +#include "arrow/extension/uuid.h" namespace arrow { namespace extension { @@ -29,7 +29,8 @@ bool UuidType::ExtensionEquals(const ExtensionType& other) const { std::shared_ptr UuidType::MakeArray(std::shared_ptr data) const { DCHECK_EQ(data->type->id(), Type::EXTENSION); - DCHECK_EQ("uuid", static_cast(*data->type).extension_name()); + DCHECK_EQ("arrow.uuid", + static_cast(*data->type).extension_name()); return std::make_shared(data); } diff --git a/cpp/src/arrow/extension/uuid_array.h b/cpp/src/arrow/extension/uuid.h similarity index 84% rename from cpp/src/arrow/extension/uuid_array.h rename to cpp/src/arrow/extension/uuid.h index c9e27fedbc0..6fd8d526b78 100644 --- a/cpp/src/arrow/extension/uuid_array.h +++ b/cpp/src/arrow/extension/uuid.h @@ -31,7 +31,9 @@ class ARROW_EXPORT UuidType : public ExtensionType { public: UuidType() : ExtensionType(fixed_size_binary(16)) {} - std::string extension_name() const override { return "uuid"; } + std::string extension_name() const override { return "arrow.uuid"; } + + const std::shared_ptr value_type() const { return fixed_size_binary(16); } bool ExtensionEquals(const ExtensionType& other) const override; @@ -42,6 +44,9 @@ class ARROW_EXPORT UuidType : public ExtensionType { const std::string& serialized) const override; std::string Serialize() const override { return "uuid-serialized"; } + + /// \brief Create a UuidType instance + static Result> Make() { return std::make_shared(); } }; ARROW_EXPORT diff --git a/cpp/src/arrow/extension/uuid_array_test.cc b/cpp/src/arrow/extension/uuid_test.cc similarity index 94% rename from cpp/src/arrow/extension/uuid_array_test.cc rename to cpp/src/arrow/extension/uuid_test.cc index bfc40f811e6..bbeaf453e1e 100644 --- a/cpp/src/arrow/extension/uuid_array_test.cc +++ b/cpp/src/arrow/extension/uuid_test.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/extension/uuid_array.h" +#include "arrow/extension/uuid.h" #include "arrow/testing/matchers.h" @@ -23,9 +23,7 @@ #include "arrow/array/array_primitive.h" #include "arrow/io/memory.h" #include "arrow/ipc/reader.h" -#include "arrow/ipc/writer.h" #include "arrow/record_batch.h" -#include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/key_value_metadata.h" diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 043b8d3733a..5a297bda98d 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -32,7 +32,7 @@ #include "arrow/extension/fixed_shape_tensor.h" #include "arrow/extension/opaque.h" #endif -#include "arrow/extension/uuid_array.h" +#include "arrow/extension/uuid.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index b24e6c395e9..c54656e7ed6 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -26,7 +26,7 @@ #include "arrow/array/array_nested.h" #include "arrow/array/util.h" -#include "arrow/extension/uuid_array.h" +#include "arrow/extension/uuid.h" #include "arrow/extension_type.h" #include "arrow/io/memory.h" #include "arrow/ipc/options.h" diff --git a/cpp/src/arrow/integration/json_integration_test.cc b/cpp/src/arrow/integration/json_integration_test.cc index 28c363e2d2f..53d00e44f8f 100644 --- a/cpp/src/arrow/integration/json_integration_test.cc +++ b/cpp/src/arrow/integration/json_integration_test.cc @@ -31,7 +31,7 @@ #include "arrow/array.h" #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" -#include "arrow/extension/uuid_array.h" +#include "arrow/extension/uuid.h" #include "arrow/integration/json_integration.h" #include "arrow/integration/json_internal.h" #include "arrow/io/file.h" diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 99526331eea..649289fb149 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -31,7 +31,7 @@ #include "arrow/array.h" #include "arrow/array/builder_primitive.h" #include "arrow/buffer_builder.h" -#include "arrow/extension/uuid_array.h" +#include "arrow/extension/uuid.h" #include "arrow/io/file.h" #include "arrow/io/memory.h" #include "arrow/io/test_common.h" diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 8de3ba0adb1..56c16f1b47d 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -28,7 +28,7 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" #include "arrow/array/builder_time.h" -#include "arrow/extension/uuid_array.h" +#include "arrow/extension/uuid.h" #include "arrow/ipc/test_common.h" #include "arrow/pretty_print.h" #include "arrow/record_batch.h" diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 4a9fe153d1b..b638b672e54 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -31,7 +31,7 @@ #include "arrow/array/util.h" #include "arrow/buffer.h" #include "arrow/compute/cast.h" -#include "arrow/extension/uuid_array.h" +#include "arrow/extension/uuid.h" #include "arrow/memory_pool.h" #include "arrow/scalar.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index 2a118a42c12..bd019133945 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -17,7 +17,7 @@ #include "arrow/testing/gtest_util.h" -#include "arrow/extension/uuid_array.h" +#include "arrow/extension/uuid.h" #include "arrow/testing/extension_type.h" #ifdef _WIN32 diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 807bcdc3150..d31c93119b7 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -172,9 +172,7 @@ def print_entry(label, value): union, sparse_union, dense_union, dictionary, run_end_encoded, - fixed_shape_tensor, - opaque, - bool8, + bool8, fixed_shape_tensor, opaque, uuid, field, type_for_alias, DataType, DictionaryType, StructType, @@ -184,8 +182,9 @@ def print_entry(label, value): TimestampType, Time32Type, Time64Type, DurationType, FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, - RunEndEncodedType, FixedShapeTensorType, OpaqueType, - Bool8Type, PyExtensionType, UnknownExtensionType, + RunEndEncodedType, Bool8Type, FixedShapeTensorType, + OpaqueType, UuidType, + PyExtensionType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, KeyValueMetadata, @@ -218,8 +217,9 @@ def print_entry(label, value): Time32Array, Time64Array, DurationArray, MonthDayNanoIntervalArray, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, - RunEndEncodedArray, FixedShapeTensorArray, OpaqueArray, - Bool8Array, scalar, NA, _NULL as NULL, Scalar, + RunEndEncodedArray, Bool8Array, FixedShapeTensorArray, + OpaqueArray, UuidArray, + scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar, @@ -235,8 +235,8 @@ def print_entry(label, value): StringScalar, LargeStringScalar, StringViewScalar, FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, - RunEndEncodedScalar, ExtensionScalar, - FixedShapeTensorScalar, OpaqueScalar, Bool8Scalar) + RunEndEncodedScalar, Bool8Scalar, ExtensionScalar, + FixedShapeTensorScalar, OpaqueScalar, UuidScalar) # Buffers, allocation from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 77d6c9c06d2..e80c6b52e89 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4338,7 +4338,13 @@ cdef class ExtensionArray(Array): return result -cdef class FixedShapeTensorArray(ExtensionArray): +class UuidArray(ExtensionArray): + """ + Concrete class for Arrow arrays of UUID data type. + """ + + +class FixedShapeTensorArray(ExtensionArray): """ Concrete class for fixed shape tensor extension arrays. diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 6f510cfc0c0..2c7399039b3 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2865,7 +2865,20 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() -cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension" nogil: +cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension": + cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make() + + CResult[shared_ptr[CDataType]] Deserialize(const c_string& serialized_data) const + + c_string Serialize() const + + const shared_ptr[CDataType] value_type() + + +cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension": cdef cppclass CFixedShapeTensorType \ " arrow::extension::FixedShapeTensorType"(CExtensionType): diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index a7c3b496a00..5c3d981c3ad 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -222,6 +222,9 @@ cdef class OpaqueType(BaseExtensionType): cdef: const COpaqueType* opaque_ext_type +cdef class UuidType(BaseExtensionType): + cdef: + const CUuidType* uuid_ext_type cdef class PyExtensionType(ExtensionType): pass diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 19a26bd6c68..83d9e9e79b6 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -122,12 +122,14 @@ cdef api object pyarrow_wrap_data_type( cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type) if cpy_ext_type != nullptr: return cpy_ext_type.GetInstance() + elif ext_type.extension_name() == b"arrow.bool8": + out = Bool8Type.__new__(Bool8Type) elif ext_type.extension_name() == b"arrow.fixed_shape_tensor": out = FixedShapeTensorType.__new__(FixedShapeTensorType) elif ext_type.extension_name() == b"arrow.opaque": out = OpaqueType.__new__(OpaqueType) - elif ext_type.extension_name() == b"arrow.bool8": - out = Bool8Type.__new__(Bool8Type) + elif ext_type.extension_name() == b"arrow.uuid": + out = UuidType.__new__(UuidType) else: out = BaseExtensionType.__new__(BaseExtensionType) else: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 72ae2aee5f8..68f77832c43 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -17,6 +17,7 @@ import collections from cython cimport binding +from uuid import UUID cdef class Scalar(_Weakrefable): @@ -1043,6 +1044,15 @@ cdef class ExtensionScalar(Scalar): return pyarrow_wrap_scalar( sp_scalar) +class UuidScalar(ExtensionScalar): + """ + Concrete class for Uuid extension scalar. + """ + + def as_py(self): + return None if self.value is None else UUID(bytes=self.value.as_py()) + + cdef class FixedShapeTensorScalar(ExtensionScalar): """ Concrete class for fixed shape tensor extension scalar. diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index bc7b07ac303..361783a9e1c 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -95,18 +95,21 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): return cls() -class UuidScalarType(pa.ExtensionScalar): +class ExampleUuidScalarType(pa.ExtensionScalar): def as_py(self): return None if self.value is None else UUID(bytes=self.value.as_py()) -class UuidType(pa.ExtensionType): +class ExampleUuidType(pa.PyExtensionType): def __init__(self): super().__init__(pa.binary(16), 'pyarrow.tests.UuidType') + def __reduce__(self): + return ExampleUuidType, () + def __arrow_ext_scalar_class__(self): - return UuidScalarType + return ExampleUuidScalarType def __arrow_ext_serialize__(self): return b'' @@ -116,7 +119,7 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): return cls() -class UuidType2(pa.ExtensionType): +class ExampleUuidType2(pa.PyExtensionType): def __init__(self): super().__init__(pa.binary(16), 'pyarrow.tests.UuidType2') @@ -250,8 +253,8 @@ def ipc_read_batch(buf): def test_ext_type_basics(): - ty = UuidType() - assert ty.extension_name == "pyarrow.tests.UuidType" + ty = ExampleUuidType() + assert ty.extension_name == "arrow.py_extension_type" def test_ext_type_str(): @@ -266,17 +269,17 @@ def test_ext_type_repr(): assert repr(ty) == "IntegerType(DataType(int64))" -def test_ext_type_lifetime(): - ty = UuidType() +def test_ext_type__lifetime(): + ty = ExampleUuidType() wr = weakref.ref(ty) del ty assert wr() is None -def test_ext_type_storage_type(): - ty = UuidType() +def test_ext_type__storage_type(): + ty = ExampleUuidType() assert ty.storage_type == pa.binary(16) - assert ty.__class__ is UuidType + assert ty.__class__ is ExampleUuidType ty = ParamExtType(5) assert ty.storage_type == pa.binary(5) assert ty.__class__ is ParamExtType @@ -309,7 +312,7 @@ def test_ext_type_bit_width(): def test_ext_type_as_py(): - ty = UuidType() + ty = ExampleUuidType() expected = uuid4() scalar = pa.ExtensionScalar.from_storage(ty, expected.bytes) assert scalar.as_py() == expected @@ -342,7 +345,7 @@ def test_ext_type_as_py(): def test_uuid_type_pickle(pickle_module): for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1): - ty = UuidType() + ty = ExampleUuidType() ser = pickle_module.dumps(ty, protocol=proto) del ty ty = pickle_module.loads(ser) @@ -358,8 +361,8 @@ def test_ext_type_equality(): c = ParamExtType(6) assert a != b assert b == c - d = UuidType() - e = UuidType() + d = ExampleUuidType() + e = ExampleUuidType() assert a != d assert d == e @@ -403,7 +406,7 @@ def test_ext_array_equality(): storage1 = pa.array([b"0123456789abcdef"], type=pa.binary(16)) storage2 = pa.array([b"0123456789abcdef"], type=pa.binary(16)) storage3 = pa.array([], type=pa.binary(16)) - ty1 = UuidType() + ty1 = ExampleUuidType() ty2 = ParamExtType(16) a = pa.ExtensionArray.from_storage(ty1, storage1) @@ -451,9 +454,9 @@ def test_ext_scalar_from_array(): data = [b"0123456789abcdef", b"0123456789abcdef", b"zyxwvutsrqponmlk", None] storage = pa.array(data, type=pa.binary(16)) - ty1 = UuidType() + ty1 = ExampleUuidType() ty2 = ParamExtType(16) - ty3 = UuidType2() + ty3 = ExampleUuidType2() a = pa.ExtensionArray.from_storage(ty1, storage) b = pa.ExtensionArray.from_storage(ty2, storage) @@ -462,9 +465,9 @@ def test_ext_scalar_from_array(): scalars_a = list(a) assert len(scalars_a) == 4 - assert ty1.__arrow_ext_scalar_class__() == UuidScalarType - assert isinstance(a[0], UuidScalarType) - assert isinstance(scalars_a[0], UuidScalarType) + assert ty1.__arrow_ext_scalar_class__() == ExampleUuidScalarType + assert isinstance(a[0], ExampleUuidScalarType) + assert isinstance(scalars_a[0], ExampleUuidScalarType) for s, val in zip(scalars_a, data): assert isinstance(s, pa.ExtensionScalar) @@ -505,7 +508,7 @@ def test_ext_scalar_from_array(): def test_ext_scalar_from_storage(): - ty = UuidType() + ty = ExampleUuidType() s = pa.ExtensionScalar.from_storage(ty, None) assert isinstance(s, pa.ExtensionScalar) @@ -706,14 +709,14 @@ def test_cast_between_extension_types(): tiny_int_arr.cast(pa.int64()).cast(IntegerType()) # Between the same extension types is okay - array = pa.array([b'1' * 16, b'2' * 16], pa.binary(16)).cast(UuidType()) - out = array.cast(UuidType()) - assert out.type == UuidType() + array = pa.array([b'1' * 16, b'2' * 16], pa.binary(16)).cast(ExampleUuidType()) + out = array.cast(ExampleUuidType()) + assert out.type == ExampleUuidType() # Will still fail casting between extensions who share storage type, # can only cast between exactly the same extension types. with pytest.raises(TypeError, match='Casting from *'): - array.cast(UuidType2()) + array.cast(ExampleUuidType2()) def test_cast_to_extension_with_extension_storage(): @@ -744,10 +747,10 @@ def test_cast_nested_extension_types(data, type_factory): def test_casting_dict_array_to_extension_type(): storage = pa.array([b"0123456789abcdef"], type=pa.binary(16)) - arr = pa.ExtensionArray.from_storage(UuidType(), storage) + arr = pa.ExtensionArray.from_storage(ExampleUuidType(), storage) dict_arr = pa.DictionaryArray.from_arrays(pa.array([0, 0], pa.int32()), arr) - out = dict_arr.cast(UuidType()) + out = dict_arr.cast(ExampleUuidType()) assert isinstance(out, pa.ExtensionArray) assert out.to_pylist() == [UUID('30313233-3435-3637-3839-616263646566'), UUID('30313233-3435-3637-3839-616263646566')] @@ -1364,6 +1367,34 @@ def test_cpp_extension_in_python(tmpdir): assert reconstructed_array == array +def test_uuid_extension(): + data = [b"0123456789abcdef", b"0123456789abcdef", + b"zyxwvutsrqponmlk", None] + + uuid_type = pa.uuid() + assert uuid_type.extension_name == "arrow.uuid" + assert uuid_type.storage_type == pa.binary(16) + assert uuid_type.__class__ is pa.UuidType + + storage = pa.array(data, pa.binary(16)) + array = pa.ExtensionArray.from_storage(uuid_type, storage) + assert array.type == uuid_type + + assert array.to_pylist() == [x if x is None else UUID(bytes=x) for x in data] + assert array[0].as_py() == UUID(bytes=data[0]) + assert array[3].as_py() is None + + buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["uuid"])) + + batch = ipc_read_batch(buf) + reconstructed_array = batch.column(0) + assert reconstructed_array.type == uuid_type + assert reconstructed_array == array + + assert uuid_type.__arrow_ext_scalar_class__() == pa.UuidScalar + assert isinstance(array[0], pa.UuidScalar) + + def test_tensor_type(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3]) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 563782f0c26..8087cdfcfa3 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1765,6 +1765,38 @@ cdef class ExtensionType(BaseExtensionType): return ExtensionScalar +cdef class UuidType(BaseExtensionType): + """ + Concrete class for UUID extension type. + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.uuid_ext_type = type.get() + + def __arrow_ext_serialize__(self): + """ + Serialized representation of metadata to reconstruct the type object. + """ + return self.uuid_ext_type.Serialize() + + @classmethod + def __arrow_ext_deserialize__(self, storage_type, serialized): + """ + Return an UuidType instance from the storage type. + """ + return self.uuid_ext_type.Deserialize(storage_type, serialized) + + def __arrow_ext_class__(self): + return UuidArray + + def __reduce__(self): + return uuid, (self.value_type,) + + def __arrow_ext_scalar_class__(self): + return UuidScalar + + cdef class FixedShapeTensorType(BaseExtensionType): """ Concrete class for fixed shape tensor extension type. @@ -5208,6 +5240,21 @@ def run_end_encoded(run_end_type, value_type): return pyarrow_wrap_data_type(ree_type) +def uuid(): + """ + Create UuidType instance. + + Returns + ------- + type : UuidType + """ + + cdef UuidType out = UuidType.__new__(UuidType) + c_uuid_ext_type = GetResultValue(CUuidType.Make()) + out.init(c_uuid_ext_type) + return out + + def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=None): """ Create instance of fixed shape tensor extension type with shape and optional From 7fa2bff5af57723c2212cfe3f3a4f09220bf0f75 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 28 Oct 2023 13:17:23 +0200 Subject: [PATCH 06/22] Correct uuid extension type name --- cpp/src/arrow/c/bridge_test.cc | 2 +- cpp/src/arrow/extension_type_test.cc | 6 +++--- cpp/src/arrow/integration/json_integration_test.cc | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index dbb98344024..e13aee5b47f 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -189,7 +189,7 @@ static const std::string kEncodedUuidMetadata = // NOLINT: runtime/string #if ARROW_LITTLE_ENDIAN std::string {2, 0, 0, 0} + std::string {20, 0, 0, 0} + kExtensionTypeKeyName + - std::string {4, 0, 0, 0} + "uuid" + + std::string {10, 0, 0, 0} + "arrow.uuid" + std::string {24, 0, 0, 0} + kExtensionMetadataKeyName + std::string {15, 0, 0, 0} + "uuid-serialized"; #else diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index c54656e7ed6..7568fedf11e 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -185,7 +185,7 @@ TEST_F(TestExtensionType, ExtensionTypeTest) { auto type_not_exist = GetExtensionType("uuid-unknown"); ASSERT_EQ(type_not_exist, nullptr); - auto registered_type = GetExtensionType("uuid"); + auto registered_type = GetExtensionType("arrow.uuid"); ASSERT_NE(registered_type, nullptr); auto type = uuid(); @@ -235,9 +235,9 @@ TEST_F(TestExtensionType, UnrecognizedExtension) { ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); - ASSERT_OK(UnregisterExtensionType("uuid")); + ASSERT_OK(UnregisterExtensionType("arrow.uuid")); auto ext_metadata = - key_value_metadata({{"ARROW:extension:name", "uuid"}, + key_value_metadata({{"ARROW:extension:name", "arrow.uuid"}, {"ARROW:extension:metadata", "uuid-serialized"}}); auto ext_field = field("f0", fixed_size_binary(16), true, ext_metadata); auto batch_no_ext = RecordBatch::Make(schema({ext_field}), 4, {storage_arr}); diff --git a/cpp/src/arrow/integration/json_integration_test.cc b/cpp/src/arrow/integration/json_integration_test.cc index 53d00e44f8f..f54bb431690 100644 --- a/cpp/src/arrow/integration/json_integration_test.cc +++ b/cpp/src/arrow/integration/json_integration_test.cc @@ -468,7 +468,7 @@ static const char* json_example2 = R"example( "nullable": true, "children" : [], "metadata" : [ - {"key": "ARROW:extension:name", "value": "uuid"}, + {"key": "ARROW:extension:name", "value": "arrow.uuid"}, {"key": "ARROW:extension:metadata", "value": "uuid-serialized"} ] }, From 25d75cea4fd0cb0428b267f2ad582e955b54beb8 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sun, 29 Oct 2023 02:09:51 +0200 Subject: [PATCH 07/22] minor change --- dev/archery/archery/integration/datagen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index c96ef2bce88..47310c905a9 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1845,7 +1845,7 @@ def generate_nested_dictionary_case(): def generate_extension_case(): dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0') - uuid_type = ExtensionType('example-uuid', 'uuid-serialized', + uuid_type = ExtensionType('uuid', 'uuid-serialized', FixedSizeBinaryField('', 16)) dict_ext_type = ExtensionType( 'dict-extension', 'dict-extension-serialized', From 76a9127d48e9d128f71da2161b103c3ddadc16cd Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 30 Oct 2023 06:53:15 +0100 Subject: [PATCH 08/22] minor change --- c_glib/example/extension-type.c | 6 +++--- c_glib/test/test-extension-data-type.rb | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/c_glib/example/extension-type.c b/c_glib/example/extension-type.c index 9aae6b2ee4f..09435eac7e7 100644 --- a/c_glib/example/extension-type.c +++ b/c_glib/example/extension-type.c @@ -59,7 +59,7 @@ G_DEFINE_TYPE(ExampleUUIDDataType, static gchar * example_uuid_data_type_get_extension_name(GArrowExtensionDataType *data_type) { - return g_strdup("example-uuid"); + return g_strdup("arrow.uuid"); } static gboolean @@ -70,7 +70,7 @@ example_uuid_data_type_equal(GArrowExtensionDataType *data_type, return TRUE; } -static const gchar *example_uuid_data_type_serialize_id = "example-uuid-serialized"; +static const gchar *example_uuid_data_type_serialize_id = "uuid-serialized"; static ExampleUUIDDataType *example_uuid_data_type_new(void); static GArrowDataType * @@ -221,7 +221,7 @@ main(int argc, char **argv) /* Create a record batch to serialize the created UUID extension array. */ GList *fields = NULL; fields = g_list_append(fields, - garrow_field_new("example-uuid", + garrow_field_new("arrow.uuid", GARROW_DATA_TYPE(uuid_data_type))); GArrowSchema *schema = garrow_schema_new(fields); g_list_free_full(fields, g_object_unref); diff --git a/c_glib/test/test-extension-data-type.rb b/c_glib/test/test-extension-data-type.rb index b3a3ce107ff..0018fb3298c 100644 --- a/c_glib/test/test-extension-data-type.rb +++ b/c_glib/test/test-extension-data-type.rb @@ -53,7 +53,7 @@ def test_name def test_to_s omit("gobject-introspection gem doesn't support implementing methods for GLib object yet") data_type = UUIDDataType.new - assert_equal("extension", data_type.to_s) + assert_equal("extension", data_type.to_s) end def test_storage_data_type @@ -65,7 +65,7 @@ def test_storage_data_type def test_extension_name omit("gobject-introspection gem doesn't support implementing methods for GLib object yet") data_type = UUIDDataType.new - assert_equal("example-uuid", data_type.extension_name) + assert_equal("arrow.uuid", data_type.extension_name) end def test_wrap_array From d97c1fd9988f947d4d35ff903130c8acec83515a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 11 Dec 2023 16:05:12 +0100 Subject: [PATCH 09/22] Review feedback --- cpp/src/arrow/acero/util_test.cc | 1 - cpp/src/arrow/c/bridge_test.cc | 8 +++++++- python/pyarrow/src/arrow/python/gdb.cc | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/acero/util_test.cc b/cpp/src/arrow/acero/util_test.cc index ec0a79d177c..caf3aedf08a 100644 --- a/cpp/src/arrow/acero/util_test.cc +++ b/cpp/src/arrow/acero/util_test.cc @@ -17,7 +17,6 @@ #include "arrow/acero/hash_join_node.h" #include "arrow/acero/schema_util.h" -#include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index e13aee5b47f..1e1e93d9403 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -2166,10 +2166,15 @@ TEST_F(TestSchemaImport, RunEndEncoded) { #endif TEST_F(TestSchemaImport, Dictionary) { + FillPrimitive("w:16"); + c_struct_.metadata = kEncodedUuidMetadata.c_str(); + auto expected = uuid(); + CheckImport(expected); + FillPrimitive(AddChild(), "u"); FillPrimitive("c"); FillDictionary(); - auto expected = dictionary(int8(), utf8()); + expected = dictionary(int8(), utf8()); CheckImport(expected); FillPrimitive(AddChild(), "u"); @@ -3712,6 +3717,7 @@ std::shared_ptr GetStorageWithMetadata(const std::string& field_name, } TEST_F(TestSchemaRoundtrip, UnregisteredExtension) { + TestWithTypeFactory(uuid, []() { return uuid(); }); TestWithTypeFactory(complex128, []() { return struct_({::arrow::field("real", float64(), /*nullable=*/false), ::arrow::field("imag", float64(), /*nullable=*/false)}); diff --git a/python/pyarrow/src/arrow/python/gdb.cc b/python/pyarrow/src/arrow/python/gdb.cc index 6941769e4ef..aa06d0eb154 100644 --- a/python/pyarrow/src/arrow/python/gdb.cc +++ b/python/pyarrow/src/arrow/python/gdb.cc @@ -76,7 +76,7 @@ class UuidType : public ExtensionType { return Status::NotImplemented(""); } - std::string Serialize() const override { return "uuid-serialized"; } + std::string Serialize() const override { return ""; } }; std::shared_ptr SliceArrayFromJSON(const std::shared_ptr& ty, From ad71226f29900fb1f74b191f39e390762de99f02 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 11 Dec 2023 16:13:14 +0100 Subject: [PATCH 10/22] Review feedback --- cpp/src/arrow/extension/uuid_test.cc | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp/src/arrow/extension/uuid_test.cc b/cpp/src/arrow/extension/uuid_test.cc index bbeaf453e1e..bca40e86b0f 100644 --- a/cpp/src/arrow/extension/uuid_test.cc +++ b/cpp/src/arrow/extension/uuid_test.cc @@ -23,9 +23,7 @@ #include "arrow/array/array_primitive.h" #include "arrow/io/memory.h" #include "arrow/ipc/reader.h" -#include "arrow/record_batch.h" #include "arrow/testing/gtest_util.h" -#include "arrow/util/key_value_metadata.h" #include "arrow/testing/extension_type.h" @@ -33,9 +31,7 @@ namespace arrow { using extension::uuid; -class TestUuuidExtensionType : public ::testing::Test {}; - -TEST_F(TestUuuidExtensionType, ExtensionTypeTest) { +TEST(TestUuuidExtensionType, ExtensionTypeTest) { auto type = uuid(); ASSERT_EQ(type->id(), Type::EXTENSION); From 34e96cdb389c9df4b51526ab372968e952430402 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 11 Dec 2023 16:33:23 +0100 Subject: [PATCH 11/22] c_glib rename uuid example test --- c_glib/example/extension-type.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/c_glib/example/extension-type.c b/c_glib/example/extension-type.c index 09435eac7e7..390d21a89f7 100644 --- a/c_glib/example/extension-type.c +++ b/c_glib/example/extension-type.c @@ -59,7 +59,7 @@ G_DEFINE_TYPE(ExampleUUIDDataType, static gchar * example_uuid_data_type_get_extension_name(GArrowExtensionDataType *data_type) { - return g_strdup("arrow.uuid"); + return g_strdup("example.uuid"); } static gboolean @@ -221,7 +221,7 @@ main(int argc, char **argv) /* Create a record batch to serialize the created UUID extension array. */ GList *fields = NULL; fields = g_list_append(fields, - garrow_field_new("arrow.uuid", + garrow_field_new("example.uuid", GARROW_DATA_TYPE(uuid_data_type))); GArrowSchema *schema = garrow_schema_new(fields); g_list_free_full(fields, g_object_unref); From 70d409d1dae38cb423221b4847e1f12b55a51dde Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 11 Dec 2023 16:50:02 +0100 Subject: [PATCH 12/22] python changes --- cpp/src/arrow/c/bridge_test.cc | 14 +++++++------- python/pyarrow/tests/test_extension_type.py | 12 ++++++------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 1e1e93d9403..6ff1aa99d4a 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -2166,15 +2166,10 @@ TEST_F(TestSchemaImport, RunEndEncoded) { #endif TEST_F(TestSchemaImport, Dictionary) { - FillPrimitive("w:16"); - c_struct_.metadata = kEncodedUuidMetadata.c_str(); - auto expected = uuid(); - CheckImport(expected); - FillPrimitive(AddChild(), "u"); FillPrimitive("c"); FillDictionary(); - expected = dictionary(int8(), utf8()); + auto expected = dictionary(int8(), utf8()); CheckImport(expected); FillPrimitive(AddChild(), "u"); @@ -2199,11 +2194,16 @@ TEST_F(TestSchemaImport, Dictionary) { } TEST_F(TestSchemaImport, UnregisteredExtension) { + FillPrimitive("w:16"); + c_struct_.metadata = kEncodedUuidMetadata.c_str(); + auto expected = uuid(); + CheckImport(expected); + FillPrimitive(AddChild(), "u"); FillPrimitive("c"); FillDictionary(); c_struct_.metadata = kEncodedDictExtensionMetadata.c_str(); - auto expected = dictionary(int8(), utf8()); + expected = dictionary(int8(), utf8()); CheckImport(expected); } diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 361783a9e1c..326b3cb803c 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -100,10 +100,10 @@ def as_py(self): return None if self.value is None else UUID(bytes=self.value.as_py()) -class ExampleUuidType(pa.PyExtensionType): +class ExampleUuidType(pa.ExtensionType): def __init__(self): - super().__init__(pa.binary(16), 'pyarrow.tests.UuidType') + super().__init__(pa.binary(16), 'pyarrow.tests.ExampleUuidType') def __reduce__(self): return ExampleUuidType, () @@ -119,10 +119,10 @@ def __arrow_ext_deserialize__(cls, storage_type, serialized): return cls() -class ExampleUuidType2(pa.PyExtensionType): +class ExampleUuidType2(pa.ExtensionType): def __init__(self): - super().__init__(pa.binary(16), 'pyarrow.tests.UuidType2') + super().__init__(pa.binary(16), 'pyarrow.tests.ExampleUuidType2') def __arrow_ext_serialize__(self): return b'' @@ -254,7 +254,7 @@ def ipc_read_batch(buf): def test_ext_type_basics(): ty = ExampleUuidType() - assert ty.extension_name == "arrow.py_extension_type" + assert ty.extension_name == "pyarrow.tests.ExampleUuidType" def test_ext_type_str(): @@ -350,7 +350,7 @@ def test_uuid_type_pickle(pickle_module): del ty ty = pickle_module.loads(ser) wr = weakref.ref(ty) - assert ty.extension_name == "pyarrow.tests.UuidType" + assert ty.extension_name == "pyarrow.tests.ExampleUuidType" del ty assert wr() is None From e444e16f77117b13f8ce77bdabc049de9fbb77bb Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 11 Dec 2023 18:23:26 +0100 Subject: [PATCH 13/22] integration test issue --- dev/archery/archery/integration/datagen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 47310c905a9..38338bdc210 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1845,7 +1845,7 @@ def generate_nested_dictionary_case(): def generate_extension_case(): dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0') - uuid_type = ExtensionType('uuid', 'uuid-serialized', + uuid_type = ExtensionType('arrow.uuid', '', FixedSizeBinaryField('', 16)) dict_ext_type = ExtensionType( 'dict-extension', 'dict-extension-serialized', From 3724f54be681ec70989de757911a55978ec1f037 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Mon, 11 Dec 2023 20:04:50 +0100 Subject: [PATCH 14/22] integration tests --- dev/archery/archery/integration/datagen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 38338bdc210..d6a37a5e5b2 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1845,8 +1845,8 @@ def generate_nested_dictionary_case(): def generate_extension_case(): dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0') - uuid_type = ExtensionType('arrow.uuid', '', - FixedSizeBinaryField('', 16)) + uuid_type = ExtensionType('arrow-uuid', '', + FixedSizeBinaryField('uuids', 16)) dict_ext_type = ExtensionType( 'dict-extension', 'dict-extension-serialized', DictionaryField('str_dict', get_field('', 'int8'), dict0)) From b61a2c05775e84af26d52d41f69aff9207dc587c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 12 Dec 2023 02:03:10 +0100 Subject: [PATCH 15/22] remove uui-serialized string --- cpp/src/arrow/CMakeLists.txt | 1 - cpp/src/arrow/c/bridge_test.cc | 25 +++++++------------ cpp/src/arrow/extension/uuid.cc | 4 +-- cpp/src/arrow/extension/uuid.h | 2 +- cpp/src/arrow/extension_type_test.cc | 5 ++-- .../integration/json_integration_test.cc | 2 +- dev/archery/archery/integration/datagen.py | 4 +-- docs/source/format/Integration.rst | 2 +- python/pyarrow/array.pxi | 2 +- python/pyarrow/includes/libarrow.pxd | 2 +- 10 files changed, 20 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index a37226477d4..4d3b635eb07 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -376,7 +376,6 @@ set(ARROW_SRCS extension_type.cc extension/bool8.cc extension/uuid.cc - memory_pool.cc pretty_print.cc record_batch.cc result.cc diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 6ff1aa99d4a..417b02fd205 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -191,13 +191,13 @@ static const std::string kEncodedUuidMetadata = // NOLINT: runtime/string std::string {20, 0, 0, 0} + kExtensionTypeKeyName + std::string {10, 0, 0, 0} + "arrow.uuid" + std::string {24, 0, 0, 0} + kExtensionMetadataKeyName + - std::string {15, 0, 0, 0} + "uuid-serialized"; + std::string {0, 0, 0, 0} + ""; #else std::string {0, 0, 0, 2} + std::string {0, 0, 0, 20} + kExtensionTypeKeyName + - std::string {0, 0, 0, 4} + "uuid" + + std::string {0, 0, 0, 10} + "arrow.uuid" + std::string {0, 0, 0, 24} + kExtensionMetadataKeyName + - std::string {0, 0, 0, 15} + "uuid-serialized"; + std::string {0, 0, 0, 0} + ""; #endif static const std::string kEncodedDictExtensionMetadata = // NOLINT: runtime/string @@ -2198,13 +2198,6 @@ TEST_F(TestSchemaImport, UnregisteredExtension) { c_struct_.metadata = kEncodedUuidMetadata.c_str(); auto expected = uuid(); CheckImport(expected); - - FillPrimitive(AddChild(), "u"); - FillPrimitive("c"); - FillDictionary(); - c_struct_.metadata = kEncodedDictExtensionMetadata.c_str(); - expected = dictionary(int8(), utf8()); - CheckImport(expected); } TEST_F(TestSchemaImport, RegisteredExtension) { @@ -2339,12 +2332,12 @@ TEST_F(TestSchemaImport, ExtensionError) { c_struct_.metadata = kEncodedUuidMetadata.c_str(); CheckImportError(); - // Invalid serialization - std::string bogus_metadata = kEncodedUuidMetadata; - bogus_metadata[bogus_metadata.size() - 5] += 1; - FillPrimitive("w:16"); - c_struct_.metadata = bogus_metadata.c_str(); - CheckImportError(); + // // Invalid serialization + // std::string bogus_metadata = kEncodedUuidMetadata; + // bogus_metadata[bogus_metadata.size() - 5] += 1; + // FillPrimitive("w:16"); + // c_struct_.metadata = bogus_metadata.c_str(); + // CheckImportError(); } TEST_F(TestSchemaImport, RecursionError) { diff --git a/cpp/src/arrow/extension/uuid.cc b/cpp/src/arrow/extension/uuid.cc index f3c1d9e1dd5..80fee5cf105 100644 --- a/cpp/src/arrow/extension/uuid.cc +++ b/cpp/src/arrow/extension/uuid.cc @@ -36,8 +36,8 @@ std::shared_ptr UuidType::MakeArray(std::shared_ptr data) cons Result> UuidType::Deserialize( std::shared_ptr storage_type, const std::string& serialized) const { - if (serialized != "uuid-serialized") { - return Status::Invalid("Type identifier did not match: '", serialized, "'"); + if (!serialized.empty()) { + return Status::Invalid("Unexpected serialized metadata: '", serialized, "'"); } if (!storage_type->Equals(*fixed_size_binary(16))) { return Status::Invalid("Invalid storage type for UuidType: ", diff --git a/cpp/src/arrow/extension/uuid.h b/cpp/src/arrow/extension/uuid.h index 6fd8d526b78..3848c3d8a90 100644 --- a/cpp/src/arrow/extension/uuid.h +++ b/cpp/src/arrow/extension/uuid.h @@ -43,7 +43,7 @@ class ARROW_EXPORT UuidType : public ExtensionType { std::shared_ptr storage_type, const std::string& serialized) const override; - std::string Serialize() const override { return "uuid-serialized"; } + std::string Serialize() const override { return ""; } /// \brief Create a UuidType instance static Result> Make() { return std::make_shared(); } diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index 7568fedf11e..125d2b61583 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -236,9 +236,8 @@ TEST_F(TestExtensionType, UnrecognizedExtension) { ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); ASSERT_OK(UnregisterExtensionType("arrow.uuid")); - auto ext_metadata = - key_value_metadata({{"ARROW:extension:name", "arrow.uuid"}, - {"ARROW:extension:metadata", "uuid-serialized"}}); + auto ext_metadata = key_value_metadata( + {{"ARROW:extension:name", "arrow.uuid"}, {"ARROW:extension:metadata", ""}}); auto ext_field = field("f0", fixed_size_binary(16), true, ext_metadata); auto batch_no_ext = RecordBatch::Make(schema({ext_field}), 4, {storage_arr}); diff --git a/cpp/src/arrow/integration/json_integration_test.cc b/cpp/src/arrow/integration/json_integration_test.cc index f54bb431690..b71f52a965f 100644 --- a/cpp/src/arrow/integration/json_integration_test.cc +++ b/cpp/src/arrow/integration/json_integration_test.cc @@ -469,7 +469,7 @@ static const char* json_example2 = R"example( "children" : [], "metadata" : [ {"key": "ARROW:extension:name", "value": "arrow.uuid"}, - {"key": "ARROW:extension:metadata", "value": "uuid-serialized"} + {"key": "ARROW:extension:metadata", "value": ""} ] }, { diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index d6a37a5e5b2..38338bdc210 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1845,8 +1845,8 @@ def generate_nested_dictionary_case(): def generate_extension_case(): dict0 = Dictionary(0, StringField('dictionary0'), size=5, name='DICT0') - uuid_type = ExtensionType('arrow-uuid', '', - FixedSizeBinaryField('uuids', 16)) + uuid_type = ExtensionType('arrow.uuid', '', + FixedSizeBinaryField('', 16)) dict_ext_type = ExtensionType( 'dict-extension', 'dict-extension-serialized', DictionaryField('str_dict', get_field('', 'int8'), dict0)) diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index 0ab5b832ad0..80bf2038f46 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -403,7 +403,7 @@ FixedSizeBinary(16) storage, here is how a "uuid" field would be represented:: "children" : [], "metadata" : [ {"key": "ARROW:extension:name", "value": "uuid"}, - {"key": "ARROW:extension:metadata", "value": "uuid-serialized"} + {"key": "ARROW:extension:metadata", "value": ""} ] } diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index e80c6b52e89..1587de0e6b7 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4344,7 +4344,7 @@ class UuidArray(ExtensionArray): """ -class FixedShapeTensorArray(ExtensionArray): +cdef class FixedShapeTensorArray(ExtensionArray): """ Concrete class for fixed shape tensor extension arrays. diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 2c7399039b3..9dd74e769de 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2878,7 +2878,7 @@ cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension": const shared_ptr[CDataType] value_type() -cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension": +cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension" nogil: cdef cppclass CFixedShapeTensorType \ " arrow::extension::FixedShapeTensorType"(CExtensionType): From 5a64d04683fd6d5337f5ba490ec43387e7110f8e Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 4 Apr 2024 22:51:50 +0200 Subject: [PATCH 16/22] lint --- c_glib/example/extension-type.c | 9 +++++---- cpp/src/arrow/testing/gtest_util.cc | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/c_glib/example/extension-type.c b/c_glib/example/extension-type.c index 390d21a89f7..1b639b3d793 100644 --- a/c_glib/example/extension-type.c +++ b/c_glib/example/extension-type.c @@ -71,7 +71,8 @@ example_uuid_data_type_equal(GArrowExtensionDataType *data_type, } static const gchar *example_uuid_data_type_serialize_id = "uuid-serialized"; -static ExampleUUIDDataType *example_uuid_data_type_new(void); +static ExampleUUIDDataType * +example_uuid_data_type_new(void); static GArrowDataType * example_uuid_data_type_deserialize(GArrowExtensionDataType *data_type, @@ -220,9 +221,9 @@ main(int argc, char **argv) /* Create a record batch to serialize the created UUID extension array. */ GList *fields = NULL; - fields = g_list_append(fields, - garrow_field_new("example.uuid", - GARROW_DATA_TYPE(uuid_data_type))); + fields = + g_list_append(fields, + garrow_field_new("example.uuid", GARROW_DATA_TYPE(uuid_data_type))); GArrowSchema *schema = garrow_schema_new(fields); g_list_free_full(fields, g_object_unref); GList *columns = NULL; diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index bd019133945..f86167056c0 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -52,9 +52,9 @@ #include "arrow/datum.h" #include "arrow/io/memory.h" #include "arrow/ipc/json_simple.h" -#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep #include "arrow/ipc/reader.h" #include "arrow/ipc/writer.h" +#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep #include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/status.h" From ffea27aad578c996b4d504c9da0f378a20348e13 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Tue, 16 Apr 2024 16:38:52 +0200 Subject: [PATCH 17/22] CMakeLists and other changes --- c_glib/example/extension-type.c | 5 ++--- c_glib/test/test-extension-data-type.rb | 4 ++-- cpp/src/arrow/CMakeLists.txt | 2 +- cpp/src/arrow/c/bridge_test.cc | 12 ++++++------ cpp/src/arrow/extension_type.cc | 1 - 5 files changed, 11 insertions(+), 13 deletions(-) diff --git a/c_glib/example/extension-type.c b/c_glib/example/extension-type.c index 1b639b3d793..1861943d7e5 100644 --- a/c_glib/example/extension-type.c +++ b/c_glib/example/extension-type.c @@ -59,7 +59,7 @@ G_DEFINE_TYPE(ExampleUUIDDataType, static gchar * example_uuid_data_type_get_extension_name(GArrowExtensionDataType *data_type) { - return g_strdup("example.uuid"); + return g_strdup("uuid"); } static gboolean @@ -222,8 +222,7 @@ main(int argc, char **argv) /* Create a record batch to serialize the created UUID extension array. */ GList *fields = NULL; fields = - g_list_append(fields, - garrow_field_new("example.uuid", GARROW_DATA_TYPE(uuid_data_type))); + g_list_append(fields, garrow_field_new("uuid", GARROW_DATA_TYPE(uuid_data_type))); GArrowSchema *schema = garrow_schema_new(fields); g_list_free_full(fields, g_object_unref); GList *columns = NULL; diff --git a/c_glib/test/test-extension-data-type.rb b/c_glib/test/test-extension-data-type.rb index 0018fb3298c..6c114b81e2c 100644 --- a/c_glib/test/test-extension-data-type.rb +++ b/c_glib/test/test-extension-data-type.rb @@ -53,7 +53,7 @@ def test_name def test_to_s omit("gobject-introspection gem doesn't support implementing methods for GLib object yet") data_type = UUIDDataType.new - assert_equal("extension", data_type.to_s) + assert_equal("extension", data_type.to_s) end def test_storage_data_type @@ -65,7 +65,7 @@ def test_storage_data_type def test_extension_name omit("gobject-introspection gem doesn't support implementing methods for GLib object yet") data_type = UUIDDataType.new - assert_equal("arrow.uuid", data_type.extension_name) + assert_equal("uuid", data_type.extension_name) end def test_wrap_array diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 4d3b635eb07..4d3c9016e8b 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -1227,6 +1227,7 @@ add_subdirectory(testing) add_subdirectory(array) add_subdirectory(c) add_subdirectory(compute) +add_subdirectory(extension) add_subdirectory(io) add_subdirectory(tensor) add_subdirectory(util) @@ -1269,7 +1270,6 @@ endif() if(ARROW_JSON) add_subdirectory(json) - add_subdirectory(extension) endif() if(ARROW_ORC) diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 417b02fd205..eee43a060c2 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -2332,12 +2332,12 @@ TEST_F(TestSchemaImport, ExtensionError) { c_struct_.metadata = kEncodedUuidMetadata.c_str(); CheckImportError(); - // // Invalid serialization - // std::string bogus_metadata = kEncodedUuidMetadata; - // bogus_metadata[bogus_metadata.size() - 5] += 1; - // FillPrimitive("w:16"); - // c_struct_.metadata = bogus_metadata.c_str(); - // CheckImportError(); + // Invalid serialization + std::string bogus_metadata = kEncodedUuidMetadata; + bogus_metadata[bogus_metadata.size() - 2] += 1; + FillPrimitive("w:16"); + c_struct_.metadata = bogus_metadata.c_str(); + CheckImportError(); } TEST_F(TestSchemaImport, RecursionError) { diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index 5a297bda98d..fc220f73a6b 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -155,7 +155,6 @@ static void CreateGlobalRegistry() { ext_types.push_back(extension::opaque(null(), "", "")); #endif - // Register canonical extension types for (const auto& ext_type : ext_types) { ARROW_CHECK_OK( g_registry->RegisterType(checked_pointer_cast(ext_type))); From 7936e992b84b90a17120679af05d9db7c30b2f09 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 17 Apr 2024 02:52:36 +0200 Subject: [PATCH 18/22] Change cpp/src/arrow/extension/CMakeLists.txt --- cpp/src/arrow/c/bridge_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index eee43a060c2..43204e61726 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -2334,7 +2334,7 @@ TEST_F(TestSchemaImport, ExtensionError) { // Invalid serialization std::string bogus_metadata = kEncodedUuidMetadata; - bogus_metadata[bogus_metadata.size() - 2] += 1; + bogus_metadata[bogus_metadata.size() - 4] += 1; FillPrimitive("w:16"); c_struct_.metadata = bogus_metadata.c_str(); CheckImportError(); From 2fb3e08829e04070bd5319480f0275de997b8fb2 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 17 Apr 2024 22:37:09 +0200 Subject: [PATCH 19/22] Minor change --- cpp/src/arrow/c/bridge_test.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 43204e61726..50987c578b5 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -189,15 +189,15 @@ static const std::string kEncodedUuidMetadata = // NOLINT: runtime/string #if ARROW_LITTLE_ENDIAN std::string {2, 0, 0, 0} + std::string {20, 0, 0, 0} + kExtensionTypeKeyName + - std::string {10, 0, 0, 0} + "arrow.uuid" + + std::string {4, 0, 0, 0} + "uuid" + std::string {24, 0, 0, 0} + kExtensionMetadataKeyName + - std::string {0, 0, 0, 0} + ""; + std::string {15, 0, 0, 0} + "uuid-serialized"; #else std::string {0, 0, 0, 2} + std::string {0, 0, 0, 20} + kExtensionTypeKeyName + - std::string {0, 0, 0, 10} + "arrow.uuid" + + std::string {0, 0, 0, 4} + "uuid" + std::string {0, 0, 0, 24} + kExtensionMetadataKeyName + - std::string {0, 0, 0, 0} + ""; + std::string {0, 0, 0, 15} + "uuid-serialized"; #endif static const std::string kEncodedDictExtensionMetadata = // NOLINT: runtime/string From a06a212b3dcdcb1f31d978f4cf9249a9da77d9cb Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Wed, 17 Apr 2024 23:29:27 +0200 Subject: [PATCH 20/22] Reverting some testing changes --- cpp/src/arrow/acero/hash_join_node_test.cc | 1 - cpp/src/arrow/acero/util_test.cc | 4 +- cpp/src/arrow/c/bridge_test.cc | 19 +++++---- cpp/src/arrow/extension/uuid_test.cc | 2 - cpp/src/arrow/extension_type_test.cc | 23 +++++++---- .../test_integration_client.cc | 1 + .../integration/json_integration_test.cc | 39 +++++++++---------- cpp/src/arrow/ipc/read_write_test.cc | 4 +- cpp/src/arrow/ipc/test_common.cc | 7 +--- cpp/src/arrow/scalar_test.cc | 7 +--- cpp/src/arrow/testing/extension_type.h | 25 ++++++++++++ cpp/src/arrow/testing/gtest_util.cc | 27 ++++++++++++- docs/source/format/Integration.rst | 2 +- python/pyarrow/src/arrow/python/gdb.cc | 27 ++----------- python/pyarrow/tests/test_gdb.py | 8 ++-- 15 files changed, 108 insertions(+), 88 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 6fed1cdf803..76ad9c7d650 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -55,7 +55,6 @@ using compute::SortIndices; using compute::SortKey; using compute::Take; using compute::internal::RowEncoder; -using extension::uuid; namespace acero { diff --git a/cpp/src/arrow/acero/util_test.cc b/cpp/src/arrow/acero/util_test.cc index caf3aedf08a..a291075a0a9 100644 --- a/cpp/src/arrow/acero/util_test.cc +++ b/cpp/src/arrow/acero/util_test.cc @@ -17,15 +17,13 @@ #include "arrow/acero/hash_join_node.h" #include "arrow/acero/schema_util.h" +#include "arrow/testing/extension_type.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/matchers.h" -#include "arrow/extension/uuid.h" - using testing::Eq; namespace arrow { -using extension::uuid; namespace acero { const char* kLeftSuffix = ".left"; diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 50987c578b5..03a85a44c38 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -31,7 +31,6 @@ #include "arrow/c/bridge.h" #include "arrow/c/helpers.h" #include "arrow/c/util_internal.h" -#include "arrow/extension/uuid.h" #include "arrow/ipc/json_simple.h" #include "arrow/memory_pool.h" #include "arrow/testing/builder.h" @@ -2196,12 +2195,13 @@ TEST_F(TestSchemaImport, Dictionary) { TEST_F(TestSchemaImport, UnregisteredExtension) { FillPrimitive("w:16"); c_struct_.metadata = kEncodedUuidMetadata.c_str(); - auto expected = uuid(); + auto expected = fixed_size_binary(16); CheckImport(expected); } TEST_F(TestSchemaImport, RegisteredExtension) { { + ExtensionTypeGuard guard(uuid()); FillPrimitive("w:16"); c_struct_.metadata = kEncodedUuidMetadata.c_str(); auto expected = uuid(); @@ -2327,6 +2327,8 @@ TEST_F(TestSchemaImport, DictionaryError) { } TEST_F(TestSchemaImport, ExtensionError) { + ExtensionTypeGuard guard(uuid()); + // Storage type doesn't match FillPrimitive("w:15"); c_struct_.metadata = kEncodedUuidMetadata.c_str(); @@ -2334,7 +2336,7 @@ TEST_F(TestSchemaImport, ExtensionError) { // Invalid serialization std::string bogus_metadata = kEncodedUuidMetadata; - bogus_metadata[bogus_metadata.size() - 4] += 1; + bogus_metadata[bogus_metadata.size() - 5] += 1; FillPrimitive("w:16"); c_struct_.metadata = bogus_metadata.c_str(); CheckImportError(); @@ -3710,11 +3712,7 @@ std::shared_ptr GetStorageWithMetadata(const std::string& field_name, } TEST_F(TestSchemaRoundtrip, UnregisteredExtension) { - TestWithTypeFactory(uuid, []() { return uuid(); }); - TestWithTypeFactory(complex128, []() { - return struct_({::arrow::field("real", float64(), /*nullable=*/false), - ::arrow::field("imag", float64(), /*nullable=*/false)}); - }); + TestWithTypeFactory(uuid, []() { return fixed_size_binary(16); }); TestWithTypeFactory(dict_extension_type, []() { return dictionary(int8(), utf8()); }); // Inside nested type. @@ -3726,7 +3724,7 @@ TEST_F(TestSchemaRoundtrip, UnregisteredExtension) { } TEST_F(TestSchemaRoundtrip, RegisteredExtension) { - ExtensionTypeGuard guard({dict_extension_type(), complex128()}); + ExtensionTypeGuard guard({uuid(), dict_extension_type(), complex128()}); TestWithTypeFactory(uuid); TestWithTypeFactory(dict_extension_type); TestWithTypeFactory(complex128); @@ -4085,7 +4083,7 @@ TEST_F(TestArrayRoundtrip, Dictionary) { } TEST_F(TestArrayRoundtrip, RegisteredExtension) { - ExtensionTypeGuard guard({smallint(), complex128(), dict_extension_type()}); + ExtensionTypeGuard guard({smallint(), complex128(), dict_extension_type(), uuid()}); TestWithArrayFactory(ExampleSmallint); TestWithArrayFactory(ExampleUuid); @@ -4131,6 +4129,7 @@ TEST_F(TestArrayRoundtrip, UnregisteredExtension) { }; TestWithArrayFactory(ExampleSmallint, StorageExtractor(ExampleSmallint)); + TestWithArrayFactory(ExampleUuid, StorageExtractor(ExampleUuid)); TestWithArrayFactory(ExampleComplex128, StorageExtractor(ExampleComplex128)); TestWithArrayFactory(ExampleDictExtension, StorageExtractor(ExampleDictExtension)); } diff --git a/cpp/src/arrow/extension/uuid_test.cc b/cpp/src/arrow/extension/uuid_test.cc index bca40e86b0f..6b146fee1f2 100644 --- a/cpp/src/arrow/extension/uuid_test.cc +++ b/cpp/src/arrow/extension/uuid_test.cc @@ -29,8 +29,6 @@ namespace arrow { -using extension::uuid; - TEST(TestUuuidExtensionType, ExtensionTypeTest) { auto type = uuid(); ASSERT_EQ(type->id(), Type::EXTENSION); diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index 125d2b61583..a9ec70dbd56 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -26,7 +26,6 @@ #include "arrow/array/array_nested.h" #include "arrow/array/util.h" -#include "arrow/extension/uuid.h" #include "arrow/extension_type.h" #include "arrow/io/memory.h" #include "arrow/ipc/options.h" @@ -42,8 +41,6 @@ namespace arrow { -using extension::uuid; - class Parametric1Array : public ExtensionArray { public: using ExtensionArray::ExtensionArray; @@ -179,13 +176,22 @@ class ExtStructType : public ExtensionType { std::string Serialize() const override { return "ext-struct-type-unique-code"; } }; -class TestExtensionType : public ::testing::Test {}; +class TestExtensionType : public ::testing::Test { + public: + void SetUp() { ASSERT_OK(RegisterExtensionType(std::make_shared())); } + + void TearDown() { + if (GetExtensionType("uuid")) { + ASSERT_OK(UnregisterExtensionType("uuid")); + } + } +}; TEST_F(TestExtensionType, ExtensionTypeTest) { auto type_not_exist = GetExtensionType("uuid-unknown"); ASSERT_EQ(type_not_exist, nullptr); - auto registered_type = GetExtensionType("arrow.uuid"); + auto registered_type = GetExtensionType("uuid"); ASSERT_NE(registered_type, nullptr); auto type = uuid(); @@ -235,9 +241,10 @@ TEST_F(TestExtensionType, UnrecognizedExtension) { ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); - ASSERT_OK(UnregisterExtensionType("arrow.uuid")); - auto ext_metadata = key_value_metadata( - {{"ARROW:extension:name", "arrow.uuid"}, {"ARROW:extension:metadata", ""}}); + ASSERT_OK(UnregisterExtensionType("uuid")); + auto ext_metadata = + key_value_metadata({{"ARROW:extension:name", "uuid"}, + {"ARROW:extension:metadata", "uuid-serialized"}}); auto ext_field = field("f0", fixed_size_binary(16), true, ext_metadata); auto batch_no_ext = RecordBatch::Make(schema({ext_field}), 4, {storage_arr}); diff --git a/cpp/src/arrow/flight/integration_tests/test_integration_client.cc b/cpp/src/arrow/flight/integration_tests/test_integration_client.cc index 75f0780486a..c5c3f10576d 100644 --- a/cpp/src/arrow/flight/integration_tests/test_integration_client.cc +++ b/cpp/src/arrow/flight/integration_tests/test_integration_client.cc @@ -145,6 +145,7 @@ class IntegrationTestScenario : public Scenario { Status RunClient(std::unique_ptr client) override { // Make sure the required extension types are registered. + ExtensionTypeGuard uuid_ext_guard(uuid()); ExtensionTypeGuard dict_ext_guard(dict_extension_type()); FlightDescriptor descr{FlightDescriptor::PATH, "", {FLAGS_path}}; diff --git a/cpp/src/arrow/integration/json_integration_test.cc b/cpp/src/arrow/integration/json_integration_test.cc index b71f52a965f..0e84ea6124d 100644 --- a/cpp/src/arrow/integration/json_integration_test.cc +++ b/cpp/src/arrow/integration/json_integration_test.cc @@ -31,7 +31,6 @@ #include "arrow/array.h" #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" -#include "arrow/extension/uuid.h" #include "arrow/integration/json_integration.h" #include "arrow/integration/json_internal.h" #include "arrow/io/file.h" @@ -68,14 +67,11 @@ DEFINE_bool(validate_times, true, namespace arrow::internal::integration { -using extension::uuid; -using internal::TemporaryDir; -using ipc::DictionaryFieldMapper; -using ipc::DictionaryMemo; -using ipc::IpcWriteOptions; -using ipc::MetadataVersion; - -namespace testing { +using ::arrow::internal::TemporaryDir; +using ::arrow::ipc::DictionaryFieldMapper; +using ::arrow::ipc::DictionaryMemo; +using ::arrow::ipc::IpcWriteOptions; +using ::arrow::ipc::MetadataVersion; using namespace ::arrow::ipc::test; // NOLINT @@ -228,7 +224,7 @@ Status RunCommand(const std::string& json_path, const std::string& arrow_path, const std::string& command) { // Make sure the required extension types are registered, as they will be // referenced in test data. - ExtensionTypeGuard ext_guard({dict_extension_type()}); + ExtensionTypeGuard ext_guard({uuid(), dict_extension_type()}); if (json_path == "") { return Status::Invalid("Must specify json file name"); @@ -468,8 +464,8 @@ static const char* json_example2 = R"example( "nullable": true, "children" : [], "metadata" : [ - {"key": "ARROW:extension:name", "value": "arrow.uuid"}, - {"key": "ARROW:extension:metadata", "value": ""} + {"key": "ARROW:extension:name", "value": "uuid"}, + {"key": "ARROW:extension:metadata", "value": "uuid-serialized"} ] }, { @@ -1031,10 +1027,12 @@ TEST(TestJsonFileReadWrite, JsonExample1) { TEST(TestJsonFileReadWrite, JsonExample2) { // Example 2: two extension types (one registered, one unregistered) - auto uuid_type = arrow::extension::uuid(); + auto uuid_type = uuid(); auto buffer = Buffer::Wrap(json_example2, strlen(json_example2)); { + ExtensionTypeGuard ext_guard(uuid_type); + ASSERT_OK_AND_ASSIGN(auto reader, IntegrationJsonReader::Open(buffer)); // The second field is an unregistered extension and will be read as // its underlying storage. @@ -1048,11 +1046,13 @@ TEST(TestJsonFileReadWrite, JsonExample2) { auto storage_array = ArrayFromJSON(fixed_size_binary(16), R"(["0123456789abcdef", null])"); - AssertArraysEqual(*batch->column(0), - arrow::extension::UuidArray(uuid_type, storage_array)); + AssertArraysEqual(*batch->column(0), ExampleUuidArray(uuid_type, storage_array)); AssertArraysEqual(*batch->column(1), NullArray(2)); } + + // Should fail now that the Uuid extension is unregistered + ASSERT_RAISES(KeyError, IntegrationJsonReader::Open(buffer)); } TEST(TestJsonFileReadWrite, JsonExample3) { @@ -1119,7 +1119,7 @@ class TestJsonRoundTrip : public ::testing::TestWithParam { }; void CheckRoundtrip(const RecordBatch& batch) { - ExtensionTypeGuard guard({dict_extension_type(), complex128()}); + ExtensionTypeGuard guard({uuid(), dict_extension_type(), complex128()}); TestSchemaRoundTrip(batch.schema()); @@ -1176,7 +1176,7 @@ const std::vector kBatchCases = { INSTANTIATE_TEST_SUITE_P(TestJsonRoundTrip, TestJsonRoundTrip, ::testing::ValuesIn(kBatchCases)); -} // namespace testing +} // namespace arrow::internal::integration int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); @@ -1184,8 +1184,8 @@ int main(int argc, char** argv) { int ret = 0; if (FLAGS_integration) { - arrow::Status result = arrow::internal::integration::testing::RunCommand( - FLAGS_json, FLAGS_arrow, FLAGS_mode); + arrow::Status result = + arrow::internal::integration::RunCommand(FLAGS_json, FLAGS_arrow, FLAGS_mode); if (!result.ok()) { std::cout << "Error message: " << result.ToString() << std::endl; ret = 1; @@ -1197,4 +1197,3 @@ int main(int argc, char** argv) { gflags::ShutDownCommandLineFlags(); return ret; } -} // namespace arrow::internal::integration diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 649289fb149..ff7838cc39d 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -31,7 +31,6 @@ #include "arrow/array.h" #include "arrow/array/builder_primitive.h" #include "arrow/buffer_builder.h" -#include "arrow/extension/uuid.h" #include "arrow/io/file.h" #include "arrow/io/memory.h" #include "arrow/io/test_common.h" @@ -60,7 +59,6 @@ namespace arrow { -using extension::uuid; using internal::checked_cast; using internal::checked_pointer_cast; using internal::TemporaryDir; @@ -409,7 +407,7 @@ static int g_file_number = 0; class ExtensionTypesMixin { public: // Register the extension types required to ensure roundtripping - ExtensionTypesMixin() : ext_guard_({dict_extension_type(), complex128()}) {} + ExtensionTypesMixin() : ext_guard_({uuid(), dict_extension_type(), complex128()}) {} protected: ExtensionTypeGuard ext_guard_; diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 56c16f1b47d..72a481660e6 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -28,7 +28,6 @@ #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" #include "arrow/array/builder_time.h" -#include "arrow/extension/uuid.h" #include "arrow/ipc/test_common.h" #include "arrow/pretty_print.h" #include "arrow/record_batch.h" @@ -51,8 +50,6 @@ namespace arrow { -using extension::uuid; -using extension::UuidArray; using internal::checked_cast; namespace ipc { @@ -1091,9 +1088,9 @@ Status MakeUuid(std::shared_ptr* out) { auto f1 = field("f1", uuid_type, /*nullable=*/false); auto schema = ::arrow::schema({f0, f1}); - auto a0 = std::make_shared( + auto a0 = std::make_shared( uuid_type, ArrayFromJSON(storage_type, R"(["0123456789abcdef", null])")); - auto a1 = std::make_shared( + auto a1 = std::make_shared( uuid_type, ArrayFromJSON(storage_type, R"(["ZYXWVUTSRQPONMLK", "JIHGFEDBA9876543"])")); diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index b638b672e54..e9ec13e98b4 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -31,7 +31,6 @@ #include "arrow/array/util.h" #include "arrow/buffer.h" #include "arrow/compute/cast.h" -#include "arrow/extension/uuid.h" #include "arrow/memory_pool.h" #include "arrow/scalar.h" #include "arrow/status.h" @@ -44,8 +43,6 @@ namespace arrow { using compute::Cast; using compute::CastOptions; -using extension::uuid; -using extension::UuidType; using internal::checked_cast; using internal::checked_pointer_cast; @@ -2040,7 +2037,7 @@ class TestExtensionScalar : public ::testing::Test { void SetUp() { type_ = uuid(); storage_type_ = fixed_size_binary(16); - uuid_type_ = checked_cast(type_.get()); + uuid_type_ = checked_cast(type_.get()); } protected: @@ -2051,7 +2048,7 @@ class TestExtensionScalar : public ::testing::Test { } std::shared_ptr type_, storage_type_; - const UuidType* uuid_type_{nullptr}; + const ExampleUuidType* uuid_type_{nullptr}; const std::string_view uuid_string1_{UUID_STRING1}; const std::string_view uuid_string2_{UUID_STRING2}; diff --git a/cpp/src/arrow/testing/extension_type.h b/cpp/src/arrow/testing/extension_type.h index 7c368c91634..a4526e31c2b 100644 --- a/cpp/src/arrow/testing/extension_type.h +++ b/cpp/src/arrow/testing/extension_type.h @@ -27,6 +27,28 @@ namespace arrow { +class ARROW_TESTING_EXPORT ExampleUuidArray : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +class ARROW_TESTING_EXPORT ExampleUuidType : public ExtensionType { + public: + ExampleUuidType() : ExtensionType(fixed_size_binary(16)) {} + + std::string extension_name() const override { return "uuid"; } + + bool ExtensionEquals(const ExtensionType& other) const override; + + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized) const override; + + std::string Serialize() const override { return "uuid-serialized"; } +}; + class ARROW_TESTING_EXPORT SmallintArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; @@ -153,6 +175,9 @@ class ARROW_TESTING_EXPORT Complex128Type : public ExtensionType { std::string Serialize() const override { return "complex128-serialized"; } }; +ARROW_TESTING_EXPORT +std::shared_ptr uuid(); + ARROW_TESTING_EXPORT std::shared_ptr smallint(); diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index f86167056c0..f896f52c15e 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -17,7 +17,6 @@ #include "arrow/testing/gtest_util.h" -#include "arrow/extension/uuid.h" #include "arrow/testing/extension_type.h" #ifdef _WIN32 @@ -866,6 +865,28 @@ Future<> SleepABitAsync() { /////////////////////////////////////////////////////////////////////////// // Extension types +bool ExampleUuidType::ExtensionEquals(const ExtensionType& other) const { + return (other.extension_name() == this->extension_name()); +} + +std::shared_ptr ExampleUuidType::MakeArray(std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("uuid", static_cast(*data->type).extension_name()); + return std::make_shared(data); +} + +Result> ExampleUuidType::Deserialize( + std::shared_ptr storage_type, const std::string& serialized) const { + if (serialized != "uuid-serialized") { + return Status::Invalid("Type identifier did not match: '", serialized, "'"); + } + if (!storage_type->Equals(*fixed_size_binary(16))) { + return Status::Invalid("Invalid storage type for UuidType: ", + storage_type->ToString()); + } + return std::make_shared(); +} + bool SmallintType::ExtensionEquals(const ExtensionType& other) const { return (other.extension_name() == this->extension_name()); } @@ -979,6 +1000,8 @@ Result> Complex128Type::Deserialize( return std::make_shared(); } +std::shared_ptr uuid() { return std::make_shared(); } + std::shared_ptr smallint() { return std::make_shared(); } std::shared_ptr tinyint() { return std::make_shared(); } @@ -1006,7 +1029,7 @@ std::shared_ptr ExampleUuid() { auto arr = ArrayFromJSON( fixed_size_binary(16), "[null, \"abcdefghijklmno0\", \"abcdefghijklmno1\", \"abcdefghijklmno2\"]"); - return ExtensionType::WrapArray(extension::uuid(), arr); + return ExtensionType::WrapArray(uuid(), arr); } std::shared_ptr ExampleSmallint() { diff --git a/docs/source/format/Integration.rst b/docs/source/format/Integration.rst index 80bf2038f46..0ab5b832ad0 100644 --- a/docs/source/format/Integration.rst +++ b/docs/source/format/Integration.rst @@ -403,7 +403,7 @@ FixedSizeBinary(16) storage, here is how a "uuid" field would be represented:: "children" : [], "metadata" : [ {"key": "ARROW:extension:name", "value": "uuid"}, - {"key": "ARROW:extension:metadata", "value": ""} + {"key": "ARROW:extension:metadata", "value": "uuid-serialized"} ] } diff --git a/python/pyarrow/src/arrow/python/gdb.cc b/python/pyarrow/src/arrow/python/gdb.cc index aa06d0eb154..7c58bae3342 100644 --- a/python/pyarrow/src/arrow/python/gdb.cc +++ b/python/pyarrow/src/arrow/python/gdb.cc @@ -22,7 +22,7 @@ #include "arrow/array.h" #include "arrow/chunked_array.h" #include "arrow/datum.h" -#include "arrow/extension_type.h" +#include "arrow/extension/uuid.h" #include "arrow/ipc/json_simple.h" #include "arrow/python/gdb.h" #include "arrow/record_batch.h" @@ -37,6 +37,8 @@ namespace arrow { +using extension::uuid; +using extension::UuidType; using ipc::internal::json::ArrayFromJSON; using ipc::internal::json::ChunkedArrayFromJSON; using ipc::internal::json::ScalarFromJSON; @@ -56,29 +58,6 @@ class CustomStatusDetail : public StatusDetail { std::string ToString() const override { return "This is a detail"; } }; -class UuidType : public ExtensionType { - public: - UuidType() : ExtensionType(fixed_size_binary(16)) {} - - std::string extension_name() const override { return "uuid"; } - - bool ExtensionEquals(const ExtensionType& other) const override { - return (other.extension_name() == this->extension_name()); - } - - std::shared_ptr MakeArray(std::shared_ptr data) const override { - return std::make_shared(data); - } - - Result> Deserialize( - std::shared_ptr storage_type, - const std::string& serialized) const override { - return Status::NotImplemented(""); - } - - std::string Serialize() const override { return ""; } -}; - std::shared_ptr SliceArrayFromJSON(const std::shared_ptr& ty, std::string_view json, int64_t offset = 0, int64_t length = -1) { diff --git a/python/pyarrow/tests/test_gdb.py b/python/pyarrow/tests/test_gdb.py index 0d12d710dcf..2ac2f55754f 100644 --- a/python/pyarrow/tests/test_gdb.py +++ b/python/pyarrow/tests/test_gdb.py @@ -409,7 +409,7 @@ def test_types_stack(gdb_arrow): check_stack_repr( gdb_arrow, "uuid_type", - ('arrow::ExtensionType "extension" ' + ('arrow::ExtensionType "extension" ' 'with storage type arrow::fixed_size_binary(16)')) @@ -447,7 +447,7 @@ def test_types_heap(gdb_arrow): check_heap_repr( gdb_arrow, "heap_uuid_type", - ('arrow::ExtensionType "extension" ' + ('arrow::ExtensionType "extension" ' 'with storage type arrow::fixed_size_binary(16)')) @@ -716,12 +716,12 @@ def test_scalars_stack(gdb_arrow): check_stack_repr( gdb_arrow, "extension_scalar", - ('arrow::ExtensionScalar of type "extension", ' + ('arrow::ExtensionScalar of type "extension", ' 'value arrow::FixedSizeBinaryScalar of size 16, ' 'value "0123456789abcdef"')) check_stack_repr( gdb_arrow, "extension_scalar_null", - 'arrow::ExtensionScalar of type "extension", null value') + 'arrow::ExtensionScalar of type "extension", null value') def test_scalars_heap(gdb_arrow): From 8007ef5fc1492a20b5d3ecb2adf3b3189571f66d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 9 May 2024 00:51:32 +0200 Subject: [PATCH 21/22] Add status info to docs --- cpp/src/arrow/CMakeLists.txt | 1 - cpp/src/arrow/c/bridge_test.cc | 1 - docs/source/format/CanonicalExtensions.rst | 2 ++ docs/source/status.rst | 2 +- python/pyarrow/tests/test_extension_type.py | 4 ++-- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 4d3c9016e8b..6b0ac8c23c7 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -908,7 +908,6 @@ endif() if(ARROW_JSON) arrow_add_object_library(ARROW_JSON - extension/bool8.cc extension/fixed_shape_tensor.cc extension/opaque.cc json/options.cc diff --git a/cpp/src/arrow/c/bridge_test.cc b/cpp/src/arrow/c/bridge_test.cc index 03a85a44c38..09bb524adbd 100644 --- a/cpp/src/arrow/c/bridge_test.cc +++ b/cpp/src/arrow/c/bridge_test.cc @@ -53,7 +53,6 @@ namespace arrow { -using extension::uuid; using internal::ArrayDeviceExportTraits; using internal::ArrayDeviceStreamExportTraits; using internal::ArrayExportGuard; diff --git a/docs/source/format/CanonicalExtensions.rst b/docs/source/format/CanonicalExtensions.rst index 5658f949cee..1106f8aaffd 100644 --- a/docs/source/format/CanonicalExtensions.rst +++ b/docs/source/format/CanonicalExtensions.rst @@ -272,6 +272,8 @@ JSON In the future, additional fields may be added, but they are not required to interpret the array. +.. _uuid_extension: + UUID ==== diff --git a/docs/source/status.rst b/docs/source/status.rst index 5e2c2cc19c8..b685d4bbf8a 100644 --- a/docs/source/status.rst +++ b/docs/source/status.rst @@ -121,7 +121,7 @@ Data Types +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | JSON | | | ✓ | | | | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ -| UUID | | | ✓ | | | | | | +| UUID | ✓ | | ✓ | | | | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ | 8-bit Boolean | ✓ | | ✓ | | | | | | +-----------------------+-------+-------+-------+------------+-------+-------+-------+-------+ diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 326b3cb803c..56ef9c42722 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -287,7 +287,7 @@ def test_ext_type__storage_type(): def test_ext_type_byte_width(): # Test for fixed-size binary types - ty = UuidType() + ty = pa.uuid() assert ty.byte_width == 16 ty = ParamExtType(5) assert ty.byte_width == 5 @@ -300,7 +300,7 @@ def test_ext_type_byte_width(): def test_ext_type_bit_width(): # Test for fixed-size binary types - ty = UuidType() + ty = pa.uuid() assert ty.bit_width == 128 ty = ParamExtType(5) assert ty.bit_width == 40 From 6ac6b59277c543e0460863608768b8d157fea83c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 22 Aug 2024 13:30:48 +0200 Subject: [PATCH 22/22] Review feedback --- .../extension/fixed_shape_tensor_test.cc | 2 ++ cpp/src/arrow/extension/uuid.cc | 14 ++++++--- cpp/src/arrow/extension/uuid.h | 21 ++++++++----- cpp/src/arrow/extension/uuid_test.cc | 31 +++++++++++++++++-- cpp/src/arrow/extension_type_test.cc | 3 ++ cpp/src/arrow/ipc/test_common.cc | 31 ++++++++++++++----- cpp/src/arrow/ipc/test_common.h | 3 ++ cpp/src/arrow/testing/gtest_util.cc | 14 --------- cpp/src/arrow/testing/gtest_util.h | 3 -- python/pyarrow/includes/libarrow.pxd | 9 ++---- python/pyarrow/public-api.pxi | 9 +++--- python/pyarrow/tests/test_extension_type.py | 14 +++++++-- python/pyarrow/types.pxi | 15 +-------- 13 files changed, 105 insertions(+), 64 deletions(-) diff --git a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc index 361c7ed39f3..842a78e1a4f 100644 --- a/cpp/src/arrow/extension/fixed_shape_tensor_test.cc +++ b/cpp/src/arrow/extension/fixed_shape_tensor_test.cc @@ -23,6 +23,7 @@ #include "arrow/array/array_primitive.h" #include "arrow/io/memory.h" #include "arrow/ipc/reader.h" +#include "arrow/ipc/test_common.h" #include "arrow/record_batch.h" #include "arrow/tensor.h" #include "arrow/testing/gtest_util.h" @@ -32,6 +33,7 @@ namespace arrow { using FixedShapeTensorType = extension::FixedShapeTensorType; +using arrow::ipc::test::RoundtripBatch; using extension::fixed_shape_tensor; using extension::FixedShapeTensorArray; diff --git a/cpp/src/arrow/extension/uuid.cc b/cpp/src/arrow/extension/uuid.cc index 80fee5cf105..43b917a17f8 100644 --- a/cpp/src/arrow/extension/uuid.cc +++ b/cpp/src/arrow/extension/uuid.cc @@ -15,13 +15,14 @@ // specific language governing permissions and limitations // under the License. +#include + #include "arrow/extension_type.h" #include "arrow/util/logging.h" #include "arrow/extension/uuid.h" -namespace arrow { -namespace extension { +namespace arrow::extension { bool UuidType::ExtensionEquals(const ExtensionType& other) const { return (other.extension_name() == this->extension_name()); @@ -46,7 +47,12 @@ Result> UuidType::Deserialize( return std::make_shared(); } +std::string UuidType::ToString(bool show_metadata) const { + std::stringstream ss; + ss << "extension<" << this->extension_name() << ">"; + return ss.str(); +} + std::shared_ptr uuid() { return std::make_shared(); } -} // namespace extension -} // namespace arrow +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/uuid.h b/cpp/src/arrow/extension/uuid.h index 3848c3d8a90..42bb21cf0b2 100644 --- a/cpp/src/arrow/extension/uuid.h +++ b/cpp/src/arrow/extension/uuid.h @@ -19,24 +19,30 @@ #include "arrow/extension_type.h" -namespace arrow { -namespace extension { +namespace arrow::extension { +/// \brief UuidArray stores array of UUIDs. Underlying storage type is +/// FixedSizeBinary(16). class ARROW_EXPORT UuidArray : public ExtensionArray { public: using ExtensionArray::ExtensionArray; }; +/// \brief UuidType is a canonical arrow extension type for UUIDs. +/// UUIDs are stored as FixedSizeBinary(16) with big-endian notation and this +/// does not interpret the bytes in any way. Specific UUID version is not +/// required or guaranteed. class ARROW_EXPORT UuidType : public ExtensionType { public: + /// \brief Construct a UuidType. UuidType() : ExtensionType(fixed_size_binary(16)) {} std::string extension_name() const override { return "arrow.uuid"; } - - const std::shared_ptr value_type() const { return fixed_size_binary(16); } + std::string ToString(bool show_metadata = false) const override; bool ExtensionEquals(const ExtensionType& other) const override; + /// Create a UuidArray from ArrayData std::shared_ptr MakeArray(std::shared_ptr data) const override; Result> Deserialize( @@ -49,8 +55,7 @@ class ARROW_EXPORT UuidType : public ExtensionType { static Result> Make() { return std::make_shared(); } }; -ARROW_EXPORT -std::shared_ptr uuid(); +/// \brief Return a UuidType instance. +ARROW_EXPORT std::shared_ptr uuid(); -} // namespace extension -} // namespace arrow +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/uuid_test.cc b/cpp/src/arrow/extension/uuid_test.cc index 6b146fee1f2..3bbb6eeb4ae 100644 --- a/cpp/src/arrow/extension/uuid_test.cc +++ b/cpp/src/arrow/extension/uuid_test.cc @@ -19,16 +19,18 @@ #include "arrow/testing/matchers.h" -#include "arrow/array/array_nested.h" -#include "arrow/array/array_primitive.h" #include "arrow/io/memory.h" #include "arrow/ipc/reader.h" +#include "arrow/ipc/test_common.h" #include "arrow/testing/gtest_util.h" +#include "arrow/util/key_value_metadata.h" #include "arrow/testing/extension_type.h" namespace arrow { +using arrow::ipc::test::RoundtripBatch; + TEST(TestUuuidExtensionType, ExtensionTypeTest) { auto type = uuid(); ASSERT_EQ(type->id(), Type::EXTENSION); @@ -42,4 +44,29 @@ TEST(TestUuuidExtensionType, ExtensionTypeTest) { ASSERT_FALSE(deserialized->Equals(*fixed_size_binary(16))); } +TEST(TestUuuidExtensionType, RoundtripBatch) { + auto ext_type = extension::uuid(); + auto exact_ext_type = internal::checked_pointer_cast(ext_type); + auto arr = ArrayFromJSON(fixed_size_binary(16), R"(["abcdefghijklmnop", null])"); + auto ext_arr = ExtensionType::WrapArray(ext_type, arr); + + // Pass extension array, expect getting back extension array + std::shared_ptr read_batch; + auto ext_field = field(/*name=*/"f0", /*type=*/ext_type); + auto batch = RecordBatch::Make(schema({ext_field}), ext_arr->length(), {ext_arr}); + RoundtripBatch(batch, &read_batch); + CompareBatch(*batch, *read_batch, /*compare_metadata=*/true); + + // Pass extension metadata and storage array, expect getting back extension array + std::shared_ptr read_batch2; + auto ext_metadata = + key_value_metadata({{"ARROW:extension:name", exact_ext_type->extension_name()}, + {"ARROW:extension:metadata", ""}}); + ext_field = field(/*name=*/"f0", /*type=*/exact_ext_type->storage_type(), + /*nullable=*/true, /*metadata=*/ext_metadata); + auto batch2 = RecordBatch::Make(schema({ext_field}), arr->length(), {arr}); + RoundtripBatch(batch2, &read_batch2); + CompareBatch(*batch, *read_batch2, /*compare_metadata=*/true); +} + } // namespace arrow diff --git a/cpp/src/arrow/extension_type_test.cc b/cpp/src/arrow/extension_type_test.cc index a9ec70dbd56..f49ffc5cba5 100644 --- a/cpp/src/arrow/extension_type_test.cc +++ b/cpp/src/arrow/extension_type_test.cc @@ -30,6 +30,7 @@ #include "arrow/io/memory.h" #include "arrow/ipc/options.h" #include "arrow/ipc/reader.h" +#include "arrow/ipc/test_common.h" #include "arrow/ipc/writer.h" #include "arrow/record_batch.h" #include "arrow/status.h" @@ -41,6 +42,8 @@ namespace arrow { +using arrow::ipc::test::RoundtripBatch; + class Parametric1Array : public ExtensionArray { public: using ExtensionArray::ExtensionArray; diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 72a481660e6..fb4f6bd8ead 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -27,8 +27,10 @@ #include "arrow/array.h" #include "arrow/array/builder_binary.h" #include "arrow/array/builder_primitive.h" -#include "arrow/array/builder_time.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/reader.h" #include "arrow/ipc/test_common.h" +#include "arrow/ipc/writer.h" #include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/status.h" @@ -242,11 +244,11 @@ Status MakeRandomBooleanArray(const int length, bool include_nulls, std::shared_ptr* out) { std::vector values(length); random_null_bytes(length, 0.5, values.data()); - ARROW_ASSIGN_OR_RAISE(auto data, internal::BytesToBits(values)); + ARROW_ASSIGN_OR_RAISE(auto data, arrow::internal::BytesToBits(values)); if (include_nulls) { std::vector valid_bytes(length); - ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(valid_bytes)); + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, arrow::internal::BytesToBits(valid_bytes)); random_null_bytes(length, 0.1, valid_bytes.data()); *out = std::make_shared(length, data, null_bitmap, -1); } else { @@ -596,7 +598,7 @@ Status MakeStruct(std::shared_ptr* out) { std::shared_ptr no_nulls(new StructArray(type, list_batch->num_rows(), columns)); std::vector null_bytes(list_batch->num_rows(), 1); null_bytes[0] = 0; - ARROW_ASSIGN_OR_RAISE(auto null_bitmap, internal::BytesToBits(null_bytes)); + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, arrow::internal::BytesToBits(null_bytes)); std::shared_ptr with_nulls( new StructArray(type, list_batch->num_rows(), columns, null_bitmap, 1)); @@ -1176,12 +1178,13 @@ enable_if_t::value, void> FillRandomData( Status MakeRandomTensor(const std::shared_ptr& type, const std::vector& shape, bool row_major_p, std::shared_ptr* out, uint32_t seed) { - const auto& element_type = internal::checked_cast(*type); + const auto& element_type = arrow::internal::checked_cast(*type); std::vector strides; if (row_major_p) { - RETURN_NOT_OK(internal::ComputeRowMajorStrides(element_type, shape, &strides)); + RETURN_NOT_OK(arrow::internal::ComputeRowMajorStrides(element_type, shape, &strides)); } else { - RETURN_NOT_OK(internal::ComputeColumnMajorStrides(element_type, shape, &strides)); + RETURN_NOT_OK( + arrow::internal::ComputeColumnMajorStrides(element_type, shape, &strides)); } const int64_t element_size = element_type.bit_width() / CHAR_BIT; @@ -1233,6 +1236,20 @@ Status MakeRandomTensor(const std::shared_ptr& type, return Tensor::Make(type, buf, shape, strides).Value(out); } +void RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out) { + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); + ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), + out_stream.get())); + + ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); + + io::BufferReader reader(complete_ipc_stream); + std::shared_ptr batch_reader; + ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); + ASSERT_OK(batch_reader->ReadNext(out)); +} + } // namespace test } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index db8613cbb1e..9b7e7f13e3a 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -184,6 +184,9 @@ Status MakeRandomTensor(const std::shared_ptr& type, const std::vector& shape, bool row_major_p, std::shared_ptr* out, uint32_t seed = 0); +ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr& batch, + std::shared_ptr* out); + } // namespace test } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/testing/gtest_util.cc b/cpp/src/arrow/testing/gtest_util.cc index f896f52c15e..ae2e53b30a3 100644 --- a/cpp/src/arrow/testing/gtest_util.cc +++ b/cpp/src/arrow/testing/gtest_util.cc @@ -590,20 +590,6 @@ void ApproxCompareBatch(const RecordBatch& left, const RecordBatch& right, }); } -void RoundtripBatch(const std::shared_ptr& batch, - std::shared_ptr* out) { - ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); - ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), - out_stream.get())); - - ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); - - io::BufferReader reader(complete_ipc_stream); - std::shared_ptr batch_reader; - ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); - ASSERT_OK(batch_reader->ReadNext(out)); -} - std::shared_ptr TweakValidityBit(const std::shared_ptr& array, int64_t index, bool validity) { auto data = array->data()->Copy(); diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index e77694d12a0..85b4c1f1f01 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -309,9 +309,6 @@ ARROW_TESTING_EXPORT void ApproxCompareBatch( const RecordBatch& left, const RecordBatch& right, bool compare_metadata = true, const EqualOptions& options = TestingEqualOptions()); -ARROW_TESTING_EXPORT void RoundtripBatch(const std::shared_ptr& batch, - std::shared_ptr* out); - // Check if the padding of the buffers of the array is zero. // Also cause valgrind warnings if the padding bytes are uninitialized. ARROW_TESTING_EXPORT void AssertZeroPadded(const Array& array); diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9dd74e769de..c2346750a19 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2865,17 +2865,14 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() -cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension": +cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil: cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType): @staticmethod CResult[shared_ptr[CDataType]] Make() - CResult[shared_ptr[CDataType]] Deserialize(const c_string& serialized_data) const - - c_string Serialize() const - - const shared_ptr[CDataType] value_type() + cdef cppclass CUuidArray" arrow::extension::UuidArray"(CExtensionArray): + pass cdef extern from "arrow/extension/fixed_shape_tensor.h" namespace "arrow::extension" nogil: diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 83d9e9e79b6..d3e2ff2e99d 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -120,15 +120,16 @@ cdef api object pyarrow_wrap_data_type( elif type.get().id() == _Type_EXTENSION: ext_type = type.get() cpy_ext_type = dynamic_cast[_CPyExtensionTypePtr](ext_type) + extension_name = ext_type.extension_name() if cpy_ext_type != nullptr: return cpy_ext_type.GetInstance() - elif ext_type.extension_name() == b"arrow.bool8": + elif extension_name == b"arrow.bool8": out = Bool8Type.__new__(Bool8Type) - elif ext_type.extension_name() == b"arrow.fixed_shape_tensor": + elif extension_name == b"arrow.fixed_shape_tensor": out = FixedShapeTensorType.__new__(FixedShapeTensorType) - elif ext_type.extension_name() == b"arrow.opaque": + elif extension_name == b"arrow.opaque": out = OpaqueType.__new__(OpaqueType) - elif ext_type.extension_name() == b"arrow.uuid": + elif extension_name == b"arrow.uuid": out = UuidType.__new__(UuidType) else: out = BaseExtensionType.__new__(BaseExtensionType) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 56ef9c42722..aacbd2cb6e7 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -269,14 +269,14 @@ def test_ext_type_repr(): assert repr(ty) == "IntegerType(DataType(int64))" -def test_ext_type__lifetime(): +def test_ext_type_lifetime(): ty = ExampleUuidType() wr = weakref.ref(ty) del ty assert wr() is None -def test_ext_type__storage_type(): +def test_ext_type_storage_type(): ty = ExampleUuidType() assert ty.storage_type == pa.binary(16) assert ty.__class__ is ExampleUuidType @@ -354,6 +354,16 @@ def test_uuid_type_pickle(pickle_module): del ty assert wr() is None + for proto in range(0, pickle_module.HIGHEST_PROTOCOL + 1): + ty = pa.uuid() + ser = pickle_module.dumps(ty, protocol=proto) + del ty + ty = pickle_module.loads(ser) + wr = weakref.ref(ty) + assert ty.extension_name == "arrow.uuid" + del ty + assert wr() is None + def test_ext_type_equality(): a = ParamExtType(5) diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 8087cdfcfa3..f83ecc3aa43 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1774,24 +1774,11 @@ cdef class UuidType(BaseExtensionType): BaseExtensionType.init(self, type) self.uuid_ext_type = type.get() - def __arrow_ext_serialize__(self): - """ - Serialized representation of metadata to reconstruct the type object. - """ - return self.uuid_ext_type.Serialize() - - @classmethod - def __arrow_ext_deserialize__(self, storage_type, serialized): - """ - Return an UuidType instance from the storage type. - """ - return self.uuid_ext_type.Deserialize(storage_type, serialized) - def __arrow_ext_class__(self): return UuidArray def __reduce__(self): - return uuid, (self.value_type,) + return uuid, () def __arrow_ext_scalar_class__(self): return UuidScalar