diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index fd601b673c4..4e5a825969b 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -132,6 +132,8 @@ parquet::ArrowReaderProperties MakeArrowReaderProperties( arrow_properties.set_io_context( parquet_scan_options.arrow_reader_properties->io_context()); arrow_properties.set_use_threads(options.use_threads); + arrow_properties.set_arrow_extensions_enabled( + parquet_scan_options.arrow_reader_properties->get_arrow_extensions_enabled()); return arrow_properties; } diff --git a/cpp/src/arrow/extension/uuid.cc b/cpp/src/arrow/extension/uuid.cc index 2f36eb3e7d1..b24f6895d0c 100644 --- a/cpp/src/arrow/extension/uuid.cc +++ b/cpp/src/arrow/extension/uuid.cc @@ -40,7 +40,7 @@ Result> UuidType::Deserialize( if (!serialized.empty()) { return Status::Invalid("Unexpected serialized metadata: '", serialized, "'"); } - if (!storage_type->Equals(*fixed_size_binary(16))) { + if (!IsSupportedStorageType(storage_type)) { return Status::Invalid("Invalid storage type for UuidType: ", storage_type->ToString()); } @@ -55,4 +55,8 @@ std::string UuidType::ToString(bool show_metadata) const { std::shared_ptr uuid() { return std::make_shared(); } +bool UuidType::IsSupportedStorageType(const std::shared_ptr& storage_type) { + return storage_type->Equals(*fixed_size_binary(16)); +} + } // namespace arrow::extension diff --git a/cpp/src/arrow/extension/uuid.h b/cpp/src/arrow/extension/uuid.h index 42bb21cf0b2..8c9660c463b 100644 --- a/cpp/src/arrow/extension/uuid.h +++ b/cpp/src/arrow/extension/uuid.h @@ -53,6 +53,8 @@ class ARROW_EXPORT UuidType : public ExtensionType { /// \brief Create a UuidType instance static Result> Make() { return std::make_shared(); } + + static bool IsSupportedStorageType(const std::shared_ptr& storage_type); }; /// \brief Return a UuidType instance. diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 0ccc541d1ae..66fd049f6a3 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -33,6 +33,7 @@ #include "arrow/array.h" #include "arrow/extension/json.h" +#include "arrow/extension/uuid.h" #include "arrow/ipc/writer.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" @@ -945,7 +946,7 @@ TEST_F(TestConvertParquetSchema, ParquetVariant) { } } -TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { +TEST_F(TestConvertParquetSchema, ParquetSchemaArrowJsonExtension) { std::vector parquet_fields; parquet_fields.push_back(PrimitiveNode::Make( "json_1", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, ConvertedType::JSON)); @@ -1027,6 +1028,68 @@ TEST_F(TestConvertParquetSchema, ParquetSchemaArrowExtensions) { } } +TEST_F(TestConvertParquetSchema, ParquetSchemaArrowUuidExtension) { + std::vector parquet_fields; + parquet_fields.push_back(PrimitiveNode::Make("uuid", Repetition::OPTIONAL, + LogicalType::UUID(), + ParquetType::FIXED_LEN_BYTE_ARRAY, 16)); + + { + // Parquet file does not contain Arrow schema. + // By default, field should be treated as fixed_size_binary(16) in Arrow. + auto arrow_schema = + ::arrow::schema({::arrow::field("uuid", ::arrow::fixed_size_binary(16))}); + std::shared_ptr metadata{}; + ASSERT_OK(ConvertSchema(parquet_fields, metadata)); + CheckFlatSchema(arrow_schema); + } + + { + // Parquet file does not contain Arrow schema. + // If Arrow extensions are enabled, field will be interpreted as uuid() + // extension field. + ArrowReaderProperties props; + props.set_arrow_extensions_enabled(true); + auto arrow_schema = + ::arrow::schema({::arrow::field("uuid", ::arrow::extension::uuid())}); + std::shared_ptr metadata{}; + ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); + CheckFlatSchema(arrow_schema); + } + + { + // Parquet file contains Arrow schema. + // uuid will be interpreted as uuid() field even though extensions are not enabled. + ArrowReaderProperties props; + props.set_arrow_extensions_enabled(false); + std::shared_ptr field_metadata = + ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); + auto arrow_schema = ::arrow::schema({::arrow::field( + "uuid", ::arrow::extension::uuid(), /*nullable=*/true, field_metadata)}); + + std::shared_ptr metadata; + ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata)); + ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); + CheckFlatSchema(arrow_schema, true /* check_metadata */); + } + + { + // Parquet file contains Arrow schema. + // uuid will be interpreted as uuid() field also when extensions *are* enabled + ArrowReaderProperties props; + props.set_arrow_extensions_enabled(true); + std::shared_ptr field_metadata = + ::arrow::key_value_metadata({"foo", "bar"}, {"biz", "baz"}); + auto arrow_schema = ::arrow::schema({::arrow::field( + "uuid", ::arrow::extension::uuid(), /*nullable=*/true, field_metadata)}); + + std::shared_ptr metadata; + ASSERT_OK(ArrowSchemaToParquetMetadata(arrow_schema, metadata)); + ASSERT_OK(ConvertSchema(parquet_fields, metadata, props)); + CheckFlatSchema(arrow_schema, true /* check_metadata */); + } +} + class TestConvertArrowSchema : public ::testing::Test { public: virtual void SetUp() {} diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 646ec05d00d..6b27bb8f55c 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -22,6 +22,7 @@ #include #include "arrow/extension/json.h" +#include "arrow/extension/uuid.h" #include "arrow/extension_type.h" #include "arrow/io/memory.h" #include "arrow/ipc/api.h" @@ -454,12 +455,18 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, type = ParquetType::BYTE_ARRAY; logical_type = LogicalType::JSON(); break; + } else if (ext_type->extension_name() == std::string("arrow.uuid")) { + type = ParquetType::FIXED_LEN_BYTE_ARRAY; + logical_type = LogicalType::UUID(); + length = 16; + break; } else if (ext_type->extension_name() == std::string("parquet.variant")) { auto variant_type = std::static_pointer_cast(field->type()); return VariantToNode(variant_type, name, field->nullable(), field_id, properties, arrow_properties, out); } + std::shared_ptr<::arrow::Field> storage_field = ::arrow::field( name, ext_type->storage_type(), field->nullable(), field->metadata()); return FieldToNode(name, storage_field, properties, arrow_properties, out); @@ -1052,60 +1059,62 @@ Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer bool modified = false; auto& origin_type = origin_field.type(); - const auto& inferred_type = inferred->field->type(); + // The origin was an extension type. This occurs when the ARROW:extension:name field + // was present when the schema was written and that extension is registered when + // the schema is read. if (origin_type->id() == ::arrow::Type::EXTENSION) { - const auto& ex_type = checked_cast(*origin_type); - if (inferred_type->id() != ::arrow::Type::EXTENSION && - ex_type.extension_name() == std::string("arrow.json") && - ::arrow::extension::JsonExtensionType::IsSupportedStorageType( - inferred_type->id())) { - // Schema mismatch. - // - // Arrow extensions are DISABLED in Parquet. - // origin_type is ::arrow::extension::json() - // inferred_type is ::arrow::utf8() - // - // Origin type is restored as Arrow should be considered the source of truth. - inferred->field = inferred->field->WithType(origin_type); - RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); - } else if (inferred_type->id() == ::arrow::Type::EXTENSION && - ex_type.extension_name() == std::string("arrow.json")) { - // Potential schema mismatch. - // - // Arrow extensions are ENABLED in Parquet. - // origin_type is arrow::extension::json(...) - // inferred_type is arrow::extension::json(arrow::utf8()) - auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); - - // Apply metadata recursively to storage type - RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); - inferred->field = inferred->field->WithType(origin_type); - } else if (inferred_type->id() == ::arrow::Type::EXTENSION && - ex_type.extension_name() == std::string("parquet.variant")) { - // Potential schema mismatch. - // - // Arrow extensions are ENABLED in Parquet. - // origin_type is parquet::arrow::variant(...) - // inferred_type is - // parquet::arrow::variant(struct(arrow::binary(),arrow::binary())) - auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); - - // Apply metadata recursively to storage type - RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); - inferred->field = inferred->field->WithType(origin_type); + const auto& origin_extension_type = + checked_cast(*origin_type); + + // (Recursively) Apply the original storage metadata from the original storage field + // This applies extension types to child elements, if any. + auto origin_storage_field = + origin_field.WithType(origin_extension_type.storage_type()); + RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); + + // Use the inferred type after child updates for below checks to see if + // we can restore an extension type on the output. + const auto& inferred_type = inferred->field->type(); + + // Whether or not the inferred type is also an extension type. This can occur when + // arrow_extensions_enabled is true in the ArrowReaderProperties. Extension types + // are not currently inferred for any other reason. + bool arrow_extension_inferred = inferred_type->id() == ::arrow::Type::EXTENSION; + + // Check if the inferred storage type is compatible with the extension type + // we're hoping to apply. We assume that if an extension type was inferred + // that it was constructed with a valid storage type. Otherwise, we check with + // extension types that we know about for valid storage, falling back to + // storage type equality for extension types that we don't know about. + std::string origin_extension_name = origin_extension_type.extension_name(); + bool extension_supports_inferred_storage; + + if (origin_extension_name == "arrow.json") { + extension_supports_inferred_storage = + arrow_extension_inferred || + ::arrow::extension::JsonExtensionType::IsSupportedStorageType( + inferred_type->id()); + } else if (origin_extension_name == "arrow.uuid") { + extension_supports_inferred_storage = + arrow_extension_inferred || + ::arrow::extension::UuidType::IsSupportedStorageType(inferred_type); + } else if (origin_extension_name == "parquet.variant") { + extension_supports_inferred_storage = + arrow_extension_inferred || + VariantExtensionType::IsSupportedStorageType(inferred_type); } else { - auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); - - // Apply metadata recursively to storage type - RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); + extension_supports_inferred_storage = + origin_extension_type.storage_type()->Equals(*inferred_type); + } - // Restore extension type, if the storage type is the same as inferred - // from the Parquet type - if (ex_type.storage_type()->Equals(*inferred->field->type())) { - inferred->field = inferred->field->WithType(origin_type); - } + // If the origin extension of the metadata we are about to apply supports + // the Arrow storage type we would otherwise return, we restore the extension + // type to the output. + if (extension_supports_inferred_storage) { + inferred->field = inferred->field->WithType(origin_type); } + modified = true; } else { ARROW_ASSIGN_OR_RAISE(modified, ApplyOriginalStorageMetadata(origin_field, inferred)); diff --git a/cpp/src/parquet/arrow/schema_internal.cc b/cpp/src/parquet/arrow/schema_internal.cc index 004f42ea869..8ddac98ec3c 100644 --- a/cpp/src/parquet/arrow/schema_internal.cc +++ b/cpp/src/parquet/arrow/schema_internal.cc @@ -18,6 +18,7 @@ #include "parquet/arrow/schema_internal.h" #include "arrow/extension/json.h" +#include "arrow/extension/uuid.h" #include "arrow/type.h" #include "parquet/properties.h" @@ -134,8 +135,9 @@ Result> FromByteArray( } } -Result> FromFLBA(const LogicalType& logical_type, - int32_t physical_length) { +Result> FromFLBA( + const LogicalType& logical_type, int32_t physical_length, + const ArrowReaderProperties& reader_properties) { switch (logical_type.type()) { case LogicalType::Type::DECIMAL: return MakeArrowDecimal(logical_type); @@ -143,11 +145,15 @@ Result> FromFLBA(const LogicalType& logical_type, return ::arrow::float16(); case LogicalType::Type::NONE: case LogicalType::Type::INTERVAL: + return ::arrow::fixed_size_binary(physical_length); case LogicalType::Type::UUID: + if (physical_length == 16 && reader_properties.get_arrow_extensions_enabled()) { + return ::arrow::extension::uuid(); + } + return ::arrow::fixed_size_binary(physical_length); default: - return Status::NotImplemented("Unhandled logical logical_type ", - logical_type.ToString(), + return Status::NotImplemented("Unhandled logical_type ", logical_type.ToString(), " for fixed-length binary array"); } } @@ -216,7 +222,7 @@ Result> GetArrowType( case ParquetType::BYTE_ARRAY: return FromByteArray(logical_type, reader_properties); case ParquetType::FIXED_LEN_BYTE_ARRAY: - return FromFLBA(logical_type, type_length); + return FromFLBA(logical_type, type_length, reader_properties); default: { // PARQUET-1565: This can occur if the file is corrupt return Status::IOError("Invalid physical column type: ", diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 45db755d903..107f17aca67 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -703,7 +703,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): cache_options : pyarrow.CacheOptions, default None Cache options used when pre_buffer is enabled. The default values should be good for most use cases. You may want to adjust these for example if - you have exceptionally high latency to the file system. + you have exceptionally high latency to the file system. thrift_string_size_limit : int, default None If not None, override the maximum total string size allocated when decoding Thrift structures. The default limit should be @@ -720,6 +720,10 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): Parquet file. page_checksum_verification : bool, default False If True, verify the page checksum for each page read from the file. + arrow_extensions_enabled : bool, default False + If True, read Parquet logical types as Arrow extension types where possible, + (e.g., read JSON as the canonical `arrow.json` extension type or UUID as + the canonical `arrow.uuid` extension type). """ # Avoid mistakingly creating attributes @@ -733,7 +737,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): thrift_container_size_limit=None, decryption_config=None, decryption_properties=None, - bint page_checksum_verification=False): + bint page_checksum_verification=False, + bint arrow_extensions_enabled=False): self.init(shared_ptr[CFragmentScanOptions]( new CParquetFragmentScanOptions())) self.use_buffered_stream = use_buffered_stream @@ -752,6 +757,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): if decryption_properties is not None: self.decryption_properties = decryption_properties self.page_checksum_verification = page_checksum_verification + self.arrow_extensions_enabled = arrow_extensions_enabled cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp): FragmentScanOptions.init(self, sp) @@ -868,6 +874,14 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): def page_checksum_verification(self, bint page_checksum_verification): self.reader_properties().set_page_checksum_verification(page_checksum_verification) + @property + def arrow_extensions_enabled(self): + return self.arrow_reader_properties().get_arrow_extensions_enabled() + + @arrow_extensions_enabled.setter + def arrow_extensions_enabled(self, bint arrow_extensions_enabled): + self.arrow_reader_properties().set_arrow_extensions_enabled(arrow_extensions_enabled) + def equals(self, ParquetFragmentScanOptions other): """ Parameters @@ -881,11 +895,12 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): attrs = ( self.use_buffered_stream, self.buffer_size, self.pre_buffer, self.cache_options, self.thrift_string_size_limit, self.thrift_container_size_limit, - self.page_checksum_verification) + self.page_checksum_verification, self.arrow_extensions_enabled) other_attrs = ( other.use_buffered_stream, other.buffer_size, other.pre_buffer, other.cache_options, other.thrift_string_size_limit, - other.thrift_container_size_limit, other.page_checksum_verification) + other.thrift_container_size_limit, other.page_checksum_verification, + other.arrow_extensions_enabled) return attrs == other_attrs @staticmethod @@ -902,7 +917,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): cache_options=self.cache_options, thrift_string_size_limit=self.thrift_string_size_limit, thrift_container_size_limit=self.thrift_container_size_limit, - page_checksum_verification=self.page_checksum_verification + page_checksum_verification=self.page_checksum_verification, + arrow_extensions_enabled=self.arrow_extensions_enabled ) return ParquetFragmentScanOptions._reconstruct, (kwargs,) diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index e6de9712f83..020a1220c6a 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -404,6 +404,8 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: CCacheOptions cache_options() const void set_coerce_int96_timestamp_unit(TimeUnit unit) TimeUnit coerce_int96_timestamp_unit() const + void set_arrow_extensions_enabled(c_bool extensions_enabled) + c_bool get_arrow_extensions_enabled() const ArrowReaderProperties default_arrow_reader_properties() diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 55c2866243b..116a7ba3c5d 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1452,7 +1452,8 @@ cdef class ParquetReader(_Weakrefable): FileDecryptionProperties decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, - page_checksum_verification=False): + page_checksum_verification=False, + arrow_extensions_enabled=False): """ Open a parquet file for reading. @@ -1469,6 +1470,7 @@ cdef class ParquetReader(_Weakrefable): thrift_string_size_limit : int, optional thrift_container_size_limit : int, optional page_checksum_verification : bool, default False + arrow_extensions_enabled : bool, default False """ cdef: shared_ptr[CFileMetaData] c_metadata @@ -1518,6 +1520,8 @@ cdef class ParquetReader(_Weakrefable): arrow_props.set_coerce_int96_timestamp_unit( string_to_timeunit(coerce_int96_timestamp_unit)) + arrow_props.set_arrow_extensions_enabled(arrow_extensions_enabled) + self.source = source get_reader(source, use_memory_map, &self.rd_handle) diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 8d3dec96a6b..71a27d4be7f 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -254,6 +254,10 @@ class ParquetFile: it will be parsed as an URI to determine the filesystem. page_checksum_verification : bool, default False If True, verify the checksum for each page read from the file. + arrow_extensions_enabled : bool, default False + If True, read Parquet logical types as Arrow extension types where possible, + (e.g., read JSON as the canonical `arrow.json` extension type or UUID as + the canonical `arrow.uuid` extension type). Examples -------- @@ -302,7 +306,7 @@ def __init__(self, source, *, metadata=None, common_metadata=None, pre_buffer=False, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, filesystem=None, - page_checksum_verification=False): + page_checksum_verification=False, arrow_extensions_enabled=False): self._close_source = getattr(source, 'closed', True) @@ -322,6 +326,7 @@ def __init__(self, source, *, metadata=None, common_metadata=None, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, page_checksum_verification=page_checksum_verification, + arrow_extensions_enabled=arrow_extensions_enabled, ) self.common_metadata = common_metadata self._nested_paths_by_prefix = self._build_nested_paths() @@ -1264,6 +1269,10 @@ class ParquetDataset: sufficient for most Parquet files. page_checksum_verification : bool, default False If True, verify the page checksum for each page read from the file. +arrow_extensions_enabled : bool, default False + If True, read Parquet logical types as Arrow extension types where possible, + (e.g., read JSON as the canonical `arrow.json` extension type or UUID as + the canonical `arrow.uuid` extension type). Examples -------- @@ -1276,7 +1285,8 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, - page_checksum_verification=False): + page_checksum_verification=False, + arrow_extensions_enabled=False): import pyarrow.dataset as ds @@ -1287,6 +1297,7 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, "thrift_string_size_limit": thrift_string_size_limit, "thrift_container_size_limit": thrift_container_size_limit, "page_checksum_verification": page_checksum_verification, + "arrow_extensions_enabled": arrow_extensions_enabled, } if buffer_size: read_options.update(use_buffered_stream=True, @@ -1674,6 +1685,10 @@ def partitioning(self): sufficient for most Parquet files. page_checksum_verification : bool, default False If True, verify the checksum for each page read from the file. +arrow_extensions_enabled : bool, default False + If True, read Parquet logical types as Arrow extension types where possible, + (e.g., read JSON as the canonical `arrow.json` extension type or UUID as + the canonical `arrow.uuid` extension type). Returns ------- @@ -1768,7 +1783,8 @@ def read_table(source, *, columns=None, use_threads=True, pre_buffer=True, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None, - page_checksum_verification=False): + page_checksum_verification=False, + arrow_extensions_enabled=False): try: dataset = ParquetDataset( @@ -1787,6 +1803,7 @@ def read_table(source, *, columns=None, use_threads=True, thrift_string_size_limit=thrift_string_size_limit, thrift_container_size_limit=thrift_container_size_limit, page_checksum_verification=page_checksum_verification, + arrow_extensions_enabled=arrow_extensions_enabled, ) except ImportError: # fall back on ParquetFile for simple cases when pyarrow.dataset diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 12124acc5f4..e7922194918 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -520,7 +520,43 @@ def test_json_extension_type(storage_type): table = pa.table([arr], names=["ext"]) - _simple_table_roundtrip(table) + # With defaults, this should roundtrip (because store_schema=True) + _check_roundtrip(table, table) + + # When store_schema is False, we get a string back by default + _check_roundtrip( + table, + pa.table({"ext": pa.array(data, pa.string())}), + store_schema=False) + + # With arrow_extensions_enabled=True on read, we get a arrow.json back + # (but with string() storage) + _check_roundtrip( + table, + pa.table({"ext": pa.array(data, pa.json_(pa.string()))}), + read_table_kwargs={"arrow_extensions_enabled": True}, + store_schema=False) + + +def test_uuid_extension_type(): + data = [ + b'\xe4`\xf9p\x83QGN\xac\x7f\xa4g>K\xa8\xcb', + b'\x1et\x14\x95\xee\xd5C\xea\x9b\xd7s\xdc\x91BK\xaf', + None + ] + arr = pa.array(data, type=pa.uuid()) + + table = pa.table([arr], names=["ext"]) + + _check_roundtrip(table, table) + _check_roundtrip( + table, + pa.table({"ext": pa.array(data, pa.binary(16))}), + store_schema=False) + _check_roundtrip( + table, + table, + {"arrow_extensions_enabled": True}, store_schema=False) def test_undefined_logical_type(parquet_test_datadir):