-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-43807: [C++][Python] Add UUID extension type conversion support to/from Parquet #45866
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a365b61
4cee020
3d1a4ec
dc696cf
300f5eb
af8227c
bd9c9a9
f187586
d11fcd8
f7a965e
b4d0730
2193fd4
a81d5c1
62f92b3
d1b37e8
4bd607b
7529947
807a48f
ced4d6b
d2d98f2
f3894b8
a101c8b
4c66a75
234475e
8e8d5ee
26afc40
4b63ec7
b48f7f5
555ed75
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,7 @@ | |
| #include <vector> | ||
|
|
||
| #include "arrow/extension/json.h" | ||
| #include "arrow/extension/uuid.h" | ||
| #include "arrow/extension_type.h" | ||
| #include "arrow/io/memory.h" | ||
| #include "arrow/ipc/api.h" | ||
|
|
@@ -454,12 +455,18 @@ Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field, | |
| type = ParquetType::BYTE_ARRAY; | ||
| logical_type = LogicalType::JSON(); | ||
| break; | ||
| } else if (ext_type->extension_name() == std::string("arrow.uuid")) { | ||
| type = ParquetType::FIXED_LEN_BYTE_ARRAY; | ||
| logical_type = LogicalType::UUID(); | ||
| length = 16; | ||
| break; | ||
| } else if (ext_type->extension_name() == std::string("parquet.variant")) { | ||
| auto variant_type = std::static_pointer_cast<VariantExtensionType>(field->type()); | ||
|
|
||
| return VariantToNode(variant_type, name, field->nullable(), field_id, properties, | ||
| arrow_properties, out); | ||
| } | ||
|
|
||
| std::shared_ptr<::arrow::Field> storage_field = ::arrow::field( | ||
| name, ext_type->storage_type(), field->nullable(), field->metadata()); | ||
| return FieldToNode(name, storage_field, properties, arrow_properties, out); | ||
|
|
@@ -1052,60 +1059,62 @@ Result<bool> ApplyOriginalMetadata(const Field& origin_field, SchemaField* infer | |
| bool modified = false; | ||
|
|
||
| auto& origin_type = origin_field.type(); | ||
| const auto& inferred_type = inferred->field->type(); | ||
|
|
||
| // The origin was an extension type. This occurs when the ARROW:extension:name field | ||
| // was present when the schema was written and that extension is registered when | ||
| // the schema is read. | ||
| if (origin_type->id() == ::arrow::Type::EXTENSION) { | ||
| const auto& ex_type = checked_cast<const ::arrow::ExtensionType&>(*origin_type); | ||
| if (inferred_type->id() != ::arrow::Type::EXTENSION && | ||
| ex_type.extension_name() == std::string("arrow.json") && | ||
| ::arrow::extension::JsonExtensionType::IsSupportedStorageType( | ||
| inferred_type->id())) { | ||
| // Schema mismatch. | ||
| // | ||
| // Arrow extensions are DISABLED in Parquet. | ||
| // origin_type is ::arrow::extension::json() | ||
| // inferred_type is ::arrow::utf8() | ||
| // | ||
| // Origin type is restored as Arrow should be considered the source of truth. | ||
| inferred->field = inferred->field->WithType(origin_type); | ||
| RETURN_NOT_OK(ApplyOriginalStorageMetadata(origin_field, inferred)); | ||
| } else if (inferred_type->id() == ::arrow::Type::EXTENSION && | ||
| ex_type.extension_name() == std::string("arrow.json")) { | ||
| // Potential schema mismatch. | ||
| // | ||
| // Arrow extensions are ENABLED in Parquet. | ||
| // origin_type is arrow::extension::json(...) | ||
| // inferred_type is arrow::extension::json(arrow::utf8()) | ||
| auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); | ||
|
|
||
| // Apply metadata recursively to storage type | ||
| RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); | ||
| inferred->field = inferred->field->WithType(origin_type); | ||
| } else if (inferred_type->id() == ::arrow::Type::EXTENSION && | ||
| ex_type.extension_name() == std::string("parquet.variant")) { | ||
| // Potential schema mismatch. | ||
| // | ||
| // Arrow extensions are ENABLED in Parquet. | ||
| // origin_type is parquet::arrow::variant(...) | ||
| // inferred_type is | ||
| // parquet::arrow::variant(struct(arrow::binary(),arrow::binary())) | ||
| auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); | ||
|
|
||
| // Apply metadata recursively to storage type | ||
| RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); | ||
| inferred->field = inferred->field->WithType(origin_type); | ||
| const auto& origin_extension_type = | ||
| checked_cast<const ::arrow::ExtensionType&>(*origin_type); | ||
|
|
||
| // (Recursively) Apply the original storage metadata from the original storage field | ||
| // This applies extension types to child elements, if any. | ||
| auto origin_storage_field = | ||
| origin_field.WithType(origin_extension_type.storage_type()); | ||
| RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); | ||
|
|
||
| // Use the inferred type after child updates for below checks to see if | ||
| // we can restore an extension type on the output. | ||
| const auto& inferred_type = inferred->field->type(); | ||
|
|
||
| // Whether or not the inferred type is also an extension type. This can occur when | ||
| // arrow_extensions_enabled is true in the ArrowReaderProperties. Extension types | ||
| // are not currently inferred for any other reason. | ||
| bool arrow_extension_inferred = inferred_type->id() == ::arrow::Type::EXTENSION; | ||
|
|
||
| // Check if the inferred storage type is compatible with the extension type | ||
| // we're hoping to apply. We assume that if an extension type was inferred | ||
| // that it was constructed with a valid storage type. Otherwise, we check with | ||
| // extension types that we know about for valid storage, falling back to | ||
| // storage type equality for extension types that we don't know about. | ||
| std::string origin_extension_name = origin_extension_type.extension_name(); | ||
| bool extension_supports_inferred_storage; | ||
|
|
||
| if (origin_extension_name == "arrow.json") { | ||
| extension_supports_inferred_storage = | ||
| arrow_extension_inferred || | ||
| ::arrow::extension::JsonExtensionType::IsSupportedStorageType( | ||
| inferred_type->id()); | ||
| } else if (origin_extension_name == "arrow.uuid") { | ||
| extension_supports_inferred_storage = | ||
| arrow_extension_inferred || | ||
| ::arrow::extension::UuidType::IsSupportedStorageType(inferred_type); | ||
| } else if (origin_extension_name == "parquet.variant") { | ||
| extension_supports_inferred_storage = | ||
| arrow_extension_inferred || | ||
| VariantExtensionType::IsSupportedStorageType(inferred_type); | ||
| } else { | ||
| auto origin_storage_field = origin_field.WithType(ex_type.storage_type()); | ||
|
|
||
| // Apply metadata recursively to storage type | ||
| RETURN_NOT_OK(ApplyOriginalStorageMetadata(*origin_storage_field, inferred)); | ||
| extension_supports_inferred_storage = | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to the above comment, do we need to check this? It will go to the else branch at line 1108 and the same check is performed there too. |
||
| origin_extension_type.storage_type()->Equals(*inferred_type); | ||
| } | ||
|
|
||
| // Restore extension type, if the storage type is the same as inferred | ||
| // from the Parquet type | ||
| if (ex_type.storage_type()->Equals(*inferred->field->type())) { | ||
| inferred->field = inferred->field->WithType(origin_type); | ||
| } | ||
| // If the origin extension of the metadata we are about to apply supports | ||
| // the Arrow storage type we would otherwise return, we restore the extension | ||
| // type to the output. | ||
| if (extension_supports_inferred_storage) { | ||
| inferred->field = inferred->field->WithType(origin_type); | ||
| } | ||
|
|
||
| modified = true; | ||
| } else { | ||
| ARROW_ASSIGN_OR_RAISE(modified, ApplyOriginalStorageMetadata(origin_field, inferred)); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| #include "parquet/arrow/schema_internal.h" | ||
|
|
||
| #include "arrow/extension/json.h" | ||
| #include "arrow/extension/uuid.h" | ||
| #include "arrow/type.h" | ||
|
|
||
| #include "parquet/properties.h" | ||
|
|
@@ -134,20 +135,25 @@ Result<std::shared_ptr<ArrowType>> FromByteArray( | |
| } | ||
| } | ||
|
|
||
| Result<std::shared_ptr<ArrowType>> FromFLBA(const LogicalType& logical_type, | ||
| int32_t physical_length) { | ||
| Result<std::shared_ptr<ArrowType>> FromFLBA( | ||
| const LogicalType& logical_type, int32_t physical_length, | ||
| const ArrowReaderProperties& reader_properties) { | ||
| switch (logical_type.type()) { | ||
| case LogicalType::Type::DECIMAL: | ||
| return MakeArrowDecimal(logical_type); | ||
| case LogicalType::Type::FLOAT16: | ||
| return ::arrow::float16(); | ||
| case LogicalType::Type::NONE: | ||
| case LogicalType::Type::INTERVAL: | ||
| return ::arrow::fixed_size_binary(physical_length); | ||
| case LogicalType::Type::UUID: | ||
| if (physical_length == 16 && reader_properties.get_arrow_extensions_enabled()) { | ||
| return ::arrow::extension::uuid(); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to check |
||
| } | ||
|
|
||
| return ::arrow::fixed_size_binary(physical_length); | ||
| default: | ||
| return Status::NotImplemented("Unhandled logical logical_type ", | ||
| logical_type.ToString(), | ||
| return Status::NotImplemented("Unhandled logical_type ", logical_type.ToString(), | ||
| " for fixed-length binary array"); | ||
| } | ||
| } | ||
|
|
@@ -216,7 +222,7 @@ Result<std::shared_ptr<ArrowType>> GetArrowType( | |
| case ParquetType::BYTE_ARRAY: | ||
| return FromByteArray(logical_type, reader_properties); | ||
| case ParquetType::FIXED_LEN_BYTE_ARRAY: | ||
| return FromFLBA(logical_type, type_length); | ||
| return FromFLBA(logical_type, type_length, reader_properties); | ||
| default: { | ||
| // PARQUET-1565: This can occur if the file is corrupt | ||
| return Status::IOError("Invalid physical column type: ", | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.