-
Notifications
You must be signed in to change notification settings - Fork 3k
AWS: support force register table in GlueCatalog #6742
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
73ecc5d
4ac803c
3509f3a
3c25e64
8e5349b
043b8a6
4bdb165
b38a928
dd9fdc4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -187,6 +187,18 @@ public class AwsProperties implements Serializable { | |
| */ | ||
| public static final String GLUE_CATALOG_ENDPOINT = "glue.endpoint"; | ||
|
|
||
| /** | ||
| * If set, Glue will always update the catalog table if the table already exists in glue catalog. | ||
| * By default, Glue catalog will only be able to create new table and will throw | ||
| * AlreadyExistsException when register an existing table name. | ||
| */ | ||
| public static final String GLUE_CATALOG_FORCE_REGISTER_TABLE = "glue.force-register-table"; | ||
|
|
||
| public static final boolean GLUE_CATALOG_FORCE_REGISTER_TABLE_DEFAULT = false; | ||
|
|
||
| /** Configure the Glue Catalog S3 FileIO Region to allow cross region s3 access */ | ||
| public static final String GLUE_CATALOG_FILE_IO_REGION = "glue.catalog-file-io-region"; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we already have |
||
|
|
||
| /** | ||
| * Number of threads to use for uploading parts to S3 (shared pool across all output streams), | ||
| * default to {@link Runtime#availableProcessors()} | ||
|
|
@@ -911,6 +923,8 @@ public class AwsProperties implements Serializable { | |
| private boolean glueCatalogSkipArchive; | ||
| private boolean glueCatalogSkipNameValidation; | ||
| private boolean glueLakeFormationEnabled; | ||
| private boolean glueCatalogForceRegisterTable; | ||
| private String glueCatalogFileIORegion; | ||
|
|
||
| private String dynamoDbTableName; | ||
| private String dynamoDbEndpoint; | ||
|
|
@@ -970,6 +984,8 @@ public AwsProperties() { | |
| this.glueCatalogSkipArchive = GLUE_CATALOG_SKIP_ARCHIVE_DEFAULT; | ||
| this.glueCatalogSkipNameValidation = GLUE_CATALOG_SKIP_NAME_VALIDATION_DEFAULT; | ||
| this.glueLakeFormationEnabled = GLUE_LAKEFORMATION_ENABLED_DEFAULT; | ||
| this.glueCatalogForceRegisterTable = GLUE_CATALOG_FORCE_REGISTER_TABLE_DEFAULT; | ||
| this.glueCatalogFileIORegion = null; | ||
|
|
||
| this.dynamoDbEndpoint = null; | ||
| this.dynamoDbTableName = DYNAMODB_TABLE_NAME_DEFAULT; | ||
|
|
@@ -1030,6 +1046,13 @@ public AwsProperties(Map<String, String> properties) { | |
| this.glueLakeFormationEnabled = | ||
| PropertyUtil.propertyAsBoolean( | ||
| properties, GLUE_LAKEFORMATION_ENABLED, GLUE_LAKEFORMATION_ENABLED_DEFAULT); | ||
| this.glueCatalogForceRegisterTable = | ||
| PropertyUtil.propertyAsBoolean( | ||
| properties, | ||
| GLUE_CATALOG_FORCE_REGISTER_TABLE, | ||
| GLUE_CATALOG_FORCE_REGISTER_TABLE_DEFAULT); | ||
| this.glueCatalogFileIORegion = properties.get(GLUE_CATALOG_FILE_IO_REGION); | ||
|
|
||
| this.s3FileIoMultipartUploadThreads = | ||
| PropertyUtil.propertyAsInt( | ||
| properties, | ||
|
|
@@ -1252,6 +1275,24 @@ public void setGlueLakeFormationEnabled(boolean glueLakeFormationEnabled) { | |
| this.glueLakeFormationEnabled = glueLakeFormationEnabled; | ||
| } | ||
|
|
||
|
|
||
| public boolean glueCatalogForceRegisterTable() { | ||
| return glueCatalogForceRegisterTable; | ||
| } | ||
|
|
||
| public void setGlueCatalogForceRegisterTable(boolean glueCatalogForceRegisterTable) { | ||
| this.glueCatalogForceRegisterTable = glueCatalogForceRegisterTable; | ||
| } | ||
|
|
||
| public String getGlueCatalogFileIORegion() { | ||
| return glueCatalogFileIORegion; | ||
| } | ||
|
|
||
| public void setGlueCatalogFileIORegion(String glueCatalogFileIORegion) { | ||
| this.glueCatalogFileIORegion = glueCatalogFileIORegion; | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * @deprecated will be removed in 1.4.0, use {@link org.apache.iceberg.aws.s3.S3FileIOProperties} | ||
| * instead | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,7 +20,9 @@ | |
|
|
||
| import java.io.Closeable; | ||
| import java.io.IOException; | ||
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Locale; | ||
| import java.util.Map; | ||
| import java.util.Set; | ||
| import java.util.stream.Collectors; | ||
|
|
@@ -31,7 +33,9 @@ | |
| import org.apache.iceberg.CatalogUtil; | ||
| import org.apache.iceberg.LockManager; | ||
| import org.apache.iceberg.TableMetadata; | ||
| import org.apache.iceberg.TableMetadataParser; | ||
| import org.apache.iceberg.TableOperations; | ||
| import org.apache.iceberg.aws.AssumeRoleAwsClientFactory; | ||
| import org.apache.iceberg.aws.AwsClientFactories; | ||
| import org.apache.iceberg.aws.AwsClientFactory; | ||
| import org.apache.iceberg.aws.AwsProperties; | ||
|
|
@@ -49,6 +53,7 @@ | |
| import org.apache.iceberg.hadoop.Configurable; | ||
| import org.apache.iceberg.io.CloseableGroup; | ||
| import org.apache.iceberg.io.FileIO; | ||
| import org.apache.iceberg.io.InputFile; | ||
| import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; | ||
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; | ||
|
|
@@ -78,6 +83,7 @@ | |
| import software.amazon.awssdk.services.glue.model.Table; | ||
| import software.amazon.awssdk.services.glue.model.TableInput; | ||
| import software.amazon.awssdk.services.glue.model.UpdateDatabaseRequest; | ||
| import software.amazon.awssdk.services.glue.model.UpdateTableRequest; | ||
|
|
||
| public class GlueCatalog extends BaseMetastoreCatalog | ||
| implements Closeable, SupportsNamespaces, Configurable<Configuration> { | ||
|
|
@@ -111,7 +117,8 @@ public GlueCatalog() {} | |
|
|
||
| @Override | ||
| public void initialize(String name, Map<String, String> properties) { | ||
| this.catalogProperties = ImmutableMap.copyOf(properties); | ||
| this.catalogProperties = new HashMap<>(); | ||
| catalogProperties.putAll(properties); | ||
|
Comment on lines
-114
to
+121
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Making CatalogProperties non-immutable in order to effectively side-load information into the AWS client factory post-initialization is a dangerous precedent to set. In addition, it doesn't seem to actually accomplish anything besides allowing the GlueCatalog to suddenly switch regions post-initialization, which is likely to introduce some dangerous side effects. |
||
| AwsClientFactory awsClientFactory; | ||
| FileIO catalogFileIO; | ||
| if (PropertyUtil.propertyAsBoolean( | ||
|
|
@@ -437,6 +444,81 @@ public void renameTable(TableIdentifier from, TableIdentifier to) { | |
| LOG.info("Successfully renamed table from {} to {}", from, to); | ||
| } | ||
|
|
||
| @Override | ||
| public org.apache.iceberg.Table registerTable( | ||
jackye1995 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| TableIdentifier identifier, String metadataFileLocation) { | ||
| Preconditions.checkArgument( | ||
| isValidIdentifier(identifier), "Table identifier to register is invalid: " + identifier); | ||
| Preconditions.checkArgument( | ||
| metadataFileLocation != null && !metadataFileLocation.isEmpty(), | ||
| "Cannot register an empty metadata file location as a table"); | ||
|
|
||
| // keep the original behavior when force-register-table flag is off | ||
| if (!awsProperties.glueCatalogForceRegisterTable()) { | ||
| return super.registerTable(identifier, metadataFileLocation); | ||
| } | ||
|
|
||
| String factoryImpl = | ||
| PropertyUtil.propertyAsString(catalogProperties, AwsProperties.CLIENT_FACTORY, null); | ||
| if (factoryImpl != null && factoryImpl.equals(AssumeRoleAwsClientFactory.class.getName())) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It hurts extensibility to make logic specific to a particular implementation class. If, for example, a customer needs to extend |
||
| // overwrite client assume_role_region for file IO to make cross region call | ||
| String catalogFileIORegion = awsProperties.getGlueCatalogFileIORegion(); | ||
| if (catalogFileIORegion != null) { | ||
| catalogProperties.put(AwsProperties.CLIENT_ASSUME_ROLE_REGION, catalogFileIORegion); | ||
| } | ||
|
Comment on lines
+465
to
+468
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't this logic and the associated parameter be removed and replaced with just setting |
||
| } | ||
|
|
||
| TableOperations ops = newTableOps(identifier); | ||
| InputFile metadataFile = ops.io().newInputFile(metadataFileLocation); | ||
| TableMetadata metadata = TableMetadataParser.read(ops.io(), metadataFile); | ||
|
|
||
| Map<String, String> tableParameters = | ||
| ImmutableMap.of( | ||
| BaseMetastoreTableOperations.TABLE_TYPE_PROP, | ||
| BaseMetastoreTableOperations.ICEBERG_TABLE_TYPE_VALUE.toLowerCase(Locale.ENGLISH), | ||
| BaseMetastoreTableOperations.METADATA_LOCATION_PROP, | ||
| metadataFileLocation); | ||
|
|
||
| String databaseName = | ||
| IcebergToGlueConverter.getDatabaseName( | ||
| identifier, awsProperties.glueCatalogSkipNameValidation()); | ||
| String tableName = | ||
| IcebergToGlueConverter.getTableName( | ||
| identifier, awsProperties.glueCatalogSkipNameValidation()); | ||
|
|
||
| TableInput tableInput = | ||
| TableInput.builder() | ||
| .applyMutation( | ||
| builder -> | ||
| IcebergToGlueConverter.setTableInputInformation( | ||
| builder, metadata, tableParameters)) | ||
| .name(tableName) | ||
| .tableType(GlueTableOperations.GLUE_EXTERNAL_TABLE_TYPE) | ||
| .parameters(tableParameters) | ||
| .build(); | ||
|
|
||
| try { | ||
| glue.createTable( | ||
| CreateTableRequest.builder().databaseName(databaseName).tableInput(tableInput).build()); | ||
| } catch (software.amazon.awssdk.services.glue.model.AlreadyExistsException e) { | ||
| GetTableResponse response = | ||
| glue.getTable( | ||
| GetTableRequest.builder().databaseName(databaseName).name(tableName).build()); | ||
| String versionId = response.table().versionId(); | ||
| glue.updateTable( | ||
| UpdateTableRequest.builder() | ||
| .databaseName(databaseName) | ||
| .tableInput(tableInput) | ||
| .versionId(versionId) | ||
| .build()); | ||
| } catch (EntityNotFoundException e) { | ||
|
Comment on lines
+471
to
+514
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So this logic is a combination of a fork of the logic in If the concern is the creation of an extra metadata file, it looks like |
||
| throw new NoSuchNamespaceException( | ||
| e, "Namespace %s is not found in Glue", identifier.namespace()); | ||
|
Comment on lines
+515
to
+516
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The exception handling seems off, can't EntityNotFoundException also be thrown when the table is not found?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This exception handling is meant for catching exceptions for |
||
| } | ||
|
|
||
| return loadTable(identifier); | ||
| } | ||
|
|
||
| @Override | ||
| public void createNamespace(Namespace namespace, Map<String, String> metadata) { | ||
| try { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is this assertion removed? It looks like it's for rename
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This assertion was removed due to the logic change in renameTable that allows rename table to use the previous table's Iceberg Properties (metadata location) the related integration test would always fail at this assert.