-
Notifications
You must be signed in to change notification settings - Fork 3k
AWS: Implement SupportsRecoveryOperations for S3FileIO #10721
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,8 +20,10 @@ | |
|
|
||
| import java.util.Arrays; | ||
| import java.util.Collection; | ||
| import java.util.Comparator; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Optional; | ||
| import java.util.Set; | ||
| import java.util.concurrent.ExecutionException; | ||
| import java.util.concurrent.ExecutorService; | ||
|
|
@@ -37,6 +39,7 @@ | |
| import org.apache.iceberg.io.FileInfo; | ||
| import org.apache.iceberg.io.InputFile; | ||
| import org.apache.iceberg.io.OutputFile; | ||
| import org.apache.iceberg.io.SupportsRecoveryOperations; | ||
| import org.apache.iceberg.metrics.MetricsContext; | ||
| import org.apache.iceberg.relocated.com.google.common.base.Joiner; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
|
|
@@ -52,6 +55,7 @@ | |
| import org.apache.iceberg.util.ThreadPools; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
| import software.amazon.awssdk.core.exception.SdkException; | ||
| import software.amazon.awssdk.services.s3.S3Client; | ||
| import software.amazon.awssdk.services.s3.model.Delete; | ||
| import software.amazon.awssdk.services.s3.model.DeleteObjectRequest; | ||
|
|
@@ -61,10 +65,12 @@ | |
| import software.amazon.awssdk.services.s3.model.GetObjectTaggingResponse; | ||
| import software.amazon.awssdk.services.s3.model.ListObjectsV2Request; | ||
| import software.amazon.awssdk.services.s3.model.ObjectIdentifier; | ||
| import software.amazon.awssdk.services.s3.model.ObjectVersion; | ||
| import software.amazon.awssdk.services.s3.model.PutObjectTaggingRequest; | ||
| import software.amazon.awssdk.services.s3.model.S3Exception; | ||
| import software.amazon.awssdk.services.s3.model.Tag; | ||
| import software.amazon.awssdk.services.s3.model.Tagging; | ||
| import software.amazon.awssdk.services.s3.paginators.ListObjectVersionsIterable; | ||
|
|
||
| /** | ||
| * FileIO implementation backed by S3. | ||
|
|
@@ -73,7 +79,7 @@ | |
| * schemes s3a, s3n, https are also treated as s3 file paths. Using this FileIO with other schemes | ||
| * will result in {@link org.apache.iceberg.exceptions.ValidationException}. | ||
| */ | ||
| public class S3FileIO implements CredentialSupplier, DelegateFileIO { | ||
| public class S3FileIO implements CredentialSupplier, DelegateFileIO, SupportsRecoveryOperations { | ||
| private static final Logger LOG = LoggerFactory.getLogger(S3FileIO.class); | ||
| private static final String DEFAULT_METRICS_IMPL = | ||
| "org.apache.iceberg.hadoop.HadoopMetricsContext"; | ||
|
|
@@ -420,4 +426,46 @@ protected void finalize() throws Throwable { | |
| } | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public boolean recoverFile(String path) { | ||
| S3URI location = new S3URI(path, s3FileIOProperties.bucketToAccessPointMapping()); | ||
| ListObjectVersionsIterable response = | ||
| client() | ||
| .listObjectVersionsPaginator( | ||
| builder -> builder.bucket(location.bucket()).prefix(location.key())); | ||
|
|
||
| // Recover to the last modified version, not isLatest, | ||
| // since isLatest is true for deletion markers. | ||
| Optional<ObjectVersion> recoverVersion = | ||
| response.versions().stream().max(Comparator.comparing(ObjectVersion::lastModified)); | ||
|
|
||
| return recoverVersion.map(version -> recoverObject(version, location.bucket())).orElse(false); | ||
| } | ||
|
|
||
| private boolean recoverObject(ObjectVersion version, String bucket) { | ||
| if (version.isLatest()) { | ||
| return true; | ||
| } | ||
|
Comment on lines
+447
to
+449
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [optional] we can have an integ test trying to restore an existing object doesn't leads to creating a new version of obj
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missed this, sure I can do a follow on to verify that we skip the recovery if the latest version is not a deletion marker. |
||
|
|
||
| LOG.info("Attempting to recover object {}", version.key()); | ||
| try { | ||
| // Perform a copy instead of deleting the delete marker | ||
| // so that recovery does not rely on delete permissions | ||
| client() | ||
| .copyObject( | ||
| builder -> | ||
| builder | ||
| .sourceBucket(bucket) | ||
| .sourceKey(version.key()) | ||
| .sourceVersionId(version.versionId()) | ||
| .destinationBucket(bucket) | ||
| .destinationKey(version.key())); | ||
| } catch (SdkException e) { | ||
| LOG.warn("Failed to recover object {}", version.key(), e); | ||
| return false; | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The simplest way for integration tests seemed to be just to enable bucket versioning for the bucket and then on cleanup, ensure deletion of every object version to make sure no garbage is left over.