From ac9c7a2540ef2abcdc4b49b6ffbf40c801aaa3a7 Mon Sep 17 00:00:00 2001 From: Amogh Jahagirdar Date: Mon, 29 Aug 2022 09:27:25 -0700 Subject: [PATCH] Bug Fix for Expire Snapshots: Ancestors for cleanup of files should use the ancestors of the only existing branch Until reachability analysis for remove snapshots is implemented --- .../org/apache/iceberg/RemoveSnapshots.java | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/RemoveSnapshots.java b/core/src/main/java/org/apache/iceberg/RemoveSnapshots.java index b996822aaf03..bca257f8ea8d 100644 --- a/core/src/main/java/org/apache/iceberg/RemoveSnapshots.java +++ b/core/src/main/java/org/apache/iceberg/RemoveSnapshots.java @@ -51,6 +51,7 @@ import org.apache.iceberg.io.CloseableIterable; import org.apache.iceberg.relocated.com.google.common.base.Joiner; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Iterables; import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.iceberg.relocated.com.google.common.collect.Sets; @@ -366,11 +367,19 @@ private void removeExpiredFiles( // Reads and deletes are done using Tasks.foreach(...).suppressFailureWhenFinished to complete // as much of the delete work as possible and avoid orphaned data or manifest files. - // this is the set of ancestors of the current table state. when removing snapshots, this must - // only remove files that were deleted in an ancestor of the current table state to avoid + // ToDo: This will be removed when reachability analysis is done so files across multiple + // branches can be removed + SnapshotRef branchToCleanup = Iterables.getFirst(base.refs().values(), null); + if (branchToCleanup == null) { + return; + } + + Snapshot branchTip = base.snapshot(branchToCleanup.snapshotId()); + + // this is the set of ancestors of the branch to cleanup. when removing snapshots, this must + // only remove files that were deleted in an ancestor of the branch to cleanup to avoid // physically deleting files that were logically deleted in a commit that was rolled back. - Set ancestorIds = - Sets.newHashSet(SnapshotUtil.ancestorIds(base.currentSnapshot(), base::snapshot)); + Set ancestorIds = Sets.newHashSet(SnapshotUtil.ancestorIds(branchTip, base::snapshot)); Set pickedAncestorSnapshotIds = Sets.newHashSet(); for (long snapshotId : ancestorIds) {