diff --git a/core/src/main/java/org/apache/iceberg/ManifestFilterManager.java b/core/src/main/java/org/apache/iceberg/ManifestFilterManager.java index 114a5444caff..81a7fbfad9a3 100644 --- a/core/src/main/java/org/apache/iceberg/ManifestFilterManager.java +++ b/core/src/main/java/org/apache/iceberg/ManifestFilterManager.java @@ -350,36 +350,35 @@ private boolean canContainDeletedFiles(ManifestFile manifest) { return canContainExpressionDeletes || canContainDroppedPartitions || canContainDroppedFiles || canContainDropBySeq; } - @SuppressWarnings({"CollectionUndefinedEquality", "checkstyle:CyclomaticComplexity"}) + @SuppressWarnings("CollectionUndefinedEquality") private boolean manifestHasDeletedFiles(PartitionAndMetricsEvaluator evaluator, ManifestReader reader) { boolean isDelete = reader.isDeleteManifestReader(); - boolean hasDeletedFiles = false; + for (ManifestEntry entry : reader.liveEntries()) { F file = entry.file(); boolean markedForDelete = deletePaths.contains(file.path()) || dropPartitions.contains(file.specId(), file.partition()) || (isDelete && entry.sequenceNumber() > 0 && entry.sequenceNumber() < minSequenceNumber); - boolean nonMatchingDeleteFile = !file.content().equals(FileContent.DATA) && !evaluator.rowsMustMatch(file); - if (!markedForDelete && nonMatchingDeleteFile) { - // not all DeleteFiles removal can be handled by metadata operation, skip in this case - continue; - } - if (markedForDelete || evaluator.rowsMightMatch(file)) { + boolean allRowsMatch = markedForDelete || evaluator.rowsMustMatch(file); ValidationException.check( - markedForDelete || evaluator.rowsMustMatch(file), + allRowsMatch || isDelete, // ignore delete files where some records may not match the expression "Cannot delete file where some, but not all, rows match filter %s: %s", this.deleteExpression, file.path()); - hasDeletedFiles = true; - if (failAnyDelete) { - throw new DeleteException(reader.spec().partitionToPath(file.partition())); + if (allRowsMatch) { + if (failAnyDelete) { + throw new DeleteException(reader.spec().partitionToPath(file.partition())); + } + + // as soon as a deleted file is detected, stop scanning + return true; } - break; // as soon as a deleted file is detected, stop scanning } } - return hasDeletedFiles; + + return false; } @SuppressWarnings({"CollectionUndefinedEquality", "checkstyle:CyclomaticComplexity"}) @@ -396,29 +395,33 @@ private ManifestFile filterManifestWithDeletedFiles(PartitionAndMetricsEvaluator try { reader.entries().forEach(entry -> { F file = entry.file(); - boolean fileDelete = deletePaths.contains(file.path()) || + boolean markedForDelete = deletePaths.contains(file.path()) || dropPartitions.contains(file.specId(), file.partition()) || (isDelete && entry.sequenceNumber() > 0 && entry.sequenceNumber() < minSequenceNumber); if (entry.status() != ManifestEntry.Status.DELETED) { - if (fileDelete || evaluator.rowsMightMatch(file)) { + if (markedForDelete || evaluator.rowsMightMatch(file)) { + boolean allRowsMatch = markedForDelete || evaluator.rowsMustMatch(file); ValidationException.check( - fileDelete || evaluator.rowsMustMatch(file), + allRowsMatch || isDelete, // ignore delete files where some records may not match the expression "Cannot delete file where some, but not all, rows match filter %s: %s", this.deleteExpression, file.path()); - writer.delete(entry); - - CharSequenceWrapper wrapper = CharSequenceWrapper.wrap(entry.file().path()); - if (deletedPaths.contains(wrapper)) { - LOG.warn("Deleting a duplicate path from manifest {}: {}", - manifest.path(), wrapper.get()); - duplicateDeleteCount += 1; + if (allRowsMatch) { + writer.delete(entry); + + CharSequenceWrapper wrapper = CharSequenceWrapper.wrap(entry.file().path()); + if (deletedPaths.contains(wrapper)) { + LOG.warn("Deleting a duplicate path from manifest {}: {}", manifest.path(), wrapper.get()); + duplicateDeleteCount += 1; + } else { + // only add the file to deletes if it is a new delete + // this keeps the snapshot summary accurate for non-duplicate data + deletedFiles.add(entry.file().copyWithoutStats()); + } + deletedPaths.add(wrapper); } else { - // only add the file to deletes if it is a new delete - // this keeps the snapshot summary accurate for non-duplicate data - deletedFiles.add(entry.file().copyWithoutStats()); + writer.existing(entry); } - deletedPaths.add(wrapper); } else { writer.existing(entry); diff --git a/core/src/test/java/org/apache/iceberg/TestOverwriteWithValidation.java b/core/src/test/java/org/apache/iceberg/TestOverwriteWithValidation.java index 1938ddddc649..899d4b6169d5 100644 --- a/core/src/test/java/org/apache/iceberg/TestOverwriteWithValidation.java +++ b/core/src/test/java/org/apache/iceberg/TestOverwriteWithValidation.java @@ -164,6 +164,8 @@ public class TestOverwriteWithValidation extends TableTestBase { greaterThanOrEqual("id", 5L), lessThanOrEqual("id", 9L)); + private static final Expression EXPRESSION_DAY_2_ANOTHER_ID_RANGE = greaterThanOrEqual("id", 10L); + @Parameterized.Parameters(name = "formatVersion = {0}") public static Object[] parameters() { return new Object[] { 1, 2 }; @@ -995,4 +997,29 @@ public void testOverwriteCaseSensitivity() { .validateNoConflictingData() .commit()); } + + @Test + public void testMetadataOnlyDeleteWithPositionDeletes() { + Assume.assumeTrue(formatVersion == 2); + + Assert.assertNull("Should be empty table", table.currentSnapshot()); + + table.newAppend() + .appendFile(FILE_DAY_2) + .appendFile(FILE_DAY_2_ANOTHER_RANGE) + .commit(); + + table.newRowDelta() + .addDeletes(FILE_DAY_2_POS_DELETES) + .addDeletes(FILE_DAY_2_ANOTHER_RANGE_EQ_DELETES) + .commit(); + + table.newOverwrite() + .overwriteByRowFilter(EXPRESSION_DAY_2_ANOTHER_ID_RANGE) + .addFile(FILE_DAY_2_MODIFIED) + .commit(); + + validateTableFiles(table, FILE_DAY_2, FILE_DAY_2_MODIFIED); + validateTableDeleteFiles(table, FILE_DAY_2_POS_DELETES); + } }