Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 31 additions & 28 deletions core/src/main/java/org/apache/iceberg/ManifestFilterManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -350,36 +350,35 @@ private boolean canContainDeletedFiles(ManifestFile manifest) {
return canContainExpressionDeletes || canContainDroppedPartitions || canContainDroppedFiles || canContainDropBySeq;
}

@SuppressWarnings({"CollectionUndefinedEquality", "checkstyle:CyclomaticComplexity"})
@SuppressWarnings("CollectionUndefinedEquality")
private boolean manifestHasDeletedFiles(PartitionAndMetricsEvaluator evaluator, ManifestReader<F> reader) {
boolean isDelete = reader.isDeleteManifestReader();
boolean hasDeletedFiles = false;

for (ManifestEntry<F> entry : reader.liveEntries()) {
F file = entry.file();
boolean markedForDelete = deletePaths.contains(file.path()) ||
dropPartitions.contains(file.specId(), file.partition()) ||
(isDelete && entry.sequenceNumber() > 0 && entry.sequenceNumber() < minSequenceNumber);

boolean nonMatchingDeleteFile = !file.content().equals(FileContent.DATA) && !evaluator.rowsMustMatch(file);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should avoid extra calls to evaluator if possible. Also, there was isDelete var already defined.

if (!markedForDelete && nonMatchingDeleteFile) {
// not all DeleteFiles removal can be handled by metadata operation, skip in this case
continue;
}

if (markedForDelete || evaluator.rowsMightMatch(file)) {
boolean allRowsMatch = markedForDelete || evaluator.rowsMustMatch(file);
ValidationException.check(
markedForDelete || evaluator.rowsMustMatch(file),
allRowsMatch || isDelete, // ignore delete files where some records may not match the expression
"Cannot delete file where some, but not all, rows match filter %s: %s",
this.deleteExpression, file.path());

hasDeletedFiles = true;
if (failAnyDelete) {
throw new DeleteException(reader.spec().partitionToPath(file.partition()));
if (allRowsMatch) {
if (failAnyDelete) {
throw new DeleteException(reader.spec().partitionToPath(file.partition()));
}

// as soon as a deleted file is detected, stop scanning
return true;
}
break; // as soon as a deleted file is detected, stop scanning
}
}
return hasDeletedFiles;

return false;
}

@SuppressWarnings({"CollectionUndefinedEquality", "checkstyle:CyclomaticComplexity"})
Expand All @@ -396,29 +395,33 @@ private ManifestFile filterManifestWithDeletedFiles(PartitionAndMetricsEvaluator
try {
reader.entries().forEach(entry -> {
F file = entry.file();
boolean fileDelete = deletePaths.contains(file.path()) ||
boolean markedForDelete = deletePaths.contains(file.path()) ||
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Renamed for consistency.

dropPartitions.contains(file.specId(), file.partition()) ||
(isDelete && entry.sequenceNumber() > 0 && entry.sequenceNumber() < minSequenceNumber);
if (entry.status() != ManifestEntry.Status.DELETED) {
if (fileDelete || evaluator.rowsMightMatch(file)) {
if (markedForDelete || evaluator.rowsMightMatch(file)) {
boolean allRowsMatch = markedForDelete || evaluator.rowsMustMatch(file);
ValidationException.check(
fileDelete || evaluator.rowsMustMatch(file),
allRowsMatch || isDelete, // ignore delete files where some records may not match the expression
"Cannot delete file where some, but not all, rows match filter %s: %s",
this.deleteExpression, file.path());

writer.delete(entry);

CharSequenceWrapper wrapper = CharSequenceWrapper.wrap(entry.file().path());
if (deletedPaths.contains(wrapper)) {
LOG.warn("Deleting a duplicate path from manifest {}: {}",
manifest.path(), wrapper.get());
duplicateDeleteCount += 1;
if (allRowsMatch) {
writer.delete(entry);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Simply moved the existing logic under if now.


CharSequenceWrapper wrapper = CharSequenceWrapper.wrap(entry.file().path());
if (deletedPaths.contains(wrapper)) {
LOG.warn("Deleting a duplicate path from manifest {}: {}", manifest.path(), wrapper.get());
duplicateDeleteCount += 1;
} else {
// only add the file to deletes if it is a new delete
// this keeps the snapshot summary accurate for non-duplicate data
deletedFiles.add(entry.file().copyWithoutStats());
}
deletedPaths.add(wrapper);
} else {
// only add the file to deletes if it is a new delete
// this keeps the snapshot summary accurate for non-duplicate data
deletedFiles.add(entry.file().copyWithoutStats());
writer.existing(entry);
}
deletedPaths.add(wrapper);

} else {
writer.existing(entry);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,8 @@ public class TestOverwriteWithValidation extends TableTestBase {
greaterThanOrEqual("id", 5L),
lessThanOrEqual("id", 9L));

private static final Expression EXPRESSION_DAY_2_ANOTHER_ID_RANGE = greaterThanOrEqual("id", 10L);

@Parameterized.Parameters(name = "formatVersion = {0}")
public static Object[] parameters() {
return new Object[] { 1, 2 };
Expand Down Expand Up @@ -995,4 +997,29 @@ public void testOverwriteCaseSensitivity() {
.validateNoConflictingData()
.commit());
}

@Test
public void testMetadataOnlyDeleteWithPositionDeletes() {
Assume.assumeTrue(formatVersion == 2);

Assert.assertNull("Should be empty table", table.currentSnapshot());

table.newAppend()
.appendFile(FILE_DAY_2)
.appendFile(FILE_DAY_2_ANOTHER_RANGE)
.commit();

table.newRowDelta()
.addDeletes(FILE_DAY_2_POS_DELETES)
.addDeletes(FILE_DAY_2_ANOTHER_RANGE_EQ_DELETES)
.commit();

table.newOverwrite()
.overwriteByRowFilter(EXPRESSION_DAY_2_ANOTHER_ID_RANGE)
.addFile(FILE_DAY_2_MODIFIED)
.commit();

validateTableFiles(table, FILE_DAY_2, FILE_DAY_2_MODIFIED);
validateTableDeleteFiles(table, FILE_DAY_2_POS_DELETES);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would previously fail as we should remove one delete file and one should be kept.

}
}