-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Spark: update delete row reader to read position deletes #2372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
723ed70
ac7cda1
56456ff
ac18d76
a71b57c
e954d82
e28f9e4
369bc34
9d42b70
e160db8
ab58501
dbd5cc2
938e356
c848b9d
ca39ef5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,7 @@ | |
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Set; | ||
| import java.util.function.Consumer; | ||
| import java.util.function.Predicate; | ||
| import org.apache.iceberg.Accessor; | ||
| import org.apache.iceberg.DeleteFile; | ||
|
|
@@ -67,6 +68,7 @@ public abstract class DeleteFilter<T> { | |
| private final List<DeleteFile> eqDeletes; | ||
| private final Schema requiredSchema; | ||
| private final Accessor<StructLike> posAccessor; | ||
| private Integer deleteMarkerIndex = null; | ||
|
|
||
| private PositionDeleteIndex deleteRowPositions = null; | ||
| private Predicate<T> eqDeleteRows = null; | ||
|
|
@@ -100,6 +102,29 @@ public Schema requiredSchema() { | |
| return requiredSchema; | ||
| } | ||
|
|
||
| protected int deleteMarkerIndex() { | ||
| if (deleteMarkerIndex != null) { | ||
| return deleteMarkerIndex; | ||
| } | ||
|
|
||
| int index = 0; | ||
| for (Types.NestedField field : requiredSchema().columns()) { | ||
| if (field.fieldId() != MetadataColumns.IS_DELETED.fieldId()) { | ||
| index = index + 1; | ||
| } else { | ||
| break; | ||
| } | ||
| } | ||
|
|
||
| deleteMarkerIndex = index; | ||
|
|
||
| return deleteMarkerIndex; | ||
| } | ||
|
|
||
| protected abstract Consumer<T> deleteMarker(); | ||
|
|
||
| protected abstract boolean isDeletedRow(T row); | ||
|
|
||
| public boolean hasPosDeletes() { | ||
| return !posDeletes.isEmpty(); | ||
| } | ||
|
|
@@ -124,11 +149,20 @@ public CloseableIterable<T> filter(CloseableIterable<T> records) { | |
| return applyEqDeletes(applyPosDeletes(records)); | ||
| } | ||
|
|
||
| private List<Predicate<T>> applyEqDeletes() { | ||
| List<Predicate<T>> isInDeleteSets = Lists.newArrayList(); | ||
| private Filter<T> deletedRowsSelector() { | ||
| return new Filter<T>() { | ||
| @Override | ||
| protected boolean shouldKeep(T item) { | ||
| return isDeletedRow(item); | ||
| } | ||
| }; | ||
| } | ||
|
|
||
| private Predicate<T> buildEqDeletePredicate() { | ||
| if (eqDeletes.isEmpty()) { | ||
| return isInDeleteSets; | ||
| return null; | ||
| } | ||
| Predicate<T> isDeleted = null; | ||
|
|
||
| Multimap<Set<Integer>, DeleteFile> filesByDeleteIds = Multimaps.newMultimap(Maps.newHashMap(), Lists::newArrayList); | ||
| for (DeleteFile delete : eqDeletes) { | ||
|
|
@@ -156,43 +190,126 @@ private List<Predicate<T>> applyEqDeletes() { | |
| records, record -> new InternalRecordWrapper(deleteSchema.asStruct()).wrap(record)), | ||
| deleteSchema.asStruct()); | ||
|
|
||
| Predicate<T> isInDeleteSet = record -> deleteSet.contains(projectRow.wrap(asStructLike(record))); | ||
| isInDeleteSets.add(isInDeleteSet); | ||
| isDeleted = isDeleted == null ? record -> deleteSet.contains(projectRow.wrap(asStructLike(record))) : | ||
| isDeleted.or(record -> deleteSet.contains(projectRow.wrap(asStructLike(record)))); | ||
| } | ||
|
|
||
| return isDeleted; | ||
| } | ||
|
|
||
| private Predicate<T> buildPosDeletePredicate() { | ||
| if (posDeletes.isEmpty()) { | ||
| return null; | ||
| } | ||
|
|
||
| Predicate<T> pred = null; | ||
|
|
||
| for (DeleteFile posDelete : posDeletes) { | ||
| CloseableIterable<Record> deleteRecords = openPosDeletes(posDelete); | ||
| Set<Long> deleteRecordSet = Deletes.toPositionSet(dataFile.path(), deleteRecords); | ||
| if (!deleteRecordSet.isEmpty()) { | ||
| pred = pred == null ? r -> deleteRecordSet.contains(pos(r)) : pred.or(r -> deleteRecordSet.contains(pos(r))); | ||
| } | ||
| } | ||
|
|
||
| return pred; | ||
| } | ||
|
|
||
| public CloseableIterable<T> keepRowsFromDeletes(CloseableIterable<T> records) { | ||
| Predicate<T> isDeletedFromPosDeletes = buildPosDeletePredicate(); | ||
| if (isDeletedFromPosDeletes == null) { | ||
| return keepRowsFromEqualityDeletes(records); | ||
| } | ||
|
|
||
| Predicate<T> isDeletedFromEqDeletes = buildEqDeletePredicate(); | ||
| if (isDeletedFromEqDeletes == null) { | ||
| return keepRowsFromPosDeletes(records); | ||
| } | ||
|
|
||
| return isInDeleteSets; | ||
| CloseableIterable<T> markedRecords; | ||
|
|
||
| if (posDeletes.stream().mapToLong(DeleteFile::recordCount).sum() < setFilterThreshold) { | ||
| markedRecords = CloseableIterable.transform(records, record -> { | ||
| if (isDeletedFromPosDeletes.test(record) || isDeletedFromEqDeletes.test(record)) { | ||
| deleteMarker().accept(record); | ||
| } | ||
| return record; | ||
| }); | ||
|
|
||
| } else { | ||
| List<CloseableIterable<Record>> deletes = Lists.transform(posDeletes, this::openPosDeletes); | ||
| markedRecords = CloseableIterable.transform(Deletes.streamingDeletedRowMarker(records, this::pos, | ||
| Deletes.deletePositions(dataFile.path(), deletes), deleteMarker()), record -> { | ||
| if (!isDeletedRow(record) && isDeletedFromEqDeletes.test(record)) { | ||
| deleteMarker().accept(record); | ||
| } | ||
| return record; | ||
| }); | ||
| } | ||
| return deletedRowsSelector().filter(markedRecords); | ||
| } | ||
|
|
||
| public CloseableIterable<T> findEqualityDeleteRows(CloseableIterable<T> records) { | ||
| private CloseableIterable<T> selectRowsFromDeletes(CloseableIterable<T> records, Predicate<T> isDeleted) { | ||
| CloseableIterable<T> markedRecords = CloseableIterable.transform(records, record -> { | ||
| if (isDeleted.test(record)) { | ||
| deleteMarker().accept(record); | ||
| } | ||
| return record; | ||
| }); | ||
|
|
||
| return deletedRowsSelector().filter(markedRecords); | ||
| } | ||
|
|
||
| public CloseableIterable<T> keepRowsFromEqualityDeletes(CloseableIterable<T> records) { | ||
| // Predicate to test whether a row has been deleted by equality deletions. | ||
| Predicate<T> deletedRows = applyEqDeletes().stream() | ||
| .reduce(Predicate::or) | ||
| .orElse(t -> false); | ||
| Predicate<T> isDeleted = buildEqDeletePredicate(); | ||
| if (isDeleted == null) { | ||
| return CloseableIterable.empty(); | ||
| } | ||
|
|
||
| Filter<T> deletedRowsFilter = new Filter<T>() { | ||
| @Override | ||
| protected boolean shouldKeep(T item) { | ||
| return deletedRows.test(item); | ||
| return selectRowsFromDeletes(records, isDeleted); | ||
| } | ||
|
|
||
| public CloseableIterable<T> keepRowsFromPosDeletes(CloseableIterable<T> records) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a need to make these methods public? Or will rows only be read using
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They are used in |
||
| // if there are fewer deletes than a reasonable number to keep in memory, use a set | ||
| if (posDeletes.stream().mapToLong(DeleteFile::recordCount).sum() < setFilterThreshold) { | ||
| // Predicate to test whether a row has been deleted by equality deletions. | ||
| Predicate<T> isDeleted = buildPosDeletePredicate(); | ||
| if (isDeleted == null) { | ||
| return CloseableIterable.empty(); | ||
| } | ||
| }; | ||
| return deletedRowsFilter.filter(records); | ||
| return selectRowsFromDeletes(records, isDeleted); | ||
| } else { | ||
| List<CloseableIterable<Record>> deletes = Lists.transform(posDeletes, this::openPosDeletes); | ||
| CloseableIterable<T> markedRecords = Deletes.streamingDeletedRowMarker(records, this::pos, | ||
| Deletes.deletePositions(dataFile.path(), deletes), deleteMarker()); | ||
|
|
||
| return deletedRowsSelector().filter(markedRecords); | ||
| } | ||
| } | ||
|
|
||
| private CloseableIterable<T> applyEqDeletes(CloseableIterable<T> records) { | ||
| // Predicate to test whether a row should be visible to user after applying equality deletions. | ||
| Predicate<T> remainingRows = applyEqDeletes().stream() | ||
| .map(Predicate::negate) | ||
| .reduce(Predicate::and) | ||
| .orElse(t -> true); | ||
| Predicate<T> isDeleted = buildEqDeletePredicate(); | ||
| if (isDeleted == null) { | ||
| return records; | ||
| } | ||
|
|
||
| CloseableIterable<T> markedRecords = CloseableIterable.transform(records, record -> { | ||
| if (isDeleted.test(record)) { | ||
| deleteMarker().accept(record); | ||
| } | ||
| return record; | ||
| }); | ||
|
|
||
| Filter<T> remainingRowsFilter = new Filter<T>() { | ||
| @Override | ||
| protected boolean shouldKeep(T item) { | ||
| return remainingRows.test(item); | ||
| return !isDeletedRow(item); | ||
| } | ||
| }; | ||
|
|
||
| return remainingRowsFilter.filter(records); | ||
| return remainingRowsFilter.filter(markedRecords); | ||
| } | ||
|
|
||
| public Predicate<T> eqDeletedRowFilter() { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Personally, it's really strange that we will modify the original row (set
_deletedflag to be true) in ashouldKeepmethod because it should be a pure test method and should not modify certain states of the record. Otherwise, it is easy to cause confusion: after a record has undergone a different sequence of shouldKeep tests, the final result is different, and even the original record is modified.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm trying to add a
DeleteMarkerto add the_is_deletedflag iteratively, pls see the PR #2434