-
Notifications
You must be signed in to change notification settings - Fork 3k
Core: Optimize lookup in DeleteFileIndex without useful bounds #8278
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,6 +50,7 @@ | |
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Maps; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Multimaps; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ObjectArrays; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Sets; | ||
| import org.apache.iceberg.types.Comparators; | ||
| import org.apache.iceberg.types.Conversions; | ||
|
|
@@ -75,6 +76,7 @@ class DeleteFileIndex { | |
| private final DeleteFileGroup globalDeletes; | ||
| private final Map<Pair<Integer, StructLikeWrapper>, DeleteFileGroup> deletesByPartition; | ||
| private final boolean isEmpty; | ||
| private final boolean useColumnStatsFiltering; | ||
|
|
||
| /** @deprecated since 1.4.0, will be removed in 1.5.0. */ | ||
| @Deprecated | ||
|
|
@@ -83,20 +85,22 @@ class DeleteFileIndex { | |
| long[] globalSeqs, | ||
| DeleteFile[] globalDeletes, | ||
| Map<Pair<Integer, StructLikeWrapper>, Pair<long[], DeleteFile[]>> deletesByPartition) { | ||
| this(specs, index(specs, globalSeqs, globalDeletes), index(specs, deletesByPartition)); | ||
| this(specs, index(specs, globalSeqs, globalDeletes), index(specs, deletesByPartition), true); | ||
| } | ||
|
|
||
| private DeleteFileIndex( | ||
| Map<Integer, PartitionSpec> specs, | ||
| DeleteFileGroup globalDeletes, | ||
| Map<Pair<Integer, StructLikeWrapper>, DeleteFileGroup> deletesByPartition) { | ||
| Map<Pair<Integer, StructLikeWrapper>, DeleteFileGroup> deletesByPartition, | ||
| boolean useColumnStatsFiltering) { | ||
| ImmutableMap.Builder<Integer, Types.StructType> builder = ImmutableMap.builder(); | ||
| specs.forEach((specId, spec) -> builder.put(specId, spec.partitionType())); | ||
| this.partitionTypeById = builder.build(); | ||
| this.wrapperById = wrappers(specs); | ||
| this.globalDeletes = globalDeletes; | ||
| this.deletesByPartition = deletesByPartition; | ||
| this.isEmpty = globalDeletes == null && deletesByPartition.isEmpty(); | ||
| this.useColumnStatsFiltering = useColumnStatsFiltering; | ||
| } | ||
|
|
||
| public boolean isEmpty() { | ||
|
|
@@ -148,7 +152,16 @@ DeleteFile[] forDataFile(long sequenceNumber, DataFile file) { | |
|
|
||
| if (globalDeletes == null && partitionDeletes == null) { | ||
| return NO_DELETES; | ||
| } else if (useColumnStatsFiltering) { | ||
| return limitWithColumnStatsFiltering(sequenceNumber, file, partitionDeletes); | ||
| } else { | ||
| return limitWithoutColumnStatsFiltering(sequenceNumber, partitionDeletes); | ||
| } | ||
| } | ||
|
|
||
| // limits deletes using sequence numbers and checks whether columns stats overlap | ||
| private DeleteFile[] limitWithColumnStatsFiltering( | ||
| long sequenceNumber, DataFile file, DeleteFileGroup partitionDeletes) { | ||
|
|
||
| Stream<IndexedDeleteFile> matchingDeletes; | ||
| if (partitionDeletes == null) { | ||
|
|
@@ -167,6 +180,21 @@ DeleteFile[] forDataFile(long sequenceNumber, DataFile file) { | |
| .toArray(DeleteFile[]::new); | ||
| } | ||
|
|
||
| // limits deletes using sequence numbers but skips expensive column stats filtering | ||
| private DeleteFile[] limitWithoutColumnStatsFiltering( | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See below why a separate method is added.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice find, maybe worth a comment
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added. |
||
| long sequenceNumber, DeleteFileGroup partitionDeletes) { | ||
|
|
||
| if (partitionDeletes == null) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is only invoked if global or partition deletes are not null. If both are null, we return earlier. |
||
| return globalDeletes.filter(sequenceNumber); | ||
| } else if (globalDeletes == null) { | ||
| return partitionDeletes.filter(sequenceNumber); | ||
| } else { | ||
| DeleteFile[] matchingGlobalDeletes = globalDeletes.filter(sequenceNumber); | ||
| DeleteFile[] matchingPartitionDeletes = partitionDeletes.filter(sequenceNumber); | ||
| return ObjectArrays.concat(matchingGlobalDeletes, matchingPartitionDeletes, DeleteFile.class); | ||
aokolnychyi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
|
|
||
| private static boolean canContainDeletesForFile(DataFile dataFile, IndexedDeleteFile deleteFile) { | ||
| switch (deleteFile.content()) { | ||
| case POSITION_DELETES: | ||
|
|
@@ -483,6 +511,8 @@ private Collection<DeleteFile> loadDeleteFiles() { | |
| DeleteFileIndex build() { | ||
| Iterable<DeleteFile> files = deleteFiles != null ? filterDeleteFiles() : loadDeleteFiles(); | ||
|
|
||
| boolean useColumnStatsFiltering = false; | ||
|
|
||
| // build a map from (specId, partition) to delete file entries | ||
| Map<Integer, StructLikeWrapper> wrappersBySpecId = Maps.newHashMap(); | ||
| ListMultimap<Pair<Integer, StructLikeWrapper>, IndexedDeleteFile> deleteFilesByPartition = | ||
|
|
@@ -494,7 +524,13 @@ DeleteFileIndex build() { | |
| wrappersBySpecId | ||
| .computeIfAbsent(specId, id -> StructLikeWrapper.forType(spec.partitionType())) | ||
| .copyFor(file.partition()); | ||
| deleteFilesByPartition.put(Pair.of(specId, wrapper), new IndexedDeleteFile(spec, file)); | ||
| IndexedDeleteFile indexedFile = new IndexedDeleteFile(spec, file); | ||
| deleteFilesByPartition.put(Pair.of(specId, wrapper), indexedFile); | ||
|
|
||
| if (!useColumnStatsFiltering) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am avoiding an extra call to
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea my thought was it translates to: useColumnStatsWithFiltering = useColumnStatsWithFiltering || indexFile.hasLowerAndUpperBound, which shouldn't evaluate the second one in theory if I understand correctly. So was thinking JVM would do the optimization for us, but not a big deal either way.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The order in which the operands are evaluated is not guaranteed. In a lot of cases, JVM would start evaluate both at the same time. I've seen this while profiling. |
||
| useColumnStatsFiltering = indexedFile.hasLowerAndUpperBounds(); | ||
| } | ||
|
|
||
| ScanMetricsUtil.indexedDeleteFile(scanMetrics, file); | ||
| } | ||
|
|
||
|
|
@@ -535,7 +571,8 @@ DeleteFileIndex build() { | |
| } | ||
| } | ||
|
|
||
| return new DeleteFileIndex(specsById, globalDeletes, sortedDeletesByPartition); | ||
| return new DeleteFileIndex( | ||
| specsById, globalDeletes, sortedDeletesByPartition, useColumnStatsFiltering); | ||
| } | ||
|
|
||
| private Iterable<CloseableIterable<ManifestEntry<DeleteFile>>> deleteManifestReaders() { | ||
|
|
@@ -597,7 +634,28 @@ private static class DeleteFileGroup { | |
| this.files = files; | ||
| } | ||
|
|
||
| public DeleteFile[] filter(long seq) { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would the stream could be costly with
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the |
||
| int start = findStartIndex(seq); | ||
|
|
||
| if (start >= files.length) { | ||
| return NO_DELETES; | ||
| } | ||
|
|
||
| DeleteFile[] matchingFiles = new DeleteFile[files.length - start]; | ||
|
|
||
| for (int index = start; index < files.length; index++) { | ||
| matchingFiles[index - start] = files[index].wrapped(); | ||
| } | ||
|
|
||
| return matchingFiles; | ||
| } | ||
|
|
||
| public Stream<IndexedDeleteFile> limit(long seq) { | ||
| int start = findStartIndex(seq); | ||
| return Arrays.stream(files, start, files.length); | ||
| } | ||
|
|
||
| private int findStartIndex(long seq) { | ||
| int pos = Arrays.binarySearch(seqs, seq); | ||
| int start; | ||
| if (pos < 0) { | ||
|
|
@@ -612,7 +670,7 @@ public Stream<IndexedDeleteFile> limit(long seq) { | |
| } | ||
| } | ||
|
|
||
| return Arrays.stream(files, start, files.length); | ||
| return start; | ||
| } | ||
|
|
||
| public Iterable<DeleteFile> referencedDeleteFiles() { | ||
|
|
||

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about filterWithColumnStats, filterWithoutColumnStats ?
Slightly shorter, and not sure if word 'limit' has any different significance than
filterworth calling out?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I kind of wanted to indicate that we both limit by sequence number as well as filter using column stats.