-
Notifications
You must be signed in to change notification settings - Fork 3k
Core: Support combining position deletes during writes #11222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,17 +21,18 @@ | |
| import java.io.IOException; | ||
| import java.util.Collection; | ||
| import java.util.List; | ||
| import java.util.function.Function; | ||
| import java.util.function.Supplier; | ||
| import org.apache.iceberg.DeleteFile; | ||
| import org.apache.iceberg.io.DeleteWriteResult; | ||
| import org.apache.iceberg.io.FileWriter; | ||
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
| import org.apache.iceberg.types.Comparators; | ||
| import org.apache.iceberg.util.CharSequenceMap; | ||
| import org.apache.iceberg.util.CharSequenceSet; | ||
| import org.roaringbitmap.longlong.PeekableLongIterator; | ||
| import org.roaringbitmap.longlong.Roaring64Bitmap; | ||
| import org.apache.iceberg.util.ContentFileUtil; | ||
|
|
||
| /** | ||
| * A position delete writer that is capable of handling unordered deletes without rows. | ||
|
|
@@ -41,14 +42,20 @@ | |
| * records are not ordered by file and position as required by the spec. If the incoming deletes are | ||
| * ordered by an external process, use {@link PositionDeleteWriter} instead. | ||
| * | ||
| * <p>If configured, this writer can also load previous deletes using the provided function and | ||
| * merge them with incoming ones prior to flushing the deletes into a file. Callers must ensure only | ||
| * previous file-scoped deletes are loaded because partition-scoped deletes can apply to multiple | ||
| * data files and can't be safely discarded. | ||
| * | ||
| * <p>Note this writer stores only positions. It does not store deleted records. | ||
| */ | ||
| public class SortingPositionOnlyDeleteWriter<T> | ||
| implements FileWriter<PositionDelete<T>, DeleteWriteResult> { | ||
|
|
||
| private final Supplier<FileWriter<PositionDelete<T>, DeleteWriteResult>> writers; | ||
| private final DeleteGranularity granularity; | ||
| private final CharSequenceMap<Roaring64Bitmap> positionsByPath; | ||
| private final CharSequenceMap<PositionDeleteIndex> positionsByPath; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this an area we'd want to explore using Map<String, PositionDeleteIndex> instead of the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, I think it'll probably make more sense to look at that when I do the update to use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We may want to keep this as |
||
| private final Function<CharSequence, PositionDeleteIndex> loadPreviousDeletes; | ||
| private DeleteWriteResult result = null; | ||
|
|
||
| public SortingPositionOnlyDeleteWriter(FileWriter<PositionDelete<T>, DeleteWriteResult> writer) { | ||
|
|
@@ -58,17 +65,26 @@ public SortingPositionOnlyDeleteWriter(FileWriter<PositionDelete<T>, DeleteWrite | |
| public SortingPositionOnlyDeleteWriter( | ||
| Supplier<FileWriter<PositionDelete<T>, DeleteWriteResult>> writers, | ||
| DeleteGranularity granularity) { | ||
| this(writers, granularity, path -> null /* no access to previous deletes */); | ||
| } | ||
|
|
||
| public SortingPositionOnlyDeleteWriter( | ||
| Supplier<FileWriter<PositionDelete<T>, DeleteWriteResult>> writers, | ||
| DeleteGranularity granularity, | ||
| Function<CharSequence, PositionDeleteIndex> loadPreviousDeletes) { | ||
| this.writers = writers; | ||
| this.granularity = granularity; | ||
| this.positionsByPath = CharSequenceMap.create(); | ||
| this.loadPreviousDeletes = loadPreviousDeletes; | ||
| } | ||
|
|
||
| @Override | ||
| public void write(PositionDelete<T> positionDelete) { | ||
| CharSequence path = positionDelete.path(); | ||
| long position = positionDelete.pos(); | ||
| Roaring64Bitmap positions = positionsByPath.computeIfAbsent(path, Roaring64Bitmap::new); | ||
| positions.add(position); | ||
| PositionDeleteIndex positions = | ||
| positionsByPath.computeIfAbsent(path, key -> new BitmapPositionDeleteIndex()); | ||
| positions.delete(position); | ||
| } | ||
|
|
||
| @Override | ||
|
|
@@ -106,14 +122,16 @@ private DeleteWriteResult writePartitionDeletes() throws IOException { | |
| private DeleteWriteResult writeFileDeletes() throws IOException { | ||
| List<DeleteFile> deleteFiles = Lists.newArrayList(); | ||
| CharSequenceSet referencedDataFiles = CharSequenceSet.empty(); | ||
| List<DeleteFile> rewrittenDeleteFiles = Lists.newArrayList(); | ||
|
|
||
| for (CharSequence path : positionsByPath.keySet()) { | ||
| DeleteWriteResult writeResult = writeDeletes(ImmutableList.of(path)); | ||
| deleteFiles.addAll(writeResult.deleteFiles()); | ||
| referencedDataFiles.addAll(writeResult.referencedDataFiles()); | ||
| rewrittenDeleteFiles.addAll(writeResult.rewrittenDeleteFiles()); | ||
| } | ||
|
|
||
| return new DeleteWriteResult(deleteFiles, referencedDataFiles); | ||
| return new DeleteWriteResult(deleteFiles, referencedDataFiles, rewrittenDeleteFiles); | ||
| } | ||
|
|
||
| @SuppressWarnings("CollectionUndefinedEquality") | ||
|
|
@@ -123,22 +141,38 @@ private DeleteWriteResult writeDeletes(Collection<CharSequence> paths) throws IO | |
| } | ||
|
|
||
| FileWriter<PositionDelete<T>, DeleteWriteResult> writer = writers.get(); | ||
| List<DeleteFile> rewrittenDeleteFiles = Lists.newArrayList(); | ||
|
|
||
| try { | ||
| PositionDelete<T> positionDelete = PositionDelete.create(); | ||
| for (CharSequence path : sort(paths)) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Another aspect I'm curious about, have we ever compared with using a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd say |
||
| // the iterator provides values in ascending sorted order | ||
| PeekableLongIterator positions = positionsByPath.get(path).getLongIterator(); | ||
| while (positions.hasNext()) { | ||
| long position = positions.next(); | ||
| writer.write(positionDelete.set(path, position, null /* no row */)); | ||
| PositionDeleteIndex positions = positionsByPath.get(path); | ||
| PositionDeleteIndex previousPositions = loadPreviousDeletes.apply(path); | ||
| if (previousPositions != null && previousPositions.isNotEmpty()) { | ||
| validatePreviousDeletes(previousPositions); | ||
| positions.merge(previousPositions); | ||
| rewrittenDeleteFiles.addAll(previousPositions.deleteFiles()); | ||
| } | ||
| positions.forEach(position -> writer.write(positionDelete.set(path, position))); | ||
| } | ||
| } finally { | ||
| writer.close(); | ||
| } | ||
|
|
||
| return writer.result(); | ||
| DeleteWriteResult writerResult = writer.result(); | ||
| List<DeleteFile> deleteFiles = writerResult.deleteFiles(); | ||
| CharSequenceSet referencedDataFiles = writerResult.referencedDataFiles(); | ||
| return new DeleteWriteResult(deleteFiles, referencedDataFiles, rewrittenDeleteFiles); | ||
| } | ||
|
|
||
| private void validatePreviousDeletes(PositionDeleteIndex index) { | ||
| Preconditions.checkArgument( | ||
| index.deleteFiles().stream().allMatch(this::isFileScoped), | ||
| "Previous deletes must be file-scoped"); | ||
| } | ||
|
|
||
| private boolean isFileScoped(DeleteFile deleteFile) { | ||
| return ContentFileUtil.referencedDataFile(deleteFile) != null; | ||
| } | ||
|
|
||
| private Collection<CharSequence> sort(Collection<CharSequence> paths) { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Purely to avoid breaking the API.