-
Notifications
You must be signed in to change notification settings - Fork 3k
Core: Rename DeleteFileHolder to PendingDeleteFile / Optimize duplicate data/delete file detection #11254
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Core: Rename DeleteFileHolder to PendingDeleteFile / Optimize duplicate data/delete file detection #11254
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,7 +30,6 @@ | |
| import java.util.Map; | ||
| import java.util.Objects; | ||
| import java.util.Set; | ||
| import java.util.stream.Collectors; | ||
| import org.apache.iceberg.encryption.EncryptedOutputFile; | ||
| import org.apache.iceberg.events.CreateSnapshotEvent; | ||
| import org.apache.iceberg.exceptions.ValidationException; | ||
|
|
@@ -42,7 +41,6 @@ | |
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
| import org.apache.iceberg.relocated.com.google.common.base.Predicate; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Iterables; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Iterators; | ||
|
|
@@ -82,11 +80,9 @@ abstract class MergingSnapshotProducer<ThisT> extends SnapshotProducer<ThisT> { | |
| private final ManifestFilterManager<DeleteFile> deleteFilterManager; | ||
|
|
||
| // update data | ||
| private final Map<PartitionSpec, List<DataFile>> newDataFilesBySpec = Maps.newHashMap(); | ||
| private final DataFileSet newDataFiles = DataFileSet.create(); | ||
| private final DeleteFileSet newDeleteFiles = DeleteFileSet.create(); | ||
| private final Map<PartitionSpec, DataFileSet> newDataFilesBySpec = Maps.newHashMap(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm... When did we start using
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. looks like this was introduced with #9860
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can follow up on this in a separate PR and change it to
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, I think we should. Thanks! |
||
| private Long newDataFilesDataSequenceNumber; | ||
| private final Map<Integer, List<DeleteFileHolder>> newDeleteFilesBySpec = Maps.newHashMap(); | ||
| private final Map<Integer, DeleteFileSet> newDeleteFilesBySpec = Maps.newHashMap(); | ||
| private final List<ManifestFile> appendManifests = Lists.newArrayList(); | ||
| private final List<ManifestFile> rewrittenAppendManifests = Lists.newArrayList(); | ||
| private final SnapshotSummary.Builder addedFilesSummary = SnapshotSummary.builder(); | ||
|
|
@@ -161,12 +157,9 @@ protected Expression rowFilter() { | |
| } | ||
|
|
||
| protected List<DataFile> addedDataFiles() { | ||
| return ImmutableList.copyOf( | ||
| newDataFilesBySpec.values().stream().flatMap(List::stream).collect(Collectors.toList())); | ||
| } | ||
|
|
||
| protected Map<PartitionSpec, List<DataFile>> addedDataFilesBySpec() { | ||
nastra marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return ImmutableMap.copyOf(newDataFilesBySpec); | ||
| return newDataFilesBySpec.values().stream() | ||
| .flatMap(Set::stream) | ||
| .collect(ImmutableList.toImmutableList()); | ||
| } | ||
|
|
||
| protected void failAnyDelete() { | ||
|
|
@@ -236,43 +229,49 @@ protected boolean addsDeleteFiles() { | |
| /** Add a data file to the new snapshot. */ | ||
| protected void add(DataFile file) { | ||
| Preconditions.checkNotNull(file, "Invalid data file: null"); | ||
| if (newDataFiles.add(file)) { | ||
| PartitionSpec fileSpec = ops.current().spec(file.specId()); | ||
| Preconditions.checkArgument( | ||
| fileSpec != null, | ||
| "Cannot find partition spec %s for data file: %s", | ||
| file.specId(), | ||
| file.path()); | ||
|
|
||
| addedFilesSummary.addedFile(fileSpec, file); | ||
| PartitionSpec spec = spec(file.specId()); | ||
| Preconditions.checkArgument( | ||
| spec != null, | ||
| "Cannot find partition spec %s for data file: %s", | ||
| file.specId(), | ||
| file.location()); | ||
|
|
||
| DataFileSet dataFiles = | ||
| newDataFilesBySpec.computeIfAbsent(spec, ignored -> DataFileSet.create()); | ||
| if (dataFiles.add(file)) { | ||
| addedFilesSummary.addedFile(spec, file); | ||
| hasNewDataFiles = true; | ||
| List<DataFile> dataFiles = | ||
| newDataFilesBySpec.computeIfAbsent(fileSpec, ignored -> Lists.newArrayList()); | ||
| dataFiles.add(file); | ||
| } | ||
| } | ||
|
|
||
| private PartitionSpec spec(int specId) { | ||
| return ops.current().spec(specId); | ||
| } | ||
|
|
||
| /** Add a delete file to the new snapshot. */ | ||
| protected void add(DeleteFile file) { | ||
| Preconditions.checkNotNull(file, "Invalid delete file: null"); | ||
| add(new DeleteFileHolder(file)); | ||
| add(new PendingDeleteFile(file)); | ||
| } | ||
|
|
||
| /** Add a delete file to the new snapshot. */ | ||
| protected void add(DeleteFile file, long dataSequenceNumber) { | ||
| Preconditions.checkNotNull(file, "Invalid delete file: null"); | ||
| add(new DeleteFileHolder(file, dataSequenceNumber)); | ||
| add(new PendingDeleteFile(file, dataSequenceNumber)); | ||
| } | ||
|
|
||
| private void add(DeleteFileHolder fileHolder) { | ||
| int specId = fileHolder.deleteFile().specId(); | ||
| PartitionSpec fileSpec = ops.current().spec(specId); | ||
| List<DeleteFileHolder> deleteFiles = | ||
| newDeleteFilesBySpec.computeIfAbsent(specId, s -> Lists.newArrayList()); | ||
|
|
||
| if (newDeleteFiles.add(fileHolder.deleteFile())) { | ||
| deleteFiles.add(fileHolder); | ||
| addedFilesSummary.addedFile(fileSpec, fileHolder.deleteFile()); | ||
| private void add(PendingDeleteFile file) { | ||
| PartitionSpec spec = spec(file.specId()); | ||
| Preconditions.checkArgument( | ||
| spec != null, | ||
| "Cannot find partition spec %s for delete file: %s", | ||
| file.specId(), | ||
| file.location()); | ||
|
|
||
| DeleteFileSet deleteFiles = | ||
| newDeleteFilesBySpec.computeIfAbsent(spec.specId(), ignored -> DeleteFileSet.create()); | ||
| if (deleteFiles.add(file)) { | ||
| addedFilesSummary.addedFile(spec, file); | ||
| hasNewDeleteFiles = true; | ||
| } | ||
| } | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.