-
Notifications
You must be signed in to change notification settings - Fork 3k
Core: support rewrite data files with starting sequence number #3480
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7ba427d
bd51a63
05ea4fa
fd84ddf
f20d20f
95d6a79
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
|
|
||
| import java.util.Set; | ||
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.ImmutableSet; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Sets; | ||
|
|
||
| class BaseRewriteFiles extends MergingSnapshotProducer<RewriteFiles> implements RewriteFiles { | ||
|
|
@@ -66,6 +67,12 @@ private void verifyInputAndOutputFiles(Set<DataFile> dataFilesToDelete, Set<Dele | |
| } | ||
| } | ||
|
|
||
| @Override | ||
| public RewriteFiles rewriteFiles(Set<DataFile> filesToDelete, Set<DataFile> filesToAdd, long sequenceNumber) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Slightly unrelated concern: should we be using Not a blocker for this PR though! |
||
| setNewFilesSequenceNumber(sequenceNumber); | ||
| return rewriteFiles(filesToDelete, ImmutableSet.of(), filesToAdd, ImmutableSet.of()); | ||
| } | ||
|
|
||
| @Override | ||
| public RewriteFiles rewriteFiles(Set<DataFile> dataFilesToReplace, Set<DeleteFile> deleteFilesToReplace, | ||
| Set<DataFile> dataFilesToAdd, Set<DeleteFile> deleteFilesToAdd) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
|
|
||
| import java.io.IOException; | ||
| import java.io.UncheckedIOException; | ||
| import java.util.Arrays; | ||
| import java.util.List; | ||
| import java.util.ListIterator; | ||
| import java.util.Map; | ||
|
|
@@ -80,6 +81,7 @@ abstract class MergingSnapshotProducer<ThisT> extends SnapshotProducer<ThisT> { | |
|
|
||
| // update data | ||
| private final List<DataFile> newFiles = Lists.newArrayList(); | ||
| private Long newFilesSequenceNumber; | ||
| private final Map<Integer, List<DeleteFile>> newDeleteFilesBySpec = Maps.newHashMap(); | ||
| private final List<ManifestFile> appendManifests = Lists.newArrayList(); | ||
| private final List<ManifestFile> rewrittenAppendManifests = Lists.newArrayList(); | ||
|
|
@@ -297,7 +299,8 @@ protected void validateAddedDataFiles(TableMetadata base, Long startingSnapshotI | |
| */ | ||
| protected void validateNoNewDeletesForDataFiles(TableMetadata base, Long startingSnapshotId, | ||
| Iterable<DataFile> dataFiles) { | ||
| validateNoNewDeletesForDataFiles(base, startingSnapshotId, null, dataFiles, true); | ||
| validateNoNewDeletesForDataFiles(base, startingSnapshotId, null, dataFiles, true, | ||
| newFilesSequenceNumber != null); | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -313,6 +316,28 @@ protected void validateNoNewDeletesForDataFiles(TableMetadata base, Long startin | |
| protected void validateNoNewDeletesForDataFiles(TableMetadata base, Long startingSnapshotId, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you copy the javadoc for the previous version? I think it's helpful to have it.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added back |
||
| Expression dataFilter, Iterable<DataFile> dataFiles, | ||
| boolean caseSensitive) { | ||
| validateNoNewDeletesForDataFiles(base, startingSnapshotId, dataFilter, dataFiles, caseSensitive, false); | ||
| } | ||
|
|
||
| /** | ||
| * Validates that no new delete files that must be applied to the given data files have been added to the table since | ||
| * a starting snapshot, with the option to ignore equality deletes during the validation. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nbd, but I would add a note here about why we want to ignore equality deletes, just so future readers could understand. |
||
| * <p> | ||
| * For example, in the case of rewriting data files, if the added data files have the same sequence number as the | ||
| * replaced data files, equality deletes added at a higher sequence number are still effective against the added | ||
| * data files, so there is no risk of commit conflict between RewriteFiles and RowDelta. In cases like this, | ||
| * validation against equality delete files can be omitted. | ||
| * | ||
| * @param base table metadata to validate | ||
| * @param startingSnapshotId id of the snapshot current at the start of the operation | ||
| * @param dataFilter a data filter | ||
| * @param dataFiles data files to validate have no new row deletes | ||
| * @param caseSensitive whether expression binding should be case-sensitive | ||
| * @param ignoreEqualityDeletes whether equality deletes should be ignored in validation | ||
| */ | ||
| private void validateNoNewDeletesForDataFiles(TableMetadata base, Long startingSnapshotId, | ||
| Expression dataFilter, Iterable<DataFile> dataFiles, | ||
| boolean caseSensitive, boolean ignoreEqualityDeletes) { | ||
| // if there is no current table state, no files have been added | ||
| if (base.currentSnapshot() == null || base.formatVersion() < 2) { | ||
| return; | ||
|
|
@@ -327,8 +352,14 @@ protected void validateNoNewDeletesForDataFiles(TableMetadata base, Long startin | |
|
|
||
| for (DataFile dataFile : dataFiles) { | ||
| // if any delete is found that applies to files written in or before the starting snapshot, fail | ||
| if (deletes.forDataFile(startingSequenceNumber, dataFile).length > 0) { | ||
| throw new ValidationException("Cannot commit, found new delete for replaced data file: %s", dataFile); | ||
| DeleteFile[] deleteFiles = deletes.forDataFile(startingSequenceNumber, dataFile); | ||
| if (ignoreEqualityDeletes) { | ||
| ValidationException.check( | ||
| Arrays.stream(deleteFiles).noneMatch(deleteFile -> deleteFile.content() == FileContent.POSITION_DELETES), | ||
| "Cannot commit, found new position delete for replaced data file: %s", dataFile); | ||
| } else { | ||
| ValidationException.check(deleteFiles.length == 0, | ||
| "Cannot commit, found new delete for replaced data file: %s", dataFile); | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -360,6 +391,10 @@ protected void validateNoNewDeleteFiles(TableMetadata base, Long startingSnapsho | |
| dataFilter, Iterables.transform(deletes.referencedDeleteFiles(), ContentFile::path)); | ||
| } | ||
|
|
||
| protected void setNewFilesSequenceNumber(long sequenceNumber) { | ||
| this.newFilesSequenceNumber = sequenceNumber; | ||
| } | ||
|
|
||
| private long startingSequenceNumber(TableMetadata metadata, Long staringSnapshotId) { | ||
| if (staringSnapshotId != null && metadata.snapshot(staringSnapshotId) != null) { | ||
| Snapshot startingSnapshot = metadata.snapshot(staringSnapshotId); | ||
|
|
@@ -591,7 +626,11 @@ private ManifestFile newFilesAsManifest() { | |
| try { | ||
| ManifestWriter<DataFile> writer = newManifestWriter(dataSpec()); | ||
| try { | ||
| writer.addAll(newFiles); | ||
| if (newFilesSequenceNumber == null) { | ||
| writer.addAll(newFiles); | ||
| } else { | ||
| newFiles.forEach(f -> writer.add(f, newFilesSequenceNumber)); | ||
| } | ||
| } finally { | ||
| writer.close(); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now that this is true, do we have to ignore it with V1 Tables?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I was thinking about that right now. Technically I think it has no harm for v1 tables, because the sequence number is always 0, and it is not read or written anywhere. Let me add a unit test for v1. Do you see any place this might affect v1 table?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's all I was thinking, V1 Tables don't have sequence numbers so I just wanted to make sure they don't break if we are trying to set them.