-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Validate parent snapshots in Kafka Connect #14515
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -71,4 +71,32 @@ default ThisT toBranch(String branch) { | |
| "Cannot commit to branch %s: %s does not support branch commits", | ||
| branch, this.getClass().getName())); | ||
| } | ||
|
|
||
| /** | ||
| * Enables snapshot validation with a user-provided function, which must throw a {@link | ||
| * org.apache.iceberg.exceptions.ValidationException} on validation failures. | ||
| * | ||
| * <p>Clients can use this method to validate summary and other metadata of parent snapshots. | ||
| * | ||
| * @param snapshotValidator a user function to validate parent snapshots | ||
| * @return this for method chaining | ||
| */ | ||
| default ThisT validateWith(Consumer<Snapshot> snapshotValidator) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the use of the ancestor iterable is a more elegant solution than making a large refactor and calling the validator in a loop. |
||
| throw new UnsupportedOperationException( | ||
| getClass().getName() + " does not implement validateWith"); | ||
| } | ||
|
|
||
| /** | ||
| * Set the snapshot ID used in any reads for this operation. | ||
| * | ||
| * <p>Validations will check changes after this snapshot ID. If the from snapshot is not set, all | ||
| * ancestor snapshots through the table's initial snapshot are validated. | ||
| * | ||
| * @param snapshotId a snapshot ID | ||
| * @return this for method chaining | ||
| */ | ||
| default ThisT validateFromSnapshot(long snapshotId) { | ||
| throw new UnsupportedOperationException( | ||
| getClass().getName() + " does not implement validateFromSnapshot"); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,7 +27,6 @@ public class BaseReplacePartitions extends MergingSnapshotProducer<ReplacePartit | |
| implements ReplacePartitions { | ||
|
|
||
| private final PartitionSet replacedPartitions; | ||
| private Long startingSnapshotId; | ||
| private boolean validateConflictingData = false; | ||
| private boolean validateConflictingDeletes = false; | ||
|
|
||
|
|
@@ -61,12 +60,6 @@ public ReplacePartitions validateAppendOnly() { | |
| return this; | ||
| } | ||
|
|
||
| @Override | ||
| public ReplacePartitions validateFromSnapshot(long newStartingSnapshotId) { | ||
| this.startingSnapshotId = newStartingSnapshotId; | ||
| return this; | ||
| } | ||
|
|
||
| @Override | ||
| public ReplacePartitions validateNoConflictingDeletes() { | ||
| this.validateConflictingDeletes = true; | ||
|
|
@@ -87,24 +80,26 @@ public BaseReplacePartitions toBranch(String branch) { | |
|
|
||
| @Override | ||
| public void validate(TableMetadata currentMetadata, Snapshot parent) { | ||
| super.validate(currentMetadata, parent); | ||
|
|
||
| if (validateConflictingData) { | ||
| if (dataSpec().isUnpartitioned()) { | ||
| validateAddedDataFiles( | ||
| currentMetadata, startingSnapshotId, Expressions.alwaysTrue(), parent); | ||
| currentMetadata, startingSnapshotId(), Expressions.alwaysTrue(), parent); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a good example of what I mean in the comment above. There are quite a few changes here (and in other classes) to refactor so that |
||
| } else { | ||
| validateAddedDataFiles(currentMetadata, startingSnapshotId, replacedPartitions, parent); | ||
| validateAddedDataFiles(currentMetadata, startingSnapshotId(), replacedPartitions, parent); | ||
| } | ||
| } | ||
|
|
||
| if (validateConflictingDeletes) { | ||
| if (dataSpec().isUnpartitioned()) { | ||
| validateDeletedDataFiles( | ||
| currentMetadata, startingSnapshotId, Expressions.alwaysTrue(), parent); | ||
| currentMetadata, startingSnapshotId(), Expressions.alwaysTrue(), parent); | ||
| validateNoNewDeleteFiles( | ||
| currentMetadata, startingSnapshotId, Expressions.alwaysTrue(), parent); | ||
| currentMetadata, startingSnapshotId(), Expressions.alwaysTrue(), parent); | ||
| } else { | ||
| validateDeletedDataFiles(currentMetadata, startingSnapshotId, replacedPartitions, parent); | ||
| validateNoNewDeleteFiles(currentMetadata, startingSnapshotId, replacedPartitions, parent); | ||
| validateDeletedDataFiles(currentMetadata, startingSnapshotId(), replacedPartitions, parent); | ||
| validateNoNewDeleteFiles(currentMetadata, startingSnapshotId(), replacedPartitions, parent); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -48,6 +48,7 @@ | |
| import java.util.concurrent.atomic.AtomicReferenceArray; | ||
| import java.util.function.Consumer; | ||
| import java.util.function.Function; | ||
| import javax.annotation.Nullable; | ||
| import org.apache.iceberg.encryption.EncryptedOutputFile; | ||
| import org.apache.iceberg.encryption.EncryptingFileIO; | ||
| import org.apache.iceberg.events.CreateSnapshotEvent; | ||
|
|
@@ -117,6 +118,8 @@ public void accept(String file) { | |
| private TableMetadata base; | ||
| private boolean stageOnly = false; | ||
| private Consumer<String> deleteFunc = defaultDelete; | ||
| @Nullable private Long startingSnapshotId = null; // check all versions by default | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like the original PR's approach to iterate backward through history rather than this approach that iterates through more by default. I also like that the other approach can reuse work by running the validation once with a loop, rather than calling the validation in a loop.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Both the original PR and this approach iterate backward. The original PR loops through the whole history for every validation run. This PR only checks new parent
Like I said above - the other approach runs validations for all parent snapshots in a loop. This PR limits the scope of validations to new Snapshots. |
||
| @Nullable private Consumer<Snapshot> snapshotValidator = null; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I commented on the original PR, we don't want to leave the exception class to the implementation. A validation should return a boolean to indicate that something was wrong. That way, the exception class is uniform ( |
||
|
|
||
| private ExecutorService workerPool; | ||
| private String targetBranch = SnapshotRef.MAIN_BRANCH; | ||
|
|
@@ -211,6 +214,23 @@ public ThisT deleteWith(Consumer<String> deleteCallback) { | |
| return self(); | ||
| } | ||
|
|
||
| @Override | ||
| public ThisT validateWith(Consumer<Snapshot> validator) { | ||
| this.snapshotValidator = validator; | ||
| return self(); | ||
| } | ||
|
|
||
| @Override | ||
| public ThisT validateFromSnapshot(long startSnapshotId) { | ||
| this.startingSnapshotId = startSnapshotId; | ||
| return self(); | ||
| } | ||
|
|
||
| @Nullable | ||
| protected Long startingSnapshotId() { | ||
| return this.startingSnapshotId; | ||
| } | ||
|
|
||
| /** | ||
| * Clean up any uncommitted manifests that were created. | ||
| * | ||
|
|
@@ -238,7 +258,13 @@ public ThisT deleteWith(Consumer<String> deleteCallback) { | |
| * @param currentMetadata current table metadata to validate | ||
| * @param snapshot ending snapshot on the lineage which is being validated | ||
| */ | ||
| protected void validate(TableMetadata currentMetadata, Snapshot snapshot) {} | ||
| protected void validate(TableMetadata currentMetadata, Snapshot snapshot) { | ||
| if (snapshotValidator != null && snapshot != null) { | ||
| SnapshotUtil.ancestorsBetween( | ||
| snapshot.snapshotId(), startingSnapshotId, currentMetadata::snapshot) | ||
| .forEach(snapshotValidator); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Apply the update's changes to the given metadata and snapshot. Return the new manifest list. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -70,6 +70,8 @@ public StreamingDelete toBranch(String branch) { | |
|
|
||
| @Override | ||
| protected void validate(TableMetadata base, Snapshot parent) { | ||
| super.validate(base, parent); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I considered whether it is a good idea to override |
||
|
|
||
| if (validateFilesToDeleteExist) { | ||
| failMissingDeletePaths(); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should document how the validator is called. Is the latest snapshot first or last? I think that affects how validations are written so it seems important.