-
Notifications
You must be signed in to change notification settings - Fork 3k
Spark Integration to read from Snapshot ref #5150
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
413e75a
8132d20
c6cdf5c
1f87b1f
235ff35
7483cf9
2be3e6e
26003f3
2fe1e0d
8b29f00
b9a7803
bb61a90
55a3595
8c86fc8
ac763d9
f80684f
fdfb0c6
b5836f6
4bad10f
00f07a0
f685af4
d3fa84e
a693c72
ab0832b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -67,6 +67,8 @@ class SparkBatchQueryScan extends SparkScan implements SupportsRuntimeFiltering | |
| private final Long startSnapshotId; | ||
| private final Long endSnapshotId; | ||
| private final Long asOfTimestamp; | ||
| private final String branch; | ||
| private final String tag; | ||
| private final List<Expression> runtimeFilterExpressions; | ||
|
|
||
| private Set<Integer> specIds = null; // lazy cache of scanned spec IDs | ||
|
|
@@ -88,6 +90,8 @@ class SparkBatchQueryScan extends SparkScan implements SupportsRuntimeFiltering | |
| this.startSnapshotId = readConf.startSnapshotId(); | ||
| this.endSnapshotId = readConf.endSnapshotId(); | ||
| this.asOfTimestamp = readConf.asOfTimestamp(); | ||
| this.branch = readConf.branch(); | ||
| this.tag = readConf.tag(); | ||
| this.runtimeFilterExpressions = Lists.newArrayList(); | ||
|
|
||
| if (scan == null) { | ||
|
|
@@ -244,13 +248,22 @@ public Statistics estimateStatistics() { | |
| Snapshot snapshot = table().snapshot(snapshotIdAsOfTime); | ||
| return estimateStatistics(snapshot); | ||
|
|
||
| } else if (branch != null) { | ||
| Snapshot snapshot = table().snapshot(branch); | ||
| return estimateStatistics(snapshot); | ||
|
|
||
| } else if (tag != null) { | ||
| Snapshot snapshot = table().snapshot(tag); | ||
| return estimateStatistics(snapshot); | ||
|
|
||
| } else { | ||
| Snapshot snapshot = table().currentSnapshot(); | ||
| return estimateStatistics(snapshot); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| @SuppressWarnings("checkstyle:CyclomaticComplexity") | ||
| public boolean equals(Object o) { | ||
| if (this == o) { | ||
| return true; | ||
|
|
@@ -269,7 +282,9 @@ && readSchema().equals(that.readSchema()) | |
| && Objects.equals(snapshotId, that.snapshotId) | ||
| && Objects.equals(startSnapshotId, that.startSnapshotId) | ||
| && Objects.equals(endSnapshotId, that.endSnapshotId) | ||
| && Objects.equals(asOfTimestamp, that.asOfTimestamp); | ||
| && Objects.equals(asOfTimestamp, that.asOfTimestamp) | ||
| && Objects.equals(branch, that.branch) | ||
| && Objects.equals(tag, that.tag); | ||
| } | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. have to uncomment this , but getting a checkstyle cyclomatic complexity error.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Considering it's required for a correct equals implementation of SparkBatchQueryScan, I think it makes the most sense just to suppress the warnings on the method |
||
|
|
||
| @Override | ||
|
|
@@ -282,7 +297,9 @@ public int hashCode() { | |
| snapshotId, | ||
| startSnapshotId, | ||
| endSnapshotId, | ||
| asOfTimestamp); | ||
| asOfTimestamp, | ||
| branch, | ||
| tag); | ||
| } | ||
|
|
||
| @Override | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -226,4 +226,148 @@ public void testSnapshotSelectionBySnapshotIdAndTimestamp() throws IOException { | |
| .hasMessageContaining("Cannot specify both snapshot-id") | ||
| .hasMessageContaining("and as-of-timestamp"); | ||
| } | ||
|
|
||
| @Test | ||
| public void testSnapshotSelectionByTag() throws IOException { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we also need tests to show that branch and tag options can't be used at the same time, and tests to validate what happens when snapshot or timestamp are set along with branch or tag. It should be easy to make a few tests for those error cases. |
||
| String tableLocation = temp.newFolder("iceberg-table").toString(); | ||
|
|
||
| HadoopTables tables = new HadoopTables(CONF); | ||
| PartitionSpec spec = PartitionSpec.unpartitioned(); | ||
| Table table = tables.create(SCHEMA, spec, tableLocation); | ||
|
|
||
| // produce the first snapshot | ||
| List<SimpleRecord> firstBatchRecords = | ||
| Lists.newArrayList( | ||
| new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); | ||
| Dataset<Row> firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); | ||
| firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); | ||
|
|
||
| table.manageSnapshots().createTag("tag", table.currentSnapshot().snapshotId()).commit(); | ||
|
|
||
| // produce the second snapshot | ||
| List<SimpleRecord> secondBatchRecords = | ||
| Lists.newArrayList( | ||
| new SimpleRecord(4, "d"), new SimpleRecord(5, "e"), new SimpleRecord(6, "f")); | ||
| Dataset<Row> secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class); | ||
| secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); | ||
|
|
||
| // verify records in the current snapshot by tag | ||
| Dataset<Row> currentSnapshotResult = | ||
| spark.read().format("iceberg").option("tag", "tag").load(tableLocation); | ||
| List<SimpleRecord> currentSnapshotRecords = | ||
| currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); | ||
| List<SimpleRecord> expectedRecords = Lists.newArrayList(); | ||
| expectedRecords.addAll(firstBatchRecords); | ||
| Assert.assertEquals( | ||
| "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); | ||
| } | ||
|
|
||
| @Test | ||
| public void testSnapshotSelectionByBranch() throws IOException { | ||
| String tableLocation = temp.newFolder("iceberg-table").toString(); | ||
|
|
||
| HadoopTables tables = new HadoopTables(CONF); | ||
| PartitionSpec spec = PartitionSpec.unpartitioned(); | ||
| Table table = tables.create(SCHEMA, spec, tableLocation); | ||
|
|
||
| // produce the first snapshot | ||
| List<SimpleRecord> firstBatchRecords = | ||
| Lists.newArrayList( | ||
| new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); | ||
| Dataset<Row> firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); | ||
| firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); | ||
|
|
||
| table.manageSnapshots().createBranch("branch", table.currentSnapshot().snapshotId()).commit(); | ||
|
|
||
| // produce the second snapshot | ||
| List<SimpleRecord> secondBatchRecords = | ||
| Lists.newArrayList( | ||
| new SimpleRecord(4, "d"), new SimpleRecord(5, "e"), new SimpleRecord(6, "f")); | ||
| Dataset<Row> secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class); | ||
| secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); | ||
|
|
||
| // verify records in the current snapshot by branch | ||
| Dataset<Row> currentSnapshotResult = | ||
| spark.read().format("iceberg").option("branch", "branch").load(tableLocation); | ||
| List<SimpleRecord> currentSnapshotRecords = | ||
| currentSnapshotResult.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); | ||
| List<SimpleRecord> expectedRecords = Lists.newArrayList(); | ||
| expectedRecords.addAll(firstBatchRecords); | ||
| Assert.assertEquals( | ||
| "Current snapshot rows should match", expectedRecords, currentSnapshotRecords); | ||
| } | ||
|
|
||
| @Test | ||
| public void testSnapshotSelectionByBranchAndTagFails() throws IOException { | ||
| String tableLocation = temp.newFolder("iceberg-table").toString(); | ||
|
|
||
| HadoopTables tables = new HadoopTables(CONF); | ||
| PartitionSpec spec = PartitionSpec.unpartitioned(); | ||
| Table table = tables.create(SCHEMA, spec, tableLocation); | ||
|
|
||
| // produce the first snapshot | ||
| List<SimpleRecord> firstBatchRecords = | ||
| Lists.newArrayList( | ||
| new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); | ||
| Dataset<Row> firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); | ||
| firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); | ||
|
|
||
| table.manageSnapshots().createBranch("branch", table.currentSnapshot().snapshotId()).commit(); | ||
| table.manageSnapshots().createTag("tag", table.currentSnapshot().snapshotId()).commit(); | ||
|
|
||
| Assertions.assertThatThrownBy( | ||
| () -> | ||
| spark | ||
| .read() | ||
| .format("iceberg") | ||
| .option(SparkReadOptions.TAG, "tag") | ||
| .option(SparkReadOptions.BRANCH, "branch") | ||
| .load(tableLocation) | ||
| .show()) | ||
| .isInstanceOf(IllegalArgumentException.class) | ||
| .hasMessageStartingWith("Cannot override ref, already set snapshot id="); | ||
| } | ||
|
|
||
| @Test | ||
| public void testSnapshotSelectionByTimestampAndBranchOrTagFails() throws IOException { | ||
| String tableLocation = temp.newFolder("iceberg-table").toString(); | ||
|
|
||
| HadoopTables tables = new HadoopTables(CONF); | ||
| PartitionSpec spec = PartitionSpec.unpartitioned(); | ||
| Table table = tables.create(SCHEMA, spec, tableLocation); | ||
|
|
||
| List<SimpleRecord> firstBatchRecords = | ||
| Lists.newArrayList( | ||
| new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c")); | ||
| Dataset<Row> firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); | ||
| firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); | ||
|
|
||
| long timestamp = System.currentTimeMillis(); | ||
| table.manageSnapshots().createBranch("branch", table.currentSnapshot().snapshotId()).commit(); | ||
| table.manageSnapshots().createTag("tag", table.currentSnapshot().snapshotId()).commit(); | ||
|
|
||
| Assertions.assertThatThrownBy( | ||
| () -> | ||
| spark | ||
| .read() | ||
| .format("iceberg") | ||
| .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) | ||
| .option(SparkReadOptions.BRANCH, "branch") | ||
| .load(tableLocation) | ||
| .show()) | ||
| .isInstanceOf(IllegalArgumentException.class) | ||
| .hasMessageStartingWith("Cannot override ref, already set snapshot id="); | ||
|
|
||
| Assertions.assertThatThrownBy( | ||
| () -> | ||
| spark | ||
| .read() | ||
| .format("iceberg") | ||
| .option(SparkReadOptions.AS_OF_TIMESTAMP, timestamp) | ||
| .option(SparkReadOptions.TAG, "tag") | ||
| .load(tableLocation) | ||
| .show()) | ||
| .isInstanceOf(IllegalArgumentException.class) | ||
| .hasMessageStartingWith("Cannot override ref, already set snapshot id="); | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.