Skip to content
Closed
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
9e8bf34
Spark 3.3 write to branch
namrathamyske Jan 23, 2023
ee4cadb
Spark 3.3 write to branch refactoring by review comments
namrathamyske Jan 23, 2023
3225506
Spark 3.3 write to branch refactoring by review comments
namrathamyske Jan 23, 2023
e1dfa45
Spark 3.3 write to branch data write test
namrathamyske Jan 23, 2023
58b4bf2
spotless
namrathamyske Jan 24, 2023
8677134
checking if snapshot set is branch
namrathamyske Jan 24, 2023
af17f25
Merge branch 'master' of https://github.com/apache/iceberg into spark…
namrathamyske Jan 25, 2023
7642b9e
Spark: address comments for spark branch writes
amogh-jahagirdar Feb 1, 2023
da9dcc0
Merge commit 'refs/pull/25/head' of https://github.com/namrathamyske/…
namrathamyske Feb 4, 2023
ca8e1ff
Merge branch 'master' of https://github.com/apache/iceberg into spark…
namrathamyske Feb 7, 2023
2e4eefe
review comments
namrathamyske Feb 11, 2023
de20c76
review comments
namrathamyske Feb 11, 2023
85d7475
spotless
namrathamyske Feb 11, 2023
bbf57e3
review comments changes
namrathamyske Feb 12, 2023
0e081e1
review comments changes
namrathamyske Feb 12, 2023
51b1052
new line change reversal
namrathamyske Feb 12, 2023
aa42e2e
Spark: Add tests for overwrite case
amogh-jahagirdar Feb 12, 2023
03c962d
Merge pull request #26 from amogh-jahagirdar/spark-branch-writes-more…
namrathamyske Feb 17, 2023
bed5ec3
nit review comments
namrathamyske Feb 17, 2023
332064e
Merge branch 'master' of https://github.com/apache/iceberg into spark…
namrathamyske Feb 17, 2023
6ef5f4e
Merge branch 'spark_writes' of https://github.com/namrathamyske/icebe…
namrathamyske Feb 17, 2023
8ecfdcd
adding write conf back
namrathamyske Feb 17, 2023
6b8f954
Remove SQL Write Conf, fail if write conf is specified for row level …
amogh-jahagirdar Feb 22, 2023
f8b34bd
Merge branch 'master' into spark_writes
amogh-jahagirdar Feb 22, 2023
a8a5d89
Merge branch 'master' into spark_writes
amogh-jahagirdar Feb 22, 2023
7ee1689
Address cleanup
amogh-jahagirdar Feb 23, 2023
64db07e
Allow non-existing branches in catalog#loadTable
amogh-jahagirdar Feb 23, 2023
1b2cd5a
Merge branch 'master' into spark_writes
amogh-jahagirdar Feb 23, 2023
4c94693
Remove Spark branch write option, use identifier in branch, merge/del…
amogh-jahagirdar Feb 26, 2023
2f3d6e1
Add merge tests
amogh-jahagirdar Feb 27, 2023
9bbed3a
Style
amogh-jahagirdar Feb 27, 2023
51a29b3
Remove setting branch in scan
amogh-jahagirdar Feb 27, 2023
b2692fe
Fix for metadata tables
amogh-jahagirdar Feb 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions core/src/main/java/org/apache/iceberg/util/SnapshotUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.iceberg.HistoryEntry;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Snapshot;
import org.apache.iceberg.SnapshotRef;
import org.apache.iceberg.Table;
import org.apache.iceberg.exceptions.ValidationException;
import org.apache.iceberg.io.FileIO;
Expand Down Expand Up @@ -394,4 +395,19 @@ public static Schema schemaFor(Table table, Long snapshotId, Long timestampMilli

return table.schema();
}

/**
* Fetch the snapshot at the head of the given branch in the given table.
*
* @param table a {@link Table}
* @param branch a branch
* @return the latest snapshot for the given branch
*/
public static Snapshot latestSnapshot(Table table, String branch) {
if (SnapshotRef.MAIN_BRANCH.equals(branch)) {
return table.currentSnapshot();
} else {
return table.snapshot(branch);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,13 @@ private Pair<Table, Long> load(Identifier ident) throws NoSuchTableException {
return Pair.of(table, SnapshotUtil.snapshotIdAsOfTime(table, asOfTimestamp));
} else if (branch != null) {
Snapshot branchSnapshot = table.snapshot(branch);
Preconditions.checkArgument(
branchSnapshot != null, "Cannot find snapshot associated with branch name: %s", branch);
return Pair.of(table, branchSnapshot.snapshotId());

// It's possible that the branch does not exist when performing writes to new branches.
// Load table should still succeed when spark is performing the write.
// Reads with invalid branches will fail at a later point
Long branchSnapshotId = branchSnapshot == null ? null : branchSnapshot.snapshotId();

return Pair.of(table, branchSnapshotId);
} else if (tag != null) {
Snapshot tagSnapshot = table.snapshot(tag);
Preconditions.checkArgument(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -760,9 +760,12 @@ private Table loadFromPathIdentifier(PathIdentifier ident) {

} else if (branch != null) {
Snapshot branchSnapshot = table.snapshot(branch);
Preconditions.checkArgument(
branchSnapshot != null, "Cannot find snapshot associated with branch name: %s", branch);
return new SparkTable(table, branchSnapshot.snapshotId(), !cacheEnabled);

// It's possible that the branch does not exist when performing writes to new branches.
// Load table should still succeed when spark is performing the write.
// Reads performed on non-existing branches will fail at a later point
Long branchSnapshotId = branchSnapshot == null ? null : branchSnapshot.snapshotId();
return new SparkTable(table, branchSnapshotId, !cacheEnabled);
Comment on lines -763 to +765
Copy link
Contributor

@amogh-jahagirdar amogh-jahagirdar Feb 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@namrathamyske @rdblue @aokolnychyi @jackye1995 I'm removing this check because this prevents writing to new branches. Catalog#loadTable gets called in spark when planning the write, and we fail the validation check that the branch snapshot exists. I added a test to validate that if a read on an invalid branch is performed we still fail (albeit later, when trying to build the scan).


} else if (tag != null) {
Snapshot tagSnapshot = table.snapshot(tag);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.apache.iceberg.DistributionMode;
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.IsolationLevel;
import org.apache.iceberg.SnapshotRef;
import org.apache.iceberg.SnapshotSummary;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableProperties;
Expand Down Expand Up @@ -324,4 +325,12 @@ public boolean caseSensitive() {
.defaultValue(SQLConf.CASE_SENSITIVE().defaultValueString())
.parse();
}

public String branch() {
return confParser
.stringConf()
.option(SparkWriteOptions.BRANCH)
.defaultValue(SnapshotRef.MAIN_BRANCH)
Copy link
Contributor

@amogh-jahagirdar amogh-jahagirdar Feb 22, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rdblue @aokolnychyi @namrathamyske @jackye1995 As discussed in the community sync, we'll remove the SQL write option from this PR and take that separately since for row level operations there's more fundamental changes that need to happen. I have those changes ready along with updated merge tests, so I'll publish that so folks can take a look

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense to me.

.parse();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,7 @@ private SparkWriteOptions() {}

// Isolation Level for DataFrame calls. Currently supported by overwritePartitions
public static final String ISOLATION_LEVEL = "isolation-level";

// Branch to write to
public static final String BRANCH = "branch";
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.iceberg.IsolationLevel;
import org.apache.iceberg.MetadataColumns;
import org.apache.iceberg.Schema;
import org.apache.iceberg.SnapshotRef;
import org.apache.iceberg.Table;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.spark.SparkDistributionAndOrderingUtil;
Expand Down Expand Up @@ -81,6 +82,11 @@ public DeltaWrite build() {
handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()),
SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR);

if (!writeConf.branch().equalsIgnoreCase(SnapshotRef.MAIN_BRANCH)) {
throw new UnsupportedOperationException(
"Row-level operations are currently supported only on the main branch");
}

Schema dataSchema = dataSchema();
if (dataSchema != null) {
TypeUtil.validateWriteSchema(table.schema(), dataSchema, checkNullability, checkOrdering);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import org.apache.iceberg.spark.SparkReadOptions;
import org.apache.iceberg.spark.SparkSchemaUtil;
import org.apache.iceberg.spark.SparkUtil;
import org.apache.iceberg.spark.SparkWriteOptions;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.iceberg.util.SnapshotUtil;
import org.apache.spark.sql.SparkSession;
Expand Down Expand Up @@ -250,7 +251,9 @@ public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) {
@Override
public WriteBuilder newWriteBuilder(LogicalWriteInfo info) {
Preconditions.checkArgument(
snapshotId == null, "Cannot write to table at a specific snapshot: %s", snapshotId);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this no longer valid? I think that we do not want to write to a specific snapshot. Is branch somehow passed as the snapshot ID?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After looking into this a bit more, I think this is incorrect. The snapshotId is set when the table is loaded using time travel syntax. I don't think that we want to allow that.

Copy link
Contributor Author

@namrathamyske namrathamyske Jan 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rdblue Can we add more checks that if the snapshot Id is the tip of the branch, then writing to branch is supported ?
If its the tip of the branch, then spark write should be supported.

I believe when we do
spark...save(table);

We are calling
catalog.loadtable(ident)
In DataFrameWriter.

When passing
spark..option("branch","..")

the snapshotId() is getting set

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this isn't an issue. I reverted this change and ran TestSparkDataWrite and everything passes. Let's revert this and run CI. If there are other issues outside of that test class, I'll take a look.

Copy link
Contributor Author

@namrathamyske namrathamyske Feb 1, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rdblue @amogh-jahagirdar if bug fix for read by snapshot ref gets merged #6717, then write to branch snapshot will fail as per test TestDeleteFrom.java That's because of the above condition. If feel we have to tweak the condition if this is going to be there.

Copy link
Contributor

@amogh-jahagirdar amogh-jahagirdar Feb 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually it seems the issue is that catalog.loadTable(table) interprets the branch option as the branch read option (because both are called "branch" and we have to load the table before doing the write, it can't differentiate if it's for write or not) couldn't we just have a different config name when doing writes?

Copy link
Contributor

@amogh-jahagirdar amogh-jahagirdar Feb 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@namrathamyske Yeah just updated to use the name write-branch and tests are passing. The issue is the name 'branch' is used for both read and write options, and when the loadTable is performed when doing the write , it treats it as a time travel. we should disambiguate the two. I think we should actually call it something else for the write case. write-branch kinda sounds odd to me tbh, maybe we go with toBranch. toBranch would be consistent with what's at the API and what's being done in the Flink PR. But we don't necessarily need to have parity there, whatever is the spark convention for naming and makes sense for users. @aokolnychyi @rdblue any suggestions there?

Copy link
Contributor Author

@namrathamyske namrathamyske Feb 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But i think we can't disregard calling loadTable wrt to ref passed. Later in future when we implement session configs for testing INSERT DELETE operations, there is lot of overlap between read and write. Spark logical plans call the SparkScanBuilder

, , . which should use read time travel config. SparkCopyOnWrite, SparkMergeOnRead have respective scanner for each of them which inherit from SparkScanBuilder. I will include the changes in this PR. Its still WIP.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point @namrathamyske , I was a bit short sighted we actually do want to leverage the statistics for the specific snapshot for writes. These statistics would be used during the scan itself (for example MERGE INTO branch) . So either we 1.) seek a good way to differentiate between a time travel query where the write shouldn't be able to be applied and an intentional write on a branch or 2.) we just relax the check that a snapshot is set as you did earlier.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rdblue @amogh-jahagirdar @jackye1995 this is still an open item for this PR get merged. I would prefer to go with second option. But let me know otherwise!

snapshotId == null || info.options().get(SparkWriteOptions.BRANCH) != null,
Copy link
Contributor

@amogh-jahagirdar amogh-jahagirdar Feb 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rdblue @aokolnychyi @namrathamyske This goes back to the thread above. After merging #6717 the SparkDataWriteTests fail at this validation because we are now setting a snapshotID to determine the schema. My solution here is to use the LogicalWriteInfo info.options().get(SparkWriteOptions.Branch) != null because this serves as an indication that we're not writing in a time travel operation and rather doing a branch write where we expect a snapshot ID to be set.

Copy link
Contributor Author

@namrathamyske namrathamyske Feb 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@amogh-jahagirdar I am good with this approach! Might need to think of a bigger refactoring in future. Would like to hear what others say.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case I think we need to also check that the snapshot ID is the branch head that is written to?

Copy link
Contributor Author

@namrathamyske namrathamyske Feb 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if branch is set, snapshot ID will be set to branch head in code. But we can add an additional check too.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jackye1995 yeah that's a safer guarantee, was thinking if it's really necessary based on how we build the scan (I don't see how the snapshot can be anything but the branch head if a branch is set) but this is probably a case where we want to be defensive. So I'm inclined to change it, will wait for @aokolnychyi @rdblue thoughts

"Cannot write to table at a specific snapshot: %s",
snapshotId);

return new SparkWriteBuilder(sparkSession(), icebergTable, info);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
import org.apache.iceberg.spark.FileRewriteCoordinator;
import org.apache.iceberg.spark.SparkWriteConf;
import org.apache.iceberg.util.PropertyUtil;
import org.apache.iceberg.util.SnapshotUtil;
import org.apache.iceberg.util.Tasks;
import org.apache.iceberg.util.ThreadPools;
import org.apache.spark.TaskContext;
Expand Down Expand Up @@ -105,6 +106,7 @@ abstract class SparkWrite implements Write, RequiresDistributionAndOrdering {
private final String applicationId;
private final boolean wapEnabled;
private final String wapId;
private final String branch;
private final long targetFileSize;
private final Schema writeSchema;
private final StructType dsSchema;
Expand Down Expand Up @@ -133,6 +135,7 @@ abstract class SparkWrite implements Write, RequiresDistributionAndOrdering {
this.applicationId = applicationId;
this.wapEnabled = writeConf.wapEnabled();
this.wapId = writeConf.wapId();
this.branch = writeConf.branch();
this.targetFileSize = writeConf.targetDataFileSize();
this.writeSchema = writeSchema;
this.dsSchema = dsSchema;
Expand Down Expand Up @@ -218,6 +221,7 @@ private void commitOperation(SnapshotUpdate<?> operation, String description) {

try {
long start = System.currentTimeMillis();
operation.toBranch(branch);
operation.commit(); // abort is automatically called if this fails
long duration = System.currentTimeMillis() - start;
LOG.info("Committed in {} ms", duration);
Expand Down Expand Up @@ -536,7 +540,7 @@ protected <T> void commit(SnapshotUpdate<T> snapshotUpdate, long epochId, String
}

private Long findLastCommittedEpochId() {
Snapshot snapshot = table.currentSnapshot();
Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch);
Long lastCommittedEpochId = null;
while (snapshot != null) {
Map<String, String> summary = snapshot.summary();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.spark.Spark3Util;
import org.apache.iceberg.spark.SparkTestBaseWithCatalog;
import org.apache.iceberg.spark.SparkWriteOptions;
import org.apache.iceberg.types.Types;
import org.apache.spark.sql.AnalysisException;
import org.apache.spark.sql.Dataset;
Expand Down Expand Up @@ -193,6 +194,46 @@ public void testMergeSchemaIcebergProperty() throws Exception {
sql("select * from %s order by id", tableName));
}

@Test
public void testMergeSchemaOnBranch() throws Exception {
String branch = "test-branch";

sql(
"ALTER TABLE %s SET TBLPROPERTIES ('%s'='true')",
tableName, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA);

Dataset<Row> twoColDF =
jsonToDF(
"id bigint, data string",
"{ \"id\": 1, \"data\": \"a\" }",
"{ \"id\": 2, \"data\": \"b\" }");

twoColDF.writeTo(tableName).option(SparkWriteOptions.BRANCH, branch).append();

assertEquals(
"Should have initial 2-column rows",
ImmutableList.of(row(1L, "a"), row(2L, "b")),
sql("select * from %s version as of '%s' order by id", tableName, branch));

Dataset<Row> threeColDF =
jsonToDF(
"id bigint, data string, new_col float",
"{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }",
"{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }");

threeColDF
.writeTo(tableName)
.option(SparkWriteOptions.MERGE_SCHEMA, "true")
.option(SparkWriteOptions.BRANCH, branch)
.append();

assertEquals(
"Should have 3-column rows",
ImmutableList.of(
row(1L, "a", null), row(2L, "b", null), row(3L, "c", 12.06F), row(4L, "d", 14.41F)),
sql("select * from %s version as of '%s' order by id", tableName, branch));
}

@Test
public void testWriteWithCaseSensitiveOption() throws NoSuchTableException, ParseException {
SparkSession sparkSession = spark.cloneSession();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Schema;
import org.apache.iceberg.Snapshot;
import org.apache.iceberg.SnapshotRef;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableProperties;
import org.apache.iceberg.avro.Avro;
Expand All @@ -44,13 +45,15 @@
import org.apache.iceberg.io.FileAppender;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.spark.SparkReadOptions;
import org.apache.iceberg.spark.SparkSQLProperties;
import org.apache.iceberg.spark.SparkSchemaUtil;
import org.apache.iceberg.spark.SparkWriteOptions;
import org.apache.iceberg.spark.data.AvroDataTest;
import org.apache.iceberg.spark.data.RandomData;
import org.apache.iceberg.spark.data.SparkAvroReader;
import org.apache.iceberg.types.Types;
import org.apache.iceberg.util.SnapshotUtil;
import org.apache.spark.SparkException;
import org.apache.spark.TaskContext;
import org.apache.spark.api.java.JavaRDD;
Expand Down Expand Up @@ -156,6 +159,18 @@ public void testWriteWithCustomDataLocation() throws IOException {
writeAndValidateWithLocations(table, location, tablePropertyDataLocation);
}

@Test
public void testBranchWriteWithCustomDataLocation() throws IOException {
File location = createTableFolder();
File tablePropertyDataLocation = temp.newFolder("test-table-property-data-dir");
Table table = createTable(new Schema(SUPPORTED_PRIMITIVES.fields()), location);
table
.updateProperties()
.set(TableProperties.WRITE_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath())
.commit();
writeAndValidateWithLocations(table, location, tablePropertyDataLocation, "test-branch");
}

private File createTableFolder() throws IOException {
File parent = temp.newFolder("parquet");
File location = new File(parent, "test");
Expand All @@ -170,16 +185,21 @@ private Table createTable(Schema schema, File location) {

private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir)
throws IOException {
writeAndValidateWithLocations(table, location, expectedDataDir, SnapshotRef.MAIN_BRANCH);
}

private void writeAndValidateWithLocations(
Table table, File location, File expectedDataDir, String branch) throws IOException {
Schema tableSchema = table.schema(); // use the table schema because ids are reassigned

table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();

Iterable<Record> expected = RandomData.generate(tableSchema, 100, 0L);
writeData(expected, tableSchema, location.toString());
writeData(expected, tableSchema, location.toString(), branch);

table.refresh();

List<Row> actual = readTable(location.toString());
List<Row> actual = readTable(location.toString(), branch);

Iterator<Record> expectedIter = expected.iterator();
Iterator<Row> actualIter = actual.iterator();
Expand All @@ -189,8 +209,7 @@ private void writeAndValidateWithLocations(Table table, File location, File expe
Assert.assertEquals(
"Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext());

table
.currentSnapshot()
SnapshotUtil.latestSnapshot(table, branch)
.addedDataFiles(table.io())
.forEach(
dataFile ->
Expand All @@ -204,15 +223,26 @@ private void writeAndValidateWithLocations(Table table, File location, File expe
}

private List<Row> readTable(String location) {
Dataset<Row> result = spark.read().format("iceberg").load(location);
return readTable(location, SnapshotRef.MAIN_BRANCH);
}

private List<Row> readTable(String location, String branch) {
Dataset<Row> result =
spark.read().format("iceberg").option(SparkReadOptions.BRANCH, branch).load(location);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor: In tests, I'd generally prefer not setting the branch option when we intend to write to main. Otherwise we're not testing the default case. I know that it is currently equivalent, but it seems like a gap that could eventually introduce errors if all of our tests use a specific branch.

Copy link
Contributor

@amogh-jahagirdar amogh-jahagirdar Feb 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, is it alright if I get this in a follow on PR before release? That way the PR is focused on how we want to organize branch tests which I think is a more fundamental thing


return result.collectAsList();
}

private void writeData(Iterable<Record> records, Schema schema, String location)
throws IOException {
writeData(records, schema, location, SnapshotRef.MAIN_BRANCH);
}

private void writeData(Iterable<Record> records, Schema schema, String location, String branch)
throws IOException {
Dataset<Row> df = createDataset(records, schema);
DataFrameWriter<?> writer = df.write().format("iceberg").mode("append");
DataFrameWriter<?> writer =
df.write().format("iceberg").option(SparkWriteOptions.BRANCH, branch).mode("append");
writer.save(location);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,26 @@ public void testSnapshotSelectionByInvalidSnapshotId() throws IOException {
.hasMessage("Cannot find snapshot with ID -10");
}

@Test
public void testSnapshotSelectionByInvalidBranch() throws IOException {
String tableLocation = temp.newFolder("iceberg-table").toString();

HadoopTables tables = new HadoopTables(CONF);
PartitionSpec spec = PartitionSpec.unpartitioned();
tables.create(SCHEMA, spec, tableLocation);

Dataset<Row> df =
spark
.read()
.format("iceberg")
.option(SparkReadOptions.BRANCH, "non-existing-branch")
.load(tableLocation);

Assertions.assertThatThrownBy(df::collectAsList)
.isInstanceOf(IllegalArgumentException.class)
.hasMessage("Cannot find ref non-existing-branch");
}

@Test
public void testSnapshotSelectionByInvalidTimestamp() throws IOException {
long timestamp = System.currentTimeMillis();
Expand Down
Loading