apache · namrathamyske · Jan 23, 2023 · Jan 23, 2023 · Jan 23, 2023 · Jan 23, 2023
diff --git a/core/src/main/java/org/apache/iceberg/util/SnapshotUtil.java b/core/src/main/java/org/apache/iceberg/util/SnapshotUtil.java
@@ -27,6 +27,7 @@
 import org.apache.iceberg.HistoryEntry;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.SnapshotRef;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.exceptions.ValidationException;
 import org.apache.iceberg.io.FileIO;
@@ -394,4 +395,19 @@ public static Schema schemaFor(Table table, Long snapshotId, Long timestampMilli
 
     return table.schema();
   }
+
+  /**
+   * Fetch the snapshot at the head of the given branch in the given table.
+   *
+   * @param table a {@link Table}
+   * @param branch a branch
+   * @return the latest snapshot for the given branch
+   */
+  public static Snapshot latestSnapshot(Table table, String branch) {
+    if (SnapshotRef.MAIN_BRANCH.equals(branch)) {
+      return table.currentSnapshot();
+    } else {
+      return table.snapshot(branch);
+    }
+  }
 }
diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCachedTableCatalog.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCachedTableCatalog.java
@@ -185,9 +185,13 @@ private Pair<Table, Long> load(Identifier ident) throws NoSuchTableException {
       return Pair.of(table, SnapshotUtil.snapshotIdAsOfTime(table, asOfTimestamp));
     } else if (branch != null) {
       Snapshot branchSnapshot = table.snapshot(branch);
-      Preconditions.checkArgument(
-          branchSnapshot != null, "Cannot find snapshot associated with branch name: %s", branch);
-      return Pair.of(table, branchSnapshot.snapshotId());
+
+      // It's possible that the branch does not exist when performing writes to new branches.
+      // Load table should still succeed when spark is performing the write.
+      // Reads with invalid branches will fail at a later point
+      Long branchSnapshotId = branchSnapshot == null ? null : branchSnapshot.snapshotId();
+
+      return Pair.of(table, branchSnapshotId);
     } else if (tag != null) {
       Snapshot tagSnapshot = table.snapshot(tag);
       Preconditions.checkArgument(

diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkCatalog.java
@@ -760,9 +760,12 @@ private Table loadFromPathIdentifier(PathIdentifier ident) {
 
     } else if (branch != null) {
       Snapshot branchSnapshot = table.snapshot(branch);
-      Preconditions.checkArgument(
-          branchSnapshot != null, "Cannot find snapshot associated with branch name: %s", branch);
-      return new SparkTable(table, branchSnapshot.snapshotId(), !cacheEnabled);
+
+      // It's possible that the branch does not exist when performing writes to new branches.
+      // Load table should still succeed when spark is performing the write.
+      // Reads performed on non-existing branches will fail at a later point
+      Long branchSnapshotId = branchSnapshot == null ? null : branchSnapshot.snapshotId();
+      return new SparkTable(table, branchSnapshotId, !cacheEnabled);
 
     } else if (tag != null) {
       Snapshot tagSnapshot = table.snapshot(tag);

diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkWriteConf.java
@@ -27,6 +27,7 @@
 import org.apache.iceberg.DistributionMode;
 import org.apache.iceberg.FileFormat;
 import org.apache.iceberg.IsolationLevel;
+import org.apache.iceberg.SnapshotRef;
 import org.apache.iceberg.SnapshotSummary;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableProperties;
@@ -324,4 +325,12 @@ public boolean caseSensitive() {
         .defaultValue(SQLConf.CASE_SENSITIVE().defaultValueString())
         .parse();
   }
+
+  public String branch() {
+    return confParser
+        .stringConf()
+        .option(SparkWriteOptions.BRANCH)
+        .defaultValue(SnapshotRef.MAIN_BRANCH)
+        .parse();
+  }
 }
diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkWriteOptions.java
@@ -77,4 +77,7 @@ private SparkWriteOptions() {}
 
   // Isolation Level for DataFrame calls. Currently supported by overwritePartitions
   public static final String ISOLATION_LEVEL = "isolation-level";
+
+  // Branch to write to
+  public static final String BRANCH = "branch";
 }
diff --git a/...3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWriteBuilder.java b/...3/spark/src/main/java/org/apache/iceberg/spark/source/SparkPositionDeltaWriteBuilder.java
@@ -22,6 +22,7 @@
 import org.apache.iceberg.IsolationLevel;
 import org.apache.iceberg.MetadataColumns;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.SnapshotRef;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
 import org.apache.iceberg.spark.SparkDistributionAndOrderingUtil;
@@ -81,6 +82,11 @@ public DeltaWrite build() {
         handleTimestampWithoutZone || !SparkUtil.hasTimestampWithoutZone(table.schema()),
         SparkUtil.TIMESTAMP_WITHOUT_TIMEZONE_ERROR);
 
+    if (!writeConf.branch().equalsIgnoreCase(SnapshotRef.MAIN_BRANCH)) {
+      throw new UnsupportedOperationException(
+          "Row-level operations are currently supported only on the main branch");
+    }
+
     Schema dataSchema = dataSchema();
     if (dataSchema != null) {
       TypeUtil.validateWriteSchema(table.schema(), dataSchema, checkNullability, checkOrdering);

diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkTable.java
@@ -52,6 +52,7 @@
 import org.apache.iceberg.spark.SparkReadOptions;
 import org.apache.iceberg.spark.SparkSchemaUtil;
 import org.apache.iceberg.spark.SparkUtil;
+import org.apache.iceberg.spark.SparkWriteOptions;
 import org.apache.iceberg.util.PropertyUtil;
 import org.apache.iceberg.util.SnapshotUtil;
 import org.apache.spark.sql.SparkSession;
@@ -250,7 +251,9 @@ public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) {
   @Override
   public WriteBuilder newWriteBuilder(LogicalWriteInfo info) {
     Preconditions.checkArgument(
-        snapshotId == null, "Cannot write to table at a specific snapshot: %s", snapshotId);
 if (branch != null) { 
 Snapshot snapshot = table.currentSnapshot(); 
 Snapshot snapshot = table.currentSnapshot(); 
 if (branch != null) { 
 Snapshot snapshot = table.currentSnapshot(); 
 Snapshot snapshot = table.currentSnapshot(); 
+        snapshotId == null || info.options().get(SparkWriteOptions.BRANCH) != null,
+        "Cannot write to table at a specific snapshot: %s",
+        snapshotId);
 
     return new SparkWriteBuilder(sparkSession(), icebergTable, info);
   }

diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkWrite.java
@@ -69,6 +69,7 @@
 import org.apache.iceberg.spark.FileRewriteCoordinator;
 import org.apache.iceberg.spark.SparkWriteConf;
 import org.apache.iceberg.util.PropertyUtil;
+import org.apache.iceberg.util.SnapshotUtil;
 import org.apache.iceberg.util.Tasks;
 import org.apache.iceberg.util.ThreadPools;
 import org.apache.spark.TaskContext;
@@ -105,6 +106,7 @@ abstract class SparkWrite implements Write, RequiresDistributionAndOrdering {
   private final String applicationId;
   private final boolean wapEnabled;
   private final String wapId;
+  private final String branch;
   private final long targetFileSize;
   private final Schema writeSchema;
   private final StructType dsSchema;
@@ -133,6 +135,7 @@ abstract class SparkWrite implements Write, RequiresDistributionAndOrdering {
     this.applicationId = applicationId;
     this.wapEnabled = writeConf.wapEnabled();
     this.wapId = writeConf.wapId();
+    this.branch = writeConf.branch();
     this.targetFileSize = writeConf.targetDataFileSize();
     this.writeSchema = writeSchema;
     this.dsSchema = dsSchema;
@@ -218,6 +221,7 @@ private void commitOperation(SnapshotUpdate<?> operation, String description) {
 
     try {
       long start = System.currentTimeMillis();
+      operation.toBranch(branch);
       operation.commit(); // abort is automatically called if this fails
       long duration = System.currentTimeMillis() - start;
       LOG.info("Committed in {} ms", duration);
@@ -536,7 +540,7 @@ protected <T> void commit(SnapshotUpdate<T> snapshotUpdate, long epochId, String
     }
 
     private Long findLastCommittedEpochId() {
-      Snapshot snapshot = table.currentSnapshot();
+      Snapshot snapshot = SnapshotUtil.latestSnapshot(table, branch);
       Long lastCommittedEpochId = null;
       while (snapshot != null) {
         Map<String, String> summary = snapshot.summary();

diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWriterV2.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWriterV2.java
@@ -24,6 +24,7 @@
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
 import org.apache.iceberg.spark.Spark3Util;
 import org.apache.iceberg.spark.SparkTestBaseWithCatalog;
+import org.apache.iceberg.spark.SparkWriteOptions;
 import org.apache.iceberg.types.Types;
 import org.apache.spark.sql.AnalysisException;
 import org.apache.spark.sql.Dataset;
@@ -193,6 +194,46 @@ public void testMergeSchemaIcebergProperty() throws Exception {
         sql("select * from %s order by id", tableName));
   }
 
+  @Test
+  public void testMergeSchemaOnBranch() throws Exception {
+    String branch = "test-branch";
+
+    sql(
+        "ALTER TABLE %s SET TBLPROPERTIES ('%s'='true')",
+        tableName, TableProperties.SPARK_WRITE_ACCEPT_ANY_SCHEMA);
+
+    Dataset<Row> twoColDF =
+        jsonToDF(
+            "id bigint, data string",
+            "{ \"id\": 1, \"data\": \"a\" }",
+            "{ \"id\": 2, \"data\": \"b\" }");
+
+    twoColDF.writeTo(tableName).option(SparkWriteOptions.BRANCH, branch).append();
+
+    assertEquals(
+        "Should have initial 2-column rows",
+        ImmutableList.of(row(1L, "a"), row(2L, "b")),
+        sql("select * from %s version as of '%s' order by id", tableName, branch));
+
+    Dataset<Row> threeColDF =
+        jsonToDF(
+            "id bigint, data string, new_col float",
+            "{ \"id\": 3, \"data\": \"c\", \"new_col\": 12.06 }",
+            "{ \"id\": 4, \"data\": \"d\", \"new_col\": 14.41 }");
+
+    threeColDF
+        .writeTo(tableName)
+        .option(SparkWriteOptions.MERGE_SCHEMA, "true")
+        .option(SparkWriteOptions.BRANCH, branch)
+        .append();
+
+    assertEquals(
+        "Should have 3-column rows",
+        ImmutableList.of(
+            row(1L, "a", null), row(2L, "b", null), row(3L, "c", 12.06F), row(4L, "d", 14.41F)),
+        sql("select * from %s version as of '%s' order by id", tableName, branch));
+  }
+
   @Test
   public void testWriteWithCaseSensitiveOption() throws NoSuchTableException, ParseException {
     SparkSession sparkSession = spark.cloneSession();

diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestDataFrameWrites.java
@@ -36,6 +36,7 @@
 import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.Snapshot;
+import org.apache.iceberg.SnapshotRef;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableProperties;
 import org.apache.iceberg.avro.Avro;
@@ -44,13 +45,15 @@
 import org.apache.iceberg.io.FileAppender;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.SparkReadOptions;
 import org.apache.iceberg.spark.SparkSQLProperties;
 import org.apache.iceberg.spark.SparkSchemaUtil;
 import org.apache.iceberg.spark.SparkWriteOptions;
 import org.apache.iceberg.spark.data.AvroDataTest;
 import org.apache.iceberg.spark.data.RandomData;
 import org.apache.iceberg.spark.data.SparkAvroReader;
 import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.SnapshotUtil;
 import org.apache.spark.SparkException;
 import org.apache.spark.TaskContext;
 import org.apache.spark.api.java.JavaRDD;
@@ -156,6 +159,18 @@ public void testWriteWithCustomDataLocation() throws IOException {
     writeAndValidateWithLocations(table, location, tablePropertyDataLocation);
   }
 
+  @Test
+  public void testBranchWriteWithCustomDataLocation() throws IOException {
+    File location = createTableFolder();
+    File tablePropertyDataLocation = temp.newFolder("test-table-property-data-dir");
+    Table table = createTable(new Schema(SUPPORTED_PRIMITIVES.fields()), location);
+    table
+        .updateProperties()
+        .set(TableProperties.WRITE_DATA_LOCATION, tablePropertyDataLocation.getAbsolutePath())
+        .commit();
+    writeAndValidateWithLocations(table, location, tablePropertyDataLocation, "test-branch");
+  }
+
   private File createTableFolder() throws IOException {
     File parent = temp.newFolder("parquet");
     File location = new File(parent, "test");
@@ -170,16 +185,21 @@ private Table createTable(Schema schema, File location) {
 
   private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir)
       throws IOException {
+    writeAndValidateWithLocations(table, location, expectedDataDir, SnapshotRef.MAIN_BRANCH);
+  }
+
+  private void writeAndValidateWithLocations(
+      Table table, File location, File expectedDataDir, String branch) throws IOException {
     Schema tableSchema = table.schema(); // use the table schema because ids are reassigned
 
     table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();
 
     Iterable<Record> expected = RandomData.generate(tableSchema, 100, 0L);
-    writeData(expected, tableSchema, location.toString());
+    writeData(expected, tableSchema, location.toString(), branch);
 
     table.refresh();
 
-    List<Row> actual = readTable(location.toString());
+    List<Row> actual = readTable(location.toString(), branch);
 
     Iterator<Record> expectedIter = expected.iterator();
     Iterator<Row> actualIter = actual.iterator();
@@ -189,8 +209,7 @@ private void writeAndValidateWithLocations(Table table, File location, File expe
     Assert.assertEquals(
         "Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext());
 
-    table
-        .currentSnapshot()
+    SnapshotUtil.latestSnapshot(table, branch)
         .addedDataFiles(table.io())
         .forEach(
             dataFile ->
@@ -204,15 +223,26 @@ private void writeAndValidateWithLocations(Table table, File location, File expe
   }
 
   private List<Row> readTable(String location) {
-    Dataset<Row> result = spark.read().format("iceberg").load(location);
+    return readTable(location, SnapshotRef.MAIN_BRANCH);
+  }
+
+  private List<Row> readTable(String location, String branch) {
+    Dataset<Row> result =
+        spark.read().format("iceberg").option(SparkReadOptions.BRANCH, branch).load(location);
 
     return result.collectAsList();
   }
 
   private void writeData(Iterable<Record> records, Schema schema, String location)
       throws IOException {
+    writeData(records, schema, location, SnapshotRef.MAIN_BRANCH);
+  }
+
+  private void writeData(Iterable<Record> records, Schema schema, String location, String branch)
+      throws IOException {
     Dataset<Row> df = createDataset(records, schema);
-    DataFrameWriter<?> writer = df.write().format("iceberg").mode("append");
+    DataFrameWriter<?> writer =
+        df.write().format("iceberg").option(SparkWriteOptions.BRANCH, branch).mode("append");
     writer.save(location);
   }
 

diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java
@@ -177,6 +177,26 @@ public void testSnapshotSelectionByInvalidSnapshotId() throws IOException {
         .hasMessage("Cannot find snapshot with ID -10");
   }
 
+  @Test
+  public void testSnapshotSelectionByInvalidBranch() throws IOException {
+    String tableLocation = temp.newFolder("iceberg-table").toString();
+
+    HadoopTables tables = new HadoopTables(CONF);
+    PartitionSpec spec = PartitionSpec.unpartitioned();
+    tables.create(SCHEMA, spec, tableLocation);
+
+    Dataset<Row> df =
+        spark
+            .read()
+            .format("iceberg")
+            .option(SparkReadOptions.BRANCH, "non-existing-branch")
+            .load(tableLocation);
+
+    Assertions.assertThatThrownBy(df::collectAsList)
+        .isInstanceOf(IllegalArgumentException.class)
+        .hasMessage("Cannot find ref non-existing-branch");
+  }
+
   @Test
   public void testSnapshotSelectionByInvalidTimestamp() throws IOException {
     long timestamp = System.currentTimeMillis();