apache · rdblue · Nov 9, 2022 · Mar 30, 2022 · Jun 28, 2022 · Jun 29, 2022
diff --git a/api/src/main/java/org/apache/iceberg/TableScan.java b/api/src/main/java/org/apache/iceberg/TableScan.java
@@ -49,6 +49,16 @@ public interface TableScan extends Scan<TableScan, FileScanTask, CombinedScanTas
    */
   TableScan asOfTime(long timestampMillis);
 
+  /**
+   * Create a new {@link TableScan} from this scan's configuration that will use the most recent
+   * snapshot as of the given snapshot ref.
+   *
+   * @param snapshotRef a snapshot Ref
+   * @return a new scan based on this with the given snapshot Ref
+   * @throws IllegalArgumentException if the snapshot cannot be found
+   */
+  TableScan useSnapshotRef(String snapshotRef);
+
   /**
    * Create a new {@link TableScan} from this that will read the given data columns. This produces
    * an expected schema that includes all fields that are either selected or used by this scan's

diff --git a/core/src/main/java/org/apache/iceberg/BaseTableScan.java b/core/src/main/java/org/apache/iceberg/BaseTableScan.java
@@ -94,6 +94,13 @@ public TableScan useSnapshot(long scanSnapshotId) {
         tableOps(), table(), tableSchema(), context().useSnapshotId(scanSnapshotId));
   }
 
+  @Override
+  public TableScan useSnapshotRef(String snapshotRef) {
+    Preconditions.checkArgument(
+        table().snapshot(snapshotRef) != null, "Cannot find ref with name %s", snapshotRef);
+    return useSnapshot(table().snapshot(snapshotRef).snapshotId());
+  }
+
   @Override
   public TableScan asOfTime(long timestampMillis) {
     Preconditions.checkArgument(

diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkReadConf.java
@@ -94,6 +94,14 @@ public Long endSnapshotId() {
     return confParser.longConf().option(SparkReadOptions.END_SNAPSHOT_ID).parseOptional();
   }
 
+  public String branch() {
+    return confParser.stringConf().option(SparkReadOptions.BRANCH).parseOptional();
+  }
+
+  public String tag() {
+    return confParser.stringConf().option(SparkReadOptions.TAG).parseOptional();
+  }
+
   public String fileScanTaskSetId() {
     return confParser.stringConf().option(SparkReadOptions.FILE_SCAN_TASK_SET_ID).parseOptional();
   }

diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java
@@ -35,6 +35,12 @@ private SparkReadOptions() {}
   // A timestamp in milliseconds; the snapshot used will be the snapshot current at this time.
   public static final String AS_OF_TIMESTAMP = "as-of-timestamp";
 
+  // branch ref of the table snapshot to read from
+  public static final String BRANCH = "branch";
+
+  // tag ref of the table snapshot to read from
+  public static final String TAG = "tag";
+
   // Overrides the table's read.split.target-size and read.split.metadata-target-size
   public static final String SPLIT_SIZE = "split-size";
 

diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkBatchQueryScan.java
@@ -67,6 +67,8 @@ class SparkBatchQueryScan extends SparkScan implements SupportsRuntimeFiltering
   private final Long startSnapshotId;
   private final Long endSnapshotId;
   private final Long asOfTimestamp;
+  private final String branch;
+  private final String tag;
   private final List<Expression> runtimeFilterExpressions;
 
   private Set<Integer> specIds = null; // lazy cache of scanned spec IDs
@@ -88,6 +90,8 @@ class SparkBatchQueryScan extends SparkScan implements SupportsRuntimeFiltering
     this.startSnapshotId = readConf.startSnapshotId();
     this.endSnapshotId = readConf.endSnapshotId();
     this.asOfTimestamp = readConf.asOfTimestamp();
+    this.branch = readConf.branch();
+    this.tag = readConf.tag();
     this.runtimeFilterExpressions = Lists.newArrayList();
 
     if (scan == null) {
@@ -244,6 +248,13 @@ public Statistics estimateStatistics() {
       Snapshot snapshot = table().snapshot(snapshotIdAsOfTime);
       return estimateStatistics(snapshot);
 
+    } else if (branch != null) {
+      Snapshot snapshot = table().snapshot(branch);
+      return estimateStatistics(snapshot);
+
+    } else if (tag != null) {
+      Snapshot snapshot = table().snapshot(tag);
+      return estimateStatistics(snapshot);
     } else {
       Snapshot snapshot = table().currentSnapshot();
       return estimateStatistics(snapshot);
@@ -269,7 +280,9 @@ && readSchema().equals(that.readSchema())
         && Objects.equals(snapshotId, that.snapshotId)
         && Objects.equals(startSnapshotId, that.startSnapshotId)
         && Objects.equals(endSnapshotId, that.endSnapshotId)
-        && Objects.equals(asOfTimestamp, that.asOfTimestamp);
+        && Objects.equals(asOfTimestamp, that.asOfTimestamp)
+        && Objects.equals(branch, that.branch)
+        && Objects.equals(branch, that.tag);
   }
 
   @Override
@@ -282,7 +295,9 @@ public int hashCode() {
         snapshotId,
         startSnapshotId,
         endSnapshotId,
-        asOfTimestamp);
+        asOfTimestamp,
+        branch,
+        tag);
   }
 
   @Override

diff --git a/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java b/spark/v3.3/spark/src/main/java/org/apache/iceberg/spark/source/SparkScanBuilder.java
@@ -21,6 +21,7 @@
 import java.util.List;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
+import org.apache.arrow.util.Preconditions;
 import org.apache.iceberg.MetadataColumns;
 import org.apache.iceberg.Schema;
 import org.apache.iceberg.Snapshot;
@@ -182,13 +183,28 @@ private Schema schemaWithMetadataColumns() {
   public Scan build() {
     Long snapshotId = readConf.snapshotId();
     Long asOfTimestamp = readConf.asOfTimestamp();
+    String branch = readConf.branch();
+    String tag = readConf.branch();
+
+    Preconditions.checkArgument(
+        branch == null || tag == null,
+        "Cannot set both %s and %s to select which table snapshot to scan",
+        SparkReadOptions.BRANCH,
+        SparkReadOptions.TAG);
 
     Preconditions.checkArgument(
         snapshotId == null || asOfTimestamp == null,
         "Cannot set both %s and %s to select which table snapshot to scan",
         SparkReadOptions.SNAPSHOT_ID,
         SparkReadOptions.AS_OF_TIMESTAMP);
 
+    String snapshotRef = branch != null ? branch : tag;
+    Preconditions.checkArgument(
+        snapshotId == null || snapshotRef == null,
+        "Cannot set both %s and %s to select which table snapshot to scan",
+        SparkReadOptions.SNAPSHOT_ID,
+        "branch/tag");
+
     Long startSnapshotId = readConf.startSnapshotId();
     Long endSnapshotId = readConf.endSnapshotId();
 
@@ -225,6 +241,12 @@ public Scan build() {
       scan = scan.asOfTime(asOfTimestamp);
     }
 
+    if (branch != null) {
+      scan.useSnapshotRef(branch);
+    } else if (tag != null) {
+      scan.useSnapshotRef(tag);
+    }
+
     if (startSnapshotId != null) {
       if (endSnapshotId != null) {
         scan = scan.appendsBetween(startSnapshotId, endSnapshotId);
@@ -240,7 +262,10 @@ public Scan build() {
 
   public Scan buildMergeOnReadScan() {
     Preconditions.checkArgument(
-        readConf.snapshotId() == null && readConf.asOfTimestamp() == null,
+        readConf.snapshotId() == null
+            && readConf.asOfTimestamp() == null
+            && readConf.branch() == null
+            && readConf.tag() == null,
         "Cannot set time travel options %s and %s for row-level command scans",
         SparkReadOptions.SNAPSHOT_ID,
         SparkReadOptions.AS_OF_TIMESTAMP);

diff --git a/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java b/spark/v3.3/spark/src/test/java/org/apache/iceberg/spark/source/TestSnapshotSelection.java
@@ -226,4 +226,88 @@ public void testSnapshotSelectionBySnapshotIdAndTimestamp() throws IOException {
         .hasMessageContaining("Cannot specify both snapshot-id")
         .hasMessageContaining("and as-of-timestamp");
   }
+
+  @Test
+  public void testSnapshotSelectionByTag() throws IOException {
+    String tableLocation = temp.newFolder("iceberg-table").toString();
+
+    HadoopTables tables = new HadoopTables(CONF);
+    PartitionSpec spec = PartitionSpec.unpartitioned();
+    Table table = tables.create(SCHEMA, spec, tableLocation);
+
+    // produce the first snapshot
+    List<SimpleRecord> firstBatchRecords = Lists.newArrayList(
+            new SimpleRecord(1, "a"),
+            new SimpleRecord(2, "b"),
+            new SimpleRecord(3, "c")
+    );
+    Dataset<Row> firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class);
+    firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
+
+    table.manageSnapshots().createTag("tag", table.currentSnapshot().snapshotId()).commit();
+
+    // produce the second snapshot
+    List<SimpleRecord> secondBatchRecords = Lists.newArrayList(
+            new SimpleRecord(4, "d"),
+            new SimpleRecord(5, "e"),
+            new SimpleRecord(6, "f")
+    );
+    Dataset<Row> secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class);
+    secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
+
+    // verify records in the current snapshot by tag
+    Dataset<Row> currentSnapshotResult = spark.read()
+            .format("iceberg")
+            .option("tag", "tag")
+            .load(tableLocation);
+    currentSnapshotResult.show();
+    List<SimpleRecord> currentSnapshotRecords = currentSnapshotResult.orderBy("id")
+            .as(Encoders.bean(SimpleRecord.class))
+            .collectAsList();
+    List<SimpleRecord> expectedRecords = Lists.newArrayList();
+    expectedRecords.addAll(firstBatchRecords);
+    Assert.assertEquals("Current snapshot rows should match", expectedRecords, currentSnapshotRecords);
+  }
+
+  @Test
+  public void testSnapshotSelectionByBranch() throws IOException {
+    String tableLocation = temp.newFolder("iceberg-table").toString();
+
+    HadoopTables tables = new HadoopTables(CONF);
+    PartitionSpec spec = PartitionSpec.unpartitioned();
+    Table table = tables.create(SCHEMA, spec, tableLocation);
+
+    // produce the first snapshot
+    List<SimpleRecord> firstBatchRecords = Lists.newArrayList(
+            new SimpleRecord(1, "a"),
+            new SimpleRecord(2, "b"),
+            new SimpleRecord(3, "c")
+    );
+    Dataset<Row> firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class);
+    firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
+
+    table.manageSnapshots().createBranch("branch", table.currentSnapshot().snapshotId()).commit();
+
+    // produce the second snapshot
+    List<SimpleRecord> secondBatchRecords = Lists.newArrayList(
+            new SimpleRecord(4, "d"),
+            new SimpleRecord(5, "e"),
+            new SimpleRecord(6, "f")
+    );
+    Dataset<Row> secondDf = spark.createDataFrame(secondBatchRecords, SimpleRecord.class);
+    secondDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);
+
+    // verify records in the current snapshot by tag
+    Dataset<Row> currentSnapshotResult = spark.read()
+            .format("iceberg")
+            .option("branch", "branch")
+            .load(tableLocation);
+    currentSnapshotResult.show();
+    List<SimpleRecord> currentSnapshotRecords = currentSnapshotResult.orderBy("id")
+            .as(Encoders.bean(SimpleRecord.class))
+            .collectAsList();
+    List<SimpleRecord> expectedRecords = Lists.newArrayList();
+    expectedRecords.addAll(firstBatchRecords);
+    Assert.assertEquals("Current snapshot rows should match", expectedRecords, currentSnapshotRecords);
+  }
 }