apache · rdblue · Jun 25, 2021 · Jun 2, 2021 · Jun 4, 2021 · Jun 7, 2021
diff --git a/bundled-guava/src/main/java/org/apache/iceberg/GuavaClasses.java b/bundled-guava/src/main/java/org/apache/iceberg/GuavaClasses.java
@@ -46,6 +46,7 @@
 import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hasher;
 import com.google.common.hash.Hashing;
+import com.google.common.io.CharStreams;
 import com.google.common.io.CountingOutputStream;
 import com.google.common.io.Files;
 import com.google.common.primitives.Bytes;
@@ -92,6 +93,7 @@ public class GuavaClasses {
     ThreadFactoryBuilder.class.getName();
     Iterables.class.getName();
     CountingOutputStream.class.getName();
+    CharStreams.class.getName();
   }
 
 }

diff --git a/core/src/main/java/org/apache/iceberg/util/SnapshotUtil.java b/core/src/main/java/org/apache/iceberg/util/SnapshotUtil.java
@@ -63,6 +63,20 @@ public static List<Long> currentAncestors(Table table) {
     return ancestorIds(table.currentSnapshot(), table::snapshot);
   }
 
+  /**
+   * Find the oldest Snapshot of a table.
+   * @param table the table to find the oldest snapshot on.
+   * @return null if the table is empty, else the oldest Snapshot.
+   */
+  public static Snapshot oldestSnapshot(Table table) {
+    Snapshot current = table.currentSnapshot();
+    while (current.parentId() != null) {
+      current = table.snapshot(current.parentId());
+    }
+
+    return current;
+  }
+
   /**
    * Returns list of snapshot ids in the range - (fromSnapshotId, toSnapshotId]
    * <p>

diff --git a/spark3/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java b/spark3/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java
@@ -52,6 +52,7 @@
 import org.apache.spark.sql.connector.read.Scan;
 import org.apache.spark.sql.connector.read.Statistics;
 import org.apache.spark.sql.connector.read.SupportsReportStatistics;
+import org.apache.spark.sql.connector.read.streaming.MicroBatchStream;
 import org.apache.spark.sql.types.StructType;
 import org.apache.spark.sql.util.CaseInsensitiveStringMap;
 import org.apache.spark.sql.vectorized.ColumnarBatch;
@@ -62,6 +63,7 @@ abstract class SparkBatchScan implements Scan, Batch, SupportsReportStatistics {
   private static final Logger LOG = LoggerFactory.getLogger(SparkBatchScan.class);
 
   private final JavaSparkContext sparkContext;
+  private final SparkSession spark;
   private final Table table;
   private final boolean caseSensitive;
   private final boolean localityPreferred;
@@ -76,6 +78,7 @@ abstract class SparkBatchScan implements Scan, Batch, SupportsReportStatistics {
   SparkBatchScan(SparkSession spark, Table table, boolean caseSensitive, Schema expectedSchema,
                  List<Expression> filters, CaseInsensitiveStringMap options) {
     this.sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
+    this.spark = spark;
     this.table = table;
     this.caseSensitive = caseSensitive;
     this.expectedSchema = expectedSchema;
@@ -108,6 +111,12 @@ public Batch toBatch() {
     return this;
   }
 
+  @Override
+  public MicroBatchStream toMicroBatchStream(String checkpointLocation) {
+    return new SparkMicroBatchStream(
+        spark, sparkContext, table, caseSensitive, expectedSchema, options, checkpointLocation);
+  }
+
   @Override
   public StructType readSchema() {
     if (readSchema == null) {
@@ -213,10 +222,10 @@ public String description() {
     return String.format("%s [filters=%s]", table, filters);
   }
 
-  private static class ReaderFactory implements PartitionReaderFactory {
+  public static class ReaderFactory implements PartitionReaderFactory {
     private final int batchSize;
 
-    private ReaderFactory(int batchSize) {
+    ReaderFactory(int batchSize) {
       this.batchSize = batchSize;
     }
 
@@ -256,7 +265,7 @@ private static class BatchReader extends BatchDataReader implements PartitionRea
     }
   }
 
-  private static class ReadTask implements InputPartition, Serializable {
+  public static class ReadTask implements InputPartition, Serializable {
     private final CombinedScanTask task;
     private final Broadcast<Table> tableBroadcast;
     private final String expectedSchemaString;