Create StreamingDataWriterFactory for epoch ID.

jose-torres · jose-torres · commit c6d4ff542c8e · 2018-03-06T09:12:43.000-08:00
diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamWriter.scala b/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamWriter.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.kafka010.KafkaWriter.validateQuery
 import org.apache.spark.sql.sources.v2.writer._
-import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
+import org.apache.spark.sql.sources.v2.writer.streaming.{StreamingDataWriterFactory, StreamWriter}
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -63,7 +63,7 @@ class KafkaStreamWriter(
  */
 case class KafkaStreamWriterFactory(
     topic: Option[String], producerParams: Map[String, String], schema: StructType)
-  extends DataWriterFactory[InternalRow] {
+  extends StreamingDataWriterFactory[InternalRow] {
 
   override def createDataWriter(
       partitionId: Int,
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/DataWriterFactory.java
@@ -48,9 +48,6 @@ public interface DataWriterFactory<T> extends Serializable {
    *                      same task id but different attempt number, which means there are multiple
    *                      tasks with the same task id running at the same time. Implementations can
    *                      use this attempt number to distinguish writers of different task attempts.
-   * @param epochId A monotonically increasing id for streaming queries that are split in to
-   *                discrete periods of execution. For non-streaming queries,
-   *                this ID will always be 0.
    */
-  DataWriter<T> createDataWriter(int partitionId, int attemptNumber, long epochId);
+  DataWriter<T> createDataWriter(int partitionId, int attemptNumber);
 }
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/streaming/StreamWriter.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/streaming/StreamWriter.java
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.sources.v2.writer.streaming;
 
 import org.apache.spark.annotation.InterfaceStability;
+import org.apache.spark.sql.Row;
 import org.apache.spark.sql.sources.v2.writer.DataSourceWriter;
 import org.apache.spark.sql.sources.v2.writer.DataWriter;
 import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage;
@@ -27,6 +28,9 @@
  *
  * Streaming queries are divided into intervals of data called epochs, with a monotonically
  * increasing numeric ID. This writer handles commits and aborts for each successive epoch.
+ *
+ * Note that StreamWriter implementations should provide instances of
+ * {@link StreamingDataWriterFactory}.
  */
 @InterfaceStability.Evolving
 public interface StreamWriter extends DataSourceWriter {
@@ -59,6 +63,14 @@ public interface StreamWriter extends DataSourceWriter {
    */
   void abort(long epochId, WriterCommitMessage[] messages);
 
+  /**
+   * Creates a writer factory which will be serialized and sent to executors.
+   *
+   * If this method fails (by throwing an exception), the query will fail and no Spark job will be
+   * submitted.
+   */
+  StreamingDataWriterFactory<Row> createWriterFactory();
+
   default void commit(WriterCommitMessage[] messages) {
     throw new UnsupportedOperationException(
         "Commit without epoch should not be called with StreamWriter");
diff --git a/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/streaming/StreamingDataWriterFactory.java b/sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/streaming/StreamingDataWriterFactory.java
@@ -0,0 +1,32 @@
+package org.apache.spark.sql.sources.v2.writer.streaming;
+
+import org.apache.spark.sql.sources.v2.writer.DataWriter;
+import org.apache.spark.sql.sources.v2.writer.DataWriterFactory;
+
+public interface StreamingDataWriterFactory<T> extends DataWriterFactory<T> {
+  /**
+   * Returns a data writer to do the actual writing work.
+   *
+   * If this method fails (by throwing an exception), the action would fail and no Spark job was
+   * submitted.
+   *
+   * @param partitionId A unique id of the RDD partition that the returned writer will process.
+   *                    Usually Spark processes many RDD partitions at the same time,
+   *                    implementations should use the partition id to distinguish writers for
+   *                    different partitions.
+   * @param attemptNumber Spark may launch multiple tasks with the same task id. For example, a task
+   *                      failed, Spark launches a new task wth the same task id but different
+   *                      attempt number. Or a task is too slow, Spark launches new tasks wth the
+   *                      same task id but different attempt number, which means there are multiple
+   *                      tasks with the same task id running at the same time. Implementations can
+   *                      use this attempt number to distinguish writers of different task attempts.
+   * @param epochId A monotonically increasing id for streaming queries that are split in to
+   *                discrete periods of execution. For non-streaming queries,
+   *                this ID will always be 0.
+   */
+  DataWriter<T> createDataWriter(int partitionId, int attemptNumber, long epochId);
+
+  @Override default DataWriter<T> createDataWriter(int partitionId, int attemptNumber) {
+    throw new IllegalStateException("Streaming data writer factory cannot create data writers without epoch.");
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2.scala
@@ -31,8 +31,9 @@ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution.streaming.{MicroBatchExecution, StreamExecution}
 import org.apache.spark.sql.execution.streaming.continuous.{CommitPartitionEpoch, ContinuousExecution, EpochCoordinatorRef, SetWriterPartitions}
+import org.apache.spark.sql.execution.streaming.sources.MicroBatchWriter
 import org.apache.spark.sql.sources.v2.writer._
-import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
+import org.apache.spark.sql.sources.v2.writer.streaming.{StreamingDataWriterFactory, StreamWriter}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.util.Utils
 
@@ -54,7 +55,12 @@ case class WriteToDataSourceV2Exec(writer: DataSourceWriter, query: SparkPlan) e
   override protected def doExecute(): RDD[InternalRow] = {
     val writeTask = writer match {
       case w: SupportsWriteInternalRow => w.createInternalRowWriterFactory()
-      case _ => new InternalRowDataWriterFactory(writer.createWriterFactory(), query.schema)
+      case w: MicroBatchWriter =>
+        new StreamingInternalRowDataWriterFactory(w.createWriterFactory(), query.schema)
+      case w: StreamWriter =>
+        new StreamingInternalRowDataWriterFactory(w.createWriterFactory(), query.schema)
+      case _ =>
+        new InternalRowDataWriterFactory(writer.createWriterFactory(), query.schema)
     }
 
     val useCommitCoordinator = writer.useCommitCoordinator
@@ -75,7 +81,8 @@ case class WriteToDataSourceV2Exec(writer: DataSourceWriter, query: SparkPlan) e
             .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions))
 
           (context: TaskContext, iter: Iterator[InternalRow]) =>
-            DataWritingSparkTask.runContinuous(writeTask, context, iter)
+            DataWritingSparkTask.runContinuous(
+              writeTask.asInstanceOf[StreamingDataWriterFactory[InternalRow]], context, iter)
         case _ =>
           (context: TaskContext, iter: Iterator[InternalRow]) =>
             DataWritingSparkTask.run(writeTask, context, iter, useCommitCoordinator)
@@ -132,8 +139,13 @@ object DataWritingSparkTask extends Logging {
     val stageId = context.stageId()
     val partId = context.partitionId()
     val attemptId = context.attemptNumber()
-    val epochId = Option(context.getLocalProperty(MicroBatchExecution.BATCH_ID_KEY)).getOrElse("0")
-    val dataWriter = writeTask.createDataWriter(partId, attemptId, epochId.toLong)
+    val dataWriter = writeTask match {
+      case w: StreamingDataWriterFactory[InternalRow] =>
+        val epochId = Option(context.getLocalProperty(MicroBatchExecution.BATCH_ID_KEY)).get
+        w.createDataWriter(partId, attemptId, epochId.toLong)
+
+      case w => w.createDataWriter(partId, attemptId)
+    }
 
     // write the data and commit this writer.
     Utils.tryWithSafeFinallyAndFailureCallbacks(block = {
@@ -170,7 +182,7 @@ object DataWritingSparkTask extends Logging {
   }
 
   def runContinuous(
-      writeTask: DataWriterFactory[InternalRow],
+      writeTask: StreamingDataWriterFactory[InternalRow],
       context: TaskContext,
       iter: Iterator[InternalRow]): WriterCommitMessage = {
     val epochCoordinator = EpochCoordinatorRef.get(
@@ -217,6 +229,17 @@ class InternalRowDataWriterFactory(
     rowWriterFactory: DataWriterFactory[Row],
     schema: StructType) extends DataWriterFactory[InternalRow] {
 
+  override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[InternalRow] = {
+    new InternalRowDataWriter(
+      rowWriterFactory.createDataWriter(partitionId, attemptNumber),
+      RowEncoder.apply(schema).resolveAndBind())
+  }
+}
+
+class StreamingInternalRowDataWriterFactory(
+    rowWriterFactory: StreamingDataWriterFactory[Row],
+    schema: StructType) extends StreamingDataWriterFactory[InternalRow] {
+
   override def createDataWriter(
       partitionId: Int,
       attemptNumber: Int,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/ConsoleWriter.scala
@@ -22,8 +22,8 @@ import scala.collection.JavaConverters._
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.sources.v2.DataSourceOptions
-import org.apache.spark.sql.sources.v2.writer.{DataWriterFactory, WriterCommitMessage}
-import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
+import org.apache.spark.sql.sources.v2.writer.WriterCommitMessage
+import org.apache.spark.sql.sources.v2.writer.streaming.{StreamingDataWriterFactory, StreamWriter}
 import org.apache.spark.sql.types.StructType
 
 /** Common methods used to create writes for the the console sink */
@@ -39,7 +39,7 @@ class ConsoleWriter(schema: StructType, options: DataSourceOptions)
   assert(SparkSession.getActiveSession.isDefined)
   protected val spark = SparkSession.getActiveSession.get
 
-  def createWriterFactory(): DataWriterFactory[Row] = PackedRowWriterFactory
+  def createWriterFactory(): StreamingDataWriterFactory[Row] = PackedRowWriterFactory
 
   override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {
     // We have to print a "Batch" label for the epoch for compatibility with the pre-data source V2
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/MicroBatchWriter.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.streaming.sources
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriterFactory, SupportsWriteInternalRow, WriterCommitMessage}
-import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
+import org.apache.spark.sql.sources.v2.writer.streaming.{StreamingDataWriterFactory, StreamWriter}
 
 /**
  * A [[DataSourceWriter]] used to hook V2 stream writers into a microbatch plan. It implements
@@ -34,7 +34,13 @@ class MicroBatchWriter(batchId: Long, writer: StreamWriter) extends DataSourceWr
 
   override def abort(messages: Array[WriterCommitMessage]): Unit = writer.abort(batchId, messages)
 
-  override def createWriterFactory(): DataWriterFactory[Row] = writer.createWriterFactory()
+  override def createWriterFactory(): StreamingDataWriterFactory[Row] = {
+    writer.createWriterFactory() match {
+      case s: StreamingDataWriterFactory[Row] => s
+      case _ =>
+        throw new IllegalStateException("StreamWriter did not give a StreamingDataWriterFactory")
+    }
+  }
 }
 
 class InternalRowMicroBatchWriter(batchId: Long, writer: StreamWriter)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/PackedRowWriterFactory.scala
@@ -22,6 +22,7 @@ import scala.collection.mutable
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriter, DataWriterFactory, WriterCommitMessage}
+import org.apache.spark.sql.sources.v2.writer.streaming.StreamingDataWriterFactory
 
 /**
  * A simple [[DataWriterFactory]] whose tasks just pack rows into the commit message for delivery
@@ -30,7 +31,7 @@ import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriter, Dat
  * Note that, because it sends all rows to the driver, this factory will generally be unsuitable
  * for production-quality sinks. It's intended for use in tests.
  */
-case object PackedRowWriterFactory extends DataWriterFactory[Row] {
+case object PackedRowWriterFactory extends StreamingDataWriterFactory[Row] {
   override def createDataWriter(
       partitionId: Int,
       attemptNumber: Int,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/sources/memoryV2.scala
@@ -31,7 +31,7 @@ import org.apache.spark.sql.catalyst.streaming.InternalOutputModes.{Append, Comp
 import org.apache.spark.sql.execution.streaming.Sink
 import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport}
 import org.apache.spark.sql.sources.v2.writer._
-import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
+import org.apache.spark.sql.sources.v2.writer.streaming.{StreamingDataWriterFactory, StreamWriter}
 import org.apache.spark.sql.streaming.OutputMode
 import org.apache.spark.sql.types.StructType
 
@@ -146,7 +146,7 @@ class MemoryStreamWriter(val sink: MemorySinkV2, outputMode: OutputMode)
   }
 }
 
-case class MemoryWriterFactory(outputMode: OutputMode) extends DataWriterFactory[Row] {
+case class MemoryWriterFactory(outputMode: OutputMode) extends StreamingDataWriterFactory[Row] {
   override def createDataWriter(
       partitionId: Int,
       attemptNumber: Int,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/v2/SimpleWritableDataSource.scala
@@ -207,10 +207,7 @@ private[v2] object SimpleCounter {
 class SimpleCSVDataWriterFactory(path: String, jobId: String, conf: SerializableConfiguration)
   extends DataWriterFactory[Row] {
 
-  override def createDataWriter(
-      partitionId: Int,
-      attemptNumber: Int,
-      epochId: Long): DataWriter[Row] = {
+  override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[Row] = {
     val jobPath = new Path(new Path(path, "_temporary"), jobId)
     val filePath = new Path(jobPath, s"$jobId-$partitionId-$attemptNumber")
     val fs = filePath.getFileSystem(conf.value)
@@ -243,10 +240,7 @@ class SimpleCSVDataWriter(fs: FileSystem, file: Path) extends DataWriter[Row] {
 class InternalRowCSVDataWriterFactory(path: String, jobId: String, conf: SerializableConfiguration)
   extends DataWriterFactory[InternalRow] {
 
-  override def createDataWriter(
-      partitionId: Int,
-      attemptNumber: Int,
-      epochId: Long): DataWriter[InternalRow] = {
+  override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[InternalRow] = {
     val jobPath = new Path(new Path(path, "_temporary"), jobId)
     val filePath = new Path(jobPath, s"$jobId-$partitionId-$attemptNumber")
     val fs = filePath.getFileSystem(conf.value)