apache · WweiL · Apr 21, 2023 · May 5, 2023 · May 5, 2023 · May 6, 2023
diff --git a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto
@@ -364,7 +364,8 @@ message StreamingQueryManagerCommand {
   }
 
   message StreamingQueryListenerCommand {
-    bytes listener_payload = 1;
+    optional bytes listener_payload = 1;
+    optional PythonUDF python_listener_payload = 2;
   }
 }
 

diff --git a/...nect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/...nect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
@@ -2804,7 +2804,7 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
           StreamingForeachBatchHelper.scalaForeachBatchWrapper(scalaFn, sessionHolder)
 
         case StreamingForeachFunction.FunctionCase.FUNCTION_NOT_SET =>
-          throw InvalidPlanInput("Unexpected") // Unreachable
+          throw InvalidPlanInput("Unexpected foreachBatch function") // Unreachable
       }
 
       writer.foreachBatch(foreachBatchFn)
@@ -3047,18 +3047,29 @@ class SparkConnectPlanner(val sessionHolder: SessionHolder) extends Logging {
         respBuilder.setResetTerminated(true)
 
       case StreamingQueryManagerCommand.CommandCase.ADD_LISTENER =>
-        val listenerPacket = Utils
-          .deserialize[StreamingListenerPacket](
-            command.getAddListener.getListenerPayload.toByteArray,
-            Utils.getContextOrSparkClassLoader)
-        val listener: StreamingQueryListener = listenerPacket.listener
-          .asInstanceOf[StreamingQueryListener]
-        val id: String = listenerPacket.id
-        sessionHolder.cacheListenerById(id, listener)
+        val listener = if (command.getAddListener.hasListenerPayload) {
+          val listenerPacket = Utils
+            .deserialize[StreamingListenerPacket](
+              command.getAddListener.getListenerPayload.toByteArray,
+              Utils.getContextOrSparkClassLoader)
+          val listener: StreamingQueryListener = listenerPacket.listener
+            .asInstanceOf[StreamingQueryListener]
+          val id: String = listenerPacket.id
+          sessionHolder.cacheListenerById(id, listener)
+          listener
+        } else {
+          val listener = new PythonStreamingQueryListener(
+            transformPythonFunction(command.getAddListener.getPythonListenerPayload),
+            sessionHolder,
+            pythonExec)
+          listener
+        }
+
         session.streams.addListener(listener)
         respBuilder.setAddListener(true)
 
       case StreamingQueryManagerCommand.CommandCase.REMOVE_LISTENER =>
+        // TODO (SPARK-44516): remove listener for python client
         val listenerId = Utils
           .deserialize[StreamingListenerPacket](
             command.getRemoveListener.getListenerPayload.toByteArray,

diff --git a/...ver/src/main/scala/org/apache/spark/sql/connect/planner/StreamingForeachBatchHelper.scala b/...ver/src/main/scala/org/apache/spark/sql/connect/planner/StreamingForeachBatchHelper.scala
@@ -18,9 +18,7 @@ package org.apache.spark.sql.connect.planner
 
 import java.util.UUID
 
-import org.apache.spark.api.python.PythonRDD
-import org.apache.spark.api.python.SimplePythonFunction
-import org.apache.spark.api.python.StreamingPythonRunner
+import org.apache.spark.api.python.{PythonEvalType, PythonRDD, SimplePythonFunction, StreamingPythonRunner}
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.connect.service.SessionHolder
@@ -89,7 +87,8 @@ object StreamingForeachBatchHelper extends Logging {
     val port = SparkConnectService.localPort
     val connectUrl = s"sc://localhost:$port/;user_id=${sessionHolder.userId}"
     val runner = StreamingPythonRunner(pythonFn, connectUrl)
-    val (dataOut, dataIn) = runner.init(sessionHolder.sessionId)
+    val (dataOut, dataIn) = runner.init(
+      sessionHolder.sessionId, PythonEvalType.SQL_STREAMING_FOREACH_BATCH)
 
     val foreachBatchRunnerFn: FnArgsWithId => Unit = (args: FnArgsWithId) => {
 

diff --git a/...er/src/main/scala/org/apache/spark/sql/connect/planner/StreamingQueryListenerHelper.scala b/...er/src/main/scala/org/apache/spark/sql/connect/planner/StreamingQueryListenerHelper.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connect.planner
+
+import org.apache.spark.api.python.{PythonEvalType, PythonRDD, SimplePythonFunction, StreamingPythonRunner}
+import org.apache.spark.sql.connect.service.{SessionHolder, SparkConnectService}
+import org.apache.spark.sql.streaming.StreamingQueryListener
+
+class PythonStreamingQueryListener(
+    listener: SimplePythonFunction,
+    sessionHolder: SessionHolder,
+    pythonExec: String)
+    extends StreamingQueryListener {
+
+  val port = SparkConnectService.localPort
+  val connectUrl = s"sc://localhost:$port/;user_id=${sessionHolder.userId}"
+  val runner = StreamingPythonRunner(listener, connectUrl)
+
+  val (dataOut, _) = runner.init(
+    sessionHolder.sessionId, PythonEvalType.SQL_STREAMING_LISTENER)
+
+  override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = {
+    PythonRDD.writeUTF(event.json, dataOut)
+    dataOut.writeInt(0)
+    dataOut.flush()
+  }
+
+  override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = {
+    PythonRDD.writeUTF(event.json, dataOut)
+    dataOut.writeInt(1)
+    dataOut.flush()
+  }
+
+  override def onQueryIdle(event: StreamingQueryListener.QueryIdleEvent): Unit = {
+    PythonRDD.writeUTF(event.json, dataOut)
+    dataOut.writeInt(2)
+    dataOut.flush()
+  }
+
+  override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = {
+    PythonRDD.writeUTF(event.json, dataOut)
+    dataOut.writeInt(3)
+    dataOut.flush()
+  }
+}
diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRunner.scala
@@ -59,6 +59,9 @@ private[spark] object PythonEvalType {
   val SQL_TABLE_UDF = 300
   val SQL_ARROW_TABLE_UDF = 301
 
+  val SQL_STREAMING_FOREACH_BATCH = 400
+  val SQL_STREAMING_LISTENER = 401
+
   def toString(pythonEvalType: Int): String = pythonEvalType match {
     case NON_UDF => "NON_UDF"
     case SQL_BATCHED_UDF => "SQL_BATCHED_UDF"
@@ -74,6 +77,8 @@ private[spark] object PythonEvalType {
     case SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE => "SQL_GROUPED_MAP_PANDAS_UDF_WITH_STATE"
     case SQL_TABLE_UDF => "SQL_TABLE_UDF"
     case SQL_ARROW_TABLE_UDF => "SQL_ARROW_TABLE_UDF"
+    case SQL_STREAMING_FOREACH_BATCH => "SQL_STREAMING_FOREACH_BATCH"
+    case SQL_STREAMING_LISTENER => "SQL_STREAMING_LISTENER"
   }
 }
 

diff --git a/core/src/main/scala/org/apache/spark/api/python/StreamingPythonRunner.scala b/core/src/main/scala/org/apache/spark/api/python/StreamingPythonRunner.scala
@@ -48,7 +48,7 @@ private[spark] class StreamingPythonRunner(func: PythonFunction, connectUrl: Str
    * Initializes the Python worker for streaming functions. Sets up Spark Connect session
    * to be used with the functions.
    */
-  def init(sessionId: String): (DataOutputStream, DataInputStream) = {
+  def init(sessionId: String, evalType: Int): (DataOutputStream, DataInputStream) = {
     log.info(s"Initializing Python runner (session: $sessionId ,pythonExec: $pythonExec")
 
     val env = SparkEnv.get
@@ -73,6 +73,9 @@ private[spark] class StreamingPythonRunner(func: PythonFunction, connectUrl: Str
     // Send sessionId
     PythonRDD.writeUTF(sessionId, dataOut)
 
+    // Send evalType
+    dataOut.writeInt(evalType)
+
     // send the user function to python process
     val command = func.command
     dataOut.writeInt(command.length)

diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -117,6 +117,8 @@
         SQLArrowTableUDFType,
         SQLBatchedUDFType,
         SQLTableUDFType,
+        SQLStreamingForeachBatchType,
+        SQLStreamingListenerType,
     )
 
     from py4j.java_gateway import JavaObject
@@ -162,6 +164,9 @@ class PythonEvalType:
     SQL_TABLE_UDF: "SQLTableUDFType" = 300
     SQL_ARROW_TABLE_UDF: "SQLArrowTableUDFType" = 301
 
+    SQL_STREAMING_FOREACH_BATCH: "SQLStreamingForeachBatchType" = 400
+    SQL_STREAMING_LISTENER: "SQLStreamingListenerType" = 401
+
 
 def portable_hash(x: Hashable) -> int:
     """

diff --git a/python/pyspark/sql/_typing.pyi b/python/pyspark/sql/_typing.pyi
@@ -61,6 +61,9 @@ SQLArrowBatchedUDFType = Literal[101]
 SQLTableUDFType = Literal[300]
 SQLArrowTableUDFType = Literal[301]
 
+SQLStreamingForeachBatchType = Literal[400]
+SQLStreamingListenerType = LiteralType[401]
+
 class SupportsOpen(Protocol):
     def open(self, partition_id: int, epoch_id: int) -> bool: ...