tests compile and pass

ericm-db · ericm-db · commit a660152d32ae · 2025-12-03T11:14:59.000-08:00
diff --git a/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala
@@ -36,13 +36,13 @@ import org.scalatest.matchers.should._
 import org.scalatest.time.SpanSugar._
 
 import org.apache.spark.{SparkException, TestUtils}
-import org.apache.spark.sql.{AnalysisException, Dataset, ForeachWriter, Row, SparkSession}
+import org.apache.spark.sql.{Dataset, ForeachWriter, Row, SparkSession}
 import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.connector.read.streaming.SparkDataStream
 import org.apache.spark.sql.execution.datasources.v2.StreamingDataSourceV2ScanRelation
 import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
 import org.apache.spark.sql.execution.streaming._
-import org.apache.spark.sql.execution.streaming.checkpointing.{OffsetSeqBase, TombstoneOffset}
+import org.apache.spark.sql.execution.streaming.checkpointing.OffsetSeqBase
 import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
 import org.apache.spark.sql.execution.streaming.runtime.{MicroBatchExecution, StreamExecution, StreamingExecutionRelation}
 import org.apache.spark.sql.execution.streaming.runtime.AsyncProgressTrackingMicroBatchExecution.{ASYNC_PROGRESS_TRACKING_CHECKPOINTING_INTERVAL_MS, ASYNC_PROGRESS_TRACKING_ENABLED}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -3026,7 +3026,6 @@ object SQLConf {
         "source evolution capability where sources can be added, removed, or reordered " +
         "without losing checkpoint state.")
       .version("4.2.0")
-      .owner("streaming-engine")
       .booleanConf
       .createWithDefault(false)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -305,7 +305,8 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan]
       userSpecifiedSchema = Some(table.schema),
       options = dsOptions,
       catalogTable = Some(table))
-    StreamingRelation(dataSource)
+    // TODO: [SC-209298] Add API for naming source in ST
+    StreamingRelation(dataSource, None)
   }
 
 
@@ -335,7 +336,8 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan]
       result
 
     case s @ StreamingRelationV2(
-        _, _, table, extraOptions, _, _, _, Some(UnresolvedCatalogRelation(tableMeta, _, true))) =>
+        _, _, table, extraOptions, _, _, _, Some(
+    UnresolvedCatalogRelation(tableMeta, _, true)), _) =>
       import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._
       val v1Relation = getStreamingRelation(tableMeta, extraOptions)
       if (table.isInstanceOf[SupportsRead]
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/continuous/ContinuousExecution.scala
@@ -79,7 +79,7 @@ class ContinuousExecution(
     import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._
     val _logicalPlan = analyzedPlan.transform {
       case s @ StreamingRelationV2(ds, sourceName, table: SupportsRead, options, output,
-        catalog, identifier, _) =>
+        catalog, identifier, _, userProvidedName) =>
         val dsStr = if (ds.nonEmpty) s"[${ds.get}]" else ""
         if (!table.supports(TableCapability.CONTINUOUS_READ)) {
           throw QueryExecutionErrors.continuousProcessingUnsupportedByDataSourceError(sourceName)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/MicroBatchExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/MicroBatchExecution.scala
@@ -184,7 +184,8 @@ class MicroBatchExecution(
 
     import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._
     val _logicalPlan = analyzedPlan.transform {
-      case streamingRelation @ StreamingRelation(dataSourceV1, sourceName, output) =>
+      case streamingRelation @ StreamingRelation(
+          dataSourceV1, sourceName, output, userProvidedSourceName) =>
         toExecutionRelationMap.getOrElseUpdate(streamingRelation, {
           // Materialize source to avoid creating it in every batch
           val metadataPath = s"$resolvedCheckpointRoot/sources/$nextSourceId"
@@ -193,11 +194,12 @@ class MicroBatchExecution(
           logInfo(log"Using Source [${MDC(LogKeys.STREAMING_SOURCE, source)}] " +
             log"from DataSourceV1 named '${MDC(LogKeys.STREAMING_DATA_SOURCE_NAME, sourceName)}' " +
             log"[${MDC(LogKeys.STREAMING_DATA_SOURCE_DESCRIPTION, dataSourceV1)}]")
-          StreamingExecutionRelation(source, output, dataSourceV1.catalogTable)(sparkSession)
+          StreamingExecutionRelation(
+            source, output, dataSourceV1.catalogTable, userProvidedSourceName)(sparkSession)
         })
 
       case s @ StreamingRelationV2(src, srcName, table: SupportsRead, options, output,
-        catalog, identifier, v1) =>
+        catalog, identifier, v1, userProvidedSourceName) =>
         val dsStr = if (src.nonEmpty) s"[${src.get}]" else ""
         val v2Disabled = disabledSources.contains(src.getOrElse(None).getClass.getCanonicalName)
         if (!v2Disabled && table.supports(TableCapability.MICRO_BATCH_READ)) {
@@ -221,7 +223,8 @@ class MicroBatchExecution(
                 trigger match {
                   case RealTimeTrigger(duration) => Some(duration)
                   case _ => None
-                }
+                },
+                userProvidedSourceName
               )
             StreamingDataSourceV2ScanRelation(relation, scan, output, stream)
           })
@@ -240,7 +243,7 @@ class MicroBatchExecution(
               log"${MDC(LogKeys.STREAMING_DATA_SOURCE_DESCRIPTION, dsStr)}")
             // We don't have a catalog table but may have a table identifier. Given this is about
             // v1 fallback path, we just give up and set the catalog table as None.
-            StreamingExecutionRelation(source, output, None)(sparkSession)
+            StreamingExecutionRelation(source, output, None, userProvidedSourceName)(sparkSession)
           })
         }
     }
@@ -932,7 +935,7 @@ class MicroBatchExecution(
     // Replace sources in the logical plan with data that has arrived since the last batch.
     val newBatchesPlan = logicalPlan transform {
       // For v1 sources.
-      case StreamingExecutionRelation(source, output, catalogTable) =>
+      case StreamingExecutionRelation(source, output, catalogTable, _) =>
         mutableNewData.get(source).map { dataPlan =>
           val hasFileMetadata = output.exists {
             case FileSourceMetadataAttribute(_) => true
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/ResolveWriteToStream.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/ResolveWriteToStream.scala
@@ -48,6 +48,14 @@ object ResolveWriteToStream extends Rule[LogicalPlan] {
           log"is not supported in streaming DataFrames/Datasets and will be disabled.")
       }
 
+      // Always check for duplicate source names
+      checkDuplicateSourceNames(s.inputQuery)
+
+      // Check for unnamed sources when enforcement is enabled
+      if (conf.enableStreamingSourceEvolution) {
+        validateNamedSources(s.inputQuery)
+      }
+
       if (conf.isUnsupportedOperationCheckEnabled) {
         if (s.trigger.isInstanceOf[RealTimeTrigger]) {
           UnsupportedOperationChecker.
@@ -143,5 +151,48 @@ object ResolveWriteToStream extends Rule[LogicalPlan] {
       log"resolved to ${MDC(CHECKPOINT_ROOT, resolvedCheckpointRoot)}.")
     (resolvedCheckpointRoot, deleteCheckpointOnStop)
   }
+
+  /**
+   * Checks for duplicate source names across all streaming sources.
+   * This validation always runs regardless of source evolution enforcement.
+   */
+  private def checkDuplicateSourceNames(plan: LogicalPlan): Unit = {
+    import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2
+
+    val sourcesWithNames = plan.collect {
+      case StreamingRelation(_, _, _, userProvidedSourceName) => userProvidedSourceName
+      case StreamingRelationV2(
+        _, _, _, _, _, _, _, _, userProvidedSourceName) => userProvidedSourceName
+    }
+
+    // Check for duplicate source names among named sources
+    val namedSources = sourcesWithNames.flatten
+    val duplicates = namedSources.groupBy(identity).filter(_._2.size > 1).keys.toSeq.sorted
+
+    if (duplicates.nonEmpty) {
+      throw QueryCompilationErrors.duplicateStreamingSourceNamesError(
+        duplicates.map(name => s"'$name'").mkString(", "))
+    }
+  }
+
+  /**
+   * Validates that all streaming sources have names when named source enforcement is enabled.
+   */
+  private def validateNamedSources(plan: LogicalPlan): Unit = {
+    import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2
+
+    val unnamedSources = plan.collect {
+      case StreamingRelation(ds, sourceName, _, None) =>
+        s"[index=${sourceName}, provider=${ds.providingClass.getSimpleName}]"
+      case StreamingRelationV2(src, srcName, table, _, _, _, _, _, None) =>
+        val provider = src.map(_.getClass.getSimpleName).getOrElse(table.getClass.getSimpleName)
+        s"[index=${srcName}, provider=${provider}]"
+    }
+
+    if (unnamedSources.nonEmpty) {
+      throw QueryCompilationErrors.unnamedStreamingSourcesWithEnforcementError(
+        unnamedSources.mkString(", "))
+    }
+  }
 }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/StreamingRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/runtime/StreamingRelation.scala
@@ -37,6 +37,12 @@ object StreamingRelation {
     StreamingRelation(
       dataSource, dataSource.sourceInfo.name, toAttributes(dataSource.sourceInfo.schema))
   }
+
+  def apply(dataSource: DataSource, userProvidedSourceName: Option[String]): StreamingRelation = {
+    StreamingRelation(
+      dataSource, dataSource.sourceInfo.name, toAttributes(dataSource.sourceInfo.schema),
+      userProvidedSourceName)
+  }
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSourceSuite.scala
@@ -190,15 +190,15 @@ abstract class FileStreamSourceTest
   protected def getSourceFromFileStream(df: DataFrame): FileStreamSource = {
     val checkpointLocation = Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath
     df.queryExecution.analyzed
-      .collect { case StreamingRelation(dataSource, _, _) =>
+      .collect { case StreamingRelation(dataSource, _, _, _) =>
         // There is only one source in our tests so just set sourceId to 0
         dataSource.createSource(s"$checkpointLocation/sources/0").asInstanceOf[FileStreamSource]
       }.head
   }
 
   protected def getSourcesFromStreamingQuery(query: StreamExecution): Seq[FileStreamSource] = {
     query.logicalPlan.collect {
-      case StreamingExecutionRelation(source, _, _) if source.isInstanceOf[FileStreamSource] =>
+      case StreamingExecutionRelation(source, _, _, _) if source.isInstanceOf[FileStreamSource] =>
         source.asInstanceOf[FileStreamSource]
     }
   }
@@ -251,7 +251,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest {
         reader.load()
       }
     df.queryExecution.analyzed
-      .collect { case s @ StreamingRelation(dataSource, _, _) => s.schema }.head
+      .collect { case s @ StreamingRelation(dataSource, _, _, _) => s.schema }.head
   }
 
   override def beforeAll(): Unit = {

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,12 @@ object StreamingRelation {`
`37`	`37`	`StreamingRelation(`
`38`	`38`	`dataSource, dataSource.sourceInfo.name, toAttributes(dataSource.sourceInfo.schema))`
`39`	`39`	`}`
	`40`	`+`
	`41`	`+ def apply(dataSource: DataSource, userProvidedSourceName: Option[String]): StreamingRelation = {`
	`42`	`+ StreamingRelation(`
	`43`	`+ dataSource, dataSource.sourceInfo.name, toAttributes(dataSource.sourceInfo.schema),`
	`44`	`+ userProvidedSourceName)`
	`45`	`+ }`
`40`	`46`	`}`
`41`	`47`
`42`	`48`	`/**`
Original file line number	Diff line number	Diff line change
`@@ -190,15 +190,15 @@ abstract class FileStreamSourceTest`
`190`	`190`	`protected def getSourceFromFileStream(df: DataFrame): FileStreamSource = {`
`191`	`191`	`val checkpointLocation = Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath`
`192`	`192`	`df.queryExecution.analyzed`
`193`		`- .collect { case StreamingRelation(dataSource, _, _) =>`
	`193`	`+ .collect { case StreamingRelation(dataSource, _, _, _) =>`
`194`	`194`	`// There is only one source in our tests so just set sourceId to 0`
`195`	`195`	`dataSource.createSource(s"$checkpointLocation/sources/0").asInstanceOf[FileStreamSource]`
`196`	`196`	`}.head`
`197`	`197`	`}`
`198`	`198`
`199`	`199`	`protected def getSourcesFromStreamingQuery(query: StreamExecution): Seq[FileStreamSource] = {`
`200`	`200`	`query.logicalPlan.collect {`
`201`		`- case StreamingExecutionRelation(source, _, _) if source.isInstanceOf[FileStreamSource] =>`
	`201`	`+ case StreamingExecutionRelation(source, _, _, _) if source.isInstanceOf[FileStreamSource] =>`
`202`	`202`	`source.asInstanceOf[FileStreamSource]`
`203`	`203`	`}`
`204`	`204`	`}`
`@@ -251,7 +251,7 @@ class FileStreamSourceSuite extends FileStreamSourceTest {`
`251`	`251`	`reader.load()`
`252`	`252`	`}`
`253`	`253`	`df.queryExecution.analyzed`
`254`		`- .collect { case s @ StreamingRelation(dataSource, _, _) => s.schema }.head`
	`254`	`+ .collect { case s @ StreamingRelation(dataSource, _, _, _) => s.schema }.head`
`255`	`255`	`}`
`256`	`256`
`257`	`257`	`override def beforeAll(): Unit = {`