apache · cloud-fan · Apr 27, 2020 · Apr 27, 2020 · dongjoon-hyun · Apr 27, 2020
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala
@@ -17,9 +17,11 @@
 
 package org.apache.spark.sql.catalyst.analysis
 
+import scala.collection.mutable
+
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
-import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, With}
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias, With}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.{LEGACY_CTE_PRECEDENCE_POLICY, LegacyBehaviorPolicy}
@@ -41,34 +43,44 @@ object CTESubstitution extends Rule[LogicalPlan] {
   }
 
   /**
-   * Check the plan to be traversed has naming conflicts in nested CTE or not, traverse through
-   * child, innerChildren and subquery expressions for the current plan.
+   * Spark 3.0 changes the CTE relations resolution, and inner relations take precedence. This is
+   * correct but we need to warn users about this behavior change under EXCEPTION mode, when we see
+   * CTE relations with conflicting names.
+   *
+   * Note that, before Spark 3.0 the parser didn't support CTE in the FROM clause. For example,
+   * `WITH ... SELECT * FROM (WITH ... SELECT ...)` was not supported. We should not fail for this
+   * case, as Spark versions before 3.0 can't run it anyway. The parameter `startOfQuery` is used
+   * to indicate where we can define CTE relations before Spark 3.0, and we should only check
+   * name conflicts when `startOfQuery` is true.
    */
   private def assertNoNameConflictsInCTE(
       plan: LogicalPlan,
-      outerCTERelationNames: Set[String] = Set.empty,
-      namesInSubqueries: Set[String] = Set.empty): Unit = {
+      outerCTERelationNames: Seq[String] = Nil,
+      startOfQuery: Boolean = true): Unit = {
+    val resolver = SQLConf.get.resolver
     plan match {
-      case w @ With(child, relations) =>
-        val newNames = relations.map {
-          case (cteName, _) =>
-            if (outerCTERelationNames.contains(cteName)) {
-              throw new AnalysisException(s"Name $cteName is ambiguous in nested CTE. " +
+      case With(child, relations) =>
+        val newNames = mutable.ArrayBuffer.empty[String]
+        newNames ++= outerCTERelationNames
+        relations.foreach {
+          case (name, relation) =>
+            if (startOfQuery && outerCTERelationNames.exists(resolver(_, name))) {
+              throw new AnalysisException(s"Name $name is ambiguous in nested CTE. " +
                 s"Please set ${LEGACY_CTE_PRECEDENCE_POLICY.key} to CORRECTED so that name " +
                 "defined in inner CTE takes precedence. If set it to LEGACY, outer CTE " +
                 "definitions will take precedence. See more details in SPARK-28228.")
-            } else {
-              cteName
             }
-        }.toSet ++ namesInSubqueries
-        assertNoNameConflictsInCTE(child, outerCTERelationNames, newNames)
-        w.innerChildren.foreach(assertNoNameConflictsInCTE(_, newNames, newNames))
+            // CTE relation is defined as `SubqueryAlias`. Here we skip it and check the child
+            // directly, so that `startOfQuery` is set correctly.
+            assertNoNameConflictsInCTE(relation.child, newNames)
+            newNames += name
+        }
+        assertNoNameConflictsInCTE(child, newNames, startOfQuery = false)
 
       case other =>
-        other.subqueries.foreach(
-          assertNoNameConflictsInCTE(_, namesInSubqueries, namesInSubqueries))
+        other.subqueries.foreach(assertNoNameConflictsInCTE(_, outerCTERelationNames))
         other.children.foreach(
-          assertNoNameConflictsInCTE(_, outerCTERelationNames, namesInSubqueries))
+          assertNoNameConflictsInCTE(_, outerCTERelationNames, startOfQuery = false))
     }
   }
 

diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
@@ -111,3 +111,28 @@ WHERE c IN (
   WITH t(c) AS (SELECT 2)
   SELECT * FROM t
 );
+
+-- forward name conflict is not a real conflict
+WITH
+  t AS (
+    WITH t2 AS (SELECT 1)
+    SELECT * FROM t2
+  ),
+  t2 AS (SELECT 2)
+SELECT * FROM t;
+
+-- case insensitive name conflicts: in other CTE relations
+WITH
+  abc AS (SELECT 1),
+  t AS (
+    WITH aBc AS (SELECT 2)
+    SELECT * FROM aBC
+  )
+SELECT * FROM t;
+
+-- case insensitive name conflicts: in subquery expressions
+WITH abc AS (SELECT 1)
+SELECT (
+  WITH aBc AS (SELECT 2)
+  SELECT * FROM aBC
+);
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 13
+-- Number of queries: 16
 
 
 -- !query
@@ -179,3 +179,43 @@ WHERE c IN (
 struct<c:int>
 -- !query output
 1
+
+
+-- !query
+WITH
+  t AS (
+    WITH t2 AS (SELECT 1)
+    SELECT * FROM t2
+  ),
+  t2 AS (SELECT 2)
+SELECT * FROM t
+-- !query schema
+struct<1:int>
+-- !query output
+1
+
+
+-- !query
+WITH
+  abc AS (SELECT 1),
+  t AS (
+    WITH aBc AS (SELECT 2)
+    SELECT * FROM aBC
+  )
+SELECT * FROM t
+-- !query schema
+struct<1:int>
+-- !query output
+1
+
+
+-- !query
+WITH abc AS (SELECT 1)
+SELECT (
+  WITH aBc AS (SELECT 2)
+  SELECT * FROM aBC
+)
+-- !query schema
+struct<scalarsubquery():int>
+-- !query output
+1
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 13
+-- Number of queries: 16
 
 
 -- !query
@@ -64,10 +64,9 @@ WITH
   )
 SELECT * FROM t2
 -- !query schema
-struct<>
+struct<scalarsubquery():int>
 -- !query output
-org.apache.spark.sql.AnalysisException
-Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
+2
 
 
 -- !query
@@ -186,3 +185,45 @@ struct<>
 -- !query output
 org.apache.spark.sql.AnalysisException
 Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
+
+
+-- !query
+WITH
+  t AS (
+    WITH t2 AS (SELECT 1)
+    SELECT * FROM t2
+  ),
+  t2 AS (SELECT 2)
+SELECT * FROM t
+-- !query schema
+struct<1:int>
+-- !query output
+1
+
+
+-- !query
+WITH
+  abc AS (SELECT 1),
+  t AS (
+    WITH aBc AS (SELECT 2)
+    SELECT * FROM aBC
+  )
+SELECT * FROM t
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name aBc is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
+
+
+-- !query
+WITH abc AS (SELECT 1)
+SELECT (
+  WITH aBc AS (SELECT 2)
+  SELECT * FROM aBC
+)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Name aBc is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 13
+-- Number of queries: 16
 
 
 -- !query
@@ -179,3 +179,43 @@ WHERE c IN (
 struct<c:int>
 -- !query output
 
+
+
+-- !query
+WITH
+  t AS (
+    WITH t2 AS (SELECT 1)
+    SELECT * FROM t2
+  ),
+  t2 AS (SELECT 2)
+SELECT * FROM t
+-- !query schema
+struct<1:int>
+-- !query output
+1
+
+
+-- !query
+WITH
+  abc AS (SELECT 1),
+  t AS (
+    WITH aBc AS (SELECT 2)
+    SELECT * FROM aBC
+  )
+SELECT * FROM t
+-- !query schema
+struct<2:int>
+-- !query output
+2
+
+
+-- !query
+WITH abc AS (SELECT 1)
+SELECT (
+  WITH aBc AS (SELECT 2)
+  SELECT * FROM aBC
+)
+-- !query schema
+struct<scalarsubquery():int>
+-- !query output
+2