diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala index d9fdb56739e0..41e4c1fc3930 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala @@ -17,9 +17,11 @@ package org.apache.spark.sql.catalyst.analysis +import scala.collection.mutable + import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.SubqueryExpression -import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, With} +import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias, With} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.internal.SQLConf.{LEGACY_CTE_PRECEDENCE_POLICY, LegacyBehaviorPolicy} @@ -41,34 +43,44 @@ object CTESubstitution extends Rule[LogicalPlan] { } /** - * Check the plan to be traversed has naming conflicts in nested CTE or not, traverse through - * child, innerChildren and subquery expressions for the current plan. + * Spark 3.0 changes the CTE relations resolution, and inner relations take precedence. This is + * correct but we need to warn users about this behavior change under EXCEPTION mode, when we see + * CTE relations with conflicting names. + * + * Note that, before Spark 3.0 the parser didn't support CTE in the FROM clause. For example, + * `WITH ... SELECT * FROM (WITH ... SELECT ...)` was not supported. We should not fail for this + * case, as Spark versions before 3.0 can't run it anyway. The parameter `startOfQuery` is used + * to indicate where we can define CTE relations before Spark 3.0, and we should only check + * name conflicts when `startOfQuery` is true. */ private def assertNoNameConflictsInCTE( plan: LogicalPlan, - outerCTERelationNames: Set[String] = Set.empty, - namesInSubqueries: Set[String] = Set.empty): Unit = { + outerCTERelationNames: Seq[String] = Nil, + startOfQuery: Boolean = true): Unit = { + val resolver = SQLConf.get.resolver plan match { - case w @ With(child, relations) => - val newNames = relations.map { - case (cteName, _) => - if (outerCTERelationNames.contains(cteName)) { - throw new AnalysisException(s"Name $cteName is ambiguous in nested CTE. " + + case With(child, relations) => + val newNames = mutable.ArrayBuffer.empty[String] + newNames ++= outerCTERelationNames + relations.foreach { + case (name, relation) => + if (startOfQuery && outerCTERelationNames.exists(resolver(_, name))) { + throw new AnalysisException(s"Name $name is ambiguous in nested CTE. " + s"Please set ${LEGACY_CTE_PRECEDENCE_POLICY.key} to CORRECTED so that name " + "defined in inner CTE takes precedence. If set it to LEGACY, outer CTE " + "definitions will take precedence. See more details in SPARK-28228.") - } else { - cteName } - }.toSet ++ namesInSubqueries - assertNoNameConflictsInCTE(child, outerCTERelationNames, newNames) - w.innerChildren.foreach(assertNoNameConflictsInCTE(_, newNames, newNames)) + // CTE relation is defined as `SubqueryAlias`. Here we skip it and check the child + // directly, so that `startOfQuery` is set correctly. + assertNoNameConflictsInCTE(relation.child, newNames) + newNames += name + } + assertNoNameConflictsInCTE(child, newNames, startOfQuery = false) case other => - other.subqueries.foreach( - assertNoNameConflictsInCTE(_, namesInSubqueries, namesInSubqueries)) + other.subqueries.foreach(assertNoNameConflictsInCTE(_, outerCTERelationNames)) other.children.foreach( - assertNoNameConflictsInCTE(_, outerCTERelationNames, namesInSubqueries)) + assertNoNameConflictsInCTE(_, outerCTERelationNames, startOfQuery = false)) } } diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql index 134f88bf3302..3b64b5daa82d 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql @@ -111,3 +111,28 @@ WHERE c IN ( WITH t(c) AS (SELECT 2) SELECT * FROM t ); + +-- forward name conflict is not a real conflict +WITH + t AS ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 + ), + t2 AS (SELECT 2) +SELECT * FROM t; + +-- case insensitive name conflicts: in other CTE relations +WITH + abc AS (SELECT 1), + t AS ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC + ) +SELECT * FROM t; + +-- case insensitive name conflicts: in subquery expressions +WITH abc AS (SELECT 1) +SELECT ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC +); \ No newline at end of file diff --git a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out index 629daf7410e5..4d0e5ea829d3 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 13 +-- Number of queries: 16 -- !query @@ -179,3 +179,43 @@ WHERE c IN ( struct -- !query output 1 + + +-- !query +WITH + t AS ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 + ), + t2 AS (SELECT 2) +SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +WITH + abc AS (SELECT 1), + t AS ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC + ) +SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +WITH abc AS (SELECT 1) +SELECT ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC +) +-- !query schema +struct +-- !query output +1 diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out index 34e9be0ca000..2f736c7b4978 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 13 +-- Number of queries: 16 -- !query @@ -64,10 +64,9 @@ WITH ) SELECT * FROM t2 -- !query schema -struct<> +struct -- !query output -org.apache.spark.sql.AnalysisException -Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.; +2 -- !query @@ -186,3 +185,45 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.; + + +-- !query +WITH + t AS ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 + ), + t2 AS (SELECT 2) +SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +WITH + abc AS (SELECT 1), + t AS ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC + ) +SELECT * FROM t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name aBc is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.; + + +-- !query +WITH abc AS (SELECT 1) +SELECT ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC +) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Name aBc is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.; diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out index 6eba1ad72523..74394ee3ffc8 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 13 +-- Number of queries: 16 -- !query @@ -179,3 +179,43 @@ WHERE c IN ( struct -- !query output + + +-- !query +WITH + t AS ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 + ), + t2 AS (SELECT 2) +SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +WITH + abc AS (SELECT 1), + t AS ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC + ) +SELECT * FROM t +-- !query schema +struct<2:int> +-- !query output +2 + + +-- !query +WITH abc AS (SELECT 1) +SELECT ( + WITH aBc AS (SELECT 2) + SELECT * FROM aBC +) +-- !query schema +struct +-- !query output +2