Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@

package org.apache.spark.sql.catalyst.analysis

import scala.collection.mutable

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.SubqueryExpression
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, With}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias, With}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.{LEGACY_CTE_PRECEDENCE_POLICY, LegacyBehaviorPolicy}
Expand All @@ -41,34 +43,44 @@ object CTESubstitution extends Rule[LogicalPlan] {
}

/**
* Check the plan to be traversed has naming conflicts in nested CTE or not, traverse through
* child, innerChildren and subquery expressions for the current plan.
* Spark 3.0 changes the CTE relations resolution, and inner relations take precedence. This is
* correct but we need to warn users about this behavior change under EXCEPTION mode, when we see
* CTE relations with conflicting names.
*
* Note that, before Spark 3.0 the parser didn't support CTE in the FROM clause. For example,
* `WITH ... SELECT * FROM (WITH ... SELECT ...)` was not supported. We should not fail for this
* case, as Spark versions before 3.0 can't run it anyway. The parameter `startOfQuery` is used
* to indicate where we can define CTE relations before Spark 3.0, and we should only check
* name conflicts when `startOfQuery` is true.
*/
private def assertNoNameConflictsInCTE(
plan: LogicalPlan,
outerCTERelationNames: Set[String] = Set.empty,
namesInSubqueries: Set[String] = Set.empty): Unit = {
outerCTERelationNames: Seq[String] = Nil,
startOfQuery: Boolean = true): Unit = {
val resolver = SQLConf.get.resolver
plan match {
case w @ With(child, relations) =>
val newNames = relations.map {
case (cteName, _) =>
if (outerCTERelationNames.contains(cteName)) {
throw new AnalysisException(s"Name $cteName is ambiguous in nested CTE. " +
case With(child, relations) =>
val newNames = mutable.ArrayBuffer.empty[String]
newNames ++= outerCTERelationNames
relations.foreach {
case (name, relation) =>
if (startOfQuery && outerCTERelationNames.exists(resolver(_, name))) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My bad. I missed to check case-sensitivity.

throw new AnalysisException(s"Name $name is ambiguous in nested CTE. " +
s"Please set ${LEGACY_CTE_PRECEDENCE_POLICY.key} to CORRECTED so that name " +
"defined in inner CTE takes precedence. If set it to LEGACY, outer CTE " +
"definitions will take precedence. See more details in SPARK-28228.")
} else {
cteName
}
}.toSet ++ namesInSubqueries
assertNoNameConflictsInCTE(child, outerCTERelationNames, newNames)
w.innerChildren.foreach(assertNoNameConflictsInCTE(_, newNames, newNames))
// CTE relation is defined as `SubqueryAlias`. Here we skip it and check the child
// directly, so that `startOfQuery` is set correctly.
assertNoNameConflictsInCTE(relation.child, newNames)
newNames += name
}
assertNoNameConflictsInCTE(child, newNames, startOfQuery = false)

case other =>
other.subqueries.foreach(
assertNoNameConflictsInCTE(_, namesInSubqueries, namesInSubqueries))
other.subqueries.foreach(assertNoNameConflictsInCTE(_, outerCTERelationNames))
other.children.foreach(
assertNoNameConflictsInCTE(_, outerCTERelationNames, namesInSubqueries))
assertNoNameConflictsInCTE(_, outerCTERelationNames, startOfQuery = false))
}
}

Expand Down
25 changes: 25 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/cte-nested.sql
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,28 @@ WHERE c IN (
WITH t(c) AS (SELECT 2)
SELECT * FROM t
);

-- forward name conflict is not a real conflict
WITH
t AS (
WITH t2 AS (SELECT 1)
SELECT * FROM t2
),
t2 AS (SELECT 2)
SELECT * FROM t;

-- case insensitive name conflicts: in other CTE relations
WITH
abc AS (SELECT 1),
t AS (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
)
SELECT * FROM t;

-- case insensitive name conflicts: in subquery expressions
WITH abc AS (SELECT 1)
SELECT (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
);
42 changes: 41 additions & 1 deletion sql/core/src/test/resources/sql-tests/results/cte-legacy.sql.out
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 13
-- Number of queries: 16


-- !query
Expand Down Expand Up @@ -179,3 +179,43 @@ WHERE c IN (
struct<c:int>
-- !query output
1


-- !query
WITH
t AS (
WITH t2 AS (SELECT 1)
SELECT * FROM t2
),
t2 AS (SELECT 2)
SELECT * FROM t
-- !query schema
struct<1:int>
-- !query output
1


-- !query
WITH
abc AS (SELECT 1),
t AS (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
)
SELECT * FROM t
-- !query schema
struct<1:int>
-- !query output
1


-- !query
WITH abc AS (SELECT 1)
SELECT (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
)
-- !query schema
struct<scalarsubquery():int>
-- !query output
1
49 changes: 45 additions & 4 deletions sql/core/src/test/resources/sql-tests/results/cte-nested.sql.out
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 13
-- Number of queries: 16


-- !query
Expand Down Expand Up @@ -64,10 +64,9 @@ WITH
)
SELECT * FROM t2
-- !query schema
struct<>
struct<scalarsubquery():int>
-- !query output
org.apache.spark.sql.AnalysisException
Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
Copy link
Contributor Author

@cloud-fan cloud-fan Apr 27, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried this query with Spark 2.4.5 (need to replace t(c) AS (SELECT 1) with t AS (SELECT 1 c) as Spark 2.4 doesn't support t(c) syntax), it fails the parser. So we don't need to fail here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it.

2


-- !query
Expand Down Expand Up @@ -186,3 +185,45 @@ struct<>
-- !query output
org.apache.spark.sql.AnalysisException
Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;


-- !query
WITH
t AS (
WITH t2 AS (SELECT 1)
SELECT * FROM t2
),
t2 AS (SELECT 2)
SELECT * FROM t
-- !query schema
struct<1:int>
-- !query output
1


-- !query
WITH
abc AS (SELECT 1),
t AS (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
)
SELECT * FROM t
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
Name aBc is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;


-- !query
WITH abc AS (SELECT 1)
SELECT (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
)
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
Name aBc is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedencePolicy to CORRECTED so that name defined in inner CTE takes precedence. If set it to LEGACY, outer CTE definitions will take precedence. See more details in SPARK-28228.;
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
-- Automatically generated by SQLQueryTestSuite
-- Number of queries: 13
-- Number of queries: 16


-- !query
Expand Down Expand Up @@ -179,3 +179,43 @@ WHERE c IN (
struct<c:int>
-- !query output



-- !query
WITH
t AS (
WITH t2 AS (SELECT 1)
SELECT * FROM t2
),
t2 AS (SELECT 2)
SELECT * FROM t
-- !query schema
struct<1:int>
-- !query output
1


-- !query
WITH
abc AS (SELECT 1),
t AS (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
)
SELECT * FROM t
-- !query schema
struct<2:int>
-- !query output
2


-- !query
WITH abc AS (SELECT 1)
SELECT (
WITH aBc AS (SELECT 2)
SELECT * FROM aBC
)
-- !query schema
struct<scalarsubquery():int>
-- !query output
2