diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index e4d2358b5de0..d69b3d68583f 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -103,7 +103,7 @@ license: | - Since Spark 3.0, if files or subdirectories disappear during recursive directory listing (i.e. they appear in an intermediate listing but then cannot be read or listed during later phases of the recursive directory listing, due to either concurrent file deletions or object store consistency issues) then the listing will fail with an exception unless `spark.sql.files.ignoreMissingFiles` is `true` (default `false`). In previous versions, these missing files or subdirectories would be ignored. Note that this change of behavior only applies during initial table file listing (or during `REFRESH TABLE`), not during query execution: the net change is that `spark.sql.files.ignoreMissingFiles` is now obeyed during table file listing / query planning, not only at query execution time. - - Since Spark 3.0, substitution order of nested WITH clauses is changed and an inner CTE definition takes precedence over an outer. In version 2.4 and earlier, `WITH t AS (SELECT 1), t2 AS (WITH t AS (SELECT 2) SELECT * FROM t) SELECT * FROM t2` returns `1` while in version 3.0 it returns `2`. The previous behaviour can be restored by setting `spark.sql.legacy.ctePrecedence.enabled` to `true`. + - Since Spark 3.0, Spark throws an AnalysisException if name conflict is detected in the nested WITH clause by default. It forces the users to choose the specific substitution order they wanted, which is controlled by `spark.sql.legacy.ctePrecedence.enabled`. If set to false (which is recommended), inner CTE definitions take precedence over outer definitions. For example, set the config to `false`, `WITH t AS (SELECT 1), t2 AS (WITH t AS (SELECT 2) SELECT * FROM t) SELECT * FROM t2` returns `2`, while setting it to `true`, the result is `1` which is the behavior in version 2.4 and earlier. - Since Spark 3.0, the `add_months` function does not adjust the resulting date to a last day of month if the original date is a last day of months. For example, `select add_months(DATE'2019-02-28', 1)` results `2019-03-28`. In Spark version 2.4 and earlier, the resulting date is adjusted when the original date is a last day of months. For example, adding a month to `2019-02-28` results in `2019-03-31`. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala index 60e6bf8db06d..d2be15d87d02 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CTESubstitution.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.catalyst.analysis +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.SubqueryExpression import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, With} import org.apache.spark.sql.catalyst.rules.Rule @@ -28,10 +29,54 @@ import org.apache.spark.sql.internal.SQLConf.LEGACY_CTE_PRECEDENCE_ENABLED */ object CTESubstitution extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = { - if (SQLConf.get.getConf(LEGACY_CTE_PRECEDENCE_ENABLED)) { + val isLegacy = SQLConf.get.getConf(LEGACY_CTE_PRECEDENCE_ENABLED) + if (isLegacy.isEmpty) { + assertNoNameConflictsInCTE(plan, inTraverse = false) + traverseAndSubstituteCTE(plan, inTraverse = false) + } else if (isLegacy.get) { legacyTraverseAndSubstituteCTE(plan) } else { - traverseAndSubstituteCTE(plan, false) + traverseAndSubstituteCTE(plan, inTraverse = false) + } + } + + /** + * Check the plan to be traversed has naming conflicts in nested CTE or not, traverse through + * child, innerChildren and subquery for the current plan. + */ + private def assertNoNameConflictsInCTE( + plan: LogicalPlan, + inTraverse: Boolean, + cteNames: Set[String] = Set.empty): Unit = { + plan.foreach { + case w @ With(child, relations) => + val newNames = relations.map { + case (cteName, _) => + if (cteNames.contains(cteName)) { + throw new AnalysisException(s"Name $cteName is ambiguous in nested CTE. " + + s"Please set ${LEGACY_CTE_PRECEDENCE_ENABLED.key} to false so that name defined " + + "in inner CTE takes precedence. See more details in SPARK-28228.") + } else { + cteName + } + }.toSet + child.transformExpressions { + case e: SubqueryExpression => + assertNoNameConflictsInCTE(e.plan, inTraverse = true, cteNames ++ newNames) + e + } + w.innerChildren.foreach { p => + assertNoNameConflictsInCTE(p, inTraverse = true, cteNames ++ newNames) + } + + case other if inTraverse => + other.transformExpressions { + case e: SubqueryExpression => + assertNoNameConflictsInCTE(e.plan, inTraverse = true, cteNames) + e + } + + case _ => } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index 3ad3416256c7..67457f823eb6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -2077,9 +2077,11 @@ object SQLConf { val LEGACY_CTE_PRECEDENCE_ENABLED = buildConf("spark.sql.legacy.ctePrecedence.enabled") .internal() - .doc("When true, outer CTE definitions takes precedence over inner definitions.") + .doc("When true, outer CTE definitions takes precedence over inner definitions. If set to " + + "false, inner CTE definitions take precedence. The default value is empty, " + + "AnalysisException is thrown while name conflict is detected in nested CTE.") .booleanConf - .createWithDefault(false) + .createOptional val LEGACY_ARRAY_EXISTS_FOLLOWS_THREE_VALUED_LOGIC = buildConf("spark.sql.legacy.arrayExistsFollowsThreeValuedLogic") diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql new file mode 100644 index 000000000000..b711bf338ab0 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/cte-nonlegacy.sql @@ -0,0 +1,2 @@ +--SET spark.sql.legacy.ctePrecedence.enabled = false +--IMPORT cte.sql diff --git a/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out new file mode 100644 index 000000000000..2d87781193c2 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/cte-nonlegacy.sql.out @@ -0,0 +1,343 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 27 + + +-- !query +create temporary view t as select * from values 0, 1, 2 as t(id) +-- !query schema +struct<> +-- !query output + + + +-- !query +create temporary view t2 as select * from values 0, 1 as t(id) +-- !query schema +struct<> +-- !query output + + + +-- !query +WITH s AS (SELECT 1 FROM s) SELECT * FROM s +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Table or view not found: s; line 1 pos 25 + + +-- !query +WITH r AS (SELECT (SELECT * FROM r)) +SELECT * FROM r +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Table or view not found: r; line 1 pos 33 + + +-- !query +WITH t AS (SELECT 1 FROM t) SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 +1 +1 + + +-- !query +WITH s1 AS (SELECT 1 FROM s2), s2 AS (SELECT 1 FROM s1) SELECT * FROM s1, s2 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Table or view not found: s2; line 1 pos 26 + + +-- !query +WITH t1 AS (SELECT * FROM t2), t2 AS (SELECT 2 FROM t1) SELECT * FROM t1 cross join t2 +-- !query schema +struct +-- !query output +0 2 +0 2 +1 2 +1 2 + + +-- !query +WITH CTE1 AS ( + SELECT b.id AS id + FROM T2 a + CROSS JOIN (SELECT id AS id FROM T2) b +) +SELECT t1.id AS c1, + t2.id AS c2 +FROM CTE1 t1 + CROSS JOIN CTE1 t2 +-- !query schema +struct +-- !query output +0 0 +0 0 +0 0 +0 0 +0 1 +0 1 +0 1 +0 1 +1 0 +1 0 +1 0 +1 0 +1 1 +1 1 +1 1 +1 1 + + +-- !query +WITH t(x) AS (SELECT 1) +SELECT * FROM t WHERE x = 1 +-- !query schema +struct +-- !query output +1 + + +-- !query +WITH t(x, y) AS (SELECT 1, 2) +SELECT * FROM t WHERE x = 1 AND y = 2 +-- !query schema +struct +-- !query output +1 2 + + +-- !query +WITH t(x, x) AS (SELECT 1, 2) +SELECT * FROM t +-- !query schema +struct +-- !query output +1 2 + + +-- !query +WITH t() AS (SELECT 1) +SELECT * FROM t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +no viable alternative at input 'WITH t()'(line 1, pos 7) + +== SQL == +WITH t() AS (SELECT 1) +-------^^^ +SELECT * FROM t + + +-- !query +WITH + t(x) AS (SELECT 1), + t(x) AS (SELECT 2) +SELECT * FROM t +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.catalyst.parser.ParseException + +CTE definition can't have duplicate names: 't'.(line 1, pos 0) + +== SQL == +WITH +^^^ + t(x) AS (SELECT 1), + t(x) AS (SELECT 2) +SELECT * FROM t + + +-- !query +WITH t as ( + WITH t2 AS (SELECT 1) + SELECT * FROM t2 +) +SELECT * FROM t +-- !query schema +struct<1:int> +-- !query output +1 + + +-- !query +SELECT max(c) FROM ( + WITH t(c) AS (SELECT 1) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +1 + + +-- !query +SELECT ( + WITH t AS (SELECT 1) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +1 + + +-- !query +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +SELECT * FROM t2 +-- !query schema +struct<2:int> +-- !query output +2 + + +-- !query +WITH + t(c) AS (SELECT 1), + t2 AS ( + SELECT ( + SELECT max(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t + ) + ) + ) +SELECT * FROM t2 +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH + t AS (SELECT 1), + t2 AS ( + WITH t AS (SELECT 2), + t2 AS ( + WITH t AS (SELECT 3) + SELECT * FROM t + ) + SELECT * FROM t2 + ) +SELECT * FROM t2 +-- !query schema +struct<3:int> +-- !query output +3 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT max(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT sum(c) FROM ( + SELECT max(c) AS c FROM ( + WITH t(c) AS (SELECT 2) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t(c) AS (SELECT 1) +SELECT sum(c) FROM ( + WITH t(c) AS (SELECT 2) + SELECT max(c) AS c FROM ( + WITH t(c) AS (SELECT 3) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +3 + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + WITH t AS (SELECT 2) + SELECT * FROM t +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + SELECT ( + WITH t AS (SELECT 2) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +2 + + +-- !query +WITH t AS (SELECT 1) +SELECT ( + WITH t AS (SELECT 2) + SELECT ( + WITH t AS (SELECT 3) + SELECT * FROM t + ) +) +-- !query schema +struct +-- !query output +3 + + +-- !query +DROP VIEW IF EXISTS t +-- !query schema +struct<> +-- !query output + + + +-- !query +DROP VIEW IF EXISTS t2 +-- !query schema +struct<> +-- !query output + diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out b/sql/core/src/test/resources/sql-tests/results/cte.sql.out index 2d87781193c2..1d50aa8f5750 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out @@ -204,9 +204,10 @@ WITH ) SELECT * FROM t2 -- !query schema -struct<2:int> +struct<> -- !query output -2 +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -222,9 +223,10 @@ WITH ) SELECT * FROM t2 -- !query schema -struct +struct<> -- !query output -2 +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -240,9 +242,10 @@ WITH ) SELECT * FROM t2 -- !query schema -struct<3:int> +struct<> -- !query output -3 +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -293,9 +296,10 @@ SELECT ( SELECT * FROM t ) -- !query schema -struct +struct<> -- !query output -2 +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -307,9 +311,10 @@ SELECT ( ) ) -- !query schema -struct +struct<> -- !query output -2 +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query @@ -322,9 +327,10 @@ SELECT ( ) ) -- !query schema -struct +struct<> -- !query output -3 +org.apache.spark.sql.AnalysisException +Name t is ambiguous in nested CTE. Please set spark.sql.legacy.ctePrecedence.enabled to false so that name defined in inner CTE takes precedence. See more details in SPARK-28228.; -- !query