From 23699cefa321833d8949fb2d673efe0dab431179 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Sat, 5 Feb 2022 23:10:36 -0800 Subject: [PATCH 1/3] [SPARK-38118] func(wrong type) in the HAVING claus should throw data mismatch error. --- .../sql/catalyst/analysis/Analyzer.scala | 28 +++++++++++++++++-- .../sql/catalyst/analysis/CheckAnalysis.scala | 2 +- .../sql/catalyst/analysis/unresolved.scala | 5 ++++ .../org/apache/spark/sql/SQLQuerySuite.scala | 25 +++++++++++++++++ 4 files changed, 57 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala index 18684bdad63c..195c25177da8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala @@ -28,6 +28,7 @@ import scala.util.{Failure, Random, Success, Try} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst._ +import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer.{extraHintForAnsiTypeCoercionExpression, DATA_TYPE_MISMATCH_ERROR} import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.encoders.OuterScopes import org.apache.spark.sql.catalyst.expressions.{Expression, FrameLessOffsetWindowFunction, _} @@ -4247,7 +4248,30 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] { * rule right after the main resolution batch. */ object RemoveTempResolvedColumn extends Rule[LogicalPlan] { - override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveExpressions { - case t: TempResolvedColumn => UnresolvedAttribute(t.nameParts) + override def apply(plan: LogicalPlan): LogicalPlan = { + plan.foreachUp { + // HAVING clause will be resolved as a Filter. When having func(column with wrong data type), + // the column could be wrapped by a TempResolvedColumn, e.g. mean(tempresolvedcolumn(t.c)). + // Because TempResolvedColumn can still preserve column data type, here is a chance to check + // if the data type matches with the required data type of the function. We can throw an error + // when data types mismatches. + case operator: Filter => + operator.expressions.foreach(_.foreachUp { + case e: Expression if e.childrenResolved && e.checkInputDataTypes().isFailure => + e.checkInputDataTypes() match { + case TypeCheckResult.TypeCheckFailure(message) => + e.setTagValue(DATA_TYPE_MISMATCH_ERROR, true) + e.failAnalysis( + s"cannot resolve '${e.sql}' due to data type mismatch: $message" + + extraHintForAnsiTypeCoercionExpression(plan)) + } + case _ => + }) + case _ => + } + + plan.resolveExpressions { + case t: TempResolvedColumn => UnresolvedAttribute(t.nameParts) + } } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index d06996a09df0..2da2686bdc84 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -623,7 +623,7 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog { } } - private def extraHintForAnsiTypeCoercionExpression(plan: LogicalPlan): String = { + private[analysis] def extraHintForAnsiTypeCoercionExpression(plan: LogicalPlan): String = { if (!SQLConf.get.ansiEnabled) { "" } else { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index b861e5df72c3..bc6917c8d05a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -645,4 +645,9 @@ case class TempResolvedColumn(child: Expression, nameParts: Seq[String]) extends override def dataType: DataType = child.dataType override protected def withNewChildInternal(newChild: Expression): Expression = copy(child = newChild) + + override def sql: String = { + val childrenSQL = children.map(_.sql).mkString(", ") + s"$childrenSQL" + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 974e489dcc0a..0159a7987225 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -4294,6 +4294,31 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark Row(3, 2, 6) :: Nil) } } + + test("SPARK-38118: Func(wrong_type) in the HAVING clause should throw data mismatch error") { + Seq("mean", "abs").foreach { func => + val e1 = intercept[AnalysisException]( + sql( + s""" + |WITH t as (SELECT true c) + |SELECT t.c + |FROM t + |GROUP BY t.c + |HAVING ${func}(t.c) > 0d""".stripMargin)) + + assert(e1.message.contains(s"cannot resolve '$func(t.c)' due to data type mismatch")) + + val e2 = intercept[AnalysisException]( + sql( + s""" + |WITH t as (SELECT true c, false d) + |SELECT (t.c AND t.d) c + |FROM t + |GROUP BY t.c + |HAVING ${func}(c) > 0d""".stripMargin)) + assert(e2.message.contains(s"cannot resolve '$func(t.c)' due to data type mismatch")) + } + } } case class Foo(bar: Option[String]) From faa12dd01b5439b4aeeafacf6ce3ede04710a27c Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Wed, 16 Feb 2022 13:45:49 -0800 Subject: [PATCH 2/3] address comments --- .../org/apache/spark/sql/catalyst/analysis/unresolved.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index bc6917c8d05a..3d70a05055a7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -647,7 +647,6 @@ case class TempResolvedColumn(child: Expression, nameParts: Seq[String]) extends copy(child = newChild) override def sql: String = { - val childrenSQL = children.map(_.sql).mkString(", ") - s"$childrenSQL" + s"${child.sql}" } } From 850e500fc7f748ee897aee8b1541ed3beb07c573 Mon Sep 17 00:00:00 2001 From: Rui Wang Date: Thu, 17 Feb 2022 13:35:07 -0800 Subject: [PATCH 3/3] update --- .../sql/catalyst/analysis/unresolved.scala | 5 +--- .../sql/catalyst/analysis/AnalysisSuite.scala | 24 ++++++++++++++++++ .../org/apache/spark/sql/SQLQuerySuite.scala | 25 ------------------- 3 files changed, 25 insertions(+), 29 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala index 3d70a05055a7..c8ef71eb8b89 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/unresolved.scala @@ -645,8 +645,5 @@ case class TempResolvedColumn(child: Expression, nameParts: Seq[String]) extends override def dataType: DataType = child.dataType override protected def withNewChildInternal(newChild: Expression): Expression = copy(child = newChild) - - override def sql: String = { - s"${child.sql}" - } + override def sql: String = child.sql } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala index 63f90a8d6b88..ff05b797e7cd 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala @@ -1150,4 +1150,28 @@ class AnalysisSuite extends AnalysisTest with Matchers { "MISSING_COLUMN", Array("c.y", "x")) } + + test("SPARK-38118: Func(wrong_type) in the HAVING clause should throw data mismatch error") { + Seq("mean", "abs").foreach { func => + assertAnalysisError(parsePlan( + s""" + |WITH t as (SELECT true c) + |SELECT t.c + |FROM t + |GROUP BY t.c + |HAVING ${func}(t.c) > 0d""".stripMargin), + Seq(s"cannot resolve '$func(t.c)' due to data type mismatch"), + false) + + assertAnalysisError(parsePlan( + s""" + |WITH t as (SELECT true c, false d) + |SELECT (t.c AND t.d) c + |FROM t + |GROUP BY t.c + |HAVING ${func}(c) > 0d""".stripMargin), + Seq(s"cannot resolve '$func(t.c)' due to data type mismatch"), + false) + } + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 0159a7987225..974e489dcc0a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -4294,31 +4294,6 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark Row(3, 2, 6) :: Nil) } } - - test("SPARK-38118: Func(wrong_type) in the HAVING clause should throw data mismatch error") { - Seq("mean", "abs").foreach { func => - val e1 = intercept[AnalysisException]( - sql( - s""" - |WITH t as (SELECT true c) - |SELECT t.c - |FROM t - |GROUP BY t.c - |HAVING ${func}(t.c) > 0d""".stripMargin)) - - assert(e1.message.contains(s"cannot resolve '$func(t.c)' due to data type mismatch")) - - val e2 = intercept[AnalysisException]( - sql( - s""" - |WITH t as (SELECT true c, false d) - |SELECT (t.c AND t.d) c - |FROM t - |GROUP BY t.c - |HAVING ${func}(c) > 0d""".stripMargin)) - assert(e2.message.contains(s"cannot resolve '$func(t.c)' due to data type mismatch")) - } - } } case class Foo(bar: Option[String])