From a186dfd39e47fa7d1f4e6ced88190ecb4450eaf3 Mon Sep 17 00:00:00 2001 From: Aman Omer Date: Mon, 9 Dec 2019 17:08:17 +0530 Subject: [PATCH 1/6] Initial commit --- .../spark/sql/catalyst/expressions/collectionOperations.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index d5d42510842e..ff2121da21d0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -1081,7 +1081,7 @@ case class ArrayContains(left: Expression, right: Expression) (left.dataType, right.dataType) match { case (_, NullType) => Seq.empty case (ArrayType(e1, hasNull), e2) => - TypeCoercion.findTightestCommonType(e1, e2) match { + TypeCoercion.findWiderTypeForTwo(e1, e2) match { case Some(dt) => Seq(ArrayType(dt, hasNull), dt) case _ => Seq.empty } From 71b7ad35f33d92d8d73161198260b5d3e36848e0 Mon Sep 17 00:00:00 2001 From: Aman Omer Date: Mon, 9 Dec 2019 18:35:29 +0530 Subject: [PATCH 2/6] Test case --- .../spark/sql/DataFrameFunctionsSuite.scala | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index a346377cd1bc..22ae235c5b0f 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -850,19 +850,19 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { val errorMsg1 = s""" |Input to function array_contains should have been array followed by a - |value with same element type, but it's [array, decimal(29,29)]. + |value with same element type, but it's [array, decimal(38,29)]. """.stripMargin.replace("\n", " ").trim() assert(e1.message.contains(errorMsg1)) - val e2 = intercept[AnalysisException] { - OneRowRelation().selectExpr("array_contains(array(1), 'foo')") - } - val errorMsg2 = - s""" - |Input to function array_contains should have been array followed by a - |value with same element type, but it's [array, string]. - """.stripMargin.replace("\n", " ").trim() - assert(e2.message.contains(errorMsg2)) + checkAnswer( + OneRowRelation().selectExpr("array_contains(array(1), 'foo')"), + Seq(Row(false)) + ) + + checkAnswer( + sql("select array_contains(array(1.0, 2.02), 1.0)"), + Seq(Row(true)) + ) } test("arrays_overlap function") { From d2ce3aedd3431fc6b56f711f5d88decb60fcbe35 Mon Sep 17 00:00:00 2001 From: Aman Omer Date: Mon, 9 Dec 2019 20:13:01 +0530 Subject: [PATCH 3/6] Updated migration guide --- docs/sql-migration-guide.md | 46 +++++++++++++++++++ .../spark/sql/DataFrameFunctionsSuite.scala | 12 ++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 9bcd36ce4127..736a7fdebf3c 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -256,6 +256,52 @@ license: | - Since Spark 3.0, the unary arithmetic operator plus(`+`) only accepts string, numeric and interval type values as inputs. Besides, `+` with a integral string representation will be coerced to double value, e.g. `+'1'` results `1.0`. In Spark version 2.4 and earlier, this operator is ignored. There is no type checking for it, thus, all type values with a `+` prefix are valid, e.g. `+ array(1, 2)` is valid and results `[1, 2]`. Besides, there is no type coercion for it at all, e.g. in Spark 2.4, the result of `+'1'` is string `1`. + - Since Spark 3.0, the parameter(first or second) to array_contains function is implicitly promoted to the wider type parameter. + + + + + + + + + + + + + + + + + + + +
+ Query + + Spark 2.4 + + Spark 3.0 + + Remarks +
+ select array_contains(array(1.10), 1.1); + + AnalysisException is thrown. + + True + + In spark 2.4, left parameter is of array(decimal(3,2)) where as right parameter is of decimal(2,1) +
+ select array_contains(array(1.1), 1.10); + + AnalysisException is thrown. + + True + + In spark 2.4, left parameter is of array(decimal(2,1)) where as right parameter is of decimal(3,2) +
+ ## Upgrading from Spark SQL 2.4 to 2.4.1 - The value of `spark.executor.heartbeatInterval`, when specified without units like "30" rather than "30s", was diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 22ae235c5b0f..39876adb0596 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -860,9 +860,19 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { ) checkAnswer( - sql("select array_contains(array(1.0, 2.02), 1.0)"), + sql("select array_contains(array(1.10), 1.1)"), Seq(Row(true)) ) + + checkAnswer( + sql("SELECT array_contains(array(1.1), 1.10)"), + Seq(Row(true)) + ) + + checkAnswer( + sql("SELECT array_contains(array(1.11), 1.1)"), + Seq(Row(false)) + ) } test("arrays_overlap function") { From 74123680aa53a7f8d734998490276b539094a6a4 Mon Sep 17 00:00:00 2001 From: Aman Omer Date: Thu, 12 Dec 2019 15:36:56 +0530 Subject: [PATCH 4/6] findWiderTypeForTwo -> findWiderTypeWithoutStringPromotionForTwo --- .../spark/sql/catalyst/analysis/TypeCoercion.scala | 2 +- .../catalyst/expressions/collectionOperations.scala | 2 +- .../apache/spark/sql/DataFrameFunctionsSuite.scala | 13 +++++++++---- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala index e76193fd9422..f94e733ad813 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TypeCoercion.scala @@ -243,7 +243,7 @@ object TypeCoercion { * string. If the wider decimal type exceeds system limitation, this rule will truncate * the decimal type before return it. */ - private[analysis] def findWiderTypeWithoutStringPromotionForTwo( + private[catalyst] def findWiderTypeWithoutStringPromotionForTwo( t1: DataType, t2: DataType): Option[DataType] = { findTightestCommonType(t1, t2) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index ff2121da21d0..6ed68e47ce7a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -1081,7 +1081,7 @@ case class ArrayContains(left: Expression, right: Expression) (left.dataType, right.dataType) match { case (_, NullType) => Seq.empty case (ArrayType(e1, hasNull), e2) => - TypeCoercion.findWiderTypeForTwo(e1, e2) match { + TypeCoercion.findWiderTypeWithoutStringPromotionForTwo(e1, e2) match { case Some(dt) => Seq(ArrayType(dt, hasNull), dt) case _ => Seq.empty } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 39876adb0596..6b04a4ea3578 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -854,10 +854,15 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { """.stripMargin.replace("\n", " ").trim() assert(e1.message.contains(errorMsg1)) - checkAnswer( - OneRowRelation().selectExpr("array_contains(array(1), 'foo')"), - Seq(Row(false)) - ) + val e2 = intercept[AnalysisException] { + OneRowRelation().selectExpr("array_contains(array(1), 'foo')") + } + val errorMsg2 = + s""" + |Input to function array_contains should have been array followed by a + |value with same element type, but it's [array, string]. + """.stripMargin.replace("\n", " ").trim() + assert(e2.message.contains(errorMsg2)) checkAnswer( sql("select array_contains(array(1.10), 1.1)"), From 6c3545de883ad27dab9814b1576c8495cd476048 Mon Sep 17 00:00:00 2001 From: Aman Omer Date: Thu, 12 Dec 2019 19:38:36 +0530 Subject: [PATCH 5/6] revert migration guide changes --- docs/sql-migration-guide.md | 46 ------------------------------------- 1 file changed, 46 deletions(-) diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md index 736a7fdebf3c..9bcd36ce4127 100644 --- a/docs/sql-migration-guide.md +++ b/docs/sql-migration-guide.md @@ -256,52 +256,6 @@ license: | - Since Spark 3.0, the unary arithmetic operator plus(`+`) only accepts string, numeric and interval type values as inputs. Besides, `+` with a integral string representation will be coerced to double value, e.g. `+'1'` results `1.0`. In Spark version 2.4 and earlier, this operator is ignored. There is no type checking for it, thus, all type values with a `+` prefix are valid, e.g. `+ array(1, 2)` is valid and results `[1, 2]`. Besides, there is no type coercion for it at all, e.g. in Spark 2.4, the result of `+'1'` is string `1`. - - Since Spark 3.0, the parameter(first or second) to array_contains function is implicitly promoted to the wider type parameter. - - - - - - - - - - - - - - - - - - - -
- Query - - Spark 2.4 - - Spark 3.0 - - Remarks -
- select array_contains(array(1.10), 1.1); - - AnalysisException is thrown. - - True - - In spark 2.4, left parameter is of array(decimal(3,2)) where as right parameter is of decimal(2,1) -
- select array_contains(array(1.1), 1.10); - - AnalysisException is thrown. - - True - - In spark 2.4, left parameter is of array(decimal(2,1)) where as right parameter is of decimal(3,2) -
- ## Upgrading from Spark SQL 2.4 to 2.4.1 - The value of `spark.executor.heartbeatInterval`, when specified without units like "30" rather than "30s", was From 7b12d6cf78d2228d36192e78cca5a12648c1db80 Mon Sep 17 00:00:00 2001 From: Aman Omer Date: Fri, 13 Dec 2019 18:02:49 +0530 Subject: [PATCH 6/6] Split UT --- .../scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 6b04a4ea3578..584768eff700 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -863,7 +863,9 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession { |value with same element type, but it's [array, string]. """.stripMargin.replace("\n", " ").trim() assert(e2.message.contains(errorMsg2)) + } + test("SPARK-29600: ArrayContains function may return incorrect result for DecimalType") { checkAnswer( sql("select array_contains(array(1.10), 1.1)"), Seq(Row(true))