From d61fb6e862e238ec4a5bdd7133c9e07c9e16a379 Mon Sep 17 00:00:00 2001 From: Dilip Biswal Date: Wed, 12 Sep 2018 12:06:14 -0700 Subject: [PATCH 1/8] [SPARK-25417] ArrayContains function may return incorrect result when right expression is implicitly down casted --- .../expressions/collectionOperations.scala | 29 ++++++++------ .../spark/sql/DataFrameFunctionsSuite.scala | 38 +++++++++++++++++++ 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index cc9edcfd41d0..8575068ba851 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -1331,26 +1331,31 @@ case class ArrayContains(left: Expression, right: Expression) @transient private lazy val ordering: Ordering[Any] = TypeUtils.getInterpretedOrdering(right.dataType) - override def inputTypes: Seq[AbstractDataType] = right.dataType match { - case NullType => Seq.empty - case _ => left.dataType match { - case n @ ArrayType(element, _) => Seq(n, element) + override def inputTypes: Seq[AbstractDataType] = { + (left.dataType, right.dataType) match { + case (_, NullType) => Seq.empty + case (ArrayType(e1, hasNull), e2) => + TypeCoercion.findTightestCommonType(e1, e2) match { + case Some(dt) => Seq(ArrayType(dt, hasNull), dt) + case _ => Seq.empty + } case _ => Seq.empty } } override def checkInputDataTypes(): TypeCheckResult = { - if (right.dataType == NullType) { - TypeCheckResult.TypeCheckFailure("Null typed values cannot be used as arguments") - } else if (!left.dataType.isInstanceOf[ArrayType] - || !left.dataType.asInstanceOf[ArrayType].elementType.sameType(right.dataType)) { - TypeCheckResult.TypeCheckFailure( - "Arguments must be an array followed by a value of same type as the array members") - } else { - TypeUtils.checkForOrderingExpr(right.dataType, s"function $prettyName") + (left.dataType, right.dataType) match { + case (_, NullType) => + TypeCheckResult.TypeCheckFailure("Null typed values cannot be used as arguments") + case (ArrayType(e1, _), e2) if e1.sameType(e2) => + TypeUtils.checkForOrderingExpr(right.dataType, s"function $prettyName") + case _ => TypeCheckResult.TypeCheckFailure(s"Input to function $prettyName should have " + + s"been ${ArrayType.simpleString} followed by a value with same element type, but it's " + + s"[${left.dataType.catalogString}, ${right.dataType.catalogString}].") } } + override def nullable: Boolean = { left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 121db442c77f..990edcdc669b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -735,6 +735,44 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { df.selectExpr("array_contains(array(1, null), array(1, null)[0])"), Seq(Row(true), Row(true)) ) + + checkAnswer( + df.selectExpr("array_contains(array(1), 1.23D)"), + Seq(Row(false), Row(false)) + ) + + checkAnswer( + df.selectExpr("array_contains(array(1), 1.0D)"), + Seq(Row(true), Row(true)) + ) + + checkAnswer( + df.selectExpr("array_contains(array(1.0D), 1)"), + Seq(Row(true), Row(true)) + ) + + checkAnswer( + df.selectExpr("array_contains(array(1.23D), 1)"), + Seq(Row(false), Row(false)) + ) + + checkAnswer( + df.selectExpr("array_contains(array(array(1)), array(1.0D))"), + Seq(Row(true), Row(true)) + ) + + checkAnswer( + df.selectExpr("array_contains(array(array(1)), array(1.23D))"), + Seq(Row(false), Row(false)) + ) + + intercept[AnalysisException] { + df.selectExpr("array_contains(array(1), 1.23)") + } + + intercept[AnalysisException] { + df.selectExpr("array_contains(array(1), 'foo')") + } } test("arrays_overlap function") { From ce6d47a8fc2f8255ec2b1d91d63833de79b6c14d Mon Sep 17 00:00:00 2001 From: Dilip Biswal Date: Wed, 12 Sep 2018 19:42:12 -0700 Subject: [PATCH 2/8] python test failure --- python/pyspark/sql/tests.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 603f994dc959..8d849fa82289 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -1498,8 +1498,7 @@ def test_array_contains_function(self): from pyspark.sql.functions import array_contains df = self.spark.createDataFrame([(["1", "2", "3"],), ([],)], ['data']) - actual = df.select(array_contains(df.data, 1).alias('b')).collect() - # The value argument can be implicitly castable to the element's type of the array. + actual = df.select(array_contains(df.data, "1").alias('b')).collect() self.assertEqual([Row(b=True), Row(b=False)], actual) def test_between_function(self): From bb3dbd54be807c0d1f5f643ca975b199881b2960 Mon Sep 17 00:00:00 2001 From: Dilip Biswal Date: Thu, 13 Sep 2018 00:36:40 -0700 Subject: [PATCH 3/8] Code review --- .../expressions/collectionOperations.scala | 1 - .../spark/sql/DataFrameFunctionsSuite.scala | 16 ++++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 8575068ba851..fb2c663d4c89 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -1355,7 +1355,6 @@ case class ArrayContains(left: Expression, right: Expression) } } - override def nullable: Boolean = { left.nullable || right.nullable || left.dataType.asInstanceOf[ArrayType].containsNull } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 990edcdc669b..37522e5eb144 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -766,13 +766,25 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { Seq(Row(false), Row(false)) ) - intercept[AnalysisException] { + val e1 = intercept[AnalysisException] { df.selectExpr("array_contains(array(1), 1.23)") } + val errorMsg1 = + s""" + |Input to function array_contains should have been array followed by a + |value with same element type, but it's [array, decimal(3,2)]. + """.stripMargin.replace("\n", " ").trim() + assert(e1.message.contains(errorMsg1)) - intercept[AnalysisException] { + val e2 = intercept[AnalysisException] { df.selectExpr("array_contains(array(1), 'foo')") } + val errorMsg2 = + s""" + |Input to function array_contains should have been array followed by a + |value with same element type, but it's [array, string]. + """.stripMargin.replace("\n", " ").trim() + assert(e2.message.contains(errorMsg2)) } test("arrays_overlap function") { From 9a5cc2cfd4618b72fd7d7a3110c7a1667fac71ae Mon Sep 17 00:00:00 2001 From: Dilip Biswal Date: Thu, 13 Sep 2018 01:12:45 -0700 Subject: [PATCH 4/8] minor --- .../spark/sql/catalyst/expressions/collectionOperations.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index fb2c663d4c89..e23ebef9643f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -1348,7 +1348,7 @@ case class ArrayContains(left: Expression, right: Expression) case (_, NullType) => TypeCheckResult.TypeCheckFailure("Null typed values cannot be used as arguments") case (ArrayType(e1, _), e2) if e1.sameType(e2) => - TypeUtils.checkForOrderingExpr(right.dataType, s"function $prettyName") + TypeUtils.checkForOrderingExpr(e2, s"function $prettyName") case _ => TypeCheckResult.TypeCheckFailure(s"Input to function $prettyName should have " + s"been ${ArrayType.simpleString} followed by a value with same element type, but it's " + s"[${left.dataType.catalogString}, ${right.dataType.catalogString}].") From 56d131b62fadfa2d625b865a3b5d0acad09869bc Mon Sep 17 00:00:00 2001 From: Dilip Biswal Date: Mon, 17 Sep 2018 00:51:09 -0700 Subject: [PATCH 5/8] Migration guide --- docs/sql-programming-guide.md | 75 ++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index c76f2e30e677..a4164397446d 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1879,6 +1879,80 @@ working with timestamps in `pandas_udf`s to get the best performance, see ## Upgrading From Spark SQL 2.3 to 2.4 + - In Spark version 2.3 and earlier, the second parameter to array_contains function is implicitly promoted to the element type of first array type parameter. This type promotion can be lossy and may cause `array_contains` function to return wrong result. This problem has been addressed in 2.4 by employing a safer type promotion mechanism. This can cause some change in behavior and are illustrated in the table below. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ Query + + Result Spark 2.3 or Prior + + Result Spark 2.4 + + Remarks +
+ SELECT
array_contains(array(1), 1.34D);
+
+ true + + false + + In Spark 2.4, both left and right parameters are promoted to array(double) and double type respectively. +
+ SELECT
array_contains(array(1), 1.34);
+
+ true + + AnalysisException is thrown since integer type can not be promoted to decimal type in a loss-less manner. + + Users can use explict cast +
+ SELECT
array_contains(array(1), '1');
+
+ true + + AnalysisException is thrown since integer type can not be promoted to string type in a loss-less manner. + + Users can use explict cast +
+ SELECT
array_contains(array(1), 'anystring');
+
+ null + + AnalysisException is thrown since integer type can not be promoted to string type in a loss-less manner. + + Users can use explict cast +
+ - Since Spark 2.4, when there is a struct field in front of the IN operator before a subquery, the inner query must contain a struct field as well. In previous versions, instead, the fields of the struct were compared to the output of the inner query. Eg. if `a` is a `struct(a string, b int)`, in Spark 2.4 `a in (select (1 as a, 'a' as b) from range(1))` is a valid query, while `a in (select 1, 'a' from range(1))` is not. In previous version it was the opposite. - In versions 2.2.1+ and 2.3, if `spark.sql.caseSensitive` is set to true, then the `CURRENT_DATE` and `CURRENT_TIMESTAMP` functions incorrectly became case-sensitive and would resolve to columns (unless typed in lower case). In Spark 2.4 this has been fixed and the functions are no longer case-sensitive. - Since Spark 2.4, Spark will evaluate the set operations referenced in a query by following a precedence rule as per the SQL standard. If the order is not specified by parentheses, set operations are performed from left to right with the exception that all INTERSECT operations are performed before any UNION, EXCEPT or MINUS operations. The old behaviour of giving equal precedence to all the set operations are preserved under a newly added configuration `spark.sql.legacy.setopsPrecedence.enabled` with a default value of `false`. When this property is set to `true`, spark will evaluate the set operators from left to right as they appear in the query given no explicit ordering is enforced by usage of parenthesis. @@ -1912,7 +1986,6 @@ working with timestamps in `pandas_udf`s to get the best performance, see - The `percentile_approx` function previously accepted numeric type input and output double type results. Now it supports date type, timestamp type and numeric types as input types. The result type is also changed to be the same as the input type, which is more reasonable for percentiles. - Since Spark 2.3, the Join/Filter's deterministic predicates that are after the first non-deterministic predicates are also pushed down/through the child operators, if possible. In prior Spark versions, these filters are not eligible for predicate pushdown. - Partition column inference previously found incorrect common type for different inferred types, for example, previously it ended up with double type as the common type for double type and date type. Now it finds the correct common type for such conflicts. The conflict resolution follows the table below: - - - - - - - diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 58bc9859d91d..ad52fd01248e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -26,6 +26,7 @@ import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.plans.logical.OneRowRelation import org.apache.spark.sql.catalyst.util.DateTimeTestUtils import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SQLConf @@ -737,41 +738,37 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { ) checkAnswer( - df.selectExpr("array_contains(array(1), 1.23D)"), - Seq(Row(false), Row(false)) + OneRowRelation().selectExpr("array_contains(array(1), 1.23D)"), + Seq(Row(false)) ) checkAnswer( - df.selectExpr("array_contains(array(1), 1.0D)"), - Seq(Row(true), Row(true)) + OneRowRelation().selectExpr("array_contains(array(1), 1.0D)"), + Seq(Row(true)) ) checkAnswer( - df.selectExpr("array_contains(array(1.0D), 1)"), - Seq(Row(true), Row(true)) + OneRowRelation().selectExpr("array_contains(array(1.0D), 1)"), + Seq(Row(true)) ) checkAnswer( - df.selectExpr("array_contains(array(1.23D), 1)"), - Seq(Row(false), Row(false)) + OneRowRelation().selectExpr("array_contains(array(1.23D), 1)"), + Seq(Row(false)) ) checkAnswer( - df.selectExpr("array_contains(array(array(1)), array(1.0D))"), - Seq(Row(true), Row(true)) + OneRowRelation().selectExpr("array_contains(array(array(1)), array(1.0D))"), + Seq(Row(true)) ) checkAnswer( - df.selectExpr("array_contains(array(array(1)), array(1.23D))"), - Seq(Row(false), Row(false)) + OneRowRelation().selectExpr("array_contains(array(array(1)), array(1.23D))"), + Seq(Row(false)) ) - checkAnswer( - df.selectExpr("array_contains(array(array(1)), array(1.23))"), - Seq(Row(false), Row(false)) - ) val e1 = intercept[AnalysisException] { - df.selectExpr("array_contains(array(1), .01234567890123456790123456780)") + OneRowRelation().selectExpr("array_contains(array(1), .01234567890123456790123456780)") } val errorMsg1 = s""" @@ -781,7 +778,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { assert(e1.message.contains(errorMsg1)) val e2 = intercept[AnalysisException] { - df.selectExpr("array_contains(array(1), 'foo')") + OneRowRelation().selectExpr("array_contains(array(1), 'foo')") } val errorMsg2 = s"""
From 966e1ea9cc918bb51cd11f4d15b97d588537885f Mon Sep 17 00:00:00 2001 From: Dilip Biswal Date: Mon, 17 Sep 2018 14:01:46 -0700 Subject: [PATCH 6/8] fix1 --- .../org/apache/spark/sql/DataFrameFunctionsSuite.scala | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala index 37522e5eb144..58bc9859d91d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala @@ -766,13 +766,17 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { Seq(Row(false), Row(false)) ) + checkAnswer( + df.selectExpr("array_contains(array(array(1)), array(1.23))"), + Seq(Row(false), Row(false)) + ) val e1 = intercept[AnalysisException] { - df.selectExpr("array_contains(array(1), 1.23)") + df.selectExpr("array_contains(array(1), .01234567890123456790123456780)") } val errorMsg1 = s""" |Input to function array_contains should have been array followed by a - |value with same element type, but it's [array, decimal(3,2)]. + |value with same element type, but it's [array, decimal(29,29)]. """.stripMargin.replace("\n", " ").trim() assert(e1.message.contains(errorMsg1)) From df5ea4768781ac82927128b8dfeefb5ab421ee14 Mon Sep 17 00:00:00 2001 From: Dilip Biswal Date: Wed, 19 Sep 2018 21:56:45 -0700 Subject: [PATCH 7/8] Code review --- docs/sql-programming-guide.md | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index a4164397446d..d0b78e133da7 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1909,20 +1909,6 @@ working with timestamps in `pandas_udf`s to get the best performance, see In Spark 2.4, both left and right parameters are promoted to array(double) and double type respectively.
- SELECT
array_contains(array(1), 1.34);
-
- true - - AnalysisException is thrown since integer type can not be promoted to decimal type in a loss-less manner. - - Users can use explict cast -
SELECT
array_contains(array(1), '1');
From d79e9d46bca28c721887625b89814e91e923e7ca Mon Sep 17 00:00:00 2001 From: Dilip Biswal Date: Wed, 19 Sep 2018 23:00:37 -0700 Subject: [PATCH 8/8] Code review --- docs/sql-programming-guide.md | 2 +- .../spark/sql/DataFrameFunctionsSuite.scala | 33 +++++++++---------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index d0b78e133da7..d2e3ee3e7781 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1906,7 +1906,7 @@ working with timestamps in `pandas_udf`s to get the best performance, see false
- In Spark 2.4, both left and right parameters are promoted to array(double) and double type respectively. + In Spark 2.4, left and right parameters are promoted to array(double) and double type respectively.