diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index c76f2e30e677..d2e3ee3e7781 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -1879,6 +1879,66 @@ working with timestamps in `pandas_udf`s to get the best performance, see ## Upgrading From Spark SQL 2.3 to 2.4 + - In Spark version 2.3 and earlier, the second parameter to array_contains function is implicitly promoted to the element type of first array type parameter. This type promotion can be lossy and may cause `array_contains` function to return wrong result. This problem has been addressed in 2.4 by employing a safer type promotion mechanism. This can cause some change in behavior and are illustrated in the table below. +
| + Query + | ++ Result Spark 2.3 or Prior + | ++ Result Spark 2.4 + | ++ Remarks + | +
|---|---|---|---|
|
+ SELECT array_contains(array(1), 1.34D); + |
+ + true + | ++ false + | ++ In Spark 2.4, left and right parameters are promoted to array(double) and double type respectively. + | +
|
+ SELECT array_contains(array(1), '1'); + |
+ + true + | ++ AnalysisException is thrown since integer type can not be promoted to string type in a loss-less manner. + | ++ Users can use explict cast + | +
|
+ SELECT array_contains(array(1), 'anystring'); + |
+ + null + | ++ AnalysisException is thrown since integer type can not be promoted to string type in a loss-less manner. + | ++ Users can use explict cast + | +
|
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 603f994dc959..8d849fa82289 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -1498,8 +1498,7 @@ def test_array_contains_function(self):
from pyspark.sql.functions import array_contains
df = self.spark.createDataFrame([(["1", "2", "3"],), ([],)], ['data'])
- actual = df.select(array_contains(df.data, 1).alias('b')).collect()
- # The value argument can be implicitly castable to the element's type of the array.
+ actual = df.select(array_contains(df.data, "1").alias('b')).collect()
self.assertEqual([Row(b=True), Row(b=False)], actual)
def test_between_function(self):
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index cc9edcfd41d0..e23ebef9643f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -1331,23 +1331,27 @@ case class ArrayContains(left: Expression, right: Expression)
@transient private lazy val ordering: Ordering[Any] =
TypeUtils.getInterpretedOrdering(right.dataType)
- override def inputTypes: Seq[AbstractDataType] = right.dataType match {
- case NullType => Seq.empty
- case _ => left.dataType match {
- case n @ ArrayType(element, _) => Seq(n, element)
+ override def inputTypes: Seq[AbstractDataType] = {
+ (left.dataType, right.dataType) match {
+ case (_, NullType) => Seq.empty
+ case (ArrayType(e1, hasNull), e2) =>
+ TypeCoercion.findTightestCommonType(e1, e2) match {
+ case Some(dt) => Seq(ArrayType(dt, hasNull), dt)
+ case _ => Seq.empty
+ }
case _ => Seq.empty
}
}
override def checkInputDataTypes(): TypeCheckResult = {
- if (right.dataType == NullType) {
- TypeCheckResult.TypeCheckFailure("Null typed values cannot be used as arguments")
- } else if (!left.dataType.isInstanceOf[ArrayType]
- || !left.dataType.asInstanceOf[ArrayType].elementType.sameType(right.dataType)) {
- TypeCheckResult.TypeCheckFailure(
- "Arguments must be an array followed by a value of same type as the array members")
- } else {
- TypeUtils.checkForOrderingExpr(right.dataType, s"function $prettyName")
+ (left.dataType, right.dataType) match {
+ case (_, NullType) =>
+ TypeCheckResult.TypeCheckFailure("Null typed values cannot be used as arguments")
+ case (ArrayType(e1, _), e2) if e1.sameType(e2) =>
+ TypeUtils.checkForOrderingExpr(e2, s"function $prettyName")
+ case _ => TypeCheckResult.TypeCheckFailure(s"Input to function $prettyName should have " +
+ s"been ${ArrayType.simpleString} followed by a value with same element type, but it's " +
+ s"[${left.dataType.catalogString}, ${right.dataType.catalogString}].")
}
}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 121db442c77f..ad52fd01248e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -26,6 +26,7 @@ import scala.util.Random
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
+import org.apache.spark.sql.catalyst.plans.logical.OneRowRelation
import org.apache.spark.sql.catalyst.util.DateTimeTestUtils
import org.apache.spark.sql.functions._
import org.apache.spark.sql.internal.SQLConf
@@ -735,6 +736,56 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext {
df.selectExpr("array_contains(array(1, null), array(1, null)[0])"),
Seq(Row(true), Row(true))
)
+
+ checkAnswer(
+ OneRowRelation().selectExpr("array_contains(array(1), 1.23D)"),
+ Seq(Row(false))
+ )
+
+ checkAnswer(
+ OneRowRelation().selectExpr("array_contains(array(1), 1.0D)"),
+ Seq(Row(true))
+ )
+
+ checkAnswer(
+ OneRowRelation().selectExpr("array_contains(array(1.0D), 1)"),
+ Seq(Row(true))
+ )
+
+ checkAnswer(
+ OneRowRelation().selectExpr("array_contains(array(1.23D), 1)"),
+ Seq(Row(false))
+ )
+
+ checkAnswer(
+ OneRowRelation().selectExpr("array_contains(array(array(1)), array(1.0D))"),
+ Seq(Row(true))
+ )
+
+ checkAnswer(
+ OneRowRelation().selectExpr("array_contains(array(array(1)), array(1.23D))"),
+ Seq(Row(false))
+ )
+
+ val e1 = intercept[AnalysisException] {
+ OneRowRelation().selectExpr("array_contains(array(1), .01234567890123456790123456780)")
+ }
+ val errorMsg1 =
+ s"""
+ |Input to function array_contains should have been array followed by a
+ |value with same element type, but it's [array |
|---|