-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-24395][SQL] IN operator should return NULL when comparing struct with NULL fields #22029
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
54ee21a
5974aea
9b28842
655eaa4
062c9fd
3b00217
cf6b3b0
59ff46b
321b9a8
1933e9d
809e80c
389e6de
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,9 +22,11 @@ import scala.collection.immutable.TreeSet | |
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.analysis.TypeCheckResult | ||
| import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral, GenerateSafeProjection, GenerateUnsafeProjection, Predicate => BasePredicate} | ||
| import org.apache.spark.sql.catalyst.expressions.codegen.Block | ||
| import org.apache.spark.sql.catalyst.expressions.codegen.Block._ | ||
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
| import org.apache.spark.sql.catalyst.util.TypeUtils | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.types._ | ||
|
|
||
|
|
||
|
|
@@ -202,7 +204,11 @@ case class InSubquery(values: Seq[Expression], query: ListQuery) | |
| */ | ||
| // scalastyle:off line.size.limit | ||
| @ExpressionDescription( | ||
| usage = "expr1 _FUNC_(expr2, expr3, ...) - Returns true if `expr` equals to any valN.", | ||
| usage = """ | ||
| expr1 _FUNC_(expr2, expr3, ...) - Returns true if `expr` equals to any valN. Otherwise, if | ||
| spark.sql.legacy.inOperator.falseForNullField is false and any of the elements or fields of | ||
| the elements is null it returns null, else it returns false. | ||
|
||
| """, | ||
| arguments = """ | ||
| Arguments: | ||
| * expr1, expr2, expr3, ... - the arguments must be same type. | ||
|
|
@@ -238,20 +244,25 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate { | |
| lazy val inSetConvertible = list.forall(_.isInstanceOf[Literal]) | ||
| private lazy val ordering = TypeUtils.getInterpretedOrdering(value.dataType) | ||
|
|
||
| override def nullable: Boolean = children.exists(_.nullable) | ||
| override def nullable: Boolean = value.dataType match { | ||
| case _: StructType if !SQLConf.get.inFalseForNullField => | ||
| children.exists(_.nullable) || | ||
| children.exists(_.dataType.asInstanceOf[StructType].exists(_.nullable)) | ||
|
||
| case _ => children.exists(_.nullable) | ||
| } | ||
| override def foldable: Boolean = children.forall(_.foldable) | ||
|
|
||
| override def toString: String = s"$value IN ${list.mkString("(", ",", ")")}" | ||
|
|
||
| override def eval(input: InternalRow): Any = { | ||
| val evaluatedValue = value.eval(input) | ||
| if (evaluatedValue == null) { | ||
| if (checkNullEval(evaluatedValue)) { | ||
| null | ||
| } else { | ||
| var hasNull = false | ||
| list.foreach { e => | ||
| val v = e.eval(input) | ||
| if (v == null) { | ||
| if (checkNullEval(v)) { | ||
| hasNull = true | ||
| } else if (ordering.equiv(v, evaluatedValue)) { | ||
| return true | ||
|
|
@@ -265,6 +276,18 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate { | |
| } | ||
| } | ||
|
|
||
| @transient lazy val checkNullGenCode: (ExprCode) => Block = value.dataType match { | ||
| case _: StructType if !SQLConf.get.inFalseForNullField => | ||
| e => code"${e.isNull} || ${e.value}.anyNull()" | ||
| case _ => e => code"${e.isNull}" | ||
| } | ||
|
|
||
| @transient lazy val checkNullEval: (Any) => Boolean = value.dataType match { | ||
| case _: StructType if !SQLConf.get.inFalseForNullField => | ||
| input => input == null || input.asInstanceOf[InternalRow].anyNull | ||
| case _ => input => input == null | ||
| } | ||
|
|
||
| override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||
| val javaDataType = CodeGenerator.javaType(value.dataType) | ||
| val valueGen = value.genCode(ctx) | ||
|
|
@@ -283,7 +306,7 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate { | |
| val listCode = listGen.map(x => | ||
| s""" | ||
| |${x.code} | ||
| |if (${x.isNull}) { | ||
| |if (${checkNullGenCode(x)}) { | ||
| | $tmpResult = $HAS_NULL; // ${ev.isNull} = true; | ||
| |} else if (${ctx.genEqual(value.dataType, valueArg, x.value)}) { | ||
| | $tmpResult = $MATCHED; // ${ev.isNull} = false; ${ev.value} = true; | ||
|
|
@@ -316,7 +339,7 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate { | |
| code""" | ||
| |${valueGen.code} | ||
| |byte $tmpResult = $HAS_NULL; | ||
| |if (!${valueGen.isNull}) { | ||
| |if (!(${checkNullGenCode(valueGen)})) { | ||
| | $tmpResult = $NOT_MATCHED; | ||
| | $javaDataType $valueArg = ${valueGen.value}; | ||
| | do { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1561,6 +1561,16 @@ object SQLConf { | |
| .booleanConf | ||
| .createWithDefault(false) | ||
|
|
||
| val LEGACY_IN_FALSE_FOR_NULL_FIELD = | ||
| buildConf("spark.sql.legacy.inOperator.falseForNullField") | ||
| .internal() | ||
| .doc("When set to true, the IN operator returns false when comparing literal structs " + | ||
| "containing a null field. When set to false (default), it returns null, instead. This is " + | ||
| "important especially when using NOT IN as in the second case, it filters out the rows " + | ||
| "when a null is present in a filed; while in the first one, those rows are returned.") | ||
| .booleanConf | ||
| .createWithDefault(true) | ||
|
||
|
|
||
| val LEGACY_INTEGRALDIVIDE_RETURN_LONG = buildConf("spark.sql.legacy.integralDivide.returnBigint") | ||
| .doc("If it is set to true, the div operator returns always a bigint. This behavior was " + | ||
| "inherited from Hive. Otherwise, the return type is the data type of the operands.") | ||
|
|
@@ -1978,6 +1988,8 @@ class SQLConf extends Serializable with Logging { | |
|
|
||
| def setOpsPrecedenceEnforced: Boolean = getConf(SQLConf.LEGACY_SETOPS_PRECEDENCE_ENABLED) | ||
|
|
||
| def inFalseForNullField: Boolean = getConf(SQLConf.LEGACY_IN_FALSE_FOR_NULL_FIELD) | ||
|
|
||
| def integralDivideReturnLong: Boolean = getConf(SQLConf.LEGACY_INTEGRALDIVIDE_RETURN_LONG) | ||
|
|
||
| /** ********************** SQLConf functionality methods ************ */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,8 +14,25 @@ CREATE TEMPORARY VIEW m AS SELECT * FROM VALUES | |
| -- Case 1 (not possible to write a literal with no rows, so we ignore it.) | ||
| -- (subquery is empty -> row is returned) | ||
|
|
||
| -- Cases 2, 3 and 4 are currently broken, so I have commented them out here. | ||
| -- Filed https://issues.apache.org/jira/browse/SPARK-24395 to fix and restore these test cases. | ||
| -- Case 2 | ||
| -- (subquery contains a row with null in all columns -> row not returned) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| SELECT * | ||
| FROM m | ||
| WHERE (a, b) NOT IN ((CAST (null AS INT), CAST (null AS DECIMAL(2, 1)))); | ||
|
|
||
| -- Case 3 | ||
| -- (probe-side columns are all null -> row not returned) | ||
| SELECT * | ||
| FROM m | ||
| WHERE a IS NULL AND b IS NULL -- Matches only (null, null) | ||
| AND (a, b) NOT IN ((0, 1.0), (2, 3.0), (4, CAST(null AS DECIMAL(2, 1)))); | ||
|
|
||
| -- Case 4 | ||
| -- (one column null, other column matches a row in the subquery result -> row not returned) | ||
| SELECT * | ||
| FROM m | ||
| WHERE b = 1.0 -- Matches (null, 1.0) | ||
| AND (a, b) NOT IN ((0, 1.0), (2, 3.0), (4, CAST(null AS DECIMAL(2, 1)))); | ||
|
|
||
| -- Case 5 | ||
| -- (one null column with no match -> row is returned) | ||
|
|
@@ -37,3 +54,26 @@ SELECT * | |
| FROM m | ||
| WHERE b = 5.0 -- Matches (4, 5.0) | ||
| AND (a, b) NOT IN ((2, 3.0)); | ||
|
|
||
|
|
||
| set spark.sql.legacy.inOperator.falseForNullField=true; | ||
|
|
||
| -- Case 2 (old behavior) | ||
| -- (subquery contains a row with null in all columns -> rows returned) | ||
| SELECT * | ||
| FROM m | ||
| WHERE (a, b) NOT IN ((CAST (null AS INT), CAST (null AS DECIMAL(2, 1)))); | ||
|
|
||
| -- Case 3 (old behavior) | ||
| -- (probe-side columns are all null -> row returned) | ||
| SELECT * | ||
| FROM m | ||
| WHERE a IS NULL AND b IS NULL -- Matches only (null, null) | ||
| AND (a, b) NOT IN ((0, 1.0), (2, 3.0), (4, CAST(null AS DECIMAL(2, 1)))); | ||
|
|
||
| -- Case 4 (old behavior) | ||
| -- (one column null, other column matches a row in the subquery result -> row returned) | ||
| SELECT * | ||
| FROM m | ||
| WHERE b = 1.0 -- Matches (null, 1.0) | ||
| AND (a, b) NOT IN ((0, 1.0), (2, 3.0), (4, CAST(null AS DECIMAL(2, 1)))); | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
any of the elements or fields ...We should explicitly mention multi-column IN, which is different from
a in (b, c, ...)whileais struct type.