-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-24395][SQL] IN operator should return NULL when comparing struct with NULL fields #22029
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
54ee21a
5974aea
9b28842
655eaa4
062c9fd
3b00217
cf6b3b0
59ff46b
321b9a8
1933e9d
809e80c
389e6de
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,9 +22,11 @@ import scala.collection.immutable.TreeSet | |
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.analysis.TypeCheckResult | ||
| import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral, GenerateSafeProjection, GenerateUnsafeProjection, Predicate => BasePredicate} | ||
| import org.apache.spark.sql.catalyst.expressions.codegen.Block | ||
| import org.apache.spark.sql.catalyst.expressions.codegen.Block._ | ||
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
| import org.apache.spark.sql.catalyst.util.TypeUtils | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.types._ | ||
|
|
||
|
|
||
|
|
@@ -138,21 +140,26 @@ case class Not(child: Expression) | |
| override def sql: String = s"(NOT ${child.sql})" | ||
| } | ||
|
|
||
| /** | ||
| * Evaluates to `true` if `values` are returned in `query`'s result set. | ||
| */ | ||
| case class InSubquery(values: Seq[Expression], query: ListQuery) | ||
| extends Predicate with Unevaluable { | ||
| abstract class InBase extends Predicate { | ||
| def values: Seq[Expression] | ||
|
|
||
| @transient protected lazy val isMultiValued = values.length > 1 | ||
|
|
||
| @transient private lazy val value: Expression = if (values.length > 1) { | ||
| @transient lazy val value: Expression = if (isMultiValued) { | ||
| CreateNamedStruct(values.zipWithIndex.flatMap { | ||
| case (v: NamedExpression, _) => Seq(Literal(v.name), v) | ||
| case (v, idx) => Seq(Literal(s"_$idx"), v) | ||
| }) | ||
| } else { | ||
| values.head | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Evaluates to `true` if `values` are returned in `query`'s result set. | ||
| */ | ||
| case class InSubquery(values: Seq[Expression], query: ListQuery) | ||
| extends InBase with Unevaluable { | ||
|
|
||
| override def checkInputDataTypes(): TypeCheckResult = { | ||
| if (values.length != query.childOutputs.length) { | ||
|
|
@@ -202,7 +209,11 @@ case class InSubquery(values: Seq[Expression], query: ListQuery) | |
| */ | ||
| // scalastyle:off line.size.limit | ||
| @ExpressionDescription( | ||
| usage = "expr1 _FUNC_(expr2, expr3, ...) - Returns true if `expr` equals to any valN.", | ||
| usage = """ | ||
| expr1 _FUNC_(expr2, expr3, ...) - Returns true if `expr` equals to any valN. Otherwise, if | ||
| spark.sql.legacy.inOperator.falseForNullField is false and any of the elements or fields of | ||
| the elements is null it returns null, else it returns false. | ||
|
||
| """, | ||
| arguments = """ | ||
| Arguments: | ||
| * expr1, expr2, expr3, ... - the arguments must be same type. | ||
|
|
@@ -219,7 +230,7 @@ case class InSubquery(values: Seq[Expression], query: ListQuery) | |
| true | ||
| """) | ||
| // scalastyle:on line.size.limit | ||
| case class In(value: Expression, list: Seq[Expression]) extends Predicate { | ||
| case class In(values: Seq[Expression], list: Seq[Expression]) extends InBase { | ||
|
|
||
| require(list != null, "list should not be null") | ||
|
|
||
|
|
@@ -234,24 +245,29 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate { | |
| } | ||
| } | ||
|
|
||
| override def children: Seq[Expression] = value +: list | ||
| override def children: Seq[Expression] = values ++ list | ||
| lazy val inSetConvertible = list.forall(_.isInstanceOf[Literal]) | ||
| private lazy val ordering = TypeUtils.getInterpretedOrdering(value.dataType) | ||
|
|
||
| override def nullable: Boolean = children.exists(_.nullable) | ||
| override def nullable: Boolean = if (isMultiValued && !SQLConf.get.inFalseForNullField) { | ||
| children.exists(_.nullable) || | ||
| children.exists(_.dataType.asInstanceOf[StructType].exists(_.nullable)) | ||
| } else { | ||
| children.exists(_.nullable) | ||
| } | ||
| override def foldable: Boolean = children.forall(_.foldable) | ||
|
|
||
| override def toString: String = s"$value IN ${list.mkString("(", ",", ")")}" | ||
|
|
||
| override def eval(input: InternalRow): Any = { | ||
| val evaluatedValue = value.eval(input) | ||
| if (evaluatedValue == null) { | ||
| if (checkNullEval(evaluatedValue)) { | ||
| null | ||
| } else { | ||
| var hasNull = false | ||
| list.foreach { e => | ||
| val v = e.eval(input) | ||
| if (v == null) { | ||
| if (checkNullEval(v)) { | ||
| hasNull = true | ||
| } else if (ordering.equiv(v, evaluatedValue)) { | ||
| return true | ||
|
|
@@ -265,6 +281,22 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate { | |
| } | ||
| } | ||
|
|
||
| @transient lazy val checkNullGenCode: (ExprCode) => Block = { | ||
| if (isMultiValued && !SQLConf.get.inFalseForNullField) { | ||
| e => code"${e.isNull} || ${e.value}.anyNull()" | ||
| } else { | ||
| e => code"${e.isNull}" | ||
| } | ||
| } | ||
|
|
||
| @transient lazy val checkNullEval: (Any) => Boolean = { | ||
| if (isMultiValued && !SQLConf.get.inFalseForNullField) { | ||
| input => input == null || input.asInstanceOf[InternalRow].anyNull | ||
| } else { | ||
| input => input == null | ||
| } | ||
| } | ||
|
|
||
| override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||
| val javaDataType = CodeGenerator.javaType(value.dataType) | ||
| val valueGen = value.genCode(ctx) | ||
|
|
@@ -283,7 +315,7 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate { | |
| val listCode = listGen.map(x => | ||
| s""" | ||
| |${x.code} | ||
| |if (${x.isNull}) { | ||
| |if (${checkNullGenCode(x)}) { | ||
| | $tmpResult = $HAS_NULL; // ${ev.isNull} = true; | ||
| |} else if (${ctx.genEqual(value.dataType, valueArg, x.value)}) { | ||
| | $tmpResult = $MATCHED; // ${ev.isNull} = false; ${ev.value} = true; | ||
|
|
@@ -316,7 +348,7 @@ case class In(value: Expression, list: Seq[Expression]) extends Predicate { | |
| code""" | ||
| |${valueGen.code} | ||
| |byte $tmpResult = $HAS_NULL; | ||
| |if (!${valueGen.isNull}) { | ||
| |if (!(${checkNullGenCode(valueGen)})) { | ||
| | $tmpResult = $NOT_MATCHED; | ||
| | $javaDataType $valueArg = ${valueGen.value}; | ||
| | do { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -212,27 +212,27 @@ object ReorderAssociativeOperator extends Rule[LogicalPlan] { | |
| * 1. Converts the predicate to false when the list is empty and | ||
| * the value is not nullable. | ||
| * 2. Removes literal repetitions. | ||
| * 3. Replaces [[In (value, seq[Literal])]] with optimized version | ||
| * 3. Replaces [[In (values, seq[Literal])]] with optimized version | ||
| * [[InSet (value, HashSet[Literal])]] which is much faster. | ||
| */ | ||
| object OptimizeIn extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
| case q: LogicalPlan => q transformExpressionsDown { | ||
| case In(v, list) if list.isEmpty => | ||
| case i @ In(_, list) if list.isEmpty => | ||
| // When v is not nullable, the following expression will be optimized | ||
| // to FalseLiteral which is tested in OptimizeInSuite.scala | ||
| If(IsNotNull(v), FalseLiteral, Literal(null, BooleanType)) | ||
| case expr @ In(v, list) if expr.inSetConvertible => | ||
| If(IsNotNull(i.value), FalseLiteral, Literal(null, BooleanType)) | ||
|
||
| case expr @ In(_, list) if expr.inSetConvertible => | ||
| val newList = ExpressionSet(list).toSeq | ||
| if (newList.length == 1 | ||
| // TODO: `EqualTo` for structural types are not working. Until SPARK-24443 is addressed, | ||
| // TODO: we exclude them in this rule. | ||
| && !v.isInstanceOf[CreateNamedStructLike] | ||
| && !expr.value.isInstanceOf[CreateNamedStructLike] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to the implementation,
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, rigth
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shall we use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not really, because
Basically there are 2 cases: one where we have several attributes in the value before IN; the other when there is a single value before IN but the value is a struct.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IIUC, you mean Can you give an example? Based on my understanding, the code here is trying to optimize a case when it's not a multi-value in and the list has only one element.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, I mean that. An example is:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for your case, it's not
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, because of optimizations, it is a
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. well, I think for this case we should optimize it. Anyway it follows the previous behavior, we can change it later. |
||
| && !newList.head.isInstanceOf[CreateNamedStructLike]) { | ||
| EqualTo(v, newList.head) | ||
| EqualTo(expr.value, newList.head) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, we do this only when
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shall we update the match here? I think it should be
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no, sorry, we can't do that, otherwise we would skip the other possible optimizations here, eg. converting to What should be done, instead, is doing the same change to |
||
| } else if (newList.length > SQLConf.get.optimizerInSetConversionThreshold) { | ||
| val hSet = newList.map(e => e.eval(EmptyRow)) | ||
| InSet(v, HashSet() ++ hSet) | ||
| InSet(expr.value, HashSet() ++ hSet) | ||
| } else if (newList.length < list.length) { | ||
| expr.copy(list = newList) | ||
| } else { // newList.length == list.length && newList.length > 1 | ||
|
|
@@ -527,7 +527,7 @@ object NullPropagation extends Rule[LogicalPlan] { | |
| } | ||
|
|
||
| // If the value expression is NULL then transform the In expression to null literal. | ||
| case In(Literal(null, _), _) => Literal.create(null, BooleanType) | ||
| case In(Seq(Literal(null, _)), _) => Literal.create(null, BooleanType) | ||
| case InSubquery(Seq(Literal(null, _)), _) => Literal.create(null, BooleanType) | ||
|
|
||
| // Non-leaf NullIntolerant expressions will return null, if at least one of its children is | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1561,6 +1561,16 @@ object SQLConf { | |
| .booleanConf | ||
| .createWithDefault(false) | ||
|
|
||
| val LEGACY_IN_FALSE_FOR_NULL_FIELD = | ||
| buildConf("spark.sql.legacy.inOperator.falseForNullField") | ||
| .internal() | ||
| .doc("When set to true (default), the IN operator returns false when comparing literal " + | ||
mgaido91 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| "structs containing a null field. When set to false, it returns null, instead. This is " + | ||
| "important especially when using NOT IN as in the second case, it filters out the rows " + | ||
| "when a null is present in a filed; while in the first one, those rows are returned.") | ||
| .booleanConf | ||
| .createWithDefault(true) | ||
|
||
|
|
||
| val LEGACY_INTEGRALDIVIDE_RETURN_LONG = buildConf("spark.sql.legacy.integralDivide.returnBigint") | ||
| .doc("If it is set to true, the div operator returns always a bigint. This behavior was " + | ||
| "inherited from Hive. Otherwise, the return type is the data type of the operands.") | ||
|
|
@@ -1978,6 +1988,8 @@ class SQLConf extends Serializable with Logging { | |
|
|
||
| def setOpsPrecedenceEnforced: Boolean = getConf(SQLConf.LEGACY_SETOPS_PRECEDENCE_ENABLED) | ||
|
|
||
| def inFalseForNullField: Boolean = getConf(SQLConf.LEGACY_IN_FALSE_FOR_NULL_FIELD) | ||
|
|
||
| def integralDivideReturnLong: Boolean = getConf(SQLConf.LEGACY_INTEGRALDIVIDE_RETURN_LONG) | ||
|
|
||
| /** ********************** SQLConf functionality methods ************ */ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
any of the elements or fields ...We should explicitly mention multi-column IN, which is different from
a in (b, c, ...)whileais struct type.