Skip to content
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.expressions.{And, ArrayExists, ArrayFilter, CaseWhen, Expression, If}
import org.apache.spark.sql.catalyst.expressions.{LambdaFunction, Literal, MapFilter, Or}
import org.apache.spark.sql.catalyst.expressions.Literal.FalseLiteral
import org.apache.spark.sql.catalyst.plans.logical.{Filter, Join, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.types.BooleanType


/**
* A rule that replaces `Literal(null, BooleanType)` with `FalseLiteral`, if possible, in the search
* condition of the WHERE/HAVING/ON(JOIN) clauses, which contain an implicit Boolean operator
Copy link
Contributor

@aokolnychyi aokolnychyi Nov 26, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the scope of this rule is a bit bigger. For example, some higher-order functions, conditions of all If and CaseWhen expressions. Would it make sense to replace "in the search condition of the WHERE/HAVING/ON(JOIN) clauses" with "in predicates"?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The extra scope is covered by "Moreover, ..."

* "(search condition) = TRUE". The replacement is only valid when `Literal(null, BooleanType)` is
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure I understand "which contain an implicit Boolean operator "(search condition) = TRUE"". Could you, please, elaborate a bit?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is based on the ANSI SQL. All these clauses have the implicit Boolean operator "(search condition) = TRUE". That is why NULL and FALSE do not satisfy the condition in these clauses

* semantically equivalent to `FalseLiteral` when evaluating the whole search condition.
*
* Please note that FALSE and NULL are not exchangeable in most cases, when the search condition
* contains NOT and NULL-tolerant expressions. Thus, the rule is very conservative and applicable
* in very limited cases.
*
* For example, `Filter(Literal(null, BooleanType))` is equal to `Filter(FalseLiteral)`.
*
* Another example containing branches is `Filter(If(cond, FalseLiteral, Literal(null, _)))`;
* this can be optimized to `Filter(If(cond, FalseLiteral, FalseLiteral))`, and eventually
* `Filter(FalseLiteral)`.
*
* Moreover, this rule also transforms predicates in all [[If]] expressions as well as branch
* conditions in all [[CaseWhen]] expressions, even if they are not part of the search conditions.
*
* For example, `Project(If(And(cond, Literal(null)), Literal(1), Literal(2)))` can be simplified
* into `Project(Literal(2))`.
*/
object ReplaceNullWithFalseInPredicate extends Rule[LogicalPlan] {

def apply(plan: LogicalPlan): LogicalPlan = plan transform {
case f @ Filter(cond, _) => f.copy(condition = replaceNullWithFalse(cond))
case j @ Join(_, _, _, Some(cond)) => j.copy(condition = Some(replaceNullWithFalse(cond)))
case p: LogicalPlan => p transformExpressions {
case i @ If(pred, _, _) => i.copy(predicate = replaceNullWithFalse(pred))
case cw @ CaseWhen(branches, _) =>
val newBranches = branches.map { case (cond, value) =>
replaceNullWithFalse(cond) -> value
}
cw.copy(branches = newBranches)
case af @ ArrayFilter(_, lf @ LambdaFunction(func, _, _)) =>
val newLambda = lf.copy(function = replaceNullWithFalse(func))
af.copy(function = newLambda)
case ae @ ArrayExists(_, lf @ LambdaFunction(func, _, _)) =>
val newLambda = lf.copy(function = replaceNullWithFalse(func))
ae.copy(function = newLambda)
case mf @ MapFilter(_, lf @ LambdaFunction(func, _, _)) =>
val newLambda = lf.copy(function = replaceNullWithFalse(func))
mf.copy(function = newLambda)
}
}

/**
* Recursively traverse the Boolean-type expression to replace
* `Literal(null, BooleanType)` with `FalseLiteral`, if possible.
*
* Note that `transformExpressionsDown` can not be used here as we must stop as soon as we hit
* an expression that is not [[CaseWhen]], [[If]], [[And]], [[Or]] or
* `Literal(null, BooleanType)`.
*/
private def replaceNullWithFalse(e: Expression): Expression = {
if (e.dataType != BooleanType) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need this? And, Or, If all return boolean, and we already requires boolean type for literal case.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about the LambdaFunction? My major concern is the future changes might forget to add it?

Copy link
Contributor

@cloud-fan cloud-fan Nov 26, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't handle LambdaFunction inside this method, it's caller side.

This method is to deal with optimizable boolean expressions, and return the original expression if it's not: https://github.com/apache/spark/pull/23139/files#diff-0bb4fc0a3c867b855f84dd1db8867139R103

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See this line https://github.com/apache/spark/pull/23139/files/e41681096867cbc6d2556da83ce733092d6df841#diff-a1acb054bc8888376603ef510e6d0ee0

My major concern is we should not completely rely on the caller to ensure the data type is Boolean. In the future, the new code changes might not completely follow our current assumption.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Had an offline discussion with @rednaxelafx . We can issue an exception instead of silently bypass it.

e
} else {
e match {
case Literal(null, BooleanType) =>
FalseLiteral
case And(left, right) =>
And(replaceNullWithFalse(left), replaceNullWithFalse(right))
case Or(left, right) =>
Or(replaceNullWithFalse(left), replaceNullWithFalse(right))
case cw: CaseWhen =>
val newBranches = cw.branches.map { case (cond, value) =>
replaceNullWithFalse(cond) -> replaceNullWithFalse(value)
}
val newElseValue = cw.elseValue.map(replaceNullWithFalse)
CaseWhen(newBranches, newElseValue)
case If(pred, trueVal, falseVal) =>
If(replaceNullWithFalse(pred),
replaceNullWithFalse(trueVal),
replaceNullWithFalse(falseVal))
case _ => e
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -736,69 +736,3 @@ object CombineConcats extends Rule[LogicalPlan] {
flattenConcats(concat)
}
}

/**
* A rule that replaces `Literal(null, _)` with `FalseLiteral` for further optimizations.
*
* This rule applies to conditions in [[Filter]] and [[Join]]. Moreover, it transforms predicates
* in all [[If]] expressions as well as branch conditions in all [[CaseWhen]] expressions.
*
* For example, `Filter(Literal(null, _))` is equal to `Filter(FalseLiteral)`.
*
* Another example containing branches is `Filter(If(cond, FalseLiteral, Literal(null, _)))`;
* this can be optimized to `Filter(If(cond, FalseLiteral, FalseLiteral))`, and eventually
* `Filter(FalseLiteral)`.
*
* As this rule is not limited to conditions in [[Filter]] and [[Join]], arbitrary plans can
* benefit from it. For example, `Project(If(And(cond, Literal(null)), Literal(1), Literal(2)))`
* can be simplified into `Project(Literal(2))`.
*
* As a result, many unnecessary computations can be removed in the query optimization phase.
*/
object ReplaceNullWithFalseInPredicate extends Rule[LogicalPlan] {

def apply(plan: LogicalPlan): LogicalPlan = plan transform {
case f @ Filter(cond, _) => f.copy(condition = replaceNullWithFalse(cond))
case j @ Join(_, _, _, Some(cond)) => j.copy(condition = Some(replaceNullWithFalse(cond)))
case p: LogicalPlan => p transformExpressions {
case i @ If(pred, _, _) => i.copy(predicate = replaceNullWithFalse(pred))
case cw @ CaseWhen(branches, _) =>
val newBranches = branches.map { case (cond, value) =>
replaceNullWithFalse(cond) -> value
}
cw.copy(branches = newBranches)
case af @ ArrayFilter(_, lf @ LambdaFunction(func, _, _)) =>
val newLambda = lf.copy(function = replaceNullWithFalse(func))
af.copy(function = newLambda)
case ae @ ArrayExists(_, lf @ LambdaFunction(func, _, _)) =>
val newLambda = lf.copy(function = replaceNullWithFalse(func))
ae.copy(function = newLambda)
case mf @ MapFilter(_, lf @ LambdaFunction(func, _, _)) =>
val newLambda = lf.copy(function = replaceNullWithFalse(func))
mf.copy(function = newLambda)
}
}

/**
* Recursively replaces `Literal(null, _)` with `FalseLiteral`.
*
* Note that `transformExpressionsDown` can not be used here as we must stop as soon as we hit
* an expression that is not [[CaseWhen]], [[If]], [[And]], [[Or]] or `Literal(null, _)`.
*/
private def replaceNullWithFalse(e: Expression): Expression = e match {
case cw: CaseWhen if cw.dataType == BooleanType =>
val newBranches = cw.branches.map { case (cond, value) =>
replaceNullWithFalse(cond) -> replaceNullWithFalse(value)
}
val newElseValue = cw.elseValue.map(replaceNullWithFalse)
CaseWhen(newBranches, newElseValue)
case i @ If(pred, trueVal, falseVal) if i.dataType == BooleanType =>
If(replaceNullWithFalse(pred), replaceNullWithFalse(trueVal), replaceNullWithFalse(falseVal))
case And(left, right) =>
And(replaceNullWithFalse(left), replaceNullWithFalse(right))
case Or(left, right) =>
Or(replaceNullWithFalse(left), replaceNullWithFalse(right))
case Literal(null, _) => FalseLiteral
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about this line? What happened if the input data type of e is not Boolean?

case _ => e
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ class ReplaceNullWithFalseInPredicateSuite extends PlanTest {
private val anotherTestRelation = LocalRelation('d.int)

test("replace null inside filter and join conditions") {
testFilter(originalCond = Literal(null), expectedCond = FalseLiteral)
testJoin(originalCond = Literal(null), expectedCond = FalseLiteral)
testFilter(originalCond = Literal(null, BooleanType), expectedCond = FalseLiteral)
testJoin(originalCond = Literal(null, BooleanType), expectedCond = FalseLiteral)
}

test("replace null in branches of If") {
Expand Down