Skip to content

Commit ee1c1f3

Browse files
scwfmarmbrus
authored andcommitted
[SPARK-4937][SQL] Adding optimization to simplify the And, Or condition in spark sql
Adding optimization to simplify the And/Or condition in spark sql. There are two kinds of Optimization 1 Numeric condition optimization, such as: a < 3 && a > 5 ---- False a < 1 || a > 0 ---- True a > 3 && a > 5 => a > 5 (a < 2 || b > 5) && a < 2 => a < 2 2 optimizing the some query from a cartesian product into equi-join, such as this sql (one of hive-testbench): ``` select sum(l_extendedprice* (1 - l_discount)) as revenue from lineitem, part where ( p_partkey = l_partkey and p_brand = 'Brand#32' and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG') and l_quantity >= 7 and l_quantity <= 7 + 10 and p_size between 1 and 5 and l_shipmode in ('AIR', 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' ) or ( p_partkey = l_partkey and p_brand = 'Brand#35' and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK') and l_quantity >= 15 and l_quantity <= 15 + 10 and p_size between 1 and 10 and l_shipmode in ('AIR', 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' ) or ( p_partkey = l_partkey and p_brand = 'Brand#24' and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG') and l_quantity >= 26 and l_quantity <= 26 + 10 and p_size between 1 and 15 and l_shipmode in ('AIR', 'AIR REG') and l_shipinstruct = 'DELIVER IN PERSON' ) ``` It has a repeated expression in Or, so we can optimize it by ``` (a && b) || (a && c) = a && (b || c)``` Before optimization, this sql hang in my locally test, and the physical plan is: ![image](https://cloud.githubusercontent.com/assets/7018048/5539175/31cf38e8-8af9-11e4-95e3-336f9b3da4a4.png) After optimization, this sql run successfully in 20+ seconds, and its physical plan is: ![image](https://cloud.githubusercontent.com/assets/7018048/5539176/39a558e0-8af9-11e4-912b-93de94b20075.png) This PR focus on the second optimization and some simple ones of the first. For complex Numeric condition optimization, I will make a follow up PR. Author: scwf <[email protected]> Author: wangfei <[email protected]> Closes #3778 from scwf/filter1 and squashes the following commits: 58bcbc2 [scwf] minor format fix 9570211 [scwf] conflicts fix 527e6ce [scwf] minor comment improvements 5c6f134 [scwf] remove numeric optimizations and move to BooleanSimplification 546a82b [wangfei] style fix 825fa69 [wangfei] adding more tests a001e8c [wangfei] revert pom changes 32a595b [scwf] improvement and test fix e99a26c [wangfei] refactory And/Or optimization to make it more readable and clean
1 parent fd3a8a1 commit ee1c1f3

File tree

3 files changed

+131
-90
lines changed

3 files changed

+131
-90
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala

Lines changed: 39 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -294,13 +294,10 @@ object OptimizeIn extends Rule[LogicalPlan] {
294294

295295
/**
296296
* Simplifies boolean expressions:
297-
*
298297
* 1. Simplifies expressions whose answer can be determined without evaluating both sides.
299298
* 2. Eliminates / extracts common factors.
300-
* 3. Removes `Not` operator.
301-
*
302-
* Note that this rule can eliminate expressions that might otherwise have been evaluated and thus
303-
* is only safe when evaluations of expressions does not result in side effects.
299+
* 3. Merge same expressions
300+
* 4. Removes `Not` operator.
304301
*/
305302
object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper {
306303
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
@@ -311,9 +308,26 @@ object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper {
311308
case (l, Literal(true, BooleanType)) => l
312309
case (Literal(false, BooleanType), _) => Literal(false)
313310
case (_, Literal(false, BooleanType)) => Literal(false)
314-
// a && a && a ... => a
315-
case _ if splitConjunctivePredicates(and).distinct.size == 1 => left
316-
case _ => and
311+
// a && a => a
312+
case (l, r) if l fastEquals r => l
313+
case (_, _) =>
314+
val lhsSet = splitDisjunctivePredicates(left).toSet
315+
val rhsSet = splitDisjunctivePredicates(right).toSet
316+
val common = lhsSet.intersect(rhsSet)
317+
val ldiff = lhsSet.diff(common)
318+
val rdiff = rhsSet.diff(common)
319+
if (ldiff.size == 0 || rdiff.size == 0) {
320+
// a && (a || b)
321+
common.reduce(Or)
322+
} else {
323+
// (a || b || c || ...) && (a || b || d || ...) && (a || b || e || ...) ... =>
324+
// (a || b) || ((c || ...) && (f || ...) && (e || ...) && ...)
325+
(ldiff.reduceOption(Or) ++ rdiff.reduceOption(Or))
326+
.reduceOption(And)
327+
.map(_ :: common.toList)
328+
.getOrElse(common.toList)
329+
.reduce(Or)
330+
}
317331
}
318332

319333
case or @ Or(left, right) =>
@@ -322,19 +336,26 @@ object BooleanSimplification extends Rule[LogicalPlan] with PredicateHelper {
322336
case (_, Literal(true, BooleanType)) => Literal(true)
323337
case (Literal(false, BooleanType), r) => r
324338
case (l, Literal(false, BooleanType)) => l
325-
// a || a || a ... => a
326-
case _ if splitDisjunctivePredicates(or).distinct.size == 1 => left
327-
// (a && b && c && ...) || (a && b && d && ...) => a && b && (c || d || ...)
328-
case _ =>
339+
// a || a => a
340+
case (l, r) if l fastEquals r => l
341+
case (_, _) =>
329342
val lhsSet = splitConjunctivePredicates(left).toSet
330343
val rhsSet = splitConjunctivePredicates(right).toSet
331344
val common = lhsSet.intersect(rhsSet)
332-
333-
(lhsSet.diff(common).reduceOption(And) ++ rhsSet.diff(common).reduceOption(And))
334-
.reduceOption(Or)
335-
.map(_ :: common.toList)
336-
.getOrElse(common.toList)
337-
.reduce(And)
345+
val ldiff = lhsSet.diff(common)
346+
val rdiff = rhsSet.diff(common)
347+
if ( ldiff.size == 0 || rdiff.size == 0) {
348+
// a || (b && a)
349+
common.reduce(And)
350+
} else {
351+
// (a && b && c && ...) || (a && b && d && ...) || (a && b && e && ...) ... =>
352+
// a && b && ((c && ...) || (d && ...) || (e && ...) || ...)
353+
(ldiff.reduceOption(And) ++ rdiff.reduceOption(And))
354+
.reduceOption(Or)
355+
.map(_ :: common.toList)
356+
.getOrElse(common.toList)
357+
.reduce(And)
358+
}
338359
}
339360

340361
case not @ Not(exp) =>
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.catalyst.optimizer
19+
20+
import org.apache.spark.sql.catalyst.analysis.EliminateAnalysisOperators
21+
import org.apache.spark.sql.catalyst.expressions.{Literal, Expression}
22+
import org.apache.spark.sql.catalyst.plans.logical._
23+
import org.apache.spark.sql.catalyst.plans.PlanTest
24+
import org.apache.spark.sql.catalyst.rules._
25+
import org.apache.spark.sql.catalyst.dsl.plans._
26+
import org.apache.spark.sql.catalyst.dsl.expressions._
27+
28+
class BooleanSimplificationSuite extends PlanTest {
29+
30+
object Optimize extends RuleExecutor[LogicalPlan] {
31+
val batches =
32+
Batch("AnalysisNodes", Once,
33+
EliminateAnalysisOperators) ::
34+
Batch("Constant Folding", FixedPoint(50),
35+
NullPropagation,
36+
ConstantFolding,
37+
BooleanSimplification,
38+
SimplifyFilters) :: Nil
39+
}
40+
41+
val testRelation = LocalRelation('a.int, 'b.int, 'c.int, 'd.string)
42+
43+
def checkCondition(originCondition: Expression, optimizedCondition: Expression): Unit = {
44+
val originQuery = testRelation.where(originCondition).analyze
45+
val optimized = Optimize(originQuery)
46+
val expected = testRelation.where(optimizedCondition).analyze
47+
comparePlans(optimized, expected)
48+
}
49+
50+
test("a && a => a") {
51+
checkCondition(Literal(1) < 'a && Literal(1) < 'a, Literal(1) < 'a)
52+
checkCondition(Literal(1) < 'a && Literal(1) < 'a && Literal(1) < 'a, Literal(1) < 'a)
53+
}
54+
55+
test("a || a => a") {
56+
checkCondition(Literal(1) < 'a || Literal(1) < 'a, Literal(1) < 'a)
57+
checkCondition(Literal(1) < 'a || Literal(1) < 'a || Literal(1) < 'a, Literal(1) < 'a)
58+
}
59+
60+
test("(a && b && c && ...) || (a && b && d && ...) || (a && b && e && ...) ...") {
61+
checkCondition('b > 3 || 'c > 5, 'b > 3 || 'c > 5)
62+
63+
checkCondition(('a < 2 && 'a > 3 && 'b > 5) || 'a < 2, 'a < 2)
64+
65+
checkCondition('a < 2 || ('a < 2 && 'a > 3 && 'b > 5), 'a < 2)
66+
67+
val input = ('a === 'b && 'b > 3 && 'c > 2) ||
68+
('a === 'b && 'c < 1 && 'a === 5) ||
69+
('a === 'b && 'b < 5 && 'a > 1)
70+
71+
val expected =
72+
(((('b > 3) && ('c > 2)) ||
73+
(('c < 1) && ('a === 5))) ||
74+
(('b < 5) && ('a > 1))) && ('a === 'b)
75+
checkCondition(input, expected)
76+
77+
}
78+
79+
test("(a || b || c || ...) && (a || b || d || ...) && (a || b || e || ...) ...") {
80+
checkCondition('b > 3 && 'c > 5, 'b > 3 && 'c > 5)
81+
82+
checkCondition(('a < 2 || 'a > 3 || 'b > 5) && 'a < 2, 'a < 2)
83+
84+
checkCondition('a < 2 && ('a < 2 || 'a > 3 || 'b > 5) , 'a < 2)
85+
86+
checkCondition(('a < 2 || 'b > 3) && ('a < 2 || 'c > 5), ('b > 3 && 'c > 5) || 'a < 2)
87+
88+
var input: Expression = ('a === 'b || 'b > 3) && ('a === 'b || 'a > 3) && ('a === 'b || 'a < 5)
89+
var expected: Expression = ('b > 3 && 'a > 3 && 'a < 5) || 'a === 'b
90+
checkCondition(input, expected)
91+
}
92+
}

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFiltersSuite.scala

Lines changed: 0 additions & 72 deletions
This file was deleted.

0 commit comments

Comments
 (0)