-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-29343][SQL] Eliminate sorts without limit in the subquery of Join/Aggregation #26011
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
75b43f5
e29b323
25a3bb8
d21c683
d2328a0
a9e9be9
425f76d
9072db7
8082324
f2d9ec1
4eccd2a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -97,7 +97,6 @@ abstract class Optimizer(catalogManager: CatalogManager) | |
| SimplifyBinaryComparison, | ||
| ReplaceNullWithFalseInPredicate, | ||
| PruneFilters, | ||
| EliminateSorts, | ||
| SimplifyCasts, | ||
| SimplifyCaseConversionExpressions, | ||
| RewriteCorrelatedScalarSubquery, | ||
|
|
@@ -174,8 +173,8 @@ abstract class Optimizer(catalogManager: CatalogManager) | |
| // idempotence enforcement on this batch. We thus make it FixedPoint(1) instead of Once. | ||
| Batch("Join Reorder", FixedPoint(1), | ||
| CostBasedJoinReorder) :+ | ||
| Batch("Remove Redundant Sorts", Once, | ||
| RemoveRedundantSorts) :+ | ||
| Batch("Eliminate Sorts", Once, | ||
| EliminateSorts) :+ | ||
| Batch("Decimal Optimizations", fixedPoint, | ||
| DecimalAggregates) :+ | ||
| Batch("Object Expressions Optimization", fixedPoint, | ||
|
|
@@ -953,26 +952,25 @@ object CombineFilters extends Rule[LogicalPlan] with PredicateHelper { | |
| } | ||
|
|
||
| /** | ||
| * Removes no-op SortOrder from Sort | ||
| * Removes Sort operation. This can happen: | ||
| * 1) if the sort is noop | ||
| * 2) if the child is already sorted | ||
| * 3) if there is another Sort operator separated by 0...n Project/Filter operators | ||
| * 4) if the Sort operator is within Join and without Limit | ||
|
||
| * 5) if the Sort operator is within GroupBy and the aggregate function is order irrelevant | ||
| */ | ||
| object EliminateSorts extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
| case s @ Sort(orders, _, child) if orders.isEmpty || orders.exists(_.child.foldable) => | ||
| val newOrders = orders.filterNot(_.child.foldable) | ||
| if (newOrders.isEmpty) child else s.copy(order = newOrders) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Removes redundant Sort operation. This can happen: | ||
| * 1) if the child is already sorted | ||
| * 2) if there is another Sort operator separated by 0...n Project/Filter operators | ||
| */ | ||
| object RemoveRedundantSorts extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { | ||
| case Sort(orders, true, child) if SortOrder.orderingSatisfies(child.outputOrdering, orders) => | ||
| child | ||
| case s @ Sort(_, _, child) => s.copy(child = recursiveRemoveSort(child)) | ||
| case j @ Join(originLeft, originRight, _, cond, _) if cond.forall(_.deterministic) => | ||
| j.copy(left = recursiveRemoveSort(originLeft), right = recursiveRemoveSort(originRight)) | ||
| case g @ Aggregate(_, aggs, originChild) if isOrderIrrelevantAggs(aggs) => | ||
| g.copy(child = recursiveRemoveSort(originChild)) | ||
| } | ||
|
|
||
| def recursiveRemoveSort(plan: LogicalPlan): LogicalPlan = plan match { | ||
|
||
|
|
@@ -987,6 +985,24 @@ object RemoveRedundantSorts extends Rule[LogicalPlan] { | |
| case f: Filter => f.condition.deterministic | ||
| case _ => false | ||
| } | ||
|
|
||
| def isOrderIrrelevantAggs(aggs: Seq[NamedExpression]): Boolean = { | ||
|
||
| def isOrderIrrelevantAggFunction(func: AggregateFunction): Boolean = func match { | ||
| case _: Sum => true | ||
| case _: Min => true | ||
| case _: Max => true | ||
| case _: Count => true | ||
| case _: Average => true | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could still have a precision difference after eliminating the sort for the floating point data type. I am afraid some end users might prefer to adding a sort in these cases to ensure the results are consistent?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah that's a good point. AVG over floating values is order sensitive. Not sure if this can really affect queries in practice, but better to be conservative here. @WangGuangxin can you fix it in a followup?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I'll fix it in a followup
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @WangGuangxin do you have no time for the follow-up now? Could I take this over?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same will be true of all central moments, BTW
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I revisit this and have a small question. In fact
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| case _: CentralMomentAgg => true | ||
| case _ => false | ||
| } | ||
|
|
||
| aggs.flatMap { e => | ||
|
||
| e.collect { | ||
| case ae: AggregateExpression => ae.aggregateFunction | ||
| } | ||
| }.forall(isOrderIrrelevantAggFunction) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this statement is a little ambiguous, so could you make it more precise?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated