-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-18609][SPARK-18841][SQL] Fix redundant Alias removal in the optimizer #16757
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6c89a15
dac7ec9
6aad5d8
81f2fa5
acbb9e0
3103d1b
23743e1
29c4696
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -110,7 +110,8 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: CatalystConf) | |
| SimplifyCaseConversionExpressions, | ||
| RewriteCorrelatedScalarSubquery, | ||
| EliminateSerialization, | ||
| RemoveAliasOnlyProject, | ||
| RemoveRedundantAliases, | ||
| RemoveRedundantProject, | ||
| SimplifyCreateStructOps, | ||
| SimplifyCreateArrayOps, | ||
| SimplifyCreateMapOps) :: | ||
|
|
@@ -157,56 +158,98 @@ class SimpleTestOptimizer extends Optimizer( | |
| new SimpleCatalystConf(caseSensitiveAnalysis = true)) | ||
|
|
||
| /** | ||
| * Removes the Project only conducting Alias of its child node. | ||
| * It is created mainly for removing extra Project added in EliminateSerialization rule, | ||
| * but can also benefit other operators. | ||
| * Remove redundant aliases from a query plan. A redundant alias is an alias that does not change | ||
| * the name or metadata of a column, and does not deduplicate it. | ||
| */ | ||
| object RemoveAliasOnlyProject extends Rule[LogicalPlan] { | ||
| object RemoveRedundantAliases extends Rule[LogicalPlan] { | ||
|
|
||
| /** | ||
| * Returns true if the project list is semantically same as child output, after strip alias on | ||
| * attribute. | ||
| * Create an attribute mapping from the old to the new attributes. This function will only | ||
| * return the attribute pairs that have changed. | ||
| */ | ||
| private def isAliasOnly( | ||
| projectList: Seq[NamedExpression], | ||
| childOutput: Seq[Attribute]): Boolean = { | ||
| if (projectList.length != childOutput.length) { | ||
| false | ||
| } else { | ||
| stripAliasOnAttribute(projectList).zip(childOutput).forall { | ||
| case (a: Attribute, o) if a semanticEquals o => true | ||
| case _ => false | ||
| } | ||
| private def createAttributeMapping(current: LogicalPlan, next: LogicalPlan) | ||
| : Seq[(Attribute, Attribute)] = { | ||
| current.output.zip(next.output).filterNot { | ||
| case (a1, a2) => a1.semanticEquals(a2) | ||
| } | ||
| } | ||
|
|
||
| private def stripAliasOnAttribute(projectList: Seq[NamedExpression]) = { | ||
| projectList.map { | ||
| // Alias with metadata can not be stripped, or the metadata will be lost. | ||
| // If the alias name is different from attribute name, we can't strip it either, or we may | ||
| // accidentally change the output schema name of the root plan. | ||
| case a @ Alias(attr: Attribute, name) if a.metadata == Metadata.empty && name == attr.name => | ||
| attr | ||
| case other => other | ||
| } | ||
| /** | ||
| * Remove the top-level alias from an expression when it is redundant. | ||
| */ | ||
| private def removeRedundantAlias(e: Expression, blacklist: AttributeSet): Expression = e match { | ||
| // Alias with metadata can not be stripped, or the metadata will be lost. | ||
| // If the alias name is different from attribute name, we can't strip it either, or we | ||
| // may accidentally change the output schema name of the root plan. | ||
| case a @ Alias(attr: Attribute, name) | ||
| if a.metadata == Metadata.empty && name == attr.name && !blacklist.contains(attr) => | ||
| attr | ||
| case a => a | ||
| } | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = { | ||
| val aliasOnlyProject = plan.collectFirst { | ||
| case p @ Project(pList, child) if isAliasOnly(pList, child.output) => p | ||
| } | ||
| /** | ||
| * Remove redundant alias expression from a LogicalPlan and its subtree. A blacklist is used to | ||
| * prevent the removal of seemingly redundant aliases used to deduplicate the input for a (self) | ||
| * join. | ||
| */ | ||
| private def removeRedundantAliases(plan: LogicalPlan, blacklist: AttributeSet): LogicalPlan = { | ||
| plan match { | ||
| // A join has to be treated differently, because the left and the right side of the join are | ||
| // not allowed to use the same attributes. We use a blacklist to prevent us from creating a | ||
| // situation in which this happens; the rule will only remove an alias if its child | ||
| // attribute is not on the black list. | ||
| case Join(left, right, joinType, condition) => | ||
| val newLeft = removeRedundantAliases(left, blacklist ++ right.outputSet) | ||
| val newRight = removeRedundantAliases(right, blacklist ++ newLeft.outputSet) | ||
| val mapping = AttributeMap( | ||
| createAttributeMapping(left, newLeft) ++ | ||
| createAttributeMapping(right, newRight)) | ||
| val newCondition = condition.map(_.transform { | ||
| case a: Attribute => mapping.getOrElse(a, a) | ||
| }) | ||
| Join(newLeft, newRight, joinType, newCondition) | ||
|
|
||
| case _ => | ||
| // Remove redundant aliases in the subtree(s). | ||
| val currentNextAttrPairs = mutable.Buffer.empty[(Attribute, Attribute)] | ||
| val newNode = plan.mapChildren { child => | ||
| val newChild = removeRedundantAliases(child, blacklist) | ||
| currentNextAttrPairs ++= createAttributeMapping(child, newChild) | ||
| newChild | ||
| } | ||
|
|
||
| aliasOnlyProject.map { case proj => | ||
| val attributesToReplace = proj.output.zip(proj.child.output).filterNot { | ||
| case (a1, a2) => a1 semanticEquals a2 | ||
| } | ||
| val attrMap = AttributeMap(attributesToReplace) | ||
| plan transform { | ||
| case plan: Project if plan eq proj => plan.child | ||
| case plan => plan transformExpressions { | ||
| case a: Attribute if attrMap.contains(a) => attrMap(a) | ||
| // Create the attribute mapping. Note that the currentNextAttrPairs can contain duplicate | ||
| // keys in case of Union (this is caused by the PushProjectionThroughUnion rule); in this | ||
| // case we use the the first mapping (which should be provided by the first child). | ||
| val mapping = AttributeMap(currentNextAttrPairs) | ||
|
|
||
| // Create a an expression cleaning function for nodes that can actually produce redundant | ||
| // aliases, use identity otherwise. | ||
| val clean: Expression => Expression = plan match { | ||
| case _: Project => removeRedundantAlias(_, blacklist) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what if we clean expressions for all nodes? Or like rule
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually can we merge this rule into |
||
| case _: Aggregate => removeRedundantAlias(_, blacklist) | ||
| case _: Window => removeRedundantAlias(_, blacklist) | ||
| case _ => identity[Expression] | ||
| } | ||
| } | ||
| }.getOrElse(plan) | ||
|
|
||
| // Transform the expressions. | ||
| newNode.mapExpressions { expr => | ||
| clean(expr.transform { | ||
| case a: Attribute => mapping.getOrElse(a, a) | ||
| }) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = removeRedundantAliases(plan, AttributeSet.empty) | ||
| } | ||
|
|
||
| /** | ||
| * Remove projections from the query plan that do not make any modifications. | ||
| */ | ||
| object RemoveRedundantProject extends Rule[LogicalPlan] { | ||
| def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
| case p @ Project(_, child) if p.output == child.output => child | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you explain what
currentandnextmeans here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Current is plan before we remove redundant aliases, and next is the plan after we have remove the redundant aliases. I'll update the doc.