-
Notifications
You must be signed in to change notification settings - Fork 29.2k
[SPARK-20392][SQL] Set barrier to prevent re-entering a tree #17770
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
82978d7
24905e3
e15b001
a076d83
b29ded3
8c8fe1e
a855182
4ff9610
d0a94f4
02e11f9
17f1a02
4629959
c313e35
7e9dfac
fba3690
f63ea0b
b9d03cd
6a7204c
3437ae0
555fa8e
505aba6
f3e4208
c0bee01
1c1cc9d
eb0598e
cba784b
b478e55
8314cc3
6add9ec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -165,7 +165,8 @@ class Analyzer( | |
| Batch("Subquery", Once, | ||
| UpdateOuterReferences), | ||
| Batch("Cleanup", fixedPoint, | ||
| CleanupAliases) | ||
| CleanupAliases, | ||
| CleanupBarriers) | ||
| ) | ||
|
|
||
| /** | ||
|
|
@@ -2435,6 +2436,13 @@ object CleanupAliases extends Rule[LogicalPlan] { | |
| } | ||
| } | ||
|
|
||
| /** Remove the barrier nodes of analysis */ | ||
| object CleanupBarriers extends Rule[LogicalPlan] { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. |
||
| override def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. |
||
| case AnalysisBarrier(child) => child | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Maps a time column to multiple time windows using the Expand operator. Since it's non-trivial to | ||
| * figure out how many windows a time column can map to, we over-estimate the number of windows and | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -441,4 +441,17 @@ class AnalysisSuite extends AnalysisTest with ShouldMatchers { | |
|
|
||
| checkAnalysis(SubqueryAlias("tbl", testRelation).as("tbl2"), testRelation) | ||
| } | ||
|
|
||
| test("analysis barrier") { | ||
| // [[AnalysisBarrier]] will be removed after analysis | ||
| checkAnalysis( | ||
| Project(Seq(UnresolvedAttribute("tbl.a")), | ||
| AnalysisBarrier(SubqueryAlias("tbl", testRelation))), | ||
| Project(testRelation.output, SubqueryAlias("tbl", testRelation))) | ||
|
|
||
| // Make sure we won't resolve the plans wrapped in an [[AnalysisBarrier]] | ||
| val barrier = AnalysisBarrier(Project(Seq(UnresolvedAttribute("tbl.b")), | ||
| SubqueryAlias("tbl", testRelation))) | ||
| assertAnalysisError(barrier, Seq("cannot resolve '`tbl.b`'")) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where is this exception thrown?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -356,7 +356,10 @@ class Dataset[T] private[sql]( | |
| */ | ||
| // This is declared with parentheses to prevent the Scala compiler from treating | ||
| // `ds.toDF("1")` as invoking this toDF and then apply on the returned DataFrame. | ||
| def toDF(): DataFrame = new Dataset[Row](sparkSession, queryExecution, RowEncoder(schema)) | ||
| def toDF(): DataFrame = { | ||
| val plan = AnalysisBarrier(logicalPlan) | ||
| new Dataset[Row](sparkSession, plan, RowEncoder(schema)) | ||
| } | ||
|
|
||
| /** | ||
| * :: Experimental :: | ||
|
|
@@ -702,7 +705,7 @@ class Dataset[T] private[sql]( | |
| * @since 2.0.0 | ||
| */ | ||
| def join(right: Dataset[_]): DataFrame = withPlan { | ||
| Join(logicalPlan, right.logicalPlan, joinType = Inner, None) | ||
| Join(AnalysisBarrier(logicalPlan), right.logicalPlan, joinType = Inner, None) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For self-join de-duplication, we only set barrier for left side.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am wondering if we should check there's duplication between right and left sides and decide using barrier or not for right side. |
||
| } | ||
|
|
||
| /** | ||
|
|
@@ -785,8 +788,8 @@ class Dataset[T] private[sql]( | |
|
|
||
| withPlan { | ||
| Join( | ||
| joined.left, | ||
| joined.right, | ||
| AnalysisBarrier(joined.left), | ||
| AnalysisBarrier(joined.right), | ||
| UsingJoin(JoinType(joinType), usingColumns), | ||
| None) | ||
| } | ||
|
|
@@ -841,17 +844,18 @@ class Dataset[T] private[sql]( | |
| // Trigger analysis so in the case of self-join, the analyzer will clone the plan. | ||
| // After the cloning, left and right side will have distinct expression ids. | ||
| val plan = withPlan( | ||
| Join(logicalPlan, right.logicalPlan, JoinType(joinType), Some(joinExprs.expr))) | ||
| .queryExecution.analyzed.asInstanceOf[Join] | ||
| Join(AnalysisBarrier(logicalPlan), right.logicalPlan, JoinType(joinType), | ||
| Some(joinExprs.expr))) | ||
| .queryExecution.analyzed.asInstanceOf[Join] | ||
|
|
||
| // If auto self join alias is disabled, return the plan. | ||
| if (!sparkSession.sessionState.conf.dataFrameSelfJoinAutoResolveAmbiguity) { | ||
| return withPlan(plan) | ||
| } | ||
|
|
||
| // If left/right have no output set intersection, return the plan. | ||
| val lanalyzed = withPlan(this.logicalPlan).queryExecution.analyzed | ||
| val ranalyzed = withPlan(right.logicalPlan).queryExecution.analyzed | ||
| val lanalyzed = withPlan(AnalysisBarrier(this.logicalPlan)).queryExecution.analyzed | ||
| val ranalyzed = withPlan(AnalysisBarrier(right.logicalPlan)).queryExecution.analyzed | ||
| if (lanalyzed.outputSet.intersect(ranalyzed.outputSet).isEmpty) { | ||
| return withPlan(plan) | ||
| } | ||
|
|
@@ -883,7 +887,7 @@ class Dataset[T] private[sql]( | |
| * @since 2.1.0 | ||
| */ | ||
| def crossJoin(right: Dataset[_]): DataFrame = withPlan { | ||
| Join(logicalPlan, right.logicalPlan, joinType = Cross, None) | ||
| Join(AnalysisBarrier(logicalPlan), right.logicalPlan, joinType = Cross, None) | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -1134,7 +1138,7 @@ class Dataset[T] private[sql]( | |
| */ | ||
| @scala.annotation.varargs | ||
| def select(cols: Column*): DataFrame = withPlan { | ||
| Project(cols.map(_.named), logicalPlan) | ||
| Project(cols.map(_.named), AnalysisBarrier(logicalPlan)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this work if we turn off eager analysis?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still have eager analysis? I remember it is removed before.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After this PR #11443, we always do the eager analysis. |
||
| } | ||
|
|
||
| /** | ||
|
|
@@ -1812,7 +1816,7 @@ class Dataset[T] private[sql]( | |
|
|
||
| withPlan { | ||
| Generate(generator, join = true, outer = false, | ||
| qualifier = None, generatorOutput = Nil, logicalPlan) | ||
| qualifier = None, generatorOutput = Nil, AnalysisBarrier(logicalPlan)) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -1853,7 +1857,7 @@ class Dataset[T] private[sql]( | |
|
|
||
| withPlan { | ||
| Generate(generator, join = true, outer = false, | ||
| qualifier = None, generatorOutput = Nil, logicalPlan) | ||
| qualifier = None, generatorOutput = Nil, AnalysisBarrier(logicalPlan)) | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How about moving this rule to
Batch("Finish Analysis", ...?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do cleaning up the barriers in the end of Analysis is because we don't want to show it in analyzed plan. If we move it the "Finish Analysis" batch, it will show up.