-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-32332][SQL] Support columnar exchanges when AQE is enabled #29134
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3085eb1
ad7eda7
635fb1f
2fd2c08
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -202,7 +202,7 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] { | |
| val leftParts = if (isLeftSkew && !isLeftCoalesced) { | ||
| val reducerId = leftPartSpec.startReducerIndex | ||
| val skewSpecs = createSkewPartitionSpecs( | ||
| left.shuffleStage.shuffle.shuffleDependency.shuffleId, reducerId, leftTargetSize) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Get shuffleId from mapStats. |
||
| left.shuffleStage.shuffle.shuffleId, reducerId, leftTargetSize) | ||
| if (skewSpecs.isDefined) { | ||
| logDebug(s"Left side partition $partitionIndex " + | ||
| s"(${FileUtils.byteCountToDisplaySize(leftActualSize)}) is skewed, " + | ||
|
|
@@ -218,7 +218,7 @@ case class OptimizeSkewedJoin(conf: SQLConf) extends Rule[SparkPlan] { | |
| val rightParts = if (isRightSkew && !isRightCoalesced) { | ||
| val reducerId = rightPartSpec.startReducerIndex | ||
| val skewSpecs = createSkewPartitionSpecs( | ||
| right.shuffleStage.shuffle.shuffleDependency.shuffleId, reducerId, rightTargetSize) | ||
| right.shuffleStage.shuffle.shuffleId, reducerId, rightTargetSize) | ||
| if (skewSpecs.isDefined) { | ||
| logDebug(s"Right side partition $partitionIndex " + | ||
| s"(${FileUtils.byteCountToDisplaySize(rightActualSize)}) is skewed, " + | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,16 +37,30 @@ import org.apache.spark.sql.execution._ | |
| import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics, SQLShuffleReadMetricsReporter, SQLShuffleWriteMetricsReporter} | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.types.StructType | ||
| import org.apache.spark.sql.vectorized.ColumnarBatch | ||
| import org.apache.spark.util.MutablePair | ||
| import org.apache.spark.util.collection.unsafe.sort.{PrefixComparators, RecordComparator} | ||
|
|
||
| /** | ||
| * Base class for implementations of shuffle exchanges. This was added to enable plugins to | ||
| * provide columnar implementations of shuffle exchanges when Adaptive Query Execution is | ||
| * enabled. | ||
| */ | ||
| abstract class ShuffleExchange extends Exchange { | ||
| def shuffleId: Int | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is available in |
||
| def getNumMappers: Int | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can get this through MapOutputTracker. |
||
| def getNumReducers: Int | ||
| def canChangeNumPartitions: Boolean | ||
| def mapOutputStatisticsFuture: Future[MapOutputStatistics] | ||
| } | ||
|
|
||
| /** | ||
| * Performs a shuffle that will result in the desired partitioning. | ||
| */ | ||
| case class ShuffleExchangeExec( | ||
| override val outputPartitioning: Partitioning, | ||
| child: SparkPlan, | ||
| canChangeNumPartitions: Boolean = true) extends Exchange { | ||
| canChangeNumPartitions: Boolean = true) extends ShuffleExchange { | ||
|
|
||
| private lazy val writeMetrics = | ||
| SQLShuffleWriteMetricsReporter.createShuffleWriteMetrics(sparkContext) | ||
|
|
@@ -63,6 +77,12 @@ case class ShuffleExchangeExec( | |
|
|
||
| @transient lazy val inputRDD: RDD[InternalRow] = child.execute() | ||
|
|
||
| override def shuffleId: Int = shuffleDependency.shuffleId | ||
|
|
||
| override def getNumMappers: Int = shuffleDependency.rdd.getNumPartitions | ||
|
|
||
| override def getNumReducers: Int = shuffleDependency.partitioner.numPartitions | ||
|
|
||
| // 'mapOutputStatisticsFuture' is only needed when enable AQE. | ||
| @transient lazy val mapOutputStatisticsFuture: Future[MapOutputStatistics] = { | ||
| if (inputRDD.getNumPartitions == 0) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,15 +18,20 @@ package org.apache.spark.sql | |
|
|
||
| import java.util.Locale | ||
|
|
||
| import org.apache.spark.{SparkFunSuite, TaskContext} | ||
| import scala.concurrent.Future | ||
|
|
||
| import org.apache.spark.{MapOutputStatistics, SparkFunSuite, TaskContext} | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} | ||
| import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow, TableIdentifier} | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParserInterface} | ||
| import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, UnresolvedHint} | ||
| import org.apache.spark.sql.catalyst.plans.physical.BroadcastMode | ||
| import org.apache.spark.sql.catalyst.rules.Rule | ||
| import org.apache.spark.sql.execution._ | ||
| import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, QueryStageExec} | ||
| import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchange, ShuffleExchangeExec} | ||
| import org.apache.spark.sql.execution.vectorized.OnHeapColumnVector | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.internal.SQLConf.COLUMN_BATCH_SIZE | ||
|
|
@@ -145,33 +150,56 @@ class SparkSessionExtensionSuite extends SparkFunSuite { | |
| } | ||
| } | ||
|
|
||
| test("inject columnar") { | ||
| test("inject columnar AQE on") { | ||
| testInjectColumnar(true) | ||
| } | ||
|
|
||
| test("inject columnar AQE off") { | ||
| testInjectColumnar(false) | ||
| } | ||
|
|
||
| private def testInjectColumnar(adaptiveEnabled: Boolean) { | ||
|
|
||
| def collectPlanSteps(plan: SparkPlan): Seq[Int] = plan match { | ||
| case a: AdaptiveSparkPlanExec => | ||
| assert(a.toString.startsWith("AdaptiveSparkPlan isFinalPlan=true")) | ||
| collectPlanSteps(a.executedPlan) | ||
| case _ => plan.collect { | ||
| case _: ReplacedRowToColumnarExec => 1 | ||
| case _: ColumnarProjectExec => 10 | ||
| case _: ColumnarToRowExec => 100 | ||
| case s: QueryStageExec => collectPlanSteps(s.plan).sum | ||
| case _: MyShuffleExchangeExec => 1000 | ||
| case _: MyBroadcastExchangeExec => 10000 | ||
| } | ||
| } | ||
|
|
||
| val extensions = create { extensions => | ||
| extensions.injectColumnar(session => | ||
| MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule())) | ||
| } | ||
| withSession(extensions) { session => | ||
| // The ApplyColumnarRulesAndInsertTransitions rule is not applied when enable AQE | ||
| session.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, false) | ||
| session.sessionState.conf.setConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED, adaptiveEnabled) | ||
| assert(session.sessionState.columnarRules.contains( | ||
| MyColumarRule(PreRuleReplaceAddWithBrokenVersion(), MyPostRule()))) | ||
| import session.sqlContext.implicits._ | ||
| // repartitioning avoids having the add operation pushed up into the LocalTableScan | ||
| val data = Seq((100L), (200L), (300L)).toDF("vals").repartition(1) | ||
| val df = data.selectExpr("vals + 1") | ||
| // perform a join to inject a broadcast exchange | ||
| val left = Seq((1, 50L), (2, 100L), (3, 150L)).toDF("l1", "l2") | ||
| val right = Seq((1, 50L), (2, 100L), (3, 150L)).toDF("r1", "r2") | ||
| val data = left.join(right, $"l1" === $"r1") | ||
| // repartitioning avoids having the add operation pushed up into the LocalTableScan | ||
| .repartition(1) | ||
| val df = data.selectExpr("l2 + r2") | ||
| // execute the plan so that the final adaptive plan is available when AQE is on | ||
| df.collect() | ||
| // Verify that both pre and post processing of the plan worked. | ||
| val found = df.queryExecution.executedPlan.collect { | ||
| case rep: ReplacedRowToColumnarExec => 1 | ||
| case proj: ColumnarProjectExec => 10 | ||
| case c2r: ColumnarToRowExec => 100 | ||
| }.sum | ||
| assert(found == 111) | ||
|
|
||
| val found = collectPlanSteps(df.queryExecution.executedPlan).sum | ||
| assert(found == 11121) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. might be nice to comment what 11121 equals in terms of the execs - MyBroadcastExchangeExec, etc.. |
||
| // Verify that we get back the expected, wrong, result | ||
| val result = df.collect() | ||
| assert(result(0).getLong(0) == 102L) // Check that broken columnar Add was used. | ||
| assert(result(1).getLong(0) == 202L) | ||
| assert(result(2).getLong(0) == 302L) | ||
| assert(result(0).getLong(0) == 101L) // Check that broken columnar Add was used. | ||
| assert(result(1).getLong(0) == 201L) | ||
| assert(result(2).getLong(0) == 301L) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -671,6 +699,15 @@ case class PreRuleReplaceAddWithBrokenVersion() extends Rule[SparkPlan] { | |
| def replaceWithColumnarPlan(plan: SparkPlan): SparkPlan = | ||
| try { | ||
| plan match { | ||
| case e: ShuffleExchangeExec => | ||
| // note that this is not actually columnar but demonstrates that exchanges can | ||
| // be replaced, particularly when adaptive query is enabled | ||
| val replaced = e.withNewChildren(e.children.map(replaceWithColumnarPlan)) | ||
| MyShuffleExchangeExec(replaced.asInstanceOf[ShuffleExchangeExec]) | ||
| case e: BroadcastExchangeExec => | ||
| // note that this is not actually columnar but demonstrates that exchanges can | ||
| // be replaced, particularly when adaptive query is enabled | ||
| new MyBroadcastExchangeExec(e.mode, e.child) | ||
| case plan: ProjectExec => | ||
| new ColumnarProjectExec(plan.projectList.map((exp) => | ||
| replaceWithColumnarExpression(exp).asInstanceOf[NamedExpression]), | ||
|
|
@@ -689,6 +726,37 @@ case class PreRuleReplaceAddWithBrokenVersion() extends Rule[SparkPlan] { | |
| override def apply(plan: SparkPlan): SparkPlan = replaceWithColumnarPlan(plan) | ||
| } | ||
|
|
||
| /** | ||
| * Custom Exchange used in tests to demonstrate that shuffles can be replaced regardless of | ||
| * whether adaptive query is enabled. | ||
| */ | ||
| case class MyShuffleExchangeExec(delegate: ShuffleExchangeExec) extends ShuffleExchange { | ||
| override def shuffleId: Int = delegate.shuffleId | ||
| override def getNumMappers: Int = delegate.getNumMappers | ||
| override def getNumReducers: Int = delegate.getNumReducers | ||
| override def canChangeNumPartitions: Boolean = delegate.canChangeNumPartitions | ||
| override def mapOutputStatisticsFuture: Future[MapOutputStatistics] = | ||
| delegate.mapOutputStatisticsFuture | ||
| override def child: SparkPlan = delegate.child | ||
| override protected def doExecute(): RDD[InternalRow] = delegate.execute() | ||
| } | ||
|
|
||
| /** | ||
| * Custom Exchange used in tests to demonstrate that broadcasts can be replaced regardless of | ||
| * whether adaptive query is enabled. | ||
| * | ||
| * Note that extending a Spark case class is not recommended, but this was the easiest way to | ||
| * implement these tests. | ||
| */ | ||
| class MyBroadcastExchangeExec(mode: BroadcastMode, | ||
| child: SparkPlan) extends BroadcastExchangeExec(mode, child) { | ||
| override def equals(o: Any): Boolean = o match { | ||
| case o: MyBroadcastExchangeExec => mode.equals(o.mode) && child.equals(o.child) | ||
| case _ => false | ||
| } | ||
| override def hashCode(): Int = mode.hashCode() + child.hashCode() | ||
| } | ||
|
|
||
| class ReplacedRowToColumnarExec(override val child: SparkPlan) | ||
| extends RowToColumnarExec(child) { | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How can we guarantee the top node is still an Exchange after applying physical rules?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's safer to use
s.withNewChildren(optimizedChildPlan)than adding special handling in those physical rules: https://github.com/apache/spark/pull/29134/files#diff-a30c7a6fcdcdd13e57135fd04d05f3b7R115-R117That saves you the trouble of worrying about certain assumptions being broken in an arbitrary rule.