[CARMEL-7546][CARMEL-3523] Optimize skewed insert (apache#324)

fenzhu · GitHub Enterprise · commit 424feb32d8b0 · 2024-04-27T01:42:37.000-05:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -131,6 +131,7 @@ case class AdaptiveSparkPlanExec(
       CombineAdjacentAggregation,
       RemoveRedundantWindowGroupLimits,
       DisableUnnecessaryBucketedScan,
+      OptimizeSkewedInsert,
       OptimizeSkewedJoin(ensureRequirements)
     ) ++ context.session.sessionState.adaptiveRulesHolder.queryStagePrepRules
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedInsert.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeSkewedInsert.scala
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.adaptive
+
+import scala.collection.mutable
+
+import org.apache.commons.io.FileUtils
+
+import org.apache.spark.sql.catalyst.plans.physical.UnspecifiedDistribution
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.command.DataWritingCommandExec
+import org.apache.spark.sql.execution.datasources.WriteFilesExec
+import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, REPARTITION_BY_COL, ShuffleExchangeLike}
+import org.apache.spark.sql.internal.SQLConf
+
+object OptimizeSkewedInsert extends Rule[SparkPlan]  {
+
+  private def getSizeInfo(medianSize: Long, sizes: Seq[Long], targetSize: Long): String = {
+    s"median size: $medianSize, max size: ${sizes.max}, min size: ${sizes.min}, avg size: " +
+      sizes.sum / sizes.length + s", target size: ${targetSize}"
+  }
+
+  override def apply(plan: SparkPlan): SparkPlan = {
+    if (!conf.getConf(SQLConf.AUTO_REPARTITION_BEFORE_WRITING_ENABLED)) {
+      plan
+    } else {
+      plan.transformUp {
+        case w @ DataWritingCommandExec(_, WriteFilesExec(child, _, _, _, _, _))
+          if supportOptimization(w) => handleSkewed(w, child)
+      }
+    }
+  }
+
+  private def handleSkewed(plan: SparkPlan, child: SparkPlan): SparkPlan = {
+    val (queryStage, planToUpdate) = child match {
+      case SortExec(_, false, ShuffleStage(s: ShuffleQueryStageExec), _) =>
+        (Option(s), Option(child))
+      case ShuffleStage(s: ShuffleQueryStageExec) => (Option(s), None)
+      case _ => (None, None)
+    }
+    if (queryStage.isEmpty || queryStage.get.mapStats.isEmpty ||
+      !supportOptimizeSkew(queryStage.get.shuffle)) {
+      plan
+    } else {
+      val mapStats = queryStage.get.mapStats.get
+      val sizes = mapStats.bytesByPartitionId
+      val numPartitions = sizes.length
+      // We use the median size of the original shuffle partitions to detect skewed partitions.
+      val medSize = SkewHandlingUtil.medianSize(mapStats)
+      val targetSize = SkewHandlingUtil.targetSize(sizes, medSize, conf)
+      logInfo(
+        s"""
+           |Optimizing skewed insert, partition size info:
+           |${getSizeInfo(medSize, mapStats.bytesByPartitionId, targetSize)}
+        """.stripMargin)
+
+      var numSkewed = 0
+      val shufflePartitions = mutable.ArrayBuffer.empty[ShufflePartitionSpec]
+      for (partitionId <- 0 until numPartitions) {
+        val size = sizes(partitionId)
+        val isSkew = SkewHandlingUtil.isSkewed(size, medSize, conf)
+        val partSpec = CoalescedPartitionSpec(partitionId, partitionId + 1, size)
+        val isCoalesced = partSpec.startReducerIndex + 1 < partSpec.endReducerIndex
+
+        // A skewed partition should never be coalesced, but skip it here just to be safe.
+        val parts = if (isSkew && !isCoalesced) {
+          val reducerId = partSpec.startReducerIndex
+          val skewSpecs = ShufflePartitionsUtil.createSkewPartitionSpecs(
+            mapStats.shuffleId, reducerId, targetSize)
+          if (skewSpecs.isDefined) {
+            logInfo(s"Partition $partitionId " +
+              s"(${FileUtils.byteCountToDisplaySize(size)}) is skewed, " +
+              s"split it into ${skewSpecs.get.length} parts.")
+            numSkewed += 1
+          }
+          skewSpecs.getOrElse(Seq(partSpec))
+        } else {
+          Seq(partSpec)
+        }
+        for (shufflePartition <- parts) {
+          shufflePartitions += shufflePartition
+        }
+      }
+
+      logInfo(s"number of skewed partitions: $numSkewed")
+      if (numSkewed > 0) {
+        val newShuffleReader = AQEShuffleReadExec(
+          queryStage.get, shufflePartitions)
+        val newChild = planToUpdate match {
+          case Some(p) => p.withNewChildren(Seq(newShuffleReader))
+          case _ => newShuffleReader
+        }
+        plan.withNewChildren(newChild :: Nil)
+      } else {
+        plan
+      }
+    }
+  }
+
+  private def supportOptimizeSkew(s: ShuffleExchangeLike): Boolean = {
+    s.shuffleOrigin == REPARTITION_BY_COL || s.shuffleOrigin == ENSURE_REQUIREMENTS
+  }
+
+  private def supportOptimization(plan: SparkPlan): Boolean = {
+    plan.requiredChildDistribution.forall {
+      case UnspecifiedDistribution => true
+      case _ => false
+    }
+  }
+
+  private object ShuffleStage {
+    def unapply(plan: SparkPlan): Option[ShuffleQueryStageExec] = plan match {
+      case s: ShuffleQueryStageExec if s.isMaterialized && s.mapStats.isDefined =>
+        Some(s)
+      case _ => None
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -3046,6 +3046,172 @@ class AdaptiveQueryExecSuite
       }
     }
   }
+
+  def checkSkewInsert(plan: SparkPlan, expectedSkewPartitions: Int): Unit = {
+    val reader = plan.collect {
+      case r: AQEShuffleReadExec => r
+    }.head
+    assert(reader.hasSkewedPartition)
+    // assert(reader.hasCoalescedPartition) // 0-size partitions are ignored.
+    val numSkewedPartitions = reader.partitionSpecs.collect {
+      case p: PartialReducerPartitionSpec => p.reducerIndex
+    }.distinct.length
+    assert(numSkewedPartitions == expectedSkewPartitions)
+  }
+
+  protected def getCorePlan(plan: SparkPlan): SparkPlan = {
+    plan match {
+      case org.apache.spark.sql.execution.CommandResultExec(_, child, _) =>
+        getCorePlan(child)
+      case ae: AdaptiveSparkPlanExec => ae.finalPhysicalPlan
+      case _ => plan
+    }
+  }
+
+  protected def stripCommandResultExec(plan: SparkPlan): SparkPlan = {
+    plan match {
+      case org.apache.spark.sql.execution.CommandResultExec(_, child, _) => child
+      case _ => plan
+    }
+  }
+
+  test("adaptive skewed insert: create as select command") {
+    withTable("tbl", "tbl2") {
+      withSQLConf(
+        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
+        SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY.key -> "true",
+        SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
+        SQLConf.AUTO_REPARTITION_BEFORE_WRITING_ENABLED.key -> "true",
+        SQLConf.SHUFFLE_TARGET_POSTSHUFFLE_INPUT_SIZE.key -> "100",
+        SQLConf.SKEW_JOIN_SKEWED_PARTITION_THRESHOLD.key -> "100") {
+
+        spark
+          .range(0, 1000, 1, 10)
+          .selectExpr("id % 1 as key", "id as value")
+          .write.saveAsTable("tbl")
+
+        val listener = new QueryExecutionListener {
+          override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = {
+            val plan = stripCommandResultExec(qe.executedPlan)
+            plan match {
+              case ae: AdaptiveSparkPlanExec =>
+                val queryStages = ae.finalPhysicalPlan.collect {
+                  case qs: ShuffleQueryStageExec => qs
+                }
+                assert(queryStages.length == 1)
+                checkSkewInsert(ae.finalPhysicalPlan, 1)
+              case _ =>
+            }
+          }
+          override def onFailure(funcName: String, qe: QueryExecution,
+              exception: Exception): Unit = {}
+        }
+        spark.listenerManager.register(listener)
+        spark.sql("create table tbl2 using parquet " +
+          "partitioned by (key) select * from tbl")
+        spark.listenerManager.unregister(listener)
+        assert(sql("select count(*) from tbl2").collect().head.getLong(0) == 1000)
+      }
+    }
+  }
+
+  test("adaptive skewed insert: insert into command") {
+    withTable("tbl", "tbl2") {
+      withSQLConf(
+        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
+        SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY.key -> "true",
+        SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "100",
+        SQLConf.SKEW_JOIN_SKEWED_PARTITION_THRESHOLD.key -> "100",
+        SQLConf.PARTITION_OVERWRITE_MODE.key -> "dynamic",
+        SQLConf.AUTO_REPARTITION_BEFORE_WRITING_ENABLED.key -> "true") {
+
+        spark
+          .range(0, 1000, 1, 10)
+          .selectExpr("id % 1 as key", "id % 1 as value")
+          .write.saveAsTable("tbl")
+        spark.sql("create table tbl2(key int, value int) using parquet " +
+          "partitioned by (key)")
+        val df2 = spark.sql("insert overwrite table tbl2 partition(key) select * from tbl")
+        val qe2 = df2.queryExecution
+        val plan = getCorePlan(qe2.sparkPlan)
+        val writeOps = plan.collect {
+          case w: DataWritingCommandExec => w
+        }
+        assert(writeOps.size == 1)
+        val queryStages = plan.collect {
+          case qs: ShuffleQueryStageExec => qs
+        }
+        assert(queryStages.length == 1)
+        checkSkewInsert(plan, 1)
+
+        assert(sql("select count(*) from tbl2").collect().head.getLong(0) == 1000)
+      }
+    }
+  }
+
+  test("CARMEL-2389 adaptive skewed insert: ArrayIndexOutOfBoundsException exception") {
+    withTable("tbl", "tbl2") {
+      withSQLConf(
+        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
+        SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY.key -> "true",
+        SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "100",
+        SQLConf.SKEW_JOIN_SKEWED_PARTITION_THRESHOLD.key -> "100",
+        SQLConf.PARTITION_OVERWRITE_MODE.key -> "dynamic",
+        SQLConf.AUTO_REPARTITION_BEFORE_WRITING_ENABLED.key -> "true",
+        SQLConf.SHUFFLE_PARTITIONS.key -> "8") {
+
+        spark
+          .range(0, 1000, 1, 10)
+          .selectExpr("id % 3 as key", "id % 1 as value")
+          .write.saveAsTable("tbl")
+        spark.sql("create table tbl2(key int, value int) using parquet " +
+          "partitioned by (key)")
+        val df2 = spark.sql("insert overwrite table tbl2 partition(key) select * from tbl")
+        val qe2 = df2.queryExecution
+        val plan = getCorePlan(qe2.sparkPlan)
+        val writeOps = plan.collect {
+          case w: DataWritingCommandExec => w
+        }
+        assert(writeOps.size == 1)
+        val queryStages = plan.collect {
+          case qs: ShuffleQueryStageExec => qs
+        }
+        assert(queryStages.length == 1)
+        assert(sql("select count(*) from tbl2").collect().head.getLong(0) == 1000)
+      }
+    }
+  }
+
+  test("adaptive skewed insert: insert into command, source table is empty") {
+    withTable("tbl", "tbl2") {
+      withSQLConf(
+        SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
+        SQLConf.ADAPTIVE_EXECUTION_FORCE_APPLY.key -> "true",
+        SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key -> "100",
+        SQLConf.SKEW_JOIN_SKEWED_PARTITION_THRESHOLD.key -> "100",
+        SQLConf.PARTITION_OVERWRITE_MODE.key -> "dynamic",
+        SQLConf.AUTO_REPARTITION_BEFORE_WRITING_ENABLED.key -> "true") {
+
+        spark.sql("create table tbl(key int, value int) using parquet " +
+          "partitioned by (key)")
+        spark.sql("create table tbl2(key int, value int) using parquet " +
+          "partitioned by (key)")
+        val df2 = spark.sql("insert overwrite table tbl2 partition(key) select * from tbl")
+        val qe2 = df2.queryExecution
+        val plan = getCorePlan(qe2.sparkPlan)
+        val writeOps = plan.collect {
+          case w: DataWritingCommandExec => w
+        }
+        assert(writeOps.size == 1)
+
+        val queryStages = plan.collect {
+          case qs: ShuffleQueryStageExec => qs
+        }
+        assert(queryStages.isEmpty)
+        assert(sql("select count(*) from tbl2").collect().head.getLong(0) == 0)
+      }
+    }
+  }
 }
 
 /**

Original file line number	Diff line number	Diff line change
`@@ -131,6 +131,7 @@ case class AdaptiveSparkPlanExec(`
`131`	`131`	`CombineAdjacentAggregation,`
`132`	`132`	`RemoveRedundantWindowGroupLimits,`
`133`	`133`	`DisableUnnecessaryBucketedScan,`
	`134`	`+ OptimizeSkewedInsert,`
`134`	`135`	`OptimizeSkewedJoin(ensureRequirements)`
`135`	`136`	`) ++ context.session.sessionState.adaptiveRulesHolder.queryStagePrepRules`
`136`	`137`	`}`