[HADP-55008] Repartition before writing also support sort columns (apache#546)

wangyum · GitHub Enterprise · commit d1774850e619 · 2024-10-01T01:43:59.000-05:00
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -5136,10 +5136,10 @@ object SQLConf {
       .createWithDefault(2000)
 
   val AUTO_REPARTITION_BEFORE_WRITING_ENABLED =
-    buildConf("spark.carmel.sql.repartition.writing.enabled")
+    buildConf("spark.sql.repartition.writing.enabled")
       .internal()
       .doc("When true, add a shuffle before writing data into partitioned table or bucket table.")
-      .version("4.0.0")
+      .version("3.5.0")
       .booleanConf
       .createWithDefault(false)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/RepartitionBeforeWriting.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/RepartitionBeforeWriting.scala
@@ -17,8 +17,8 @@
 
 package org.apache.spark.sql.execution.datasources
 
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet}
-import org.apache.spark.sql.catalyst.plans.logical.{AppendData, HasPartitionExpressions, LogicalPlan, RebalancePartitions, RepartitionByExpression, RepartitionOperation}
+import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, AttributeSet, SortOrder}
+import org.apache.spark.sql.catalyst.plans.logical.{AppendData, HasPartitionExpressions, LogicalPlan, RebalancePartitions, RepartitionByExpression, RepartitionOperation, Sort}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.connector.expressions.{BucketTransform, IdentityTransform}
 import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
@@ -34,6 +34,7 @@ object RepartitionBeforeWriting extends Rule[LogicalPlan] {
 
   def buildRepartition(
       attributes: Seq[Attribute],
+      sortCols: Seq[Attribute],
       query: LogicalPlan,
       numPartitions: Option[Int] = None): LogicalPlan = {
     query.collectFirst { case r: RepartitionOperation => r } match {
@@ -43,7 +44,11 @@ object RepartitionBeforeWriting extends Rule[LogicalPlan] {
         case _ =>
           val repartitionOper = new RepartitionByExpression(attributes, query, numPartitions, None)
           repartitionOper.setTagValue(LogicalPlan.REMOVABLE_PLAN_TAG, true)
-          repartitionOper
+          if (sortCols.isEmpty) {
+            repartitionOper
+          } else {
+            Sort(sortCols.map(SortOrder(_, Ascending)), global = false, repartitionOper)
+          }
     }
   }
 
@@ -72,7 +77,7 @@ object RepartitionBeforeWriting extends Rule[LogicalPlan] {
       numPartitions: Option[Int] = None): LogicalPlan = {
     val attributes =
       query.outputSet.filter(attr => parts.exists(part => query.conf.resolver(part, attr.name)))
-    buildRepartition(attributes.toSeq, query, numPartitions)
+    buildRepartition(attributes.toSeq, Nil, query, numPartitions)
   }
 
   private def buildRebalanceForV2(
@@ -100,7 +105,11 @@ object RepartitionBeforeWriting extends Rule[LogicalPlan] {
         if (bucketColumns.isEmpty) {
           i
         } else {
-          i.copy(query = buildRepartition(bucketColumns, query, Some(bucket.numBuckets)))
+          val sortCols =
+            bucket.sortColumnNames
+              .map { col => query.resolve(Seq(col), resolver).get.toAttribute }
+              .filterNot(_.foldable)
+          i.copy(query = buildRepartition(bucketColumns, sortCols, query, Some(bucket.numBuckets)))
         }
 
       case i @ InsertIntoHadoopFsRelationCommand(_, sp, _, pc, None, _, _, query, _, _, _, _)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
@@ -29,7 +29,7 @@ import org.apache.spark.sql.{InsertOperationLockUtil, QueryTest, _}
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, NamedExpression}
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning
-import org.apache.spark.sql.execution.CommandResultExec
+import org.apache.spark.sql.execution.{CommandResultExec, SortExec}
 import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveSparkPlanHelper}
 import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.hive.execution.HiveTempPath
@@ -962,6 +962,28 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
               assert(exprs.map(_.asInstanceOf[NamedExpression].name).mkString(", ") == "a")
             }
           }
+
+          withTable("part_tab") {
+            sql(s"CREATE TABLE part_tab(a int, b int) $dataSource PARTITIONED BY (b)")
+            val df = sql(
+              """
+                |INSERT INTO part_tab
+                |SELECT a,
+                |       b
+                |FROM   src_tab distribute by b,
+                |       cast(rand() * 10 AS int)
+                |""".stripMargin)
+
+            df.collect()
+            val adaptivePlan =
+              df.queryExecution.executedPlan.asInstanceOf[CommandResultExec]
+                .commandPhysicalPlan.asInstanceOf[AdaptiveSparkPlanExec].executedPlan
+            val shuffles = collect(adaptivePlan) { case s: ShuffleExchangeExec => s }
+            assert(shuffles.length == 1)
+            shuffles.foreach { s =>
+              assert(s.outputPartitioning.asInstanceOf[HashPartitioning].expressions.size === 2)
+            }
+          }
         }
       }
     }
@@ -997,6 +1019,12 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
             } else {
               assert(exprs.map(_.asInstanceOf[NamedExpression].name).mkString(", ") == "a")
             }
+
+            val sorts = collect(adaptivePlan) { case s: SortExec => s }
+            sorts.foreach { s =>
+              assert(s.sortOrder.size === 2)
+              assert(s.sortOrder.flatMap(_.references).map(_.name).mkString(", ") === "c, a")
+            }
           }
         }
       }

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ import org.apache.spark.sql.{InsertOperationLockUtil, QueryTest, _}`
`29`	`29`	`import org.apache.spark.sql.catalyst.expressions.{AttributeReference, NamedExpression}`
`30`	`30`	`import org.apache.spark.sql.catalyst.parser.ParseException`
`31`	`31`	`import org.apache.spark.sql.catalyst.plans.physical.HashPartitioning`
`32`		`-import org.apache.spark.sql.execution.CommandResultExec`
	`32`	`+import org.apache.spark.sql.execution.{CommandResultExec, SortExec}`
`33`	`33`	`import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveSparkPlanHelper}`
`34`	`34`	`import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec`
`35`	`35`	`import org.apache.spark.sql.hive.execution.HiveTempPath`
`@@ -962,6 +962,28 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter`
`962`	`962`	`assert(exprs.map(_.asInstanceOf[NamedExpression].name).mkString(", ") == "a")`
`963`	`963`	`}`
`964`	`964`	`}`
	`965`	`+`
	`966`	`+ withTable("part_tab") {`
	`967`	`+ sql(s"CREATE TABLE part_tab(a int, b int) $dataSource PARTITIONED BY (b)")`
	`968`	`+ val df = sql(`
	`969`	`+ """`
	`970`	`+ \|INSERT INTO part_tab`
	`971`	`+ \|SELECT a,`
	`972`	`+ \| b`
	`973`	`+ \|FROM src_tab distribute by b,`
	`974`	`+ \| cast(rand() * 10 AS int)`
	`975`	`+ \|""".stripMargin)`
	`976`	`+`
	`977`	`+ df.collect()`
	`978`	`+ val adaptivePlan =`
	`979`	`+ df.queryExecution.executedPlan.asInstanceOf[CommandResultExec]`
	`980`	`+ .commandPhysicalPlan.asInstanceOf[AdaptiveSparkPlanExec].executedPlan`
	`981`	`+ val shuffles = collect(adaptivePlan) { case s: ShuffleExchangeExec => s }`
	`982`	`+ assert(shuffles.length == 1)`
	`983`	`+ shuffles.foreach { s =>`
	`984`	`+ assert(s.outputPartitioning.asInstanceOf[HashPartitioning].expressions.size === 2)`
	`985`	`+ }`
	`986`	`+ }`
`965`	`987`	`}`
`966`	`988`	`}`
`967`	`989`	`}`
`@@ -997,6 +1019,12 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter`
`997`	`1019`	`} else {`
`998`	`1020`	`assert(exprs.map(_.asInstanceOf[NamedExpression].name).mkString(", ") == "a")`
`999`	`1021`	`}`
	`1022`	`+`
	`1023`	`+ val sorts = collect(adaptivePlan) { case s: SortExec => s }`
	`1024`	`+ sorts.foreach { s =>`
	`1025`	`+ assert(s.sortOrder.size === 2)`
	`1026`	`+ assert(s.sortOrder.flatMap(_.references).map(_.name).mkString(", ") === "c, a")`
	`1027`	`+ }`
`1000`	`1028`	`}`
`1001`	`1029`	`}`
`1002`	`1030`	`}`