From fb41b90c8914d8045691b9b752d0f86704538f42 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cattilapiros=E2=80=9D?=
 <piros.attila.zsolt@gmail.com>
Date: Tue, 2 Mar 2021 16:58:29 -0800
Subject: [PATCH 001/169] [SPARK-34361][K8S] In case of downscaling avoid
 killing of executors already known by the scheduler backend in the pod
 allocator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?

This PR modifies the POD allocator to use the scheduler backend to get the known executors and remove those from the pending and newly created list.

This is different from the normal `ExecutorAllocationManager` requested killing of executors where the  `spark.dynamicAllocation.executorIdleTimeout` is used.
In this case POD allocator kills the executors which  should be only responsible for terminating not satisfied POD allocations (new requests where no POD state is received yet and PODs in pending state).

### Why are the changes needed?

Because there is race between executor POD allocator and cluster scheduler backend.
Running several experiment during downscaling we experienced a lot of killed fresh executors wich has already running task on them.

The pattern in the log was the following (see executor 312 and TID 2079):

```
21/02/01 15:12:03 INFO ExecutorMonitor: New executor 312 has registered (new total is 138)
...
21/02/01 15:12:03 INFO TaskSetManager: Starting task 247.0 in stage 4.0 (TID 2079, 100.100.18.138, executor 312, partition 247, PROCESS_LOCAL, 8777 bytes)
21/02/01 15:12:03 INFO ExecutorPodsAllocator: Deleting 3 excess pod requests (408,312,307).
...
21/02/01 15:12:04 ERROR TaskSchedulerImpl: Lost executor 312 on 100.100.18.138: The executor with id 312 was deleted by a user or the framework.
21/02/01 15:12:04 INFO TaskSetManager: Task 2079 failed because while it was being computed, its executor exited for a reason unrelated to the task. Not counting this failure towards the maximum number of failures for the task.
```

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

#### Manually

With this change there was no executor lost with running task on it.

##### With unit test

A new test is added and existing test is modified to check these cases.

Closes #31513 from attilapiros/SPARK-34361.

Authored-by: “attilapiros” <piros.attila.zsolt@gmail.com>
Signed-off-by: Holden Karau <hkarau@apple.com>
(cherry picked from commit 6c5322de6176726955b4bc941f92ecaa54a7f539)
Signed-off-by: Holden Karau <hkarau@apple.com>
---
 .../org/apache/spark/deploy/k8s/Config.scala  |   3 +-
 .../cluster/k8s/ExecutorPodsAllocator.scala   |  66 +++++---
 .../KubernetesClusterSchedulerBackend.scala   |   2 +-
 .../k8s/ExecutorPodsAllocatorSuite.scala      | 155 +++++++++++++++++-
 ...bernetesClusterSchedulerBackendSuite.scala |   2 +-
 5 files changed, 204 insertions(+), 24 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
index 2eeffd840b7db..fe92fae36c36c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
@@ -237,7 +237,8 @@ private[spark] object Config extends Logging {
 
   val KUBERNETES_ALLOCATION_EXECUTOR_TIMEOUT =
     ConfigBuilder("spark.kubernetes.allocation.executor.timeout")
-      .doc("Time to wait before considering a pending executor timedout.")
+      .doc("Time to wait before a newly created executor POD request, which does not reached " +
+        "the POD pending state yet, considered timedout and will be deleted.")
       .version("3.1.0")
       .timeConf(TimeUnit.MILLISECONDS)
       .checkValue(value => value > 0, "Allocation executor timeout must be a positive time value.")
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
index eb35de8759593..5fc81a6d84273 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
@@ -18,7 +18,7 @@ package org.apache.spark.scheduler.cluster.k8s
 
 import java.time.Instant
 import java.util.concurrent.ConcurrentHashMap
-import java.util.concurrent.atomic.{AtomicBoolean, AtomicLong}
+import java.util.concurrent.atomic.{AtomicInteger, AtomicLong}
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
@@ -82,9 +82,14 @@ private[spark] class ExecutorPodsAllocator(
   // snapshot yet. Mapped to the (ResourceProfile id, timestamp) when they were created.
   private val newlyCreatedExecutors = mutable.LinkedHashMap.empty[Long, (Int, Long)]
 
+  // Executor IDs that have been requested from Kubernetes but have not been detected in any POD
+  // snapshot yet but already known by the scheduler backend. Mapped to the ResourceProfile id.
+  private val schedulerKnownNewlyCreatedExecs = mutable.LinkedHashMap.empty[Long, Int]
+
   private val dynamicAllocationEnabled = Utils.isDynamicAllocationEnabled(conf)
 
-  private val hasPendingPods = new AtomicBoolean()
+  // visible for tests
+  private[k8s] val numOutstandingPods = new AtomicInteger()
 
   private var lastSnapshot = ExecutorPodsSnapshot()
 
@@ -93,9 +98,9 @@ private[spark] class ExecutorPodsAllocator(
   // if they happen to come up before the deletion takes effect.
   @volatile private var deletedExecutorIds = Set.empty[Long]
 
-  def start(applicationId: String): Unit = {
+  def start(applicationId: String, schedulerBackend: KubernetesClusterSchedulerBackend): Unit = {
     snapshotsStore.addSubscriber(podAllocationDelay) {
-      onNewSnapshots(applicationId, _)
+      onNewSnapshots(applicationId, schedulerBackend, _)
     }
   }
 
@@ -105,7 +110,7 @@ private[spark] class ExecutorPodsAllocator(
       totalExpectedExecutorsPerResourceProfileId.put(rp.id, numExecs)
     }
     logDebug(s"Set total expected execs to $totalExpectedExecutorsPerResourceProfileId")
-    if (!hasPendingPods.get()) {
+    if (numOutstandingPods.get() == 0) {
       snapshotsStore.notifySubscribers()
     }
   }
@@ -114,8 +119,19 @@ private[spark] class ExecutorPodsAllocator(
 
   private def onNewSnapshots(
       applicationId: String,
+      schedulerBackend: KubernetesClusterSchedulerBackend,
       snapshots: Seq[ExecutorPodsSnapshot]): Unit = {
-    newlyCreatedExecutors --= snapshots.flatMap(_.executorPods.keys)
+    val k8sKnownExecIds = snapshots.flatMap(_.executorPods.keys)
+    newlyCreatedExecutors --= k8sKnownExecIds
+    schedulerKnownNewlyCreatedExecs --= k8sKnownExecIds
+
+    // transfer the scheduler backend known executor requests from the newlyCreatedExecutors
+    // to the schedulerKnownNewlyCreatedExecs
+    val schedulerKnownExecs = schedulerBackend.getExecutorIds().map(_.toLong).toSet
+    schedulerKnownNewlyCreatedExecs ++=
+      newlyCreatedExecutors.filterKeys(schedulerKnownExecs.contains(_)).mapValues(_._1)
+    newlyCreatedExecutors --= schedulerKnownNewlyCreatedExecs.keySet
+
     // For all executors we've created against the API but have not seen in a snapshot
     // yet - check the current time. If the current time has exceeded some threshold,
     // assume that the pod was either never created (the API server never properly
@@ -164,15 +180,16 @@ private[spark] class ExecutorPodsAllocator(
       _deletedExecutorIds = _deletedExecutorIds.filter(existingExecs.contains)
     }
 
+    val notDeletedPods = lastSnapshot.executorPods.filterKeys(!_deletedExecutorIds.contains(_))
     // Map the pods into per ResourceProfile id so we can check per ResourceProfile,
     // add a fast path if not using other ResourceProfiles.
     val rpIdToExecsAndPodState =
       mutable.HashMap[Int, mutable.HashMap[Long, ExecutorPodState]]()
     if (totalExpectedExecutorsPerResourceProfileId.size <= 1) {
       rpIdToExecsAndPodState(ResourceProfile.DEFAULT_RESOURCE_PROFILE_ID) =
-        mutable.HashMap.empty ++= lastSnapshot.executorPods
+        mutable.HashMap.empty ++= notDeletedPods
     } else {
-      lastSnapshot.executorPods.foreach { case (execId, execPodState) =>
+      notDeletedPods.foreach { case (execId, execPodState) =>
         val rpId = execPodState.pod.getMetadata.getLabels.get(SPARK_RESOURCE_PROFILE_ID_LABEL).toInt
         val execPods = rpIdToExecsAndPodState.getOrElseUpdate(rpId,
           mutable.HashMap[Long, ExecutorPodState]())
@@ -190,24 +207,33 @@ private[spark] class ExecutorPodsAllocator(
         case _ => false
       }
 
-      val currentPendingExecutors = podsForRpId.filter {
+      val (schedulerKnownPendingExecsForRpId, currentPendingExecutorsForRpId) = podsForRpId.filter {
         case (_, PodPending(_)) => true
         case _ => false
+      }.partition { case (k, _) =>
+        schedulerKnownExecs.contains(k)
       }
       // This variable is used later to print some debug logs. It's updated when cleaning up
-      // excess pod requests, since currentPendingExecutors is immutable.
-      var knownPendingCount = currentPendingExecutors.size
+      // excess pod requests, since currentPendingExecutorsForRpId is immutable.
+      var knownPendingCount = currentPendingExecutorsForRpId.size
 
       val newlyCreatedExecutorsForRpId =
         newlyCreatedExecutors.filter { case (_, (waitingRpId, _)) =>
           rpId == waitingRpId
         }
 
+      val schedulerKnownNewlyCreatedExecsForRpId =
+        schedulerKnownNewlyCreatedExecs.filter { case (_, waitingRpId) =>
+          rpId == waitingRpId
+        }
+
       if (podsForRpId.nonEmpty) {
         logDebug(s"ResourceProfile Id: $rpId " +
           s"pod allocation status: $currentRunningCount running, " +
-          s"${currentPendingExecutors.size} pending. " +
-          s"${newlyCreatedExecutorsForRpId.size} unacknowledged.")
+          s"${currentPendingExecutorsForRpId.size} unknown pending, " +
+          s"${schedulerKnownPendingExecsForRpId.size} scheduler backend known pending, " +
+          s"${newlyCreatedExecutorsForRpId.size} unknown newly created, " +
+          s"${schedulerKnownNewlyCreatedExecsForRpId.size} scheduler backend known newly created.")
       }
 
       // It's possible that we have outstanding pods that are outdated when dynamic allocation
@@ -218,8 +244,9 @@ private[spark] class ExecutorPodsAllocator(
       //
       // TODO: with dynamic allocation off, handle edge cases if we end up with more running
       // executors than expected.
-      val knownPodCount = currentRunningCount + currentPendingExecutors.size +
-        newlyCreatedExecutorsForRpId.size
+      val knownPodCount = currentRunningCount +
+        currentPendingExecutorsForRpId.size + schedulerKnownPendingExecsForRpId.size +
+        newlyCreatedExecutorsForRpId.size + schedulerKnownNewlyCreatedExecsForRpId.size
 
       if (knownPodCount > targetNum) {
         val excess = knownPodCount - targetNum
@@ -227,7 +254,7 @@ private[spark] class ExecutorPodsAllocator(
           .filter { case (_, (_, createTime)) =>
             currentTime - createTime > executorIdleTimeout
           }.keys.take(excess).toList
-        val knownPendingToDelete = currentPendingExecutors
+        val knownPendingToDelete = currentPendingExecutorsForRpId
           .filter(x => isExecutorIdleTimedOut(x._2, currentTime))
           .take(excess - newlyCreatedToDelete.size)
           .map { case (id, _) => id }
@@ -245,7 +272,7 @@ private[spark] class ExecutorPodsAllocator(
               .withLabel(SPARK_ROLE_LABEL, SPARK_POD_EXECUTOR_ROLE)
               .withLabelIn(SPARK_EXECUTOR_ID_LABEL, toDelete.sorted.map(_.toString): _*)
               .delete()
-            newlyCreatedExecutors --= toDelete
+            newlyCreatedExecutors --= newlyCreatedToDelete
             knownPendingCount -= knownPendingToDelete.size
           }
         }
@@ -276,8 +303,9 @@ private[spark] class ExecutorPodsAllocator(
     deletedExecutorIds = _deletedExecutorIds
 
     // Update the flag that helps the setTotalExpectedExecutors() callback avoid triggering this
-    // update method when not needed.
-    hasPendingPods.set(totalPendingCount + newlyCreatedExecutors.size > 0)
+    // update method when not needed. PODs known by the scheduler backend are not counted here as
+    // they considered running PODs and they should not block upscaling.
+    numOutstandingPods.set(totalPendingCount + newlyCreatedExecutors.size)
   }
 
   private def requestNewExecutors(
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
index c35a434f83ec1..d58e38ab9794a 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
@@ -93,7 +93,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
     val initExecs = Map(defaultProfile -> initialExecutors)
     podAllocator.setTotalExpectedExecutors(initExecs)
     lifecycleEventHandler.start(this)
-    podAllocator.start(applicationId())
+    podAllocator.start(applicationId(), this)
     watchEvents.start(applicationId())
     pollEvents.start(applicationId())
     setUpExecutorConfigMap()
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala
index 349bbcd6f7883..55be80ae29c7e 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala
@@ -81,6 +81,9 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
   @Mock
   private var executorBuilder: KubernetesExecutorBuilder = _
 
+  @Mock
+  private var schedulerBackend: KubernetesClusterSchedulerBackend = _
+
   private var snapshotsStore: DeterministicExecutorPodsSnapshotsStore = _
 
   private var podsAllocatorUnderTest: ExecutorPodsAllocator = _
@@ -96,12 +99,14 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     waitForExecutorPodsClock = new ManualClock(0L)
     podsAllocatorUnderTest = new ExecutorPodsAllocator(
       conf, secMgr, executorBuilder, kubernetesClient, snapshotsStore, waitForExecutorPodsClock)
-    podsAllocatorUnderTest.start(TEST_SPARK_APP_ID)
+    when(schedulerBackend.getExecutorIds).thenReturn(Seq.empty)
+    podsAllocatorUnderTest.start(TEST_SPARK_APP_ID, schedulerBackend)
   }
 
   test("Initially request executors in batches. Do not request another batch if the" +
     " first has not finished.") {
     podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> (podAllocationSize + 1)))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 5)
     for (nextId <- 1 to podAllocationSize) {
       verify(podOperations).create(podWithAttachedContainerForId(nextId))
     }
@@ -111,28 +116,34 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
   test("Request executors in batches. Allow another batch to be requested if" +
     " all pending executors start running.") {
     podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> (podAllocationSize + 1)))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 5)
     for (execId <- 1 until podAllocationSize) {
       snapshotsStore.updatePod(runningExecutor(execId))
     }
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 1)
     verify(podOperations, never()).create(podWithAttachedContainerForId(podAllocationSize + 1))
     snapshotsStore.updatePod(runningExecutor(podAllocationSize))
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 1)
     verify(podOperations).create(podWithAttachedContainerForId(podAllocationSize + 1))
     snapshotsStore.updatePod(runningExecutor(podAllocationSize))
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 1)
     verify(podOperations, times(podAllocationSize + 1)).create(any(classOf[Pod]))
   }
 
   test("When a current batch reaches error states immediately, re-request" +
     " them on the next batch.") {
     podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> podAllocationSize))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 5)
     for (execId <- 1 until podAllocationSize) {
       snapshotsStore.updatePod(runningExecutor(execId))
     }
     val failedPod = failedExecutorWithoutDeletion(podAllocationSize)
     snapshotsStore.updatePod(failedPod)
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 1)
     verify(podOperations).create(podWithAttachedContainerForId(podAllocationSize + 1))
   }
 
@@ -148,9 +159,11 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
       .withLabelIn(SPARK_EXECUTOR_ID_LABEL, "1"))
       .thenReturn(labeledPods)
     podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 1))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 1)
     verify(podOperations).create(podWithAttachedContainerForId(1))
     waitForExecutorPodsClock.setTime(podCreationTimeout + 1)
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 1)
     verify(labeledPods).delete()
     verify(podOperations).create(podWithAttachedContainerForId(2))
   }
@@ -174,17 +187,20 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
 
     // Target 1 executor, make sure it's requested, even with an empty initial snapshot.
     podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 1))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 1)
     verify(podOperations).create(podWithAttachedContainerForId(1))
 
     // Mark executor as running, verify that subsequent allocation cycle is a no-op.
     snapshotsStore.updatePod(runningExecutor(1))
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 0)
     verify(podOperations, times(1)).create(any())
     verify(podOperations, never()).delete()
 
     // Request 3 more executors, make sure all are requested.
     podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 4))
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 3)
     verify(podOperations).create(podWithAttachedContainerForId(2))
     verify(podOperations).create(podWithAttachedContainerForId(3))
     verify(podOperations).create(podWithAttachedContainerForId(4))
@@ -193,6 +209,7 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     snapshotsStore.updatePod(runningExecutor(2))
     snapshotsStore.updatePod(pendingExecutor(3))
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 2)
     verify(podOperations, times(4)).create(any())
     verify(podOperations, never()).delete()
 
@@ -200,6 +217,7 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     waitForExecutorPodsClock.advance(executorIdleTimeout * 2)
     podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 1))
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 0)
     verify(podOperations, times(4)).create(any())
     verify(podOperations).withLabelIn(SPARK_EXECUTOR_ID_LABEL, "3", "4")
     verify(podOperations).delete()
@@ -212,6 +230,7 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     snapshotsStore.updatePod(deletedExecutor(4))
     snapshotsStore.removeDeletedExecutors()
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 0)
     assert(!podsAllocatorUnderTest.isDeleted("3"))
     assert(!podsAllocatorUnderTest.isDeleted("4"))
   }
@@ -279,6 +298,7 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     waitForExecutorPodsClock.setTime(startTime)
 
     podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 5))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 5)
     verify(podOperations).create(podWithAttachedContainerForId(1))
     verify(podOperations).create(podWithAttachedContainerForId(2))
     verify(podOperations).create(podWithAttachedContainerForId(3))
@@ -292,16 +312,139 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     // Newly created executors (both acknowledged and not) are protected by executorIdleTimeout
     podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 0))
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 5)
     verify(podOperations, never()).withLabelIn(SPARK_EXECUTOR_ID_LABEL, "1", "2", "3", "4", "5")
     verify(podOperations, never()).delete()
 
     // Newly created executors (both acknowledged and not) are cleaned up.
     waitForExecutorPodsClock.advance(executorIdleTimeout * 2)
+    when(schedulerBackend.getExecutorIds).thenReturn(Seq("1", "3", "4"))
     snapshotsStore.notifySubscribers()
-    verify(podOperations).withLabelIn(SPARK_EXECUTOR_ID_LABEL, "1", "2", "3", "4", "5")
+    // SPARK-34361: even as 1, 3 and 4 are not timed out as they are considered as known PODs so
+    // this is why they are not counted into the outstanding PODs and /they are not removed even
+    // though executor 1 is still in pending state and executor 3 and 4 are new request without
+    // any state reported by kubernetes and all the three are already timed out
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 0)
+    verify(podOperations).withLabelIn(SPARK_EXECUTOR_ID_LABEL, "2", "5")
     verify(podOperations).delete()
   }
 
+  /**
+   * This test covers some downscaling and upscaling of dynamic allocation on kubernetes
+   * along with multiple resource profiles (default and rp) when some executors
+   * already know by the scheduler backend.
+   *
+   * Legend:
+   *
+   * N-: newly created not known by the scheduler backend
+   * N+: newly created known by the scheduler backend
+   * P- / P+ : pending (not know / known) by the scheduler backend
+   * D: deleted
+   *                                       |   default    ||         rp        | expected
+   *                                       |              ||                   | outstanding
+   *                                       | 1  | 2  | 3  || 4  | 5  | 6  | 7  | PODs
+   * ==========================================================================================
+   *  0) setTotalExpectedExecs with        | N- | N- | N- || N- | N- | N- | N- |
+   *       default->3, ro->4               |    |    |    ||    |    |    |    |      7
+   * ------------------------------------------------------------------------------------------
+   *  1) make 1 from each rp               | N+ | N- | N- || N+ | N- | N- | N- |
+   *     known by backend                  |    |    |    ||    |    |    |    |      5
+   * -------------------------------------------------------------------------------------------
+   *  2) some more backend known + pending | N+ | P+ | P- || N+ | P+ | P- | N- |      3
+   * -------------------------------------------------------------------------------------------
+   *  3) advance time with idle timeout    |    |    |    ||    |    |    |    |
+   *     setTotalExpectedExecs with        | N+ | P+ | D  || N+ | P+ | D  | D  |      0
+   *       default->1, rp->1               |    |    |    ||    |    |    |    |
+   * -------------------------------------------------------------------------------------------
+   *  4) setTotalExpectedExecs with        | N+ | P+ | D  || N+ | P+ | D  | D  |      0 and
+   *       default->2, rp->2               |    |    |    ||    |    |    |    | no new POD req.
+   * ===========================================================================================
+   *
+   *  5) setTotalExpectedExecs with default -> 3, rp -> 3 which will lead to creation of the new
+   *     PODs: 8 and 9
+   */
+  test("SPARK-34361: scheduler backend known pods with multiple resource profiles at downscaling") {
+    when(podOperations
+      .withField("status.phase", "Pending"))
+      .thenReturn(podOperations)
+    when(podOperations
+      .withLabel(SPARK_APP_ID_LABEL, TEST_SPARK_APP_ID))
+      .thenReturn(podOperations)
+    when(podOperations
+      .withLabel(SPARK_ROLE_LABEL, SPARK_POD_EXECUTOR_ROLE))
+      .thenReturn(podOperations)
+    when(podOperations
+      .withLabelIn(meq(SPARK_EXECUTOR_ID_LABEL), any()))
+      .thenReturn(podOperations)
+
+    val startTime = Instant.now.toEpochMilli
+    waitForExecutorPodsClock.setTime(startTime)
+
+    val rpb = new ResourceProfileBuilder()
+    val ereq = new ExecutorResourceRequests()
+    val treq = new TaskResourceRequests()
+    ereq.cores(4).memory("2g")
+    treq.cpus(2)
+    rpb.require(ereq).require(treq)
+    val rp = rpb.build()
+
+    // 0) request 3 PODs for the default and 4 PODs for the other resource profile
+    podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 3, rp -> 4))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 7)
+    verify(podOperations).create(podWithAttachedContainerForId(1, defaultProfile.id))
+    verify(podOperations).create(podWithAttachedContainerForId(2, defaultProfile.id))
+    verify(podOperations).create(podWithAttachedContainerForId(3, defaultProfile.id))
+    verify(podOperations).create(podWithAttachedContainerForId(4, rp.id))
+    verify(podOperations).create(podWithAttachedContainerForId(5, rp.id))
+    verify(podOperations).create(podWithAttachedContainerForId(6, rp.id))
+    verify(podOperations).create(podWithAttachedContainerForId(7, rp.id))
+
+    // 1) make 1 POD known by the scheduler backend for each resource profile
+    when(schedulerBackend.getExecutorIds).thenReturn(Seq("1", "4"))
+    snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 5,
+      "scheduler backend known PODs are not outstanding")
+    verify(podOperations, times(7)).create(any())
+
+    // 2) make 1 extra POD known by the scheduler backend for each resource profile
+    // and make some to pending
+    when(schedulerBackend.getExecutorIds).thenReturn(Seq("1", "2", "4", "5"))
+    snapshotsStore.updatePod(pendingExecutor(2, defaultProfile.id))
+    snapshotsStore.updatePod(pendingExecutor(3, defaultProfile.id))
+    snapshotsStore.updatePod(pendingExecutor(5, rp.id))
+    snapshotsStore.updatePod(pendingExecutor(6, rp.id))
+    snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 3)
+    verify(podOperations, times(7)).create(any())
+
+    // 3) downscale to 1 POD for default and 1 POD for the other resource profile
+    waitForExecutorPodsClock.advance(executorIdleTimeout * 2)
+    podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 1, rp -> 1))
+    snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 0)
+    verify(podOperations, times(7)).create(any())
+    verify(podOperations, times(2)).delete()
+    assert(podsAllocatorUnderTest.isDeleted("3"))
+    assert(podsAllocatorUnderTest.isDeleted("6"))
+    assert(podsAllocatorUnderTest.isDeleted("7"))
+
+    // 4) upscale to 2 PODs for default and 2 for the other resource profile but as there is still
+    // 2 PODs known by the scheduler backend there must be no new POD requested to be created
+    podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 2, rp -> 2))
+    snapshotsStore.notifySubscribers()
+    verify(podOperations, times(7)).create(any())
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 0)
+    verify(podOperations, times(7)).create(any())
+
+    // 5) requesting 1 more executor for each resource
+    podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 3, rp -> 3))
+    snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 2)
+    verify(podOperations, times(9)).create(any())
+    verify(podOperations).create(podWithAttachedContainerForId(8, defaultProfile.id))
+    verify(podOperations).create(podWithAttachedContainerForId(9, rp.id))
+  }
+
   test("SPARK-33288: multiple resource profiles") {
     when(podOperations
       .withField("status.phase", "Pending"))
@@ -330,6 +473,7 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     // Target 1 executor for default profile, 2 for other profile,
     // make sure it's requested, even with an empty initial snapshot.
     podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 1, rp -> 2))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 3)
     verify(podOperations).create(podWithAttachedContainerForId(1, defaultProfile.id))
     verify(podOperations).create(podWithAttachedContainerForId(2, rp.id))
     verify(podOperations).create(podWithAttachedContainerForId(3, rp.id))
@@ -339,6 +483,7 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     snapshotsStore.updatePod(runningExecutor(2, rp.id))
     snapshotsStore.updatePod(runningExecutor(3, rp.id))
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 0)
     verify(podOperations, times(3)).create(any())
     verify(podOperations, never()).delete()
 
@@ -346,6 +491,7 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     // make sure all are requested.
     podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 4, rp -> 3))
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 4)
     verify(podOperations).create(podWithAttachedContainerForId(4, defaultProfile.id))
     verify(podOperations).create(podWithAttachedContainerForId(5, defaultProfile.id))
     verify(podOperations).create(podWithAttachedContainerForId(6, defaultProfile.id))
@@ -356,6 +502,7 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     snapshotsStore.updatePod(pendingExecutor(5, defaultProfile.id))
     snapshotsStore.updatePod(pendingExecutor(7, rp.id))
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 3)
     verify(podOperations, times(7)).create(any())
     verify(podOperations, never()).delete()
 
@@ -364,6 +511,7 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     waitForExecutorPodsClock.advance(executorIdleTimeout * 2)
     podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 1, rp -> 1))
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 0)
     verify(podOperations, times(7)).create(any())
     verify(podOperations).withLabelIn(SPARK_EXECUTOR_ID_LABEL, "5", "6")
     verify(podOperations).withLabelIn(SPARK_EXECUTOR_ID_LABEL, "7")
@@ -379,6 +527,7 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     snapshotsStore.updatePod(deletedExecutor(7))
     snapshotsStore.removeDeletedExecutors()
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 0)
     assert(!podsAllocatorUnderTest.isDeleted("5"))
     assert(!podsAllocatorUnderTest.isDeleted("6"))
     assert(!podsAllocatorUnderTest.isDeleted("7"))
@@ -399,6 +548,7 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
       .thenReturn(podOperations)
 
     podsAllocatorUnderTest.setTotalExpectedExecutors(Map(defaultProfile -> 6))
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 5)
     // Initial request of pods
     verify(podOperations).create(podWithAttachedContainerForId(1))
     verify(podOperations).create(podWithAttachedContainerForId(2))
@@ -414,6 +564,7 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     // We move forward one allocation cycle
     waitForExecutorPodsClock.setTime(podAllocationDelay + 1)
     snapshotsStore.notifySubscribers()
+    assert(podsAllocatorUnderTest.numOutstandingPods.get() == 2)
     // We request pod 6
     verify(podOperations).create(podWithAttachedContainerForId(6))
   }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
index 861d41cc50ac5..e4a73e24c3921 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
@@ -127,7 +127,7 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
   test("Start all components") {
     schedulerBackendUnderTest.start()
     verify(podAllocator).setTotalExpectedExecutors(Map(defaultProfile -> 3))
-    verify(podAllocator).start(TEST_SPARK_APP_ID)
+    verify(podAllocator).start(TEST_SPARK_APP_ID, schedulerBackendUnderTest)
     verify(lifecycleEventHandler).start(schedulerBackendUnderTest)
     verify(watchEvents).start(TEST_SPARK_APP_ID)
     verify(pollEvents).start(TEST_SPARK_APP_ID)

From b8b6f884d28af05be21cc65080a7ed7d45b103b5 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Sat, 3 Apr 2021 00:00:17 -0700
Subject: [PATCH 002/169] [SPARK-34948][K8S] Add ownerReference to executor
 configmap to fix leakages

This PR aims to add `ownerReference` to the executor ConfigMap to fix leakage.

SPARK-30985 maintains the executor config map explicitly inside Spark. However, this config map can be leaked when Spark drivers die accidentally or are killed by K8s. We need to add `ownerReference` to make K8s do the garbage collection these automatically.

The number of ConfigMap is one of the resource quota. So, the leaked configMaps currently cause Spark jobs submission failures.

No.

Pass the CIs and check manually.

K8s IT is tested manually.
```
KubernetesSuite:
- Run SparkPi with no resources
- Run SparkPi with a very long application name.
- Use SparkLauncher.NO_RESOURCE
- Run SparkPi with a master URL without a scheme.
- Run SparkPi with an argument.
- Run SparkPi with custom labels, annotations, and environment variables.
- All pods have the same service account by default
- Run extraJVMOptions check on driver
- Run SparkRemoteFileTest using a remote data file
- Verify logging configuration is picked from the provided SPARK_CONF_DIR/log4j.properties
- Run SparkPi with env and mount secrets.
- Run PySpark on simple pi.py example
- Run PySpark to test a pyfiles example
- Run PySpark with memory customization
- Run in client mode.
- Start pod creation from template
- PVs with local storage
- Launcher client dependencies
- SPARK-33615: Launcher client archives
- SPARK-33748: Launcher python client respecting PYSPARK_PYTHON
- SPARK-33748: Launcher python client respecting spark.pyspark.python and spark.pyspark.driver.python
- Launcher python client dependencies using a zip file
- Test basic decommissioning
- Test basic decommissioning with shuffle cleanup
- Test decommissioning with dynamic allocation & shuffle cleanups
- Test decommissioning timeouts
- Run SparkR on simple dataframe.R example
Run completed in 19 minutes, 2 seconds.
Total number of tests run: 27
Suites: completed 2, aborted 0
Tests: succeeded 27, failed 0, canceled 0, ignored 0, pending 0
All tests passed.
```

**BEFORE**
```
$ k get cm spark-exec-450b417895b3b2c7-conf-map -oyaml | grep ownerReferences
```

**AFTER**
```
$ k get cm spark-exec-bb37a27895b1c26c-conf-map -oyaml | grep ownerReferences
        f:ownerReferences:
```

Closes #32042 from dongjoon-hyun/SPARK-34948.

Authored-by: Dongjoon Hyun <dhyun@apple.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit a42dc93a2abf9490d68146b3586aec7fe2f9c102)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../scheduler/cluster/k8s/ExecutorPodsAllocator.scala      | 2 +-
 .../cluster/k8s/KubernetesClusterSchedulerBackend.scala    | 7 +++++--
 .../k8s/KubernetesClusterSchedulerBackendSuite.scala       | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
index 5fc81a6d84273..5ebd172f7dec6 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
@@ -70,7 +70,7 @@ private[spark] class ExecutorPodsAllocator(
 
   private val shouldDeleteExecutors = conf.get(KUBERNETES_DELETE_EXECUTORS)
 
-  private val driverPod = kubernetesDriverPodName
+  val driverPod = kubernetesDriverPodName
     .map(name => Option(kubernetesClient.pods()
       .withName(name)
       .get())
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
index d58e38ab9794a..887afca05cb1c 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackend.scala
@@ -20,11 +20,13 @@ import java.util.concurrent.{ScheduledExecutorService, TimeUnit}
 
 import scala.concurrent.Future
 
+import io.fabric8.kubernetes.api.model.Pod
 import io.fabric8.kubernetes.client.KubernetesClient
 
 import org.apache.spark.SparkContext
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
+import org.apache.spark.deploy.k8s.KubernetesUtils
 import org.apache.spark.deploy.k8s.submit.KubernetesClientUtils
 import org.apache.spark.deploy.security.HadoopDelegationTokenManager
 import org.apache.spark.internal.config.SCHEDULER_MIN_REGISTERED_RESOURCES_RATIO
@@ -67,13 +69,14 @@ private[spark] class KubernetesClusterSchedulerBackend(
     }
   }
 
-  private def setUpExecutorConfigMap(): Unit = {
+  private def setUpExecutorConfigMap(driverPod: Option[Pod]): Unit = {
     val configMapName = KubernetesClientUtils.configMapNameExecutor
     val confFilesMap = KubernetesClientUtils
       .buildSparkConfDirFilesMap(configMapName, conf, Map.empty)
     val labels =
       Map(SPARK_APP_ID_LABEL -> applicationId(), SPARK_ROLE_LABEL -> SPARK_POD_EXECUTOR_ROLE)
     val configMap = KubernetesClientUtils.buildConfigMap(configMapName, confFilesMap, labels)
+    KubernetesUtils.addOwnerReference(driverPod.orNull, Seq(configMap))
     kubernetesClient.configMaps().create(configMap)
   }
 
@@ -96,7 +99,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
     podAllocator.start(applicationId(), this)
     watchEvents.start(applicationId())
     pollEvents.start(applicationId())
-    setUpExecutorConfigMap()
+    setUpExecutorConfigMap(podAllocator.driverPod)
   }
 
   override def stop(): Unit = {
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
index e4a73e24c3921..3573ffc07d2c8 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/KubernetesClusterSchedulerBackendSuite.scala
@@ -112,6 +112,7 @@ class KubernetesClusterSchedulerBackendSuite extends SparkFunSuite with BeforeAn
       .thenReturn(driverEndpointRef)
     when(kubernetesClient.pods()).thenReturn(podOperations)
     when(kubernetesClient.configMaps()).thenReturn(configMapsOperations)
+    when(podAllocator.driverPod).thenReturn(None)
     schedulerBackendUnderTest = new KubernetesClusterSchedulerBackend(
       taskScheduler,
       sc,

From e852a3c88d12ca96057f0f316940f26271ad1b2c Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Fri, 21 May 2021 08:27:49 -0700
Subject: [PATCH 003/169] [SPARK-35482][K8S] Use `spark.blockManager.port` not
 the wrong `spark.blockmanager.port` in BasicExecutorFeatureStep

### What changes were proposed in this pull request?

most spark conf keys are case sensitive, including `spark.blockManager.port`, we can not get the correct port number with `spark.blockmanager.port`.

This PR changes the wrong key to `spark.blockManager.port` in `BasicExecutorFeatureStep`.

This PR also ensures a fast fail when the port value is invalid for executor containers. When 0 is specified(it is valid as random port, but invalid as a k8s request), it should not be put in the `containerPort` field of executor pod desc. We do not expect executor pods to continuously fail to create because of invalid requests.

### Why are the changes needed?

bugfix

### Does this PR introduce _any_ user-facing change?

no

### How was this patch tested?

new tests

Closes #32621 from yaooqinn/SPARK-35482.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit d957426351149dd1b4e1106d1230f395934f61d2)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../features/BasicExecutorFeatureStep.scala   | 24 +++++++++------
 .../BasicExecutorFeatureStepSuite.scala       | 29 +++++++++++++++++++
 2 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala
index 250dd8238d9ea..a0a17cecf9a8e 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStep.scala
@@ -44,7 +44,10 @@ private[spark] class BasicExecutorFeatureStep(
     .getOrElse(throw new SparkException("Must specify the executor container image"))
   private val blockManagerPort = kubernetesConf
     .sparkConf
-    .getInt("spark.blockmanager.port", DEFAULT_BLOCKMANAGER_PORT)
+    .getInt(BLOCK_MANAGER_PORT.key, DEFAULT_BLOCKMANAGER_PORT)
+
+  require(blockManagerPort == 0 || (1024 <= blockManagerPort && blockManagerPort < 65536),
+    "port number must be 0 or in [1024, 65535]")
 
   private val executorPodNamePrefix = kubernetesConf.resourceNamePrefix
 
@@ -171,14 +174,17 @@ private[spark] class BasicExecutorFeatureStep(
         .replaceAll(ENV_EXECUTOR_ID, kubernetesConf.executorId))
     }
 
-    val requiredPorts = Seq(
-      (BLOCK_MANAGER_PORT_NAME, blockManagerPort))
-      .map { case (name, port) =>
-        new ContainerPortBuilder()
-          .withName(name)
-          .withContainerPort(port)
-          .build()
-      }
+    // 0 is invalid as kubernetes containerPort request, we shall leave it unmounted
+    val requiredPorts = if (blockManagerPort != 0) {
+      Seq(
+        (BLOCK_MANAGER_PORT_NAME, blockManagerPort))
+        .map { case (name, port) =>
+          new ContainerPortBuilder()
+            .withName(name)
+            .withContainerPort(port)
+            .build()
+        }
+    } else Nil
 
     if (!isDefaultProfile) {
       if (pod.container != null && pod.container.getResources() != null) {
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStepSuite.scala
index 66ece81aca646..df4693f313917 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicExecutorFeatureStepSuite.scala
@@ -330,6 +330,35 @@ class BasicExecutorFeatureStepSuite extends SparkFunSuite with BeforeAndAfter {
     SecretVolumeUtils.podHasVolume(podConfigured.pod, SPARK_CONF_VOLUME_EXEC)
   }
 
+  test("SPARK-35482: user correct block manager port for executor pods") {
+    try {
+      val initPod = SparkPod.initialPod()
+      val sm = new SecurityManager(baseConf)
+      val step1 =
+        new BasicExecutorFeatureStep(newExecutorConf(), sm, defaultProfile)
+      val containerPort1 = step1.configurePod(initPod).container.getPorts.get(0)
+      assert(containerPort1.getContainerPort === DEFAULT_BLOCKMANAGER_PORT,
+        s"should use port no. $DEFAULT_BLOCKMANAGER_PORT as default")
+
+      baseConf.set(BLOCK_MANAGER_PORT, 12345)
+      val step2 = new BasicExecutorFeatureStep(newExecutorConf(), sm, defaultProfile)
+      val containerPort2 = step2.configurePod(initPod).container.getPorts.get(0)
+      assert(containerPort2.getContainerPort === 12345)
+
+      baseConf.set(BLOCK_MANAGER_PORT, 1000)
+      val e = intercept[IllegalArgumentException] {
+        new BasicExecutorFeatureStep(newExecutorConf(), sm, defaultProfile)
+      }
+      assert(e.getMessage.contains("port number must be 0 or in [1024, 65535]"))
+
+      baseConf.set(BLOCK_MANAGER_PORT, 0)
+      val step3 = new BasicExecutorFeatureStep(newExecutorConf(), sm, defaultProfile)
+      assert(step3.configurePod(initPod).container.getPorts.isEmpty, "random port")
+    } finally {
+      baseConf.remove(BLOCK_MANAGER_PORT)
+    }
+  }
+
   // There is always exactly one controller reference, and it points to the driver pod.
   private def checkOwnerReferences(executor: Pod, driverPodUid: String): Unit = {
     assert(executor.getMetadata.getOwnerReferences.size() === 1)

From 5625c45ec9a60da763da4fb7ebdbf953ecb9abac Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Sun, 23 May 2021 08:07:57 -0700
Subject: [PATCH 004/169] [SPARK-35493][K8S] make `spark.blockManager.port`
 fallback for `spark.driver.blockManager.port` as same as other cluster
 managers

### What changes were proposed in this pull request?

`spark.blockManager.port` does not work for k8s driver pods now, we should make it work as other cluster managers.

### Why are the changes needed?

`spark.blockManager.port` should be able to work for spark driver pod

### Does this PR introduce _any_ user-facing change?

yes, `spark.blockManager.port` will be respect iff it is present  && `spark.driver.blockManager.port` is absent

### How was this patch tested?

new tests

Closes #32639 from yaooqinn/SPARK-35493.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit 96b0548ab6d5fe36833812f7b6424c984f75c6dd)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../k8s/features/BasicDriverFeatureStep.scala |  2 +-
 .../BasicDriverFeatureStepSuite.scala         | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStep.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStep.scala
index cec8272beed57..7f34f30d59982 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStep.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStep.scala
@@ -96,7 +96,7 @@ private[spark] class BasicDriverFeatureStep(conf: KubernetesDriverConf)
     val driverPort = conf.sparkConf.getInt(DRIVER_PORT.key, DEFAULT_DRIVER_PORT)
     val driverBlockManagerPort = conf.sparkConf.getInt(
       DRIVER_BLOCK_MANAGER_PORT.key,
-      DEFAULT_BLOCKMANAGER_PORT
+      conf.sparkConf.getInt(BLOCK_MANAGER_PORT.key, DEFAULT_BLOCKMANAGER_PORT)
     )
     val driverUIPort = SparkUI.getUIPort(conf.sparkConf)
     val driverContainer = new ContainerBuilder(pod.container)
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStepSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStepSuite.scala
index 858b4f1494b8e..f0843225ea6e2 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStepSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/deploy/k8s/features/BasicDriverFeatureStepSuite.scala
@@ -213,6 +213,25 @@ class BasicDriverFeatureStepSuite extends SparkFunSuite {
     }
   }
 
+  test("SPARK-35493: make spark.blockManager.port be able to be fallen back to in driver pod") {
+    val initPod = SparkPod.initialPod()
+    val sparkConf = new SparkConf()
+      .set(CONTAINER_IMAGE, "spark-driver:latest")
+      .set(BLOCK_MANAGER_PORT, 1234)
+    val driverConf1 = KubernetesTestConf.createDriverConf(sparkConf)
+    val pod1 = new BasicDriverFeatureStep(driverConf1).configurePod(initPod)
+    val portMap1 =
+      pod1.container.getPorts.asScala.map { cp => (cp.getName -> cp.getContainerPort) }.toMap
+    assert(portMap1(BLOCK_MANAGER_PORT_NAME) === 1234, s"fallback to $BLOCK_MANAGER_PORT.key")
+
+    val driverConf2 =
+      KubernetesTestConf.createDriverConf(sparkConf.set(DRIVER_BLOCK_MANAGER_PORT, 1235))
+    val pod2 = new BasicDriverFeatureStep(driverConf2).configurePod(initPod)
+    val portMap2 =
+      pod2.container.getPorts.asScala.map { cp => (cp.getName -> cp.getContainerPort) }.toMap
+    assert(portMap2(BLOCK_MANAGER_PORT_NAME) === 1235)
+  }
+
   def containerPort(name: String, portNumber: Int): ContainerPort =
     new ContainerPortBuilder()
       .withName(name)

From 7c3e41121a921d0ff54dab1e9422adccdac4b9aa Mon Sep 17 00:00:00 2001
From: Chris Wu <wucaowei19@gmail.com>
Date: Fri, 4 Jun 2021 06:59:49 -0700
Subject: [PATCH 005/169] [SPARK-32975][K8S] Add config for driver readiness
 timeout before executors start

### What changes were proposed in this pull request?
Add a new config that controls the timeout of waiting for driver pod's readiness before allocating executor pods. This wait only happens once on application start.

### Why are the changes needed?
The driver's headless service can be resolved by DNS only after the driver pod is ready. If the executor tries to connect to the headless service before driver pod is ready, it will hit UnkownHostException and get into error state but will not be restarted. **This case usually happens when the driver pod has sidecar containers but hasn't finished their creation when executors start.** So basically there is a race condition. This issue can be mitigated by tweaking this config.

### Does this PR introduce _any_ user-facing change?
A new config `spark.kubernetes.allocation.driver.readinessTimeout` added.

### How was this patch tested?
Exisiting tests.

Closes #32752 from cchriswu/SPARK-32975-fix.

Lead-authored-by: Chris Wu <wucaowei19@gmail.com>
Co-authored-by: Chris Wu <wcaowei@vmware.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit 497c80a1ad7fdd605b75c8a6601fce35c7449578)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../scala/org/apache/spark/deploy/k8s/Config.scala    | 11 +++++++++++
 .../scheduler/cluster/k8s/ExecutorPodsAllocator.scala | 11 +++++++++++
 .../cluster/k8s/ExecutorPodsAllocatorSuite.scala      |  1 +
 3 files changed, 23 insertions(+)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
index fe92fae36c36c..de9d15850e1c8 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/Config.scala
@@ -235,6 +235,17 @@ private[spark] object Config extends Logging {
       .checkValue(value => value > 0, "Allocation batch delay must be a positive time value.")
       .createWithDefaultString("1s")
 
+  val KUBERNETES_ALLOCATION_DRIVER_READINESS_TIMEOUT =
+    ConfigBuilder("spark.kubernetes.allocation.driver.readinessTimeout")
+      .doc("Time to wait for driver pod to get ready before creating executor pods. This wait " +
+        "only happens on application start. If timeout happens, executor pods will still be " +
+        "created.")
+      .version("3.1.3")
+      .timeConf(TimeUnit.SECONDS)
+      .checkValue(value => value > 0, "Allocation driver readiness timeout must be a positive "
+        + "time value.")
+      .createWithDefaultString("1s")
+
   val KUBERNETES_ALLOCATION_EXECUTOR_TIMEOUT =
     ConfigBuilder("spark.kubernetes.allocation.executor.timeout")
       .doc("Time to wait before a newly created executor POD request, which does not reached " +
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
index 5ebd172f7dec6..358058e27a049 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
@@ -18,6 +18,7 @@ package org.apache.spark.scheduler.cluster.k8s
 
 import java.time.Instant
 import java.util.concurrent.ConcurrentHashMap
+import java.util.concurrent.TimeUnit
 import java.util.concurrent.atomic.{AtomicInteger, AtomicLong}
 
 import scala.collection.JavaConverters._
@@ -61,6 +62,8 @@ private[spark] class ExecutorPodsAllocator(
     podAllocationDelay * 5,
     conf.get(KUBERNETES_ALLOCATION_EXECUTOR_TIMEOUT))
 
+  private val driverPodReadinessTimeout = conf.get(KUBERNETES_ALLOCATION_DRIVER_READINESS_TIMEOUT)
+
   private val executorIdleTimeout = conf.get(DYN_ALLOCATION_EXECUTOR_IDLE_TIMEOUT) * 1000
 
   private val namespace = conf.get(KUBERNETES_NAMESPACE)
@@ -99,6 +102,14 @@ private[spark] class ExecutorPodsAllocator(
   @volatile private var deletedExecutorIds = Set.empty[Long]
 
   def start(applicationId: String, schedulerBackend: KubernetesClusterSchedulerBackend): Unit = {
+    // Wait until the driver pod is ready before starting executors, as the headless service won't
+    // be resolvable by DNS until the driver pod is ready.
+    Utils.tryLogNonFatalError {
+      kubernetesClient
+        .pods()
+        .withName(kubernetesDriverPodName.get)
+        .waitUntilReady(driverPodReadinessTimeout, TimeUnit.SECONDS)
+    }
     snapshotsStore.addSubscriber(podAllocationDelay) {
       onNewSnapshots(applicationId, schedulerBackend, _)
     }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala
index 55be80ae29c7e..ed6ca2a68f9a7 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala
@@ -93,6 +93,7 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     when(kubernetesClient.pods()).thenReturn(podOperations)
     when(podOperations.withName(driverPodName)).thenReturn(driverPodOperations)
     when(driverPodOperations.get).thenReturn(driverPod)
+    when(driverPodOperations.waitUntilReady(any(), any())).thenReturn(driverPod)
     when(executorBuilder.buildFromFeatures(any(classOf[KubernetesExecutorConf]), meq(secMgr),
       meq(kubernetesClient), any(classOf[ResourceProfile]))).thenAnswer(executorPodAnswer())
     snapshotsStore = new DeterministicExecutorPodsSnapshotsStore()

From 38ec10632af272ef81e7c0359a5100e756aebcaf Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Thu, 10 Jun 2021 13:39:39 -0700
Subject: [PATCH 006/169] [SPARK-32975][K8S][FOLLOWUP] Avoid None.get exception

### What changes were proposed in this pull request?

A follow-up for SPARK-32975 to avoid unexpected the `None.get` exception

Run SparkPi with docker desktop, as podName is an option, we will got
```logtalk
21/06/09 01:09:12 ERROR Utils: Uncaught exception in thread main
java.util.NoSuchElementException: None.get
	at scala.None$.get(Option.scala:529)
	at scala.None$.get(Option.scala:527)
	at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.$anonfun$start$1(ExecutorPodsAllocator.scala:110)
	at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1417)
	at org.apache.spark.scheduler.cluster.k8s.ExecutorPodsAllocator.start(ExecutorPodsAllocator.scala:111)
	at org.apache.spark.scheduler.cluster.k8s.KubernetesClusterSchedulerBackend.start(KubernetesClusterSchedulerBackend.scala:99)
	at org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:220)
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:581)
	at org.apache.spark.SparkContext$.getOrCreate(SparkContext.scala:2686)
	at org.apache.spark.sql.SparkSession$Builder.$anonfun$getOrCreate$2(SparkSession.scala:948)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:942)
	at org.apache.spark.examples.SparkPi$.main(SparkPi.scala:30)
	at org.apache.spark.examples.SparkPi.main(SparkPi.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:955)
	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:180)
	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:203)
	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:90)
	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1043)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1052)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
```

### Why are the changes needed?

fix a regression

### Does this PR introduce _any_ user-facing change?

no
### How was this patch tested?

Manual.

Closes #32830 from yaooqinn/SPARK-32975.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit b4b78ce26567ce7ab83d47ce3b6af87c866bcacb)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../cluster/k8s/ExecutorPodsAllocator.scala      | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
index 358058e27a049..5429e36dda5ec 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
@@ -102,13 +102,15 @@ private[spark] class ExecutorPodsAllocator(
   @volatile private var deletedExecutorIds = Set.empty[Long]
 
   def start(applicationId: String, schedulerBackend: KubernetesClusterSchedulerBackend): Unit = {
-    // Wait until the driver pod is ready before starting executors, as the headless service won't
-    // be resolvable by DNS until the driver pod is ready.
-    Utils.tryLogNonFatalError {
-      kubernetesClient
-        .pods()
-        .withName(kubernetesDriverPodName.get)
-        .waitUntilReady(driverPodReadinessTimeout, TimeUnit.SECONDS)
+    driverPod.foreach { pod =>
+      // Wait until the driver pod is ready before starting executors, as the headless service won't
+      // be resolvable by DNS until the driver pod is ready.
+      Utils.tryLogNonFatalError {
+        kubernetesClient
+          .pods()
+          .withName(pod.getMetadata.getName)
+          .waitUntilReady(driverPodReadinessTimeout, TimeUnit.SECONDS)
+      }
     }
     snapshotsStore.addSubscriber(podAllocationDelay) {
       onNewSnapshots(applicationId, schedulerBackend, _)

From ee142c04df9452edb5b00d56475280c7e038dbf4 Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Sun, 13 Jun 2021 09:11:14 -0700
Subject: [PATCH 007/169] [MINOR][K8S] Print the driver pod name instead of
 Some(name) if absent

Print the driver pod name instead of Some(name) if absent

fix error hint

no

new test

Closes #32889 from yaooqinn/minork8s.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
(cherry picked from commit 1125afd4622e6d3f7f14fca1ebcfebdfba6d9529)
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../cluster/k8s/ExecutorPodsAllocator.scala        |  2 +-
 .../cluster/k8s/ExecutorPodsAllocatorSuite.scala   | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
index 5429e36dda5ec..c83b8b854f298 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
@@ -78,7 +78,7 @@ private[spark] class ExecutorPodsAllocator(
       .withName(name)
       .get())
       .getOrElse(throw new SparkException(
-        s"No pod was found named $kubernetesDriverPodName in the cluster in the " +
+        s"No pod was found named $name in the cluster in the " +
           s"namespace $namespace (this was supposed to be the driver pod.).")))
 
   // Executor IDs that have been requested from Kubernetes but have not been detected in any
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala
index ed6ca2a68f9a7..7ec17298d902e 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocatorSuite.scala
@@ -28,7 +28,7 @@ import org.mockito.invocation.InvocationOnMock
 import org.mockito.stubbing.Answer
 import org.scalatest.BeforeAndAfter
 
-import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
+import org.apache.spark.{SecurityManager, SparkConf, SparkException, SparkFunSuite}
 import org.apache.spark.deploy.k8s.{KubernetesExecutorConf, KubernetesExecutorSpec}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
@@ -570,6 +570,18 @@ class ExecutorPodsAllocatorSuite extends SparkFunSuite with BeforeAndAfter {
     verify(podOperations).create(podWithAttachedContainerForId(6))
   }
 
+  test("print the pod name instead of Some(name) if pod is absent") {
+    val nonexistentPod = "i-do-not-exist"
+    val conf = new SparkConf().set(KUBERNETES_DRIVER_POD_NAME, nonexistentPod)
+    when(kubernetesClient.pods()).thenReturn(podOperations)
+    when(podOperations.withName(nonexistentPod)).thenReturn(driverPodOperations)
+    when(driverPodOperations.get()).thenReturn(null)
+    val e = intercept[SparkException](new ExecutorPodsAllocator(
+      conf, secMgr, executorBuilder, kubernetesClient, snapshotsStore, waitForExecutorPodsClock))
+    assert(e.getMessage.contains("No pod was found named i-do-not-exist in the cluster in the" +
+      " namespace default"))
+  }
+
   private def executorPodAnswer(): Answer[KubernetesExecutorSpec] =
     (invocation: InvocationOnMock) => {
       val k8sConf: KubernetesExecutorConf = invocation.getArgument(0)

From fd14c3074d13352cc2b558bb9b48df9b9e9ef757 Mon Sep 17 00:00:00 2001
From: Weiwei Yang <wyang@cloudera.com>
Date: Tue, 19 Oct 2021 22:42:06 -0700
Subject: [PATCH 008/169] [SPARK-37049][K8S] executorIdleTimeout should check
 `creationTimestamp` instead of `startTime`

SPARK-33099 added the support to respect `spark.dynamicAllocation.executorIdleTimeout` in `ExecutorPodsAllocator`. However, when it checks if a pending executor pod is timed out, it checks against the pod's [startTime](https://github.com/kubernetes/api/blob/2a5dae08c42b1e8fdc1379432d8898efece65363/core/v1/types.go#L3664-L3667), see code [here](https://github.com/apache/spark/blob/c2ba498ff678ddda034cedf45cc17fbeefe922fd/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala#L459). A pending pod `startTime` is empty, and this causes the function `isExecutorIdleTimedOut()` always return true for pending pods.

This can be reproduced locally, run the following job

```
${SPARK_HOME}/bin/spark-submit --master k8s://http://localhost:8001 --deploy-mode cluster --name spark-group-example \
  --master k8s://http://localhost:8001 --deploy-mode cluster \
  --class org.apache.spark.examples.GroupByTest \
  --conf spark.executor.instances=1 \
  --conf spark.kubernetes.namespace=spark-test \
  --conf spark.kubernetes.executor.request.cores=1 \
  --conf spark.dynamicAllocation.enabled=true \
  --conf spark.shuffle.service.enabled=true \
  --conf spark.dynamicAllocation.shuffleTracking.enabled=true \
  --conf spark.shuffle.service.enabled=false \
  --conf spark.kubernetes.container.image=local/spark:3.3.0 \
  --conf spark.kubernetes.authenticate.driver.serviceAccountName=spark \
  local:///opt/spark/examples/jars/spark-examples_2.12-3.3.0-SNAPSHOT.jar \
  1000 1000 100 1000
```

the local cluster doesn't have enough resources to run more than 4 executors, the rest of the executor pods will be pending. The job will have task backlogs and triggers to request more executors from K8s:

```
21/10/19 22:51:45 INFO ExecutorPodsAllocator: Going to request 1 executors from Kubernetes for ResourceProfile Id: 0, target: 1 running: 0.
21/10/19 22:51:51 INFO ExecutorPodsAllocator: Going to request 1 executors from Kubernetes for ResourceProfile Id: 0, target: 2 running: 1.
21/10/19 22:51:52 INFO ExecutorPodsAllocator: Going to request 2 executors from Kubernetes for ResourceProfile Id: 0, target: 4 running: 2.
21/10/19 22:51:53 INFO ExecutorPodsAllocator: Going to request 4 executors from Kubernetes for ResourceProfile Id: 0, target: 8 running: 4.
...
21/10/19 22:52:14 INFO ExecutorPodsAllocator: Deleting 39 excess pod requests (23,59,32,41,50,68,35,44,17,8,53,62,26,71,11,56,29,38,47,20,65,5,14,46,64,73,55,49,40,67,58,13,22,31,7,16,52,70,43).
21/10/19 22:52:18 INFO ExecutorPodsAllocator: Deleting 28 excess pod requests (25,34,61,37,10,19,28,60,69,63,45,54,72,36,18,9,27,21,57,12,48,30,39,66,15,42,24,33).
```

At `22:51:45`, it starts to request executors; and at  `22:52:14` it starts to delete excess executor pods. This is 29s but spark.dynamicAllocation.executorIdleTimeout is set to 60s. The config was not honored.

### What changes were proposed in this pull request?
Change the check from using pod's `startTime` to `creationTimestamp`. [creationTimestamp](https://github.com/kubernetes/apimachinery/blob/e6c90c4366be1504309a6aafe0d816856450f36a/pkg/apis/meta/v1/types.go#L193-L201) is the timestamp when a pod gets created on K8s:

```
// CreationTimestamp is a timestamp representing the server time when this object was
// created. It is not guaranteed to be set in happens-before order across separate operations.
// Clients may not set this value. It is represented in RFC3339 form and is in UTC.
```

[startTime](https://github.com/kubernetes/api/blob/2a5dae08c42b1e8fdc1379432d8898efece65363/core/v1/types.go#L3664-L3667) is the timestamp when pod gets started:

```
// RFC 3339 date and time at which the object was acknowledged by the Kubelet.
// This is before the Kubelet pulled the container image(s) for the pod.
// +optional
```

a pending pod's startTime is empty. Here is a example of a pending pod:

```
NAMESPACE     NAME                                     READY   STATUS    RESTARTS   AGE
default       pending-pod-example                      0/1     Pending   0          2s

kubectl get pod pending-pod-example -o yaml | grep creationTimestamp
--->  creationTimestamp: "2021-10-19T16:17:52Z"

// pending pod has no startTime
kubectl get pod pending-pod-example -o yaml | grep startTime
---> // empty

// running pod has startTime set to the timestamp when the pod gets started
kubectl get pod coredns-558bd4d5db-6qrtx -n kube-system -o yaml | grep startTime
        f:startTime: {}
---> startTime: "2021-08-04T23:44:44Z"
```

### Why are the changes needed?
This fixed the issue that `spark.dynamicAllocation.executorIdleTimeout` currently is not honored by pending executor pods.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
The PR includes the UT changes, that has the testing coverage for this issue.

Closes #34319 from yangwwei/SPARK-37049.

Authored-by: Weiwei Yang <wyang@cloudera.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
(cherry picked from commit 041cd5d7d15ec4184ae51a8a10a26bef05bd261f)
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../scheduler/cluster/k8s/ExecutorPodsAllocator.scala     | 8 ++++----
 .../cluster/k8s/ExecutorLifecycleTestUtils.scala          | 4 +++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
index c83b8b854f298..9e1794fd3e8fc 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsAllocator.scala
@@ -369,11 +369,11 @@ private[spark] class ExecutorPodsAllocator(
 
   private def isExecutorIdleTimedOut(state: ExecutorPodState, currentTime: Long): Boolean = {
     try {
-      val startTime = Instant.parse(state.pod.getStatus.getStartTime).toEpochMilli()
-      currentTime - startTime > executorIdleTimeout
+      val creationTime = Instant.parse(state.pod.getMetadata.getCreationTimestamp).toEpochMilli()
+      currentTime - creationTime > executorIdleTimeout
     } catch {
-      case _: Exception =>
-        logDebug(s"Cannot get startTime of pod ${state.pod}")
+      case e: Exception =>
+        logError(s"Cannot get the creationTimestamp of the pod: ${state.pod}", e)
         true
     }
   }
diff --git a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorLifecycleTestUtils.scala b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorLifecycleTestUtils.scala
index 41cba573d89c2..0b3ce6d7eb274 100644
--- a/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorLifecycleTestUtils.scala
+++ b/resource-managers/kubernetes/core/src/test/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorLifecycleTestUtils.scala
@@ -62,9 +62,11 @@ object ExecutorLifecycleTestUtils {
 
   def pendingExecutor(executorId: Long, rpId: Int = DEFAULT_RESOURCE_PROFILE_ID): Pod = {
     new PodBuilder(podWithAttachedContainerForId(executorId, rpId))
+      .editOrNewMetadata()
+        .withCreationTimestamp(Instant.now.toString)
+        .endMetadata()
       .editOrNewStatus()
         .withPhase("pending")
-        .withStartTime(Instant.now.toString)
         .endStatus()
       .build()
   }

From 8e57cfeb9681626ff1887ed5113133cc3108a692 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Tue, 30 Nov 2021 18:41:18 -0800
Subject: [PATCH 009/169] [SPARK-37497][K8S] Promote
 `ExecutorPods[PollingSnapshot|WatchSnapshot]Source` to DeveloperApi

### What changes were proposed in this pull request?

This PR aims to promote `ExecutorPodsWatchSnapshotSource` and `ExecutorPodsPollingSnapshotSource` as **stable** `DeveloperApi` in order to maintain it officially in a backward compatible way at Apache Spark 3.3.0.

### Why are the changes needed?

- Since SPARK-24248 at Apache Spark 2.4.0, `ExecutorPodsWatchSnapshotSource` and `ExecutorPodsPollingSnapshotSource` have been used to monitor executor pods without any interface changes for over 3 years.

- Apache Spark 3.1.1 makes `Kubernetes` module GA and provides an extensible external cluster manager framework. New `ExternalClusterManager` for K8s environment need to depend on this to monitor pods.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Manual review.

Closes #34751 from dongjoon-hyun/SPARK-37497.

Authored-by: Dongjoon Hyun <dongjoon@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
(cherry picked from commit 2b044962cd6eff5a3a76f2808ee93b40bdf931df)
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../k8s/ExecutorPodsPollingSnapshotSource.scala    | 13 ++++++++++++-
 .../k8s/ExecutorPodsWatchSnapshotSource.scala      | 14 +++++++++++++-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsPollingSnapshotSource.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsPollingSnapshotSource.scala
index da7fe7cdda328..6fcb87655b56d 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsPollingSnapshotSource.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsPollingSnapshotSource.scala
@@ -22,12 +22,21 @@ import io.fabric8.kubernetes.client.KubernetesClient
 import scala.collection.JavaConverters._
 
 import org.apache.spark.SparkConf
+import org.apache.spark.annotation.{DeveloperApi, Since, Stable}
 import org.apache.spark.deploy.k8s.Config._
 import org.apache.spark.deploy.k8s.Constants._
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.{ThreadUtils, Utils}
 
-private[spark] class ExecutorPodsPollingSnapshotSource(
+/**
+ * :: DeveloperApi ::
+ *
+ * A class used for polling K8s executor pods by ExternalClusterManagers.
+ * @since 3.1.3
+ */
+@Stable
+@DeveloperApi
+class ExecutorPodsPollingSnapshotSource(
     conf: SparkConf,
     kubernetesClient: KubernetesClient,
     snapshotsStore: ExecutorPodsSnapshotsStore,
@@ -37,6 +46,7 @@ private[spark] class ExecutorPodsPollingSnapshotSource(
 
   private var pollingFuture: Future[_] = _
 
+  @Since("3.1.3")
   def start(applicationId: String): Unit = {
     require(pollingFuture == null, "Cannot start polling more than once.")
     logDebug(s"Starting to check for executor pod state every $pollingInterval ms.")
@@ -44,6 +54,7 @@ private[spark] class ExecutorPodsPollingSnapshotSource(
       new PollRunnable(applicationId), pollingInterval, pollingInterval, TimeUnit.MILLISECONDS)
   }
 
+  @Since("3.1.3")
   def stop(): Unit = {
     if (pollingFuture != null) {
       pollingFuture.cancel(true)
diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsWatchSnapshotSource.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsWatchSnapshotSource.scala
index a6749a644e00c..7ac70b589c698 100644
--- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsWatchSnapshotSource.scala
+++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/scheduler/cluster/k8s/ExecutorPodsWatchSnapshotSource.scala
@@ -22,16 +22,27 @@ import io.fabric8.kubernetes.api.model.Pod
 import io.fabric8.kubernetes.client.{KubernetesClient, KubernetesClientException, Watcher}
 import io.fabric8.kubernetes.client.Watcher.Action
 
+import org.apache.spark.annotation.{DeveloperApi, Since, Stable}
 import org.apache.spark.deploy.k8s.Constants._
 import org.apache.spark.internal.Logging
 import org.apache.spark.util.Utils
 
-private[spark] class ExecutorPodsWatchSnapshotSource(
+/**
+ * :: DeveloperApi ::
+ *
+ * A class used for watching K8s executor pods by ExternalClusterManagers.
+ *
+ * @since 3.1.3
+ */
+@Stable
+@DeveloperApi
+class ExecutorPodsWatchSnapshotSource(
     snapshotsStore: ExecutorPodsSnapshotsStore,
     kubernetesClient: KubernetesClient) extends Logging {
 
   private var watchConnection: Closeable = _
 
+  @Since("3.1.3")
   def start(applicationId: String): Unit = {
     require(watchConnection == null, "Cannot start the watcher twice.")
     logDebug(s"Starting watch for pods with labels $SPARK_APP_ID_LABEL=$applicationId," +
@@ -42,6 +53,7 @@ private[spark] class ExecutorPodsWatchSnapshotSource(
       .watch(new ExecutorPodsWatcher())
   }
 
+  @Since("3.1.3")
   def stop(): Unit = {
     if (watchConnection != null) {
       Utils.tryLogNonFatalError {

From 3c35c388fb929dffc234a6726757d9ae8e9e24df Mon Sep 17 00:00:00 2001
From: Sumeet Gajjar <sumeetgajjar93@gmail.com>
Date: Mon, 5 Apr 2021 17:32:43 -0500
Subject: [PATCH 010/169] [SPARK-34949][CORE] Prevent BlockManager reregister
 when Executor is shutting down

### What changes were proposed in this pull request?

This PR prevents reregistering BlockManager when a Executor is shutting down. It is achieved by checking  `executorShutdown` before calling `env.blockManager.reregister()`.

### Why are the changes needed?

This change is required since Spark reports executors as active, even they are removed.
I was testing Dynamic Allocation on K8s with about 300 executors. While doing so, when the executors were torn down due to `spark.dynamicAllocation.executorIdleTimeout`, I noticed all the executor pods being removed from K8s, however, under the "Executors" tab in SparkUI, I could see some executors listed as alive.  [spark.sparkContext.statusTracker.getExecutorInfos.length](https://github.com/apache/spark/blob/65da9287bc5112564836a555cd2967fc6b05856f/core/src/main/scala/org/apache/spark/SparkStatusTracker.scala#L105) also returned a value greater than 1.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Added a new test.

## Logs
Following are the logs of the executor(Id:303) which re-registers `BlockManager`
```
21/04/02 21:33:28 INFO CoarseGrainedExecutorBackend: Got assigned task 1076
21/04/02 21:33:28 INFO Executor: Running task 4.0 in stage 3.0 (TID 1076)
21/04/02 21:33:28 INFO MapOutputTrackerWorker: Updating epoch to 302 and clearing cache
21/04/02 21:33:28 INFO TorrentBroadcast: Started reading broadcast variable 3
21/04/02 21:33:28 INFO TransportClientFactory: Successfully created connection to /100.100.195.227:33703 after 76 ms (62 ms spent in bootstraps)
21/04/02 21:33:28 INFO MemoryStore: Block broadcast_3_piece0 stored as bytes in memory (estimated size 2.4 KB, free 168.0 MB)
21/04/02 21:33:28 INFO TorrentBroadcast: Reading broadcast variable 3 took 168 ms
21/04/02 21:33:28 INFO MemoryStore: Block broadcast_3 stored as values in memory (estimated size 3.9 KB, free 168.0 MB)
21/04/02 21:33:29 INFO MapOutputTrackerWorker: Don't have map outputs for shuffle 1, fetching them
21/04/02 21:33:29 INFO MapOutputTrackerWorker: Doing the fetch; tracker endpoint = NettyRpcEndpointRef(spark://MapOutputTrackerda-lite-test-4-7a57e478947d206d-driver-svc.dex-app-n5ttnbmg.svc:7078)
21/04/02 21:33:29 INFO MapOutputTrackerWorker: Got the output locations
21/04/02 21:33:29 INFO ShuffleBlockFetcherIterator: Getting 2 non-empty blocks including 1 local blocks and 1 remote blocks
21/04/02 21:33:30 INFO TransportClientFactory: Successfully created connection to /100.100.80.103:40971 after 660 ms (528 ms spent in bootstraps)
21/04/02 21:33:30 INFO ShuffleBlockFetcherIterator: Started 1 remote fetches in 1042 ms
21/04/02 21:33:31 INFO Executor: Finished task 4.0 in stage 3.0 (TID 1076). 1276 bytes result sent to driver
.
.
.
21/04/02 21:34:16 INFO CoarseGrainedExecutorBackend: Driver commanded a shutdown
21/04/02 21:34:16 INFO Executor: Told to re-register on heartbeat
21/04/02 21:34:16 INFO BlockManager: BlockManager BlockManagerId(303, 100.100.122.34, 41265, None) re-registering with master
21/04/02 21:34:16 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(303, 100.100.122.34, 41265, None)
21/04/02 21:34:16 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(303, 100.100.122.34, 41265, None)
21/04/02 21:34:16 INFO BlockManager: Reporting 0 blocks to the master.
21/04/02 21:34:16 INFO MemoryStore: MemoryStore cleared
21/04/02 21:34:16 INFO BlockManager: BlockManager stopped
21/04/02 21:34:16 INFO FileDataSink: Closing sink with output file = /tmp/safari-events/.des_analysis/safari-events/hdp_spark_monitoring_random-container-037caf27-6c77-433f-820f-03cd9c7d9b6e-spark-8a492407d60b401bbf4309a14ea02ca2_events.tsv
21/04/02 21:34:16 INFO HonestProfilerBasedThreadSnapshotProvider: Stopping agent
21/04/02 21:34:16 INFO HonestProfilerHandler: Stopping honest profiler agent
21/04/02 21:34:17 INFO ShutdownHookManager: Shutdown hook called
21/04/02 21:34:17 INFO ShutdownHookManager: Deleting directory /var/data/spark-d886588c-2a7e-491d-bbcb-4f58b3e31001/spark-4aa337a0-60c0-45da-9562-8c50eaff3cea

```

Closes #32043 from sumeetgajjar/SPARK-34949.

Authored-by: Sumeet Gajjar <sumeetgajjar93@gmail.com>
Signed-off-by: Mridul Muralidharan <mridul<at>gmail.com>
(cherry picked from commit a9ca1978ae8ecc53e2ef9e14b4be70dc8f5d9341)
Signed-off-by: Mridul Muralidharan <mridulatgmail.com>
---
 .../org/apache/spark/executor/Executor.scala  |  2 +-
 .../apache/spark/executor/ExecutorSuite.scala | 66 ++++++++++++++-----
 2 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index e7f1b8f3cf17a..4ead4397e9739 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -995,7 +995,7 @@ private[spark] class Executor(
     try {
       val response = heartbeatReceiverRef.askSync[HeartbeatResponse](
         message, new RpcTimeout(HEARTBEAT_INTERVAL_MS.millis, EXECUTOR_HEARTBEAT_INTERVAL.key))
-      if (response.reregisterBlockManager) {
+      if (!executorShutdown.get && response.reregisterBlockManager) {
         logInfo("Told to re-register on heartbeat")
         env.blockManager.reregister()
       }
diff --git a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
index 97ffb36062dbc..a237447b0fa2d 100644
--- a/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
+++ b/core/src/test/scala/org/apache/spark/executor/ExecutorSuite.scala
@@ -270,6 +270,17 @@ class ExecutorSuite extends SparkFunSuite
     heartbeatZeroAccumulatorUpdateTest(false)
   }
 
+  private def withMockHeartbeatReceiverRef(executor: Executor)
+      (func: RpcEndpointRef => Unit): Unit = {
+    val executorClass = classOf[Executor]
+    val mockReceiverRef = mock[RpcEndpointRef]
+    val receiverRef = executorClass.getDeclaredField("heartbeatReceiverRef")
+    receiverRef.setAccessible(true)
+    receiverRef.set(executor, mockReceiverRef)
+
+    func(mockReceiverRef)
+  }
+
   private def withHeartbeatExecutor(confs: (String, String)*)
       (f: (Executor, ArrayBuffer[Heartbeat]) => Unit): Unit = {
     val conf = new SparkConf
@@ -277,22 +288,18 @@ class ExecutorSuite extends SparkFunSuite
     val serializer = new JavaSerializer(conf)
     val env = createMockEnv(conf, serializer)
     withExecutor("id", "localhost", SparkEnv.get) { executor =>
-      val executorClass = classOf[Executor]
-
-      // Save all heartbeats sent into an ArrayBuffer for verification
-      val heartbeats = ArrayBuffer[Heartbeat]()
-      val mockReceiver = mock[RpcEndpointRef]
-      when(mockReceiver.askSync(any[Heartbeat], any[RpcTimeout])(any))
-        .thenAnswer((invocation: InvocationOnMock) => {
-          val args = invocation.getArguments()
-          heartbeats += args(0).asInstanceOf[Heartbeat]
-          HeartbeatResponse(false)
-        })
-      val receiverRef = executorClass.getDeclaredField("heartbeatReceiverRef")
-      receiverRef.setAccessible(true)
-      receiverRef.set(executor, mockReceiver)
+      withMockHeartbeatReceiverRef(executor) { mockReceiverRef =>
+        // Save all heartbeats sent into an ArrayBuffer for verification
+        val heartbeats = ArrayBuffer[Heartbeat]()
+        when(mockReceiverRef.askSync(any[Heartbeat], any[RpcTimeout])(any))
+          .thenAnswer((invocation: InvocationOnMock) => {
+            val args = invocation.getArguments()
+            heartbeats += args(0).asInstanceOf[Heartbeat]
+            HeartbeatResponse(false)
+          })
 
-      f(executor, heartbeats)
+        f(executor, heartbeats)
+      }
     }
   }
 
@@ -416,6 +423,35 @@ class ExecutorSuite extends SparkFunSuite
     assert(taskMetrics.getMetricValue("JVMHeapMemory") > 0)
   }
 
+  test("SPARK-34949: do not re-register BlockManager when executor is shutting down") {
+    val reregisterInvoked = new AtomicBoolean(false)
+    val mockBlockManager = mock[BlockManager]
+    when(mockBlockManager.reregister()).thenAnswer { (_: InvocationOnMock) =>
+      reregisterInvoked.getAndSet(true)
+    }
+    val conf = new SparkConf(false).setAppName("test").setMaster("local[2]")
+    val mockEnv = createMockEnv(conf, new JavaSerializer(conf))
+    when(mockEnv.blockManager).thenReturn(mockBlockManager)
+
+    withExecutor("id", "localhost", mockEnv) { executor =>
+      withMockHeartbeatReceiverRef(executor) { mockReceiverRef =>
+        when(mockReceiverRef.askSync(any[Heartbeat], any[RpcTimeout])(any)).thenAnswer {
+          (_: InvocationOnMock) => HeartbeatResponse(reregisterBlockManager = true)
+        }
+        val reportHeartbeat = PrivateMethod[Unit](Symbol("reportHeartBeat"))
+        executor.invokePrivate(reportHeartbeat())
+        assert(reregisterInvoked.get(), "BlockManager.reregister should be invoked " +
+          "on HeartbeatResponse(reregisterBlockManager = true) when executor is not shutting down")
+
+        reregisterInvoked.getAndSet(false)
+        executor.stop()
+        executor.invokePrivate(reportHeartbeat())
+        assert(!reregisterInvoked.get(),
+          "BlockManager.reregister should not be invoked when executor is shutting down")
+      }
+    }
+  }
+
   test("SPARK-33587: isFatalError") {
     def errorInThreadPool(e: => Throwable): Throwable = {
       intercept[Throwable] {

From c3523986c7091627642f66641ff63680b88021bb Mon Sep 17 00:00:00 2001
From: skotlov <skotlov@joom.com>
Date: Wed, 21 Apr 2021 22:54:16 -0700
Subject: [PATCH 011/169] [SPARK-34674][CORE][K8S] Close SparkContext after the
 Main method has finished

### What changes were proposed in this pull request?
Close SparkContext after the Main method has finished, to allow SparkApplication on K8S to complete.
This is fixed version of [merged and reverted PR](https://github.com/apache/spark/pull/32081).

### Why are the changes needed?
if I don't call the method sparkContext.stop() explicitly, then a Spark driver process doesn't terminate even after its Main method has been completed. This behaviour is different from spark on yarn, where the manual sparkContext stopping is not required. It looks like, the problem is in using non-daemon threads, which prevent the driver jvm process from terminating.
So I have inserted code that closes sparkContext automatically.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Manually on the production AWS EKS environment in my company.

Closes #32283 from kotlovs/close-spark-context-on-exit-2.

Authored-by: skotlov <skotlov@joom.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit b17a0e6931cac98cc839c047b1b5d4ea6d052009)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../main/scala/org/apache/spark/deploy/SparkSubmit.scala | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index acdddbcb89401..bc4d677c640af 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -956,6 +956,15 @@ private[spark] class SparkSubmit extends Logging {
     } catch {
       case t: Throwable =>
         throw findCause(t)
+    } finally {
+      if (!isShell(args.primaryResource) && !isSqlShell(args.mainClass) &&
+        !isThriftServer(args.mainClass)) {
+        try {
+          SparkContext.getActive.foreach(_.stop())
+        } catch {
+          case e: Throwable => logError(s"Failed to close SparkContext: $e")
+        }
+      }
     }
   }
 

From a004fb601aa45ca08d0dc31045193cde9aad8689 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dongjoon@apache.org>
Date: Sun, 18 Jul 2021 22:26:23 -0700
Subject: [PATCH 012/169] [SPARK-36193][CORE] Recover SparkSubmit.runMain not
 to stop SparkContext in non-K8s env

### What changes were proposed in this pull request?

According to the discussion on https://github.com/apache/spark/pull/32283 , this PR aims to limit the feature of SPARK-34674 to K8s environment only.

### Why are the changes needed?

To reduce the behavior change in non-K8s environment.

### Does this PR introduce _any_ user-facing change?

The change behavior is consistent with 3.1.1 and older Spark releases.

### How was this patch tested?

N/A

Closes #33403 from dongjoon-hyun/SPARK-36193.

Authored-by: Dongjoon Hyun <dongjoon@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
(cherry picked from commit fd3e9ce0b9ee09c7dce9f2e029fe96eac51eab96)
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index bc4d677c640af..5a9c550fc2068 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -957,8 +957,8 @@ private[spark] class SparkSubmit extends Logging {
       case t: Throwable =>
         throw findCause(t)
     } finally {
-      if (!isShell(args.primaryResource) && !isSqlShell(args.mainClass) &&
-        !isThriftServer(args.mainClass)) {
+      if (args.master.startsWith("k8s") && !isShell(args.primaryResource) &&
+          !isSqlShell(args.mainClass) && !isThriftServer(args.mainClass)) {
         try {
           SparkContext.getActive.foreach(_.stop())
         } catch {

From a92ed12f1e5144ed0e080160b7fb46375e71a9fb Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Sat, 26 Jun 2021 12:48:24 +0800
Subject: [PATCH 013/169] [SPARK-35879][CORE][SHUFFLE] Fix performance
 regression caused by collectFetchRequests

### What changes were proposed in this pull request?

This PR fixes perf regression at the executor side when creating fetch requests with large initial partitions

![image](https://user-images.githubusercontent.com/8326978/123270865-dd21e800-d532-11eb-8447-ad80e47b034f.png)

In NetEase, we had an online job that took `45min` to "fetch" about 100MB of shuffle data, which actually turned out that it was just collecting fetch requests slowly. Normally, such a task should finish in seconds.

See the `DEBUG` log

```
21/06/22 11:52:26 DEBUG BlockManagerStorageEndpoint: Sent response: 0 to kyuubi.163.org:
21/06/22 11:53:05 DEBUG ShuffleBlockFetcherIterator: Creating fetch request of 3941440 at BlockManagerId(12, .., 43559, None) with 19 blocks
21/06/22 11:53:44 DEBUG ShuffleBlockFetcherIterator: Creating fetch request of 3716400 at BlockManagerId(20, .., 38287, None) with 18 blocks
21/06/22 11:54:41 DEBUG ShuffleBlockFetcherIterator: Creating fetch request of 4559280 at BlockManagerId(6, .., 39689, None) with 22 blocks
21/06/22 11:55:08 DEBUG ShuffleBlockFetcherIterator: Creating fetch request of 3120160 at BlockManagerId(33, .., 39449, None) with 15 blocks
```

I also create a test case locally with my local laptop docker env to give some reproducible cases.

```
bin/spark-sql --conf spark.kubernetes.file.upload.path=./ --master k8s://https://kubernetes.docker.internal:6443 --conf spark.kubernetes.container.image=yaooqinn/spark:v20210624-5 -c spark.kubernetes.context=docker-for-desktop_1 --num-executors 5 --driver-memory 5g --conf spark.kubernetes.executor.podNamePrefix=sparksql
```

```sql
 SET spark.sql.adaptive.enabled=true;
 SET spark.sql.shuffle.partitions=3000;
 SELECT /*+ REPARTITION */ 1 as pid, id from range(1, 1000000, 1, 500);
 SELECT /*+ REPARTITION(pid, id) */ 1 as pid, id from range(1, 1000000, 1, 500);
 ```

### Why are the changes needed?

fix perf regression which was introduced by SPARK-29292 (3ad4863673fc46080dda963be3055a3e554cfbc7) in v3.1.0.

3ad4863673fc46080dda963be3055a3e554cfbc7 is for support compilation with scala 2.13 but the performance losses is huge. We need to consider backporting this PR to branch 3.1.

### Does this PR introduce _any_ user-facing change?

no

### How was this patch tested?

Mannully,

#### before
```log
 21/06/23 13:54:22 DEBUG ShuffleBlockFetcherIterator: maxBytesInFlight: 50331648, targetRemoteRequestSize: 10066329, maxBlocksInFlightPerAddress: 2147483647
 21/06/23 13:54:38 DEBUG ShuffleBlockFetcherIterator: Creating fetch request of 2314708 at BlockManagerId(2, 10.1.3.114, 36423, None) with 86 blocks
 21/06/23 13:54:59 DEBUG ShuffleBlockFetcherIterator: Creating fetch request of 2636612 at BlockManagerId(3, 10.1.3.115, 34293, None) with 87 blocks
 21/06/23 13:55:18 DEBUG ShuffleBlockFetcherIterator: Creating fetch request of 2508706 at BlockManagerId(4, 10.1.3.116, 41869, None) with 90 blocks
 21/06/23 13:55:34 DEBUG ShuffleBlockFetcherIterator: Creating fetch request of 2350854 at BlockManagerId(5, 10.1.3.117, 45787, None) with 85 blocks
 21/06/23 13:55:34 INFO ShuffleBlockFetcherIterator: Getting 438 (11.8 MiB) non-empty blocks including 90 (2.5 MiB) local and 0 (0.0 B) host-local and 348 (9.4 MiB) remote blocks
 21/06/23 13:55:34 DEBUG ShuffleBlockFetcherIterator: Sending request for 87 blocks (2.5 MiB) from 10.1.3.115:34293
 21/06/23 13:55:34 INFO TransportClientFactory: Successfully created connection to /10.1.3.115:34293 after 1 ms (0 ms spent in bootstraps)
 21/06/23 13:55:34 DEBUG ShuffleBlockFetcherIterator: Sending request for 90 blocks (2.4 MiB) from 10.1.3.116:41869
 21/06/23 13:55:34 INFO TransportClientFactory: Successfully created connection to /10.1.3.116:41869 after 2 ms (0 ms spent in bootstraps)
 21/06/23 13:55:34 DEBUG ShuffleBlockFetcherIterator: Sending request for 85 blocks (2.2 MiB) from 10.1.3.117:45787
 ```
```log
 21/06/23 14:00:45 INFO MapOutputTracker: Broadcast outputstatuses size = 411, actual size = 828997
 21/06/23 14:00:45 INFO MapOutputTrackerWorker: Got the map output locations
 21/06/23 14:00:45 DEBUG ShuffleBlockFetcherIterator: maxBytesInFlight: 50331648, targetRemoteRequestSize: 10066329, maxBlocksInFlightPerAddress: 2147483647
 21/06/23 14:00:55 DEBUG ShuffleBlockFetcherIterator: Creating fetch request of 1894389 at BlockManagerId(2, 10.1.3.114, 36423, None) with 99 blocks
 21/06/23 14:01:04 DEBUG ShuffleBlockFetcherIterator: Creating fetch request of 1919993 at BlockManagerId(3, 10.1.3.115, 34293, None) with 100 blocks
 21/06/23 14:01:14 DEBUG ShuffleBlockFetcherIterator: Creating fetch request of 1977186 at BlockManagerId(5, 10.1.3.117, 45787, None) with 103 blocks
 21/06/23 14:01:23 DEBUG ShuffleBlockFetcherIterator: Creating fetch request of 1938336 at BlockManagerId(4, 10.1.3.116, 41869, None) with 101 blocks
 21/06/23 14:01:23 INFO ShuffleBlockFetcherIterator: Getting 500 (9.1 MiB) non-empty blocks including 97 (1820.3 KiB) local and 0 (0.0 B) host-local and 403 (7.4 MiB) remote blocks
 21/06/23 14:01:23 DEBUG ShuffleBlockFetcherIterator: Sending request for 101 blocks (1892.9 KiB) from 10.1.3.116:41869
 21/06/23 14:01:23 DEBUG ShuffleBlockFetcherIterator: Sending request for 103 blocks (1930.8 KiB) from 10.1.3.117:45787
 21/06/23 14:01:23 DEBUG ShuffleBlockFetcherIterator: Sending request for 99 blocks (1850.0 KiB) from 10.1.3.114:36423
 21/06/23 14:01:23 DEBUG ShuffleBlockFetcherIterator: Sending request for 100 blocks (1875.0 KiB) from 10.1.3.115:34293
 21/06/23 14:01:23 INFO ShuffleBlockFetcherIterator: Started 4 remote fetches in 37889 ms
 ```

#### After

```log
21/06/24 13:01:16 DEBUG ShuffleBlockFetcherIterator: maxBytesInFlight: 50331648, targetRemoteRequestSize: 10066329, maxBlocksInFlightPerAddress: 2147483647
21/06/24 13:01:16 INFO ShuffleBlockFetcherIterator: ==> Call blockInfos.map(_._2).sum: 40 ms
21/06/24 13:01:16 INFO ShuffleBlockFetcherIterator: ==> Call mergeFetchBlockInfo for shuffle_0_9_2990_2997/9: 0 ms
21/06/24 13:01:16 INFO ShuffleBlockFetcherIterator: ==> Call mergeFetchBlockInfo for shuffle_0_15_2395_2997/15: 0 ms
```

Closes #33063 from yaooqinn/SPARK-35879.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Kent Yao <yao@apache.org>
(cherry picked from commit 14d4decf736297e2bf4d824ccbd604c9da49ccf4)
Signed-off-by: Kent Yao <yao@apache.org>
---
 .../storage/ShuffleBlockFetcherIterator.scala | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
index fa4e46590aa5e..0c37a5b21a870 100644
--- a/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
+++ b/core/src/main/scala/org/apache/spark/storage/ShuffleBlockFetcherIterator.scala
@@ -317,7 +317,10 @@ final class ShuffleBlockFetcherIterator(
         hostLocalBlockBytes += mergedBlockInfos.map(_.size).sum
       } else {
         remoteBlockBytes += blockInfos.map(_._2).sum
-        collectFetchRequests(address, blockInfos, collectedRemoteRequests)
+        val (_, timeCost) = Utils.timeTakenMs[Unit] {
+          collectFetchRequests(address, blockInfos, collectedRemoteRequests)
+        }
+        logDebug(s"Collected remote fetch requests for $address in $timeCost ms")
       }
     }
     val numRemoteBlocks = collectedRemoteRequests.map(_.blocks.size).sum
@@ -345,10 +348,10 @@ final class ShuffleBlockFetcherIterator(
       curBlocks: Seq[FetchBlockInfo],
       address: BlockManagerId,
       isLast: Boolean,
-      collectedRemoteRequests: ArrayBuffer[FetchRequest]): Seq[FetchBlockInfo] = {
+      collectedRemoteRequests: ArrayBuffer[FetchRequest]): ArrayBuffer[FetchBlockInfo] = {
     val mergedBlocks = mergeContinuousShuffleBlockIdsIfNeeded(curBlocks, doBatchFetch)
     numBlocksToFetch += mergedBlocks.size
-    var retBlocks = Seq.empty[FetchBlockInfo]
+    val retBlocks = new ArrayBuffer[FetchBlockInfo]
     if (mergedBlocks.length <= maxBlocksInFlightPerAddress) {
       collectedRemoteRequests += createFetchRequest(mergedBlocks, address)
     } else {
@@ -358,7 +361,7 @@ final class ShuffleBlockFetcherIterator(
         } else {
           // The last group does not exceed `maxBlocksInFlightPerAddress`. Put it back
           // to `curBlocks`.
-          retBlocks = blocks
+          retBlocks ++= blocks
           numBlocksToFetch -= blocks.size
         }
       }
@@ -372,26 +375,24 @@ final class ShuffleBlockFetcherIterator(
       collectedRemoteRequests: ArrayBuffer[FetchRequest]): Unit = {
     val iterator = blockInfos.iterator
     var curRequestSize = 0L
-    var curBlocks = Seq.empty[FetchBlockInfo]
+    var curBlocks = new ArrayBuffer[FetchBlockInfo]()
 
     while (iterator.hasNext) {
       val (blockId, size, mapIndex) = iterator.next()
       assertPositiveBlockSize(blockId, size)
-      curBlocks = curBlocks ++ Seq(FetchBlockInfo(blockId, size, mapIndex))
+      curBlocks += FetchBlockInfo(blockId, size, mapIndex)
       curRequestSize += size
       // For batch fetch, the actual block in flight should count for merged block.
       val mayExceedsMaxBlocks = !doBatchFetch && curBlocks.size >= maxBlocksInFlightPerAddress
       if (curRequestSize >= targetRemoteRequestSize || mayExceedsMaxBlocks) {
-        curBlocks = createFetchRequests(curBlocks, address, isLast = false,
+        curBlocks = createFetchRequests(curBlocks.toSeq, address, isLast = false,
           collectedRemoteRequests)
         curRequestSize = curBlocks.map(_.size).sum
       }
     }
     // Add in the final request
     if (curBlocks.nonEmpty) {
-      curBlocks = createFetchRequests(curBlocks, address, isLast = true,
-        collectedRemoteRequests)
-      curRequestSize = curBlocks.map(_.size).sum
+      createFetchRequests(curBlocks.toSeq, address, isLast = true, collectedRemoteRequests)
     }
   }
 
@@ -889,7 +890,7 @@ object ShuffleBlockFetcherIterator {
       blocks: Seq[FetchBlockInfo],
       doBatchFetch: Boolean): Seq[FetchBlockInfo] = {
     val result = if (doBatchFetch) {
-      var curBlocks = new ArrayBuffer[FetchBlockInfo]
+      val curBlocks = new ArrayBuffer[FetchBlockInfo]
       val mergedBlockInfo = new ArrayBuffer[FetchBlockInfo]
 
       def mergeFetchBlockInfo(toBeMerged: ArrayBuffer[FetchBlockInfo]): FetchBlockInfo = {

From d6b5c3931a6173cc1eb66af022c02242bee2985f Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Mon, 22 Feb 2021 21:11:21 +0800
Subject: [PATCH 014/169] [SPARK-34473][SQL] Avoid NPE in
 DataFrameReader.schema(StructType)

### What changes were proposed in this pull request?

This fixes a regression in `DataFrameReader.schema(StructType)`, to avoid NPE if the given `StructType` is null. Note that, passing null to Spark public APIs leads to undefined behavior. There is no document mentioning the null behavior, and it's just an accident that `DataFrameReader.schema(StructType)` worked before. So I think this is not a 3.1 blocker.

### Why are the changes needed?

It fixes a 3.1 regression

### Does this PR introduce _any_ user-facing change?

yea, now `df.read.schema(null: StructType)` is a noop as before, while in the current branch-3.1 it throws NPE.

### How was this patch tested?

It's undefined behavior and is very obvious, so I didn't add a test. We should add tests when we clearly define and fix the null behavior for all public APIs.

Closes #31593 from cloud-fan/minor.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 02c784ca686fc675b63ce37f03215bc6c2fec869)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../scala/org/apache/spark/sql/DataFrameReader.scala  | 11 +++++------
 .../apache/spark/sql/streaming/DataStreamReader.scala | 11 +++++------
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index b94c42a2c9544..e4da076035171 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -73,8 +73,10 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * @since 1.4.0
    */
   def schema(schema: StructType): DataFrameReader = {
-    val replaced = CharVarcharUtils.failIfHasCharVarchar(schema).asInstanceOf[StructType]
-    this.userSpecifiedSchema = Option(replaced)
+    if (schema != null) {
+      val replaced = CharVarcharUtils.failIfHasCharVarchar(schema).asInstanceOf[StructType]
+      this.userSpecifiedSchema = Option(replaced)
+    }
     this
   }
 
@@ -90,10 +92,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * @since 2.3.0
    */
   def schema(schemaString: String): DataFrameReader = {
-    val rawSchema = StructType.fromDDL(schemaString)
-    val schema = CharVarcharUtils.failIfHasCharVarchar(rawSchema).asInstanceOf[StructType]
-    this.userSpecifiedSchema = Option(schema)
-    this
+    schema(StructType.fromDDL(schemaString))
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index d82fa9e88592f..06c75791ad5c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -64,8 +64,10 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
    * @since 2.0.0
    */
   def schema(schema: StructType): DataStreamReader = {
-    val replaced = CharVarcharUtils.failIfHasCharVarchar(schema).asInstanceOf[StructType]
-    this.userSpecifiedSchema = Option(replaced)
+    if (schema != null) {
+      val replaced = CharVarcharUtils.failIfHasCharVarchar(schema).asInstanceOf[StructType]
+      this.userSpecifiedSchema = Option(replaced)
+    }
     this
   }
 
@@ -77,10 +79,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
    * @since 2.3.0
    */
   def schema(schemaString: String): DataStreamReader = {
-    val rawSchema = StructType.fromDDL(schemaString)
-    val schema = CharVarcharUtils.failIfHasCharVarchar(rawSchema).asInstanceOf[StructType]
-    this.userSpecifiedSchema = Option(schema)
-    this
+    schema(StructType.fromDDL(schemaString))
   }
 
   /**

From 866e6839f53c5e8b7243e187111932180e0377e9 Mon Sep 17 00:00:00 2001
From: Linhong Liu <linhong.liu@databricks.com>
Date: Tue, 23 Feb 2021 15:51:02 +0800
Subject: [PATCH 015/169] [SPARK-34490][SQL] Analysis should fail if the view
 refers a dropped table

When resolving a view, we use the captured view name in `AnalysisContext` to
distinguish whether a relation name is a view or a table. But if the resolution failed,
other rules (e.g. `ResolveTables`) will try to resolve the relation again but without
`AnalysisContext`. So, in this case, the resolution may be incorrect. For example,
if the view refers to a dropped table while a view with the same name exists, the
dropped table will be resolved as a view rather than an unresolved exception.

bugfix

no

newly added test cases

Closes #31606 from linhongliu-db/fix-temp-view-master.

Lead-authored-by: Linhong Liu <linhong.liu@databricks.com>
Co-authored-by: Linhong Liu <67896261+linhongliu-db@users.noreply.github.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit be675a052c38a36ce5e33ba56bdc69cc8972b3e8)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      | 27 +++++++++++++------
 .../analysis/TableLookupCacheSuite.scala      | 13 ++++++---
 .../sql/execution/SQLViewTestSuite.scala      | 20 ++++++++++++++
 3 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index e9e8ba842a36f..bf8003105ada6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -872,16 +872,16 @@ class Analyzer(override val catalogManager: CatalogManager)
   object ResolveTempViews extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp {
       case u @ UnresolvedRelation(ident, _, isStreaming) =>
-        lookupTempView(ident, isStreaming).getOrElse(u)
+        lookupTempView(ident, isStreaming, performCheck = true).getOrElse(u)
       case i @ InsertIntoStatement(UnresolvedRelation(ident, _, false), _, _, _, _, _) =>
-        lookupTempView(ident)
+        lookupTempView(ident, performCheck = true)
           .map(view => i.copy(table = view))
           .getOrElse(i)
       // TODO (SPARK-27484): handle streaming write commands when we have them.
       case write: V2WriteCommand =>
         write.table match {
           case UnresolvedRelation(ident, _, false) =>
-            lookupTempView(ident).map(EliminateSubqueryAliases(_)).map {
+            lookupTempView(ident, performCheck = true).map(EliminateSubqueryAliases(_)).map {
               case r: DataSourceV2Relation => write.withNewTable(r)
               case _ => throw new AnalysisException("Cannot write into temp view " +
                 s"${ident.quoted} as it's not a data source v2 relation.")
@@ -906,7 +906,9 @@ class Analyzer(override val catalogManager: CatalogManager)
     }
 
     def lookupTempView(
-        identifier: Seq[String], isStreaming: Boolean = false): Option[LogicalPlan] = {
+        identifier: Seq[String],
+        isStreaming: Boolean = false,
+        performCheck: Boolean = false): Option[LogicalPlan] = {
       // Permanent View can't refer to temp views, no need to lookup at all.
       if (isResolvingView && !referredTempViewNames.contains(identifier)) return None
 
@@ -920,7 +922,7 @@ class Analyzer(override val catalogManager: CatalogManager)
         throw new AnalysisException(s"${identifier.quoted} is not a temp view of streaming " +
           s"logical plan, please use batch API such as `DataFrameReader.table` to read it.")
       }
-      tmpView.map(ResolveRelations.resolveViews)
+      tmpView.map(ResolveRelations.resolveViews(_, performCheck))
     }
   }
 
@@ -1074,7 +1076,7 @@ class Analyzer(override val catalogManager: CatalogManager)
     // look at `AnalysisContext.catalogAndNamespace` when resolving relations with single-part name.
     // If `AnalysisContext.catalogAndNamespace` is non-empty, analyzer will expand single-part names
     // with it, instead of current catalog and namespace.
-    def resolveViews(plan: LogicalPlan): LogicalPlan = plan match {
+    def resolveViews(plan: LogicalPlan, performCheck: Boolean = false): LogicalPlan = plan match {
       // The view's child should be a logical plan parsed from the `desc.viewText`, the variable
       // `viewText` should be defined, or else we throw an error on the generation of the View
       // operator.
@@ -1093,9 +1095,18 @@ class Analyzer(override val catalogManager: CatalogManager)
             executeSameContext(child)
           }
         }
+        // Fail the analysis eagerly because outside AnalysisContext, the unresolved operators
+        // inside a view maybe resolved incorrectly.
+        // But for commands like `DropViewCommand`, resolving view is unnecessary even though
+        // there is unresolved node. So use the `performCheck` flag to skip the analysis check
+        // for these commands.
+        // TODO(SPARK-34504): avoid unnecessary view resolving and remove the `performCheck` flag
+        if (performCheck) {
+          checkAnalysis(newChild)
+        }
         view.copy(child = newChild)
       case p @ SubqueryAlias(_, view: View) =>
-        p.copy(child = resolveViews(view))
+        p.copy(child = resolveViews(view, performCheck))
       case _ => plan
     }
 
@@ -1133,7 +1144,7 @@ class Analyzer(override val catalogManager: CatalogManager)
 
       case u: UnresolvedRelation =>
         lookupRelation(u.multipartIdentifier, u.options, u.isStreaming)
-          .map(resolveViews).getOrElse(u)
+          .map(resolveViews(_, performCheck = true)).getOrElse(u)
 
       case u @ UnresolvedTable(identifier, cmd) =>
         lookupTableOrView(identifier).map {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TableLookupCacheSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TableLookupCacheSuite.scala
index 3e9a8b71a8fb6..ec9480514ba2d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TableLookupCacheSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/TableLookupCacheSuite.scala
@@ -19,6 +19,8 @@ package org.apache.spark.sql.catalyst.analysis
 
 import java.io.File
 
+import scala.collection.JavaConverters._
+
 import org.mockito.ArgumentMatchers.any
 import org.mockito.Mockito._
 import org.mockito.invocation.InvocationOnMock
@@ -27,8 +29,8 @@ import org.scalatest.matchers.must.Matchers
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, CatalogStorageFormat, CatalogTable, CatalogTableType, ExternalCatalog, InMemoryCatalog, SessionCatalog}
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.connector.InMemoryTableCatalog
-import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogNotFoundException, Identifier, Table, V1Table}
+import org.apache.spark.sql.connector.{InMemoryTable, InMemoryTableCatalog}
+import org.apache.spark.sql.connector.catalog.{CatalogManager, CatalogNotFoundException, Identifier, Table}
 import org.apache.spark.sql.types._
 
 class TableLookupCacheSuite extends AnalysisTest with Matchers {
@@ -46,7 +48,12 @@ class TableLookupCacheSuite extends AnalysisTest with Matchers {
       ignoreIfExists = false)
     val v2Catalog = new InMemoryTableCatalog {
       override def loadTable(ident: Identifier): Table = {
-        V1Table(externalCatalog.getTable("default", ident.name))
+        val catalogTable = externalCatalog.getTable("default", ident.name)
+        new InMemoryTable(
+          catalogTable.identifier.table,
+          catalogTable.schema,
+          Array.empty,
+          Map.empty[String, String].asJava)
       }
       override def name: String = CatalogManager.SESSION_CATALOG_NAME
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala
index 68e1a682562ac..84a20bb16ad86 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala
@@ -258,6 +258,26 @@ abstract class SQLViewTestSuite extends QueryTest with SQLTestUtils {
       checkViewOutput(viewName, Seq(Row(2)))
     }
   }
+
+  test("SPARK-34490 - query should fail if the view refers a dropped table") {
+    withTable("t") {
+      Seq(2, 3, 1).toDF("c1").write.format("parquet").saveAsTable("t")
+      val viewName = createView("testView", "SELECT * FROM t")
+      withView(viewName) {
+        // Always create a temp view in this case, not use `createView` on purpose
+        sql("CREATE TEMP VIEW t AS SELECT 1 AS c1")
+        withTempView("t") {
+          checkViewOutput(viewName, Seq(Row(2), Row(3), Row(1)))
+          // Manually drop table `t` to see if the query will fail
+          sql("DROP TABLE IF EXISTS default.t")
+          val e = intercept[AnalysisException] {
+            sql(s"SELECT * FROM $viewName").collect()
+          }.getMessage
+          assert(e.contains("Table or view not found: t"))
+        }
+      }
+    }
+  }
 }
 
 class LocalTempViewTestSuite extends SQLViewTestSuite with SharedSparkSession {

From 6003f7c015444a70293e9127c0e7929040f32d4a Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Wed, 24 Feb 2021 21:32:19 +0800
Subject: [PATCH 016/169] [SPARK-34515][SQL] Fix NPE if InSet contains null
 value during getPartitionsByFilter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?

Skip null value during rewrite `InSet` to `>= and <=` at getPartitionsByFilter.

### Why are the changes needed?

Spark will convert `InSet` to `>= and <=` if it's values size over `spark.sql.hive.metastorePartitionPruningInSetThreshold` during pruning partition . At this case, if values contain a null, we will get such exception 
 
```
java.lang.NullPointerException
 at org.apache.spark.unsafe.types.UTF8String.compareTo(UTF8String.java:1389)
 at org.apache.spark.unsafe.types.UTF8String.compareTo(UTF8String.java:50)
 at scala.math.LowPriorityOrderingImplicits$$anon$3.compare(Ordering.scala:153)
 at java.util.TimSort.countRunAndMakeAscending(TimSort.java:355)
 at java.util.TimSort.sort(TimSort.java:220)
 at java.util.Arrays.sort(Arrays.java:1438)
 at scala.collection.SeqLike.sorted(SeqLike.scala:659)
 at scala.collection.SeqLike.sorted$(SeqLike.scala:647)
 at scala.collection.AbstractSeq.sorted(Seq.scala:45)
 at org.apache.spark.sql.hive.client.Shim_v0_13.convert$1(HiveShim.scala:772)
 at org.apache.spark.sql.hive.client.Shim_v0_13.$anonfun$convertFilters$4(HiveShim.scala:826)
 at scala.collection.immutable.Stream.flatMap(Stream.scala:489)
 at org.apache.spark.sql.hive.client.Shim_v0_13.convertFilters(HiveShim.scala:826)
 at org.apache.spark.sql.hive.client.Shim_v0_13.getPartitionsByFilter(HiveShim.scala:848)
 at org.apache.spark.sql.hive.client.HiveClientImpl.$anonfun$getPartitionsByFilter$1(HiveClientImpl.scala:750)
```

### Does this PR introduce _any_ user-facing change?

Yes, bug fix.

### How was this patch tested?

Add test.

Closes #31632 from ulysses-you/SPARK-34515.

Authored-by: ulysses-you <ulyssesyou18@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 999d3b89b6df14a5ccb94ffc2ffadb82964e9f7d)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../scala/org/apache/spark/sql/hive/client/HiveShim.scala | 4 +++-
 .../org/apache/spark/sql/hive/client/FiltersSuite.scala   | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index ed088648bc20a..8ccb17ce35925 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -769,7 +769,9 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 
       case InSet(child, values) if useAdvanced && values.size > inSetThreshold =>
         val dataType = child.dataType
-        val sortedValues = values.toSeq.sorted(TypeUtils.getInterpretedOrdering(dataType))
+        // Skip null here is safe, more details could see at ExtractableLiterals.
+        val sortedValues = values.filter(_ != null).toSeq
+          .sorted(TypeUtils.getInterpretedOrdering(dataType))
         convert(And(GreaterThanOrEqual(child, Literal(sortedValues.head, dataType)),
           LessThanOrEqual(child, Literal(sortedValues.last, dataType))))
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
index 12ed0e5305299..6962f9dd6b186 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
@@ -179,5 +179,13 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest {
     }
   }
 
+  test("SPARK-34515: Fix NPE if InSet contains null value during getPartitionsByFilter") {
+    withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING_INSET_THRESHOLD.key -> "2") {
+      val filter = InSet(a("p", IntegerType), Set(null, 1, 2))
+      val converted = shim.convertFilters(testTable, Seq(filter), conf.sessionLocalTimeZone)
+      assert(converted == "(p >= 1 and p <= 2)")
+    }
+  }
+
   private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)()
 }

From eee08d71fbf76c661f992b52c43917bc61564739 Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@apache.org>
Date: Thu, 25 Feb 2021 18:07:39 +0800
Subject: [PATCH 017/169] [SPARK-34436][SQL] DPP support LIKE ANY/ALL
 expression

### What changes were proposed in this pull request?

This pr make DPP support LIKE ANY/ALL expression:
```sql
SELECT date_id, product_id FROM fact_sk f
JOIN dim_store s
ON f.store_id = s.store_id WHERE s.country LIKE ANY ('%D%E%', '%A%B%')
```

### Why are the changes needed?

Improve query performance.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Unit test.

Closes #31563 from wangyum/SPARK-34436.

Lead-authored-by: Yuming Wang <yumwang@apache.org>
Co-authored-by: Yuming Wang <yumwang@ebay.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 4a3200b08ac3e7733b5a3dc7271d35e6872c5967)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../dynamicpruning/PartitionPruning.scala     |  1 +
 .../sql/DynamicPartitionPruningSuite.scala    | 20 +++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PartitionPruning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PartitionPruning.scala
index e30f9b65a2c2c..7fac91a337adc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PartitionPruning.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/dynamicpruning/PartitionPruning.scala
@@ -163,6 +163,7 @@ object PartitionPruning extends Rule[LogicalPlan] with PredicateHelper {
     case _: BinaryComparison => true
     case _: In | _: InSet => true
     case _: StringPredicate => true
+    case _: MultiLikeBase => true
     case _ => false
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala
index 55437aaa47298..db7b0dd4b67e8 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DynamicPartitionPruningSuite.scala
@@ -1361,6 +1361,26 @@ abstract class DynamicPartitionPruningSuiteBase
       checkAnswer(df, Nil)
     }
   }
+
+  test("SPARK-34436: DPP support LIKE ANY/ALL expression") {
+    withSQLConf(SQLConf.DYNAMIC_PARTITION_PRUNING_ENABLED.key -> "true") {
+      val df = sql(
+        """
+          |SELECT date_id, product_id FROM fact_sk f
+          |JOIN dim_store s
+          |ON f.store_id = s.store_id WHERE s.country LIKE ANY ('%D%E%', '%A%B%')
+        """.stripMargin)
+
+      checkPartitionPruningPredicate(df, false, true)
+
+      checkAnswer(df,
+        Row(1030, 2) ::
+        Row(1040, 2) ::
+        Row(1050, 2) ::
+        Row(1060, 2) :: Nil
+      )
+    }
+  }
 }
 
 class DynamicPartitionPruningSuiteAEOff extends DynamicPartitionPruningSuiteBase {

From 91f2a9e86a38245dde3aa8b37ee2619907545f3f Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Fri, 26 Feb 2021 21:29:14 +0900
Subject: [PATCH 018/169] [SPARK-34550][SQL] Skip InSet null value during push
 filter to Hive metastore

### What changes were proposed in this pull request?

Skip `InSet` null value during push filter to Hive metastore.

### Why are the changes needed?

If `InSet` contains a null value, we should skip it and push other values to metastore. To keep same behavior with `In`.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Add test.

Closes #31659 from ulysses-you/SPARK-34550.

Authored-by: ulysses-you <ulyssesyou18@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
(cherry picked from commit 82267acfe8c78a70d56a6ae6ab9a1135c0dc0836)
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../apache/spark/sql/hive/client/HiveShim.scala   |  4 ++--
 .../spark/sql/hive/client/FiltersSuite.scala      | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 8ccb17ce35925..db67480ceb77a 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -700,7 +700,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
       }
 
       def unapply(values: Set[Any]): Option[Seq[String]] = {
-        val extractables = values.toSeq.map(valueToLiteralString.lift)
+        val extractables = values.filter(_ != null).toSeq.map(valueToLiteralString.lift)
         if (extractables.nonEmpty && extractables.forall(_.isDefined)) {
           Some(extractables.map(_.get))
         } else {
@@ -715,7 +715,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
       }
 
       def unapply(values: Set[Any]): Option[Seq[String]] = {
-        val extractables = values.toSeq.map(valueToLiteralString.lift)
+        val extractables = values.filter(_ != null).toSeq.map(valueToLiteralString.lift)
         if (extractables.nonEmpty && extractables.forall(_.isDefined)) {
           Some(extractables.map(_.get))
         } else {
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
index 6962f9dd6b186..79b34bd141de3 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
@@ -187,5 +187,20 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest {
     }
   }
 
+  test("SPARK-34538: Skip InSet null value during push filter to Hive metastore") {
+    withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING_INSET_THRESHOLD.key -> "3") {
+      val intFilter = InSet(a("p", IntegerType), Set(null, 1, 2))
+      val intConverted = shim.convertFilters(testTable, Seq(intFilter), conf.sessionLocalTimeZone)
+      assert(intConverted == "(p = 1 or p = 2)")
+    }
+
+    withSQLConf(SQLConf.HIVE_METASTORE_PARTITION_PRUNING_INSET_THRESHOLD.key -> "3") {
+      val dateFilter = InSet(a("p", DateType), Set(null,
+        Literal(Date.valueOf("2020-01-01")).eval(), Literal(Date.valueOf("2021-01-01")).eval()))
+      val dateConverted = shim.convertFilters(testTable, Seq(dateFilter), conf.sessionLocalTimeZone)
+      assert(dateConverted == "(p = 2020-01-01 or p = 2021-01-01)")
+    }
+  }
+
   private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)()
 }

From d78ae65f01094f3e2dd8a2b14b7da3c7ff5905d4 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Mon, 1 Mar 2021 13:55:35 +0900
Subject: [PATCH 019/169] [SPARK-34556][SQL] Checking duplicate static
 partition columns should respect case sensitive conf

### What changes were proposed in this pull request?

This PR makes partition spec parsing respect case sensitive conf.

### Why are the changes needed?

When parsing the partition spec, Spark will call `org.apache.spark.sql.catalyst.parser.ParserUtils.checkDuplicateKeys` to check if there are duplicate partition column names in the list. But this method is always case sensitive and doesn't detect duplicate partition column names when using different cases.

### Does this PR introduce _any_ user-facing change?

Yep. This prevents users from writing incorrect queries such as `INSERT OVERWRITE t PARTITION (c='2', C='3') VALUES (1)` when they don't enable case sensitive conf.

### How was this patch tested?

The new added test will fail without this change.

Closes #31669 from zsxwing/SPARK-34556.

Authored-by: Shixiong Zhu <zsxwing@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
(cherry picked from commit 62737e140c7b04805726a33c392c297335db7b45)
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../sql/catalyst/parser/AstBuilder.scala      |  6 ++++-
 .../apache/spark/sql/SQLInsertTestSuite.scala | 22 +++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index c3d593c47f74e..63906e11cebc7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -491,7 +491,11 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with SQLConfHelper with Logg
     // Before calling `toMap`, we check duplicated keys to avoid silently ignore partition values
     // in partition spec like PARTITION(a='1', b='2', a='3'). The real semantical check for
     // partition columns will be done in analyzer.
-    checkDuplicateKeys(parts.toSeq, ctx)
+    if (conf.caseSensitiveAnalysis) {
+      checkDuplicateKeys(parts.toSeq, ctx)
+    } else {
+      checkDuplicateKeys(parts.map(kv => kv._1.toLowerCase(Locale.ROOT) -> kv._2).toSeq, ctx)
+    }
     parts.toMap
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala
index c7446c7a9f443..67c5f12dc71dd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala
@@ -208,6 +208,28 @@ trait SQLInsertTestSuite extends QueryTest with SQLTestUtils {
       checkAnswer(spark.table("t"), Row("1", null))
     }
   }
+
+  test("SPARK-34556: " +
+    "checking duplicate static partition columns should respect case sensitive conf") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(i STRING, c string) USING PARQUET PARTITIONED BY (c)")
+      val e = intercept[AnalysisException] {
+        sql("INSERT OVERWRITE t PARTITION (c='2', C='3') VALUES (1)")
+      }
+      assert(e.getMessage.contains("Found duplicate keys 'c'"))
+    }
+    // The following code is skipped for Hive because columns stored in Hive Metastore is always
+    // case insensitive and we cannot create such table in Hive Metastore.
+    if (!format.startsWith("hive")) {
+      withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
+        withTable("t") {
+          sql(s"CREATE TABLE t(i int, c string, C string) USING PARQUET PARTITIONED BY (c, C)")
+          sql("INSERT OVERWRITE t PARTITION (c='2', C='3') VALUES (1)")
+          checkAnswer(spark.table("t"), Row(1, "2", "3"))
+        }
+      }
+    }
+  }
 }
 
 class FileSourceSQLInsertTestSuite extends SQLInsertTestSuite with SharedSparkSession {

From 53997008faeb5abe73e7d9cc3274cdc40bfbee3a Mon Sep 17 00:00:00 2001
From: Amandeep Sharma <happyaman91@gmail.com>
Date: Tue, 2 Mar 2021 17:14:15 +0800
Subject: [PATCH 020/169] [SPARK-34417][SQL]
 org.apache.spark.sql.DataFrameNaFunctions.fillMap fails for column name
 having a dot

**What changes were proposed in this pull request?**

This PR fixes dataframe.na.fillMap() for column having a dot in the name as mentioned in [SPARK-34417](https://issues.apache.org/jira/browse/SPARK-34417).

Use resolved attributes of a column for replacing null values.

**Why are the changes needed?**
dataframe.na.fillMap() does not work for column having a dot in the name

**Does this PR introduce any user-facing change?**
None

**How was this patch tested?**
Added unit test for the same

Closes #31545 from amandeep-sharma/master.

Lead-authored-by: Amandeep Sharma <happyaman91@gmail.com>
Co-authored-by: Amandeep Sharma <amandeep.sharma@oracle.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 4bda3c0f0225817456c4e423d4c85cc6b796f0c9)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/DataFrameNaFunctions.scala      | 45 ++++++++++---------
 .../spark/sql/DataFrameNaFunctionsSuite.scala | 25 +++++++++++
 2 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
index bbf0ac1dd85e9..308bb96502b19 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala
@@ -395,10 +395,13 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
 
   private def fillMap(values: Seq[(String, Any)]): DataFrame = {
     // Error handling
-    values.foreach { case (colName, replaceValue) =>
+    val attrToValue = AttributeMap(values.map { case (colName, replaceValue) =>
       // Check column name exists
-      df.resolve(colName)
-
+      val attr = df.resolve(colName) match {
+        case a: Attribute => a
+        case _ => throw new UnsupportedOperationException(
+          s"Nested field ${colName} is not supported.")
+      }
       // Check data type
       replaceValue match {
         case _: jl.Double | _: jl.Float | _: jl.Integer | _: jl.Long | _: jl.Boolean | _: String =>
@@ -406,31 +409,29 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) {
         case _ => throw new IllegalArgumentException(
           s"Unsupported value type ${replaceValue.getClass.getName} ($replaceValue).")
       }
-    }
-
-    val columnEquals = df.sparkSession.sessionState.analyzer.resolver
-    val projections = df.schema.fields.map { f =>
-      values.find { case (k, _) => columnEquals(k, f.name) }.map { case (_, v) =>
-        v match {
-          case v: jl.Float => fillCol[Float](f, v)
-          case v: jl.Double => fillCol[Double](f, v)
-          case v: jl.Long => fillCol[Long](f, v)
-          case v: jl.Integer => fillCol[Integer](f, v)
-          case v: jl.Boolean => fillCol[Boolean](f, v.booleanValue())
-          case v: String => fillCol[String](f, v)
-        }
-      }.getOrElse(df.col(f.name))
+      attr -> replaceValue
+    })
+
+    val output = df.queryExecution.analyzed.output
+    val projections = output.map {
+      attr => attrToValue.get(attr).map {
+        case v: jl.Float => fillCol[Float](attr, v)
+        case v: jl.Double => fillCol[Double](attr, v)
+        case v: jl.Long => fillCol[Long](attr, v)
+        case v: jl.Integer => fillCol[Integer](attr, v)
+        case v: jl.Boolean => fillCol[Boolean](attr, v.booleanValue())
+        case v: String => fillCol[String](attr, v)
+      }.getOrElse(Column(attr))
     }
     df.select(projections : _*)
   }
 
   /**
-   * Returns a [[Column]] expression that replaces null value in `col` with `replacement`.
-   * It selects a column based on its name.
+   * Returns a [[Column]] expression that replaces null value in column defined by `attr`
+   * with `replacement`.
    */
-  private def fillCol[T](col: StructField, replacement: T): Column = {
-    val quotedColName = "`" + col.name + "`"
-    fillCol(col.dataType, col.name, df.col(quotedColName), replacement)
+  private def fillCol[T](attr: Attribute, replacement: T): Column = {
+    fillCol(attr.dataType, attr.name, Column(attr), replacement)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
index 091877f7cac37..23c2349f89574 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionsSuite.scala
@@ -460,4 +460,29 @@ class DataFrameNaFunctionsSuite extends QueryTest with SharedSparkSession {
       Row(0, 0L, 0.toShort, 0.toByte, Float.NaN, Double.NaN) ::
       Row(0, 0L, 0.toShort, 0.toByte, Float.NaN, Double.NaN) :: Nil)
   }
+
+  test("SPARK-34417 - test fillMap() for column with a dot in the name") {
+    val na = "n/a"
+    checkAnswer(
+      Seq(("abc", 23L), ("def", 44L), (null, 0L)).toDF("ColWith.Dot", "Col")
+        .na.fill(Map("`ColWith.Dot`" -> na)),
+      Row("abc", 23) :: Row("def", 44L) :: Row(na, 0L) :: Nil)
+  }
+
+  test("SPARK-34417 - test fillMap() for qualified-column with a dot in the name") {
+    val na = "n/a"
+    checkAnswer(
+      Seq(("abc", 23L), ("def", 44L), (null, 0L)).toDF("ColWith.Dot", "Col").as("testDF")
+        .na.fill(Map("testDF.`ColWith.Dot`" -> na)),
+      Row("abc", 23) :: Row("def", 44L) :: Row(na, 0L) :: Nil)
+  }
+
+  test("SPARK-34417 - test fillMap() for column without a dot in the name" +
+    " and dataframe with another column having a dot in the name") {
+    val na = "n/a"
+    checkAnswer(
+      Seq(("abc", 23L), ("def", 44L), (null, 0L)).toDF("Col", "ColWith.Dot")
+        .na.fill(Map("Col" -> na)),
+      Row("abc", 23) :: Row("def", 44L) :: Row(na, 0L) :: Nil)
+  }
 }

From 59bd127e953a1a04f112dce92cd503d2a98f3568 Mon Sep 17 00:00:00 2001
From: Karen Feng <karen.feng@databricks.com>
Date: Tue, 2 Mar 2021 17:27:13 +0800
Subject: [PATCH 021/169] [SPARK-34547][SQL] Only use metadata columns for
 resolution as last resort

### What changes were proposed in this pull request?

Today, child expressions may be resolved based on "real" or metadata output attributes. We should prefer the real attribute during resolution if one exists.

### Why are the changes needed?

Today, attempting to resolve an expression when there is a "real" output attribute and a metadata attribute with the same name results in resolution failure. This is likely unexpected, as the user may not know about the metadata attribute.

### Does this PR introduce _any_ user-facing change?

Yes. Previously, the user would see an error message when resolving a column with the same name as a "real" output attribute and a metadata attribute as below:
```
org.apache.spark.sql.AnalysisException: Reference 'index' is ambiguous, could be: testcat.ns1.ns2.tableTwo.index, testcat.ns1.ns2.tableOne.index.; line 1 pos 71
at org.apache.spark.sql.catalyst.expressions.package$AttributeSeq.resolve(package.scala:363)
at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveChildren(LogicalPlan.scala:107)
```

Now, resolution succeeds and provides the "real" output attribute.

### How was this patch tested?

Added a unit test.

Closes #31654 from karenfeng/fallback-resolve-metadata.

Authored-by: Karen Feng <karen.feng@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 2e54d68eb94cf39b59166f2b1bbb8f6c317760b8)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/plans/logical/LogicalPlan.scala  |  6 ++--
 .../sql/connector/DataSourceV2SQLSuite.scala  | 31 +++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index ad5c3fd74e9b5..781e4c21c3058 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -89,8 +89,9 @@ abstract class LogicalPlan
     }
   }
 
-  private[this] lazy val childAttributes =
-    AttributeSeq(children.flatMap(c => c.output ++ c.metadataOutput))
+  private[this] lazy val childAttributes = AttributeSeq(children.flatMap(_.output))
+
+  private[this] lazy val childMetadataAttributes = AttributeSeq(children.flatMap(_.metadataOutput))
 
   private[this] lazy val outputAttributes = AttributeSeq(output)
 
@@ -103,6 +104,7 @@ abstract class LogicalPlan
       nameParts: Seq[String],
       resolver: Resolver): Option[NamedExpression] =
     childAttributes.resolve(nameParts, resolver)
+      .orElse(childMetadataAttributes.resolve(nameParts, resolver))
 
   /**
    * Optionally resolves the given strings to a [[NamedExpression]] based on the output of this
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
index 0e12eba84eb03..38888c3fdd80b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
@@ -2649,6 +2649,37 @@ class DataSourceV2SQLSuite
     }
   }
 
+  test("SPARK-34547: metadata columns are resolved last") {
+    val t1 = s"${catalogAndNamespace}tableOne"
+    val t2 = "t2"
+    withTable(t1) {
+      sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " +
+        "PARTITIONED BY (bucket(4, id), id)")
+      sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+      withTempView(t2) {
+        sql(s"CREATE TEMPORARY VIEW $t2 AS SELECT * FROM " +
+          s"VALUES (1, -1), (2, -2), (3, -3) AS $t2(id, index)")
+
+        val sqlQuery = spark.sql(s"SELECT $t1.id, $t2.id, data, index, $t1.index, $t2.index FROM " +
+          s"$t1 JOIN $t2 WHERE $t1.id = $t2.id")
+        val t1Table = spark.table(t1)
+        val t2Table = spark.table(t2)
+        val dfQuery = t1Table.join(t2Table, t1Table.col("id") === t2Table.col("id"))
+          .select(s"$t1.id", s"$t2.id", "data", "index", s"$t1.index", s"$t2.index")
+
+        Seq(sqlQuery, dfQuery).foreach { query =>
+          checkAnswer(query,
+            Seq(
+              Row(1, 1, "a", -1, 0, -1),
+              Row(2, 2, "b", -2, 0, -2),
+              Row(3, 3, "c", -3, 0, -3)
+            )
+          )
+        }
+      }
+    }
+  }
+
   test("SPARK-33505: insert into partitioned table") {
     val t = "testpart.ns1.ns2.tbl"
     withTable(t) {

From bcf662e78054470a821abc0dde6bd9afb36b2e4a Mon Sep 17 00:00:00 2001
From: Kris Mok <kris.mok@databricks.com>
Date: Wed, 3 Mar 2021 12:22:51 +0900
Subject: [PATCH 022/169] [SPARK-34596][SQL] Use Utils.getSimpleName to avoid
 hitting Malformed class name in NewInstance.doGenCode

### What changes were proposed in this pull request?

Use `Utils.getSimpleName` to avoid hitting `Malformed class name` error in `NewInstance.doGenCode`.

### Why are the changes needed?

On older JDK versions (e.g. JDK8u), nested Scala classes may trigger `java.lang.Class.getSimpleName` to throw an `java.lang.InternalError: Malformed class name` error.
In this particular case, creating an `ExpressionEncoder` on such a nested Scala class would create a `NewInstance` expression under the hood, which will trigger the problem during codegen.

Similar to https://github.com/apache/spark/pull/29050, we should use  Spark's `Utils.getSimpleName` utility function in place of `Class.getSimpleName` to avoid hitting the issue.

There are two other occurrences of `java.lang.Class.getSimpleName` in the same file, but they're safe because they're only guaranteed to be only used on Java classes, which don't have this problem, e.g.:
```scala
    // Make a copy of the data if it's unsafe-backed
    def makeCopyIfInstanceOf(clazz: Class[_ <: Any], value: String) =
      s"$value instanceof ${clazz.getSimpleName}? ${value}.copy() : $value"
    val genFunctionValue: String = lambdaFunction.dataType match {
      case StructType(_) => makeCopyIfInstanceOf(classOf[UnsafeRow], genFunction.value)
      case ArrayType(_, _) => makeCopyIfInstanceOf(classOf[UnsafeArrayData], genFunction.value)
      case MapType(_, _, _) => makeCopyIfInstanceOf(classOf[UnsafeMapData], genFunction.value)
      case _ => genFunction.value
    }
```
The Unsafe-* family of types are all Java types, so they're okay.

### Does this PR introduce _any_ user-facing change?

Fixes a bug that throws an error when using `ExpressionEncoder` on some nested Scala types, otherwise no changes.

### How was this patch tested?

Added a test case to `org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite`. It'll fail on JDK8u before the fix, and pass after the fix.

Closes #31709 from rednaxelafx/spark-34596-master.

Authored-by: Kris Mok <kris.mok@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
(cherry picked from commit ecf4811764f1ef91954c865a864e0bf6691f99a6)
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../sql/catalyst/expressions/objects/objects.scala   |  2 +-
 .../catalyst/encoders/ExpressionEncoderSuite.scala   | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index f391b3128cf41..8801c7d3a9271 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -489,7 +489,7 @@ case class NewInstance(
       // that might be defined on the companion object.
       case 0 => s"$className$$.MODULE$$.apply($argString)"
       case _ => outer.map { gen =>
-        s"${gen.value}.new ${cls.getSimpleName}($argString)"
+        s"${gen.value}.new ${Utils.getSimpleName(cls)}($argString)"
       }.getOrElse {
         s"new $className($argString)"
       }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
index f2598a925e08e..26352648638c7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
@@ -205,6 +205,18 @@ class ExpressionEncoderSuite extends CodegenInterpretedPlanTest with AnalysisTes
 
   encodeDecodeTest(Array(Option(InnerClass(1))), "array of optional inner class")
 
+  // holder class to trigger Class.getSimpleName issue
+  object MalformedClassObject extends Serializable {
+    case class MalformedNameExample(x: Int)
+  }
+
+  {
+    OuterScopes.addOuterScope(MalformedClassObject)
+    encodeDecodeTest(
+      MalformedClassObject.MalformedNameExample(42),
+      "nested Scala class should work")
+  }
+
   productTest(PrimitiveData(1, 1, 1, 1, 1, 1, true))
 
   productTest(

From 0ba776739fbb6930d6dbf55c9970b91d67cc723e Mon Sep 17 00:00:00 2001
From: yuhaiyang <yuhaiyang@yuhaiyangs-MacBook-Pro.local>
Date: Wed, 3 Mar 2021 12:25:38 +0800
Subject: [PATCH 023/169] [SPARK-34534] Fix blockIds order when use
 FetchShuffleBlocks to fetch blocks

### What changes were proposed in this pull request?

Fix a problems which can lead to data correctness after part blocks retry in `OneForOneBlockFetcher` when use `FetchShuffleBlocks` .

### Why are the changes needed?
This is a data correctness bug, It's is no problems when use old protocol to send `OpenBlocks` before fetch chunks in `OneForOneBlockFetcher`;
In latest branch, `OpenBlocks`  has been replaced to `FetchShuffleBlocks`. Howerver, `FetchShuffleBlocks` read shuffle blocks order is not the same as `blockIds` in `OneForOneBlockFetcher`; the `blockIds` is used to match blockId with shuffle data with index, now it is out of order;
It will lead to read wrong block chunk when some blocks fetch failed in `OneForOneBlockFetcher`, it will retry the rest of the blocks in `blockIds`  based on the `blockIds`'s order.

### Does this PR introduce _any_ user-facing change?

### How was this patch tested?

Closes #31643 from seayoun/yuhaiyang_fix_use_FetchShuffleBlocks_order.

Lead-authored-by: yuhaiyang <yuhaiyang@yuhaiyangs-MacBook-Pro.local>
Co-authored-by: yuhaiyang <yuhaiyang@172.19.25.126>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 4e438196114eff2e1fc4dd726fdc1bda1af267da)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../shuffle/OneForOneBlockFetcher.java        | 53 ++++++++++++++-----
 .../shuffle/OneForOneBlockFetcherSuite.java   | 42 +++++++++++++++
 2 files changed, 81 insertions(+), 14 deletions(-)

diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
index ec2e3dce661d9..0b7eaa6225a41 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/OneForOneBlockFetcher.java
@@ -21,7 +21,7 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.HashMap;
+import java.util.LinkedHashMap;
 
 import com.google.common.primitives.Ints;
 import com.google.common.primitives.Longs;
@@ -81,7 +81,6 @@ public OneForOneBlockFetcher(
       TransportConf transportConf,
       DownloadFileManager downloadFileManager) {
     this.client = client;
-    this.blockIds = blockIds;
     this.listener = listener;
     this.chunkCallback = new ChunkCallback();
     this.transportConf = transportConf;
@@ -90,8 +89,10 @@ public OneForOneBlockFetcher(
       throw new IllegalArgumentException("Zero-sized blockIds array");
     }
     if (!transportConf.useOldFetchProtocol() && isShuffleBlocks(blockIds)) {
-      this.message = createFetchShuffleBlocksMsg(appId, execId, blockIds);
+      this.blockIds = new String[blockIds.length];
+      this.message = createFetchShuffleBlocksMsgAndBuildBlockIds(appId, execId, blockIds);
     } else {
+      this.blockIds = blockIds;
       this.message = new OpenBlocks(appId, execId, blockIds);
     }
   }
@@ -106,17 +107,16 @@ private boolean isShuffleBlocks(String[] blockIds) {
   }
 
   /**
-   * Analyze the pass in blockIds and create FetchShuffleBlocks message.
-   * The blockIds has been sorted by mapId and reduceId. It's produced in
-   * org.apache.spark.MapOutputTracker.convertMapStatuses.
+   * Create FetchShuffleBlocks message and rebuild internal blockIds by
+   * analyzing the pass in blockIds.
    */
-  private FetchShuffleBlocks createFetchShuffleBlocksMsg(
+  private FetchShuffleBlocks createFetchShuffleBlocksMsgAndBuildBlockIds(
       String appId, String execId, String[] blockIds) {
     String[] firstBlock = splitBlockId(blockIds[0]);
     int shuffleId = Integer.parseInt(firstBlock[1]);
     boolean batchFetchEnabled = firstBlock.length == 5;
 
-    HashMap<Long, ArrayList<Integer>> mapIdToReduceIds = new HashMap<>();
+    LinkedHashMap<Long, BlocksInfo> mapIdToBlocksInfo = new LinkedHashMap<>();
     for (String blockId : blockIds) {
       String[] blockIdParts = splitBlockId(blockId);
       if (Integer.parseInt(blockIdParts[1]) != shuffleId) {
@@ -124,23 +124,36 @@ private FetchShuffleBlocks createFetchShuffleBlocksMsg(
           ", got:" + blockId);
       }
       long mapId = Long.parseLong(blockIdParts[2]);
-      if (!mapIdToReduceIds.containsKey(mapId)) {
-        mapIdToReduceIds.put(mapId, new ArrayList<>());
+      if (!mapIdToBlocksInfo.containsKey(mapId)) {
+        mapIdToBlocksInfo.put(mapId, new BlocksInfo());
       }
-      mapIdToReduceIds.get(mapId).add(Integer.parseInt(blockIdParts[3]));
+      BlocksInfo blocksInfoByMapId = mapIdToBlocksInfo.get(mapId);
+      blocksInfoByMapId.blockIds.add(blockId);
+      blocksInfoByMapId.reduceIds.add(Integer.parseInt(blockIdParts[3]));
       if (batchFetchEnabled) {
         // When we read continuous shuffle blocks in batch, we will reuse reduceIds in
         // FetchShuffleBlocks to store the start and end reduce id for range
         // [startReduceId, endReduceId).
         assert(blockIdParts.length == 5);
-        mapIdToReduceIds.get(mapId).add(Integer.parseInt(blockIdParts[4]));
+        blocksInfoByMapId.reduceIds.add(Integer.parseInt(blockIdParts[4]));
       }
     }
-    long[] mapIds = Longs.toArray(mapIdToReduceIds.keySet());
+    long[] mapIds = Longs.toArray(mapIdToBlocksInfo.keySet());
     int[][] reduceIdArr = new int[mapIds.length][];
+    int blockIdIndex = 0;
     for (int i = 0; i < mapIds.length; i++) {
-      reduceIdArr[i] = Ints.toArray(mapIdToReduceIds.get(mapIds[i]));
+      BlocksInfo blocksInfoByMapId = mapIdToBlocksInfo.get(mapIds[i]);
+      reduceIdArr[i] = Ints.toArray(blocksInfoByMapId.reduceIds);
+
+      // The `blockIds`'s order must be same with the read order specified in in FetchShuffleBlocks
+      // because the shuffle data's return order should match the `blockIds`'s order to ensure
+      // blockId and data match.
+      for (int j = 0; j < blocksInfoByMapId.blockIds.size(); j++) {
+        this.blockIds[blockIdIndex++] = blocksInfoByMapId.blockIds.get(j);
+      }
     }
+    assert(blockIdIndex == this.blockIds.length);
+
     return new FetchShuffleBlocks(
       appId, execId, shuffleId, mapIds, reduceIdArr, batchFetchEnabled);
   }
@@ -157,6 +170,18 @@ private String[] splitBlockId(String blockId) {
     return blockIdParts;
   }
 
+  /** The reduceIds and blocks in a single mapId */
+  private class BlocksInfo {
+
+    final ArrayList<Integer> reduceIds;
+    final ArrayList<String> blockIds;
+
+    BlocksInfo() {
+      this.reduceIds = new ArrayList<>();
+      this.blockIds = new ArrayList<>();
+    }
+  }
+
   /** Callback invoked on receipt of each chunk. We equate a single chunk to a single block. */
   private class ChunkCallback implements ChunkReceivedCallback {
     @Override
diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/OneForOneBlockFetcherSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/OneForOneBlockFetcherSuite.java
index 285eedb39c65c..a7eb59d366966 100644
--- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/OneForOneBlockFetcherSuite.java
+++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/OneForOneBlockFetcherSuite.java
@@ -201,6 +201,48 @@ public void testEmptyBlockFetch() {
     }
   }
 
+  @Test
+  public void testFetchShuffleBlocksOrder() {
+    LinkedHashMap<String, ManagedBuffer> blocks = Maps.newLinkedHashMap();
+    blocks.put("shuffle_0_0_0", new NioManagedBuffer(ByteBuffer.wrap(new byte[1])));
+    blocks.put("shuffle_0_2_1", new NioManagedBuffer(ByteBuffer.wrap(new byte[2])));
+    blocks.put("shuffle_0_10_2", new NettyManagedBuffer(Unpooled.wrappedBuffer(new byte[3])));
+    String[] blockIds = blocks.keySet().toArray(new String[blocks.size()]);
+
+    BlockFetchingListener listener = fetchBlocks(
+      blocks,
+      blockIds,
+      new FetchShuffleBlocks("app-id", "exec-id", 0,
+              new long[]{0, 2, 10}, new int[][]{{0}, {1}, {2}}, false),
+      conf);
+
+    for (int chunkIndex = 0; chunkIndex < blockIds.length; chunkIndex++) {
+      String blockId = blockIds[chunkIndex];
+      verify(listener).onBlockFetchSuccess(blockId, blocks.get(blockId));
+    }
+  }
+
+  @Test
+  public void testBatchFetchShuffleBlocksOrder() {
+    LinkedHashMap<String, ManagedBuffer> blocks = Maps.newLinkedHashMap();
+    blocks.put("shuffle_0_0_1_2", new NioManagedBuffer(ByteBuffer.wrap(new byte[1])));
+    blocks.put("shuffle_0_2_2_3", new NioManagedBuffer(ByteBuffer.wrap(new byte[2])));
+    blocks.put("shuffle_0_10_3_4", new NettyManagedBuffer(Unpooled.wrappedBuffer(new byte[3])));
+    String[] blockIds = blocks.keySet().toArray(new String[blocks.size()]);
+
+    BlockFetchingListener listener = fetchBlocks(
+      blocks,
+      blockIds,
+      new FetchShuffleBlocks("app-id", "exec-id", 0,
+              new long[]{0, 2, 10}, new int[][]{{1, 2}, {2, 3}, {3, 4}}, true),
+      conf);
+
+    for (int chunkIndex = 0; chunkIndex < blockIds.length; chunkIndex++) {
+      String blockId = blockIds[chunkIndex];
+      verify(listener).onBlockFetchSuccess(blockId, blocks.get(blockId));
+    }
+  }
+
   /**
    * Begins a fetch on the given set of blocks by mocking out the server side of the RPC which
    * simply returns the given (BlockId, Block) pairs.

From b3500ce4f0e089eddedfae1fb382a37a40d60280 Mon Sep 17 00:00:00 2001
From: Karen Feng <karen.feng@databricks.com>
Date: Wed, 3 Mar 2021 22:07:41 +0800
Subject: [PATCH 024/169] [SPARK-34555][SQL] Resolve metadata output from
 DataFrame

Add metadataOutput as a fallback to resolution.
Builds off https://github.com/apache/spark/pull/31654.

The metadata columns could not be resolved via `df.col("metadataColName")` from the DataFrame API.

Yes, the metadata columns can now be resolved as described above.

Scala unit test.

Closes #31668 from karenfeng/spark-34555.

Authored-by: Karen Feng <karen.feng@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit b01dd12805f7b40318f183ee48bc0012bb4e847f)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/plans/logical/LogicalPlan.scala  |  5 +++-
 .../sql/connector/DataSourceV2SQLSuite.scala  | 23 ++++++++++++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index 781e4c21c3058..bdf37d040eb79 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -95,6 +95,8 @@ abstract class LogicalPlan
 
   private[this] lazy val outputAttributes = AttributeSeq(output)
 
+  private[this] lazy val outputMetadataAttributes = AttributeSeq(metadataOutput)
+
   /**
    * Optionally resolves the given strings to a [[NamedExpression]] using the input from all child
    * nodes of this LogicalPlan. The attribute is expressed as
@@ -115,6 +117,7 @@ abstract class LogicalPlan
       nameParts: Seq[String],
       resolver: Resolver): Option[NamedExpression] =
     outputAttributes.resolve(nameParts, resolver)
+      .orElse(outputMetadataAttributes.resolve(nameParts, resolver))
 
   /**
    * Given an attribute name, split it to name parts by dot, but
@@ -124,7 +127,7 @@ abstract class LogicalPlan
   def resolveQuoted(
       name: String,
       resolver: Resolver): Option[NamedExpression] = {
-    outputAttributes.resolve(UnresolvedAttribute.parseAttributeName(name), resolver)
+    resolve(UnresolvedAttribute.parseAttributeName(name), resolver)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
index 38888c3fdd80b..3e066c977c15a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
@@ -35,7 +35,7 @@ import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
 import org.apache.spark.sql.internal.SQLConf.{PARTITION_OVERWRITE_MODE, PartitionOverwriteMode, V2_SESSION_CATALOG_IMPLEMENTATION}
 import org.apache.spark.sql.internal.connector.SimpleTableProvider
 import org.apache.spark.sql.sources.SimpleScanSource
-import org.apache.spark.sql.types.{BooleanType, LongType, StringType, StructField, StructType}
+import org.apache.spark.sql.types.{BooleanType, LongType, MetadataBuilder, StringType, StructField, StructType}
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.unsafe.types.UTF8String
@@ -2699,6 +2699,27 @@ class DataSourceV2SQLSuite
     }
   }
 
+  test("SPARK-34555: Resolve DataFrame metadata column") {
+    val tbl = s"${catalogAndNamespace}table"
+    withTable(tbl) {
+      sql(s"CREATE TABLE $tbl (id bigint, data string) USING $v2Format " +
+        "PARTITIONED BY (bucket(4, id), id)")
+      sql(s"INSERT INTO $tbl VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+      val table = spark.table(tbl)
+      val dfQuery = table.select(
+        table.col("id"),
+        table.col("data"),
+        table.col("index"),
+        table.col("_partition")
+      )
+
+      checkAnswer(
+        dfQuery,
+        Seq(Row(1, "a", 0, "3/1"), Row(2, "b", 0, "0/2"), Row(3, "c", 0, "1/3"))
+      )
+    }
+  }
+
   private def testNotSupportedV2Command(
       sqlCommand: String,
       sqlParams: String,

From c41b543773cc0c9b1199652fc66d84b797d68c35 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 4 Mar 2021 11:29:34 +0900
Subject: [PATCH 025/169] [SPARK-34584][SQL] Static partition should also
 follow StoreAssignmentPolicy when insert into v2 tables

### What changes were proposed in this pull request?

This is a followup of https://github.com/apache/spark/pull/27597 and simply apply the fix in the v2 table insertion code path.

### Why are the changes needed?

bug fix

### Does this PR introduce _any_ user-facing change?

yes, now v2 table insertion with static partitions also follow StoreAssignmentPolicy.

### How was this patch tested?

moved the test from https://github.com/apache/spark/pull/27597 to the general test suite `SQLInsertTestSuite`, which covers DS v2, file source, and hive tables.

Closes #31726 from cloud-fan/insert.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
(cherry picked from commit 8f1eec4d138da604b890111d0a6daaef86d44ef2)
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../sql/catalyst/analysis/Analyzer.scala      |  6 +++-
 .../apache/spark/sql/SQLInsertTestSuite.scala | 29 ++++++++++++++++++-
 .../spark/sql/sources/InsertSuite.scala       | 21 --------------
 3 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index bf8003105ada6..f71139fafcc04 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1304,7 +1304,11 @@ class Analyzer(override val catalogManager: CatalogManager)
           relation.output.flatMap { col =>
             outputNameToStaticName.get(col.name).flatMap(staticPartitions.get) match {
               case Some(staticValue) =>
-                Some(Alias(Cast(Literal(staticValue), col.dataType), col.name)())
+                // SPARK-30844: try our best to follow StoreAssignmentPolicy for static partition
+                // values but not completely follow because we can't do static type checking due to
+                // the reason that the parser has erased the type info of static partition values
+                // and converted them to string.
+                Some(Alias(AnsiCast(Literal(staticValue), col.dataType), col.name)())
               case _ if queryColumns.hasNext =>
                 Some(queryColumns.next)
               case _ =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala
index 67c5f12dc71dd..a0943437bc8b9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLInsertTestSuite.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils}
 
 /**
- * The base trait for DML - insert syntax
+ * The base trait for SQL INSERT.
  */
 trait SQLInsertTestSuite extends QueryTest with SQLTestUtils {
 
@@ -230,6 +230,33 @@ trait SQLInsertTestSuite extends QueryTest with SQLTestUtils {
       }
     }
   }
+
+  test("SPARK-30844: static partition should also follow StoreAssignmentPolicy") {
+    val testingPolicies = if (format == "foo") {
+      // DS v2 doesn't support the legacy policy
+      Seq(SQLConf.StoreAssignmentPolicy.ANSI, SQLConf.StoreAssignmentPolicy.STRICT)
+    } else {
+      SQLConf.StoreAssignmentPolicy.values
+    }
+    testingPolicies.foreach { policy =>
+      withSQLConf(
+        SQLConf.STORE_ASSIGNMENT_POLICY.key -> policy.toString) {
+        withTable("t") {
+          sql("create table t(a int, b string) using parquet partitioned by (a)")
+          policy match {
+            case SQLConf.StoreAssignmentPolicy.ANSI | SQLConf.StoreAssignmentPolicy.STRICT =>
+              val errorMsg = intercept[NumberFormatException] {
+                sql("insert into t partition(a='ansi') values('ansi')")
+              }.getMessage
+              assert(errorMsg.contains("invalid input syntax for type numeric: ansi"))
+            case SQLConf.StoreAssignmentPolicy.LEGACY =>
+              sql("insert into t partition(a='ansi') values('ansi')")
+              checkAnswer(sql("select * from t"), Row("ansi", null) :: Nil)
+          }
+        }
+      }
+    }
+  }
 }
 
 class FileSourceSQLInsertTestSuite extends SQLInsertTestSuite with SharedSparkSession {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index aaf8765c04425..bce55ac34419f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -797,27 +797,6 @@ class InsertSuite extends DataSourceTest with SharedSparkSession {
     }
   }
 
-  test("SPARK-30844: static partition should also follow StoreAssignmentPolicy") {
-    SQLConf.StoreAssignmentPolicy.values.foreach { policy =>
-      withSQLConf(
-        SQLConf.STORE_ASSIGNMENT_POLICY.key -> policy.toString) {
-        withTable("t") {
-          sql("create table t(a int, b string) using parquet partitioned by (a)")
-          policy match {
-            case SQLConf.StoreAssignmentPolicy.ANSI | SQLConf.StoreAssignmentPolicy.STRICT =>
-              val errorMsg = intercept[NumberFormatException] {
-                sql("insert into t partition(a='ansi') values('ansi')")
-              }.getMessage
-              assert(errorMsg.contains("invalid input syntax for type numeric: ansi"))
-            case SQLConf.StoreAssignmentPolicy.LEGACY =>
-              sql("insert into t partition(a='ansi') values('ansi')")
-              checkAnswer(sql("select * from t"), Row("ansi", null) :: Nil)
-          }
-        }
-      }
-    }
-  }
-
   test("SPARK-24860: dynamic partition overwrite specified per source without catalog table") {
     withTempPath { path =>
       Seq((1, 1), (2, 2)).toDF("i", "part")

From 463e130d855b95254bb40422c4cf4da8ff2cc766 Mon Sep 17 00:00:00 2001
From: Shixiong Zhu <zsxwing@gmail.com>
Date: Thu, 4 Mar 2021 15:12:53 +0800
Subject: [PATCH 026/169] [SPARK-34599][SQL] Fix the issue that INSERT INTO
 OVERWRITE doesn't support partition columns containing dot for DSv2

### What changes were proposed in this pull request?

`ResolveInsertInto.staticDeleteExpression` should use `UnresolvedAttribute.quoted` to create the delete expression so that we will treat the entire `attr.name` as a column name.

### Why are the changes needed?

When users use `dot` in a partition column name, queries like ```INSERT OVERWRITE $t1 PARTITION (`a.b` = 'a') (`c.d`) VALUES('b')``` is not working.

### Does this PR introduce _any_ user-facing change?

Without this test, the above query will throw
```
[info]   org.apache.spark.sql.AnalysisException: cannot resolve '`a.b`' given input columns: [a.b, c.d];
[info] 'OverwriteByExpression RelationV2[a.b#17, c.d#18] default.tbl, ('a.b <=> cast(a as string)), false
[info] +- Project [a.b#19, ansi_cast(col1#16 as string) AS c.d#20]
[info]    +- Project [cast(a as string) AS a.b#19, col1#16]
[info]       +- LocalRelation [col1#16]
```

With the fix, the query will run correctly.

### How was this patch tested?

The new added test.

Closes #31713 from zsxwing/SPARK-34599.

Authored-by: Shixiong Zhu <zsxwing@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 53e4dba7c489ac5c0ad61f0121c4e247de5b485c)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/catalyst/analysis/Analyzer.scala  |  4 +++-
 .../apache/spark/sql/connector/InsertIntoTests.scala   | 10 ++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index f71139fafcc04..771b817955854 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1336,7 +1336,9 @@ class Analyzer(override val catalogManager: CatalogManager)
               // ResolveOutputRelation runs, using the query's column names that will match the
               // table names at that point. because resolution happens after a future rule, create
               // an UnresolvedAttribute.
-              EqualNullSafe(UnresolvedAttribute(attr.name), Cast(Literal(value), attr.dataType))
+              EqualNullSafe(
+                UnresolvedAttribute.quoted(attr.name),
+                Cast(Literal(value), attr.dataType))
             case None =>
               throw QueryCompilationErrors.unknownStaticPartitionColError(name)
           }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/InsertIntoTests.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/InsertIntoTests.scala
index 2cc7a1f994645..ad730376b2e3a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/InsertIntoTests.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/InsertIntoTests.scala
@@ -477,5 +477,15 @@ trait InsertIntoSQLOnlyTests
         verifyTable(t1, spark.table(view))
       }
     }
+
+    test("SPARK-34599: InsertInto: overwrite - dot in the partition column name - static mode") {
+      import testImplicits._
+      val t1 = "tbl"
+      withTable(t1) {
+        sql(s"CREATE TABLE $t1 (`a.b` string, `c.d` string) USING $v2Format PARTITIONED BY (`a.b`)")
+        sql(s"INSERT OVERWRITE $t1 PARTITION (`a.b` = 'a') (`c.d`) VALUES('b')")
+        verifyTable(t1, Seq("a" -> "b").toDF("id", "data"))
+      }
+    }
   }
 }

From f3fdc8d68f7b7ff7e2d76db374f69c49219c3a67 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Thu, 4 Mar 2021 20:42:47 +0800
Subject: [PATCH 027/169] [SPARK-34567][SQL] CreateTableAsSelect should update
 metrics too

### What changes were proposed in this pull request?
For command `CreateTableAsSelect` we use `InsertIntoHiveTable`, `InsertIntoHadoopFsRelationCommand` to insert data.
We will update metrics of  `InsertIntoHiveTable`, `InsertIntoHadoopFsRelationCommand`  in `FileFormatWriter.write()`, but we only show CreateTableAsSelectCommand in WebUI SQL Tab.
We need to update `CreateTableAsSelectCommand`'s metrics too.

Before this PR:
![image](https://user-images.githubusercontent.com/46485123/109411226-81f44480-79db-11eb-99cb-b9686b15bf61.png)

After this PR:
![image](https://user-images.githubusercontent.com/46485123/109411232-8ae51600-79db-11eb-9111-3bea0bc2d475.png)

![image](https://user-images.githubusercontent.com/46485123/109905192-62aa2f80-7cd9-11eb-91f9-04b16c9238ae.png)

### Why are the changes needed?
Complete SQL Metrics

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
<!--
MT

Closes #31679 from AngersZhuuuu/SPARK-34567.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 401e270c179021dd7bfd136b143ccc6d01c04755)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../command/DataWritingCommand.scala          | 27 +++++++++++++++++--
 .../command/createDataSourceTables.scala      |  2 +-
 .../execution/datasources/DataSource.scala    |  5 +++-
 .../execution/metric/SQLMetricsSuite.scala    | 17 ++++++++++++
 .../CreateHiveTableAsSelectCommand.scala      |  2 ++
 .../sql/hive/execution/SQLMetricsSuite.scala  | 27 +++++++++++++++++++
 6 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
index a56007f5d5d95..c9de8c7e1a9d0 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/DataWritingCommand.scala
@@ -19,12 +19,13 @@ package org.apache.spark.sql.execution.command
 
 import org.apache.hadoop.conf.Configuration
 
+import org.apache.spark.SparkContext
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan}
-import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.{SparkPlan, SQLExecution}
 import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker
-import org.apache.spark.sql.execution.metric.SQLMetric
+import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
 import org.apache.spark.util.SerializableConfiguration
 
 /**
@@ -73,4 +74,26 @@ object DataWritingCommand {
       attr.withName(outputName)
     }
   }
+
+  /**
+   * When execute CTAS operators, Spark will use [[InsertIntoHadoopFsRelationCommand]]
+   * or [[InsertIntoHiveTable]] command to write data, they both inherit metrics from
+   * [[DataWritingCommand]], but after running [[InsertIntoHadoopFsRelationCommand]]
+   * or [[InsertIntoHiveTable]], we only update metrics in these two command through
+   * [[BasicWriteJobStatsTracker]], we also need to propogate metrics to the command
+   * that actually calls [[InsertIntoHadoopFsRelationCommand]] or [[InsertIntoHiveTable]].
+   *
+   * @param sparkContext Current SparkContext.
+   * @param command Command to execute writing data.
+   * @param metrics Metrics of real DataWritingCommand.
+   */
+  def propogateMetrics(
+      sparkContext: SparkContext,
+      command: DataWritingCommand,
+      metrics: Map[String, SQLMetric]): Unit = {
+    command.metrics.foreach { case (key, metric) => metrics(key).set(metric.value) }
+    SQLMetrics.postDriverMetricUpdates(sparkContext,
+      sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY),
+      metrics.values.toSeq)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
index be7fa7b1b447e..dc26e00599aea 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/createDataSourceTables.scala
@@ -217,7 +217,7 @@ case class CreateDataSourceTableAsSelectCommand(
       catalogTable = if (tableExists) Some(table) else None)
 
     try {
-      dataSource.writeAndRead(mode, query, outputColumnNames, physicalPlan)
+      dataSource.writeAndRead(mode, query, outputColumnNames, physicalPlan, metrics)
     } catch {
       case ex: AnalysisException =>
         logError(s"Failed to write to table ${table.identifier.unquotedString}", ex)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
index e84f5943d30f0..476174a2cad9f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -43,6 +43,7 @@ import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2
 import org.apache.spark.sql.execution.datasources.v2.orc.OrcDataSourceV2
+import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.sources.{RateStreamProvider, TextSocketSourceProvider}
 import org.apache.spark.sql.internal.SQLConf
@@ -518,7 +519,8 @@ case class DataSource(
       mode: SaveMode,
       data: LogicalPlan,
       outputColumnNames: Seq[String],
-      physicalPlan: SparkPlan): BaseRelation = {
+      physicalPlan: SparkPlan,
+      metrics: Map[String, SQLMetric]): BaseRelation = {
     val outputColumns = DataWritingCommand.logicalPlanOutputWithNames(data, outputColumnNames)
     if (outputColumns.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) {
       throw new AnalysisException("Cannot save interval data type into external storage.")
@@ -546,6 +548,7 @@ case class DataSource(
           partitionColumns = resolvedPartCols,
           outputColumnNames = outputColumnNames)
         resolved.run(sparkSession, physicalPlan)
+        DataWritingCommand.propogateMetrics(sparkSession.sparkContext, resolved, metrics)
         // Replace the schema with that of the DataFrame we just wrote out to avoid re-inferring
         copy(userSpecifiedSchema = Some(outputColumns.toStructType.asNullable)).resolveRelation()
       case _ =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
index 21d17f40abb34..d5f9875a5926b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -28,6 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
 import org.apache.spark.sql.execution.{FilterExec, RangeExec, SparkPlan, WholeStageCodegenExec}
 import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecutionSuite
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
+import org.apache.spark.sql.execution.command.DataWritingCommandExec
 import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec
 import org.apache.spark.sql.functions._
@@ -755,4 +756,20 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils
       }
     }
   }
+
+  test("SPARK-34567: Add metrics for CTAS operator") {
+    withTable("t") {
+      val df = sql("CREATE TABLE t USING PARQUET AS SELECT 1 as a")
+      val dataWritingCommandExec =
+        df.queryExecution.executedPlan.asInstanceOf[DataWritingCommandExec]
+      dataWritingCommandExec.executeCollect()
+      val createTableAsSelect = dataWritingCommandExec.cmd
+      assert(createTableAsSelect.metrics.contains("numFiles"))
+      assert(createTableAsSelect.metrics("numFiles").value == 1)
+      assert(createTableAsSelect.metrics.contains("numOutputBytes"))
+      assert(createTableAsSelect.metrics("numOutputBytes").value > 0)
+      assert(createTableAsSelect.metrics.contains("numOutputRows"))
+      assert(createTableAsSelect.metrics("numOutputRows").value == 1)
+    }
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala
index ccaa4502d9d2a..283c254b39602 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/CreateHiveTableAsSelectCommand.scala
@@ -55,6 +55,7 @@ trait CreateHiveTableAsSelectBase extends DataWritingCommand {
 
       val command = getWritingCommand(catalog, tableDesc, tableExists = true)
       command.run(sparkSession, child)
+      DataWritingCommand.propogateMetrics(sparkSession.sparkContext, command, metrics)
     } else {
       // TODO ideally, we should get the output data ready first and then
       // add the relation into catalog, just in case of failure occurs while data
@@ -69,6 +70,7 @@ trait CreateHiveTableAsSelectBase extends DataWritingCommand {
         val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier)
         val command = getWritingCommand(catalog, createdTableMeta, tableExists = false)
         command.run(sparkSession, child)
+        DataWritingCommand.propogateMetrics(sparkSession.sparkContext, command, metrics)
       } catch {
         case NonFatal(e) =>
           // drop the created table.
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala
index 4d6dafd598a2e..a2de43d737704 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLMetricsSuite.scala
@@ -18,7 +18,9 @@
 package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecutionSuite
+import org.apache.spark.sql.execution.command.DataWritingCommandExec
 import org.apache.spark.sql.execution.metric.SQLMetricsTestUtils
+import org.apache.spark.sql.hive.HiveUtils
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 
 // Disable AQE because metric info is different with AQE on/off
@@ -34,4 +36,29 @@ class SQLMetricsSuite extends SQLMetricsTestUtils with TestHiveSingleton
       testMetricsDynamicPartition("hive", "hive", "t1")
     }
   }
+
+  test("SPARK-34567: Add metrics for CTAS operator") {
+    Seq(false, true).foreach { canOptimized =>
+      withSQLConf(HiveUtils.CONVERT_METASTORE_CTAS.key -> canOptimized.toString) {
+        withTable("t") {
+          val df = sql(s"CREATE TABLE t STORED AS PARQUET AS SELECT 1 as a")
+          val dataWritingCommandExec =
+            df.queryExecution.executedPlan.asInstanceOf[DataWritingCommandExec]
+          dataWritingCommandExec.executeCollect()
+          val createTableAsSelect = dataWritingCommandExec.cmd
+          if (canOptimized) {
+            assert(createTableAsSelect.isInstanceOf[OptimizedCreateHiveTableAsSelectCommand])
+          } else {
+            assert(createTableAsSelect.isInstanceOf[CreateHiveTableAsSelectCommand])
+          }
+          assert(createTableAsSelect.metrics.contains("numFiles"))
+          assert(createTableAsSelect.metrics("numFiles").value == 1)
+          assert(createTableAsSelect.metrics.contains("numOutputBytes"))
+          assert(createTableAsSelect.metrics("numOutputBytes").value > 0)
+          assert(createTableAsSelect.metrics.contains("numOutputRows"))
+          assert(createTableAsSelect.metrics("numOutputRows").value == 1)
+        }
+      }
+    }
+  }
 }

From c9700c98ee19e8b2e07114273267e3527eb17ee7 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Thu, 4 Mar 2021 22:41:11 +0800
Subject: [PATCH 028/169] [SPARK-34482][SS] Correct the active SparkSession for
 StreamExecution.logicalPlan

### What changes were proposed in this pull request?

Set the active SparkSession to `sparkSessionForStream` and diable AQE & CBO before initializing the `StreamExecution.logicalPlan`.

### Why are the changes needed?

The active session should be `sparkSessionForStream`. Otherwise, settings like

https://github.com/apache/spark/blob/6b34745cb9b294c91cd126c2ea44c039ee83cb84/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala#L332-L335

wouldn't take effect if callers access them from the active SQLConf, e.g., the rule of `InsertAdaptiveSparkPlan`. Besides, unlike `InsertAdaptiveSparkPlan` (which skips streaming plan), `CostBasedJoinReorder` seems to have the chance to take effect theoretically.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Tested manually. Before the fix, `InsertAdaptiveSparkPlan` would try to apply AQE on the plan(wouldn't take effect though). After this fix, the rule returns directly.

Closes #31600 from Ngone51/active-session-for-stream.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit e7e016192f882cfb430d706c2099e58e1bcc014c)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../execution/streaming/StreamExecution.scala | 42 ++++++++++---------
 .../spark/sql/streaming/StreamSuite.scala     | 33 ++++++++++++++-
 2 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
index 6b0d33b819a20..1b145f23fc8a9 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/StreamExecution.scala
@@ -314,26 +314,28 @@ abstract class StreamExecution(
       startLatch.countDown()
 
       // While active, repeatedly attempt to run batches.
-      SparkSession.setActiveSession(sparkSession)
-
-      updateStatusMessage("Initializing sources")
-      // force initialization of the logical plan so that the sources can be created
-      logicalPlan
-
-      // Adaptive execution can change num shuffle partitions, disallow
-      sparkSessionForStream.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false")
-      // Disable cost-based join optimization as we do not want stateful operations to be rearranged
-      sparkSessionForStream.conf.set(SQLConf.CBO_ENABLED.key, "false")
-      offsetSeqMetadata = OffsetSeqMetadata(
-        batchWatermarkMs = 0, batchTimestampMs = 0, sparkSessionForStream.conf)
-
-      if (state.compareAndSet(INITIALIZING, ACTIVE)) {
-        // Unblock `awaitInitialization`
-        initializationLatch.countDown()
-        runActivatedStream(sparkSessionForStream)
-        updateStatusMessage("Stopped")
-      } else {
-        // `stop()` is already called. Let `finally` finish the cleanup.
+      sparkSessionForStream.withActive {
+        // Adaptive execution can change num shuffle partitions, disallow
+        sparkSessionForStream.conf.set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "false")
+        // Disable cost-based join optimization as we do not want stateful operations
+        // to be rearranged
+        sparkSessionForStream.conf.set(SQLConf.CBO_ENABLED.key, "false")
+
+        updateStatusMessage("Initializing sources")
+        // force initialization of the logical plan so that the sources can be created
+        logicalPlan
+
+        offsetSeqMetadata = OffsetSeqMetadata(
+          batchWatermarkMs = 0, batchTimestampMs = 0, sparkSessionForStream.conf)
+
+        if (state.compareAndSet(INITIALIZING, ACTIVE)) {
+          // Unblock `awaitInitialization`
+          initializationLatch.countDown()
+          runActivatedStream(sparkSessionForStream)
+          updateStatusMessage("Stopped")
+        } else {
+          // `stop()` is already called. Let `finally` finish the cleanup.
+        }
       }
     } catch {
       case e if isInterruptedByStop(e, sparkSession.sparkContext) =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
index ed284df10aced..0d2d00f1f5b2b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala
@@ -34,7 +34,7 @@ import org.scalatest.time.SpanSugar._
 import org.apache.spark.{SparkConf, SparkContext, TaskContext, TestUtils}
 import org.apache.spark.scheduler.{SparkListener, SparkListenerJobStart}
 import org.apache.spark.sql._
-import org.apache.spark.sql.catalyst.plans.logical.Range
+import org.apache.spark.sql.catalyst.plans.logical.{Range, RepartitionByExpression}
 import org.apache.spark.sql.catalyst.streaming.{InternalOutputModes, StreamingRelationV2}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.execution.{LocalLimitExec, SimpleMode, SparkPlan}
@@ -1264,6 +1264,37 @@ class StreamSuite extends StreamTest {
       }
     }
   }
+
+  test("SPARK-34482: correct active SparkSession for logicalPlan") {
+    withSQLConf(
+      SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true",
+      SQLConf.COALESCE_PARTITIONS_INITIAL_PARTITION_NUM.key -> "10") {
+      val df = spark.readStream.format(classOf[FakeDefaultSource].getName).load()
+      var query: StreamExecution = null
+      try {
+        query =
+          df.repartition($"a")
+            .writeStream
+            .format("memory")
+            .queryName("memory")
+            .start()
+            .asInstanceOf[StreamingQueryWrapper]
+            .streamingQuery
+        query.awaitInitialization(streamingTimeout.toMillis)
+        val plan = query.logicalPlan
+        val numPartition = plan
+          .find { _.isInstanceOf[RepartitionByExpression] }
+          .map(_.asInstanceOf[RepartitionByExpression].numPartitions)
+        // Before the fix of SPARK-34482, the numPartition is the value of
+        // `COALESCE_PARTITIONS_INITIAL_PARTITION_NUM`.
+        assert(numPartition.get === spark.sqlContext.conf.getConf(SQLConf.SHUFFLE_PARTITIONS))
+      } finally {
+        if (query != null) {
+          query.stop()
+        }
+      }
+    }
+  }
 }
 
 abstract class FakeSource extends StreamSourceProvider {

From db5fe1d5621e267c2c32fb17fc49b7d3487cfc79 Mon Sep 17 00:00:00 2001
From: Baohe Zhang <baohe.zhang@verizonmedia.com>
Date: Thu, 4 Mar 2021 15:37:33 -0800
Subject: [PATCH 029/169] [SPARK-32924][WEBUI] Make duration column in master
 UI sorted in the correct order

### What changes were proposed in this pull request?

Make the "duration" column in standalone mode master UI sorted by numeric duration, hence the column can be sorted by the correct order.

Before changes:
![image](https://user-images.githubusercontent.com/26694233/110025426-f5a49300-7cf4-11eb-86f0-2febade86be9.png)

After changes:
![image](https://user-images.githubusercontent.com/26694233/110025604-33092080-7cf5-11eb-8b34-215688faf56d.png)

### Why are the changes needed?

Fix a UI bug to make the sorting consistent across different pages.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
Ran several apps with different durations and verified the duration column on the master page can be sorted correctly.

Closes #31743 from baohe-zhang/SPARK-32924.

Authored-by: Baohe Zhang <baohe.zhang@verizonmedia.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit 9ac5ee2e17ca491eabf2e6e7d33ce7cfb5a002a7)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../scala/org/apache/spark/deploy/master/ui/MasterPage.scala  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
index 9e1f753b51e5a..1dda6831b2214 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/ui/MasterPage.scala
@@ -309,7 +309,9 @@ private[ui] class MasterPage(parent: MasterWebUI) extends WebUIPage("") {
       <td>{UIUtils.formatDate(app.submitDate)}</td>
       <td>{app.desc.user}</td>
       <td>{app.state.toString}</td>
-      <td>{UIUtils.formatDuration(app.duration)}</td>
+      <td sorttable_customkey={app.duration.toString}>
+        {UIUtils.formatDuration(app.duration)}
+      </td>
     </tr>
   }
 

From 4e48ab4921579c4ed306295d987eb857ea6e571d Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Fri, 5 Mar 2021 12:19:30 +0800
Subject: [PATCH 030/169] [SPARK-34613][SQL] Fix view does not capture disable
 hint config

### What changes were proposed in this pull request?

Add allow list to capture sql config for view.

### Why are the changes needed?

Spark use origin text sql to store view then capture and store sql config into view metadata.

Capture config will skip some config with some prefix, e.g. `spark.sql.optimizer.` but unfortunately `spark.sql.optimizer.disableHints` is start with `spark.sql.optimizer.`.

We need a allow list to help capture the config.

### Does this PR introduce _any_ user-facing change?

Yes bug fix.

### How was this patch tested?

Add test.

Closes #31732 from ulysses-you/SPARK-34613.

Authored-by: ulysses-you <ulyssesyou18@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 43aacd5069294e1215e86cd43bd0810bda998be2)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/execution/command/views.scala       | 12 +++++++++++-
 .../spark/sql/execution/SQLViewTestSuite.scala    | 15 +++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
index 960fe4ad22836..ef0e90d97ae1c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
@@ -353,8 +353,18 @@ object ViewHelper {
     "spark.sql.shuffle.",
     "spark.sql.adaptive.")
 
+  private val configAllowList = Seq(
+    SQLConf.DISABLE_HINTS.key
+  )
+
+  /**
+   * Capture view config either of:
+   * 1. exists in allowList
+   * 2. do not exists in denyList
+   */
   private def shouldCaptureConfig(key: String): Boolean = {
-    !configPrefixDenyList.exists(prefix => key.startsWith(prefix))
+    configAllowList.exists(prefix => key.equals(prefix)) ||
+      !configPrefixDenyList.exists(prefix => key.startsWith(prefix))
   }
 
   import CatalogTable._
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala
index 84a20bb16ad86..88218b1865882 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
+import org.apache.spark.sql.catalyst.plans.logical.Repartition
 import org.apache.spark.sql.internal.SQLConf._
 import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils}
 
@@ -278,6 +279,20 @@ abstract class SQLViewTestSuite extends QueryTest with SQLTestUtils {
       }
     }
   }
+
+  test("SPARK-34613: Fix view does not capture disable hint config") {
+    withSQLConf(DISABLE_HINTS.key -> "true") {
+      val viewName = createView("v1", "SELECT /*+ repartition(1) */ 1")
+      withView(viewName) {
+        assert(
+          sql(s"SELECT * FROM $viewName").queryExecution.analyzed.collect {
+            case e: Repartition => e
+          }.isEmpty
+        )
+        checkViewOutput(viewName, Seq(Row(1)))
+      }
+    }
+  }
 }
 
 class LocalTempViewTestSuite extends SQLViewTestSuite with SharedSparkSession {

From 81d34b4ae6a78faad70c3af2c9783ab38c0ed8f6 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Thu, 4 Mar 2021 22:40:57 -0800
Subject: [PATCH 031/169] [SPARK-34607][SQL][3.1] Add `Utils.isMemberClass` to
 fix a malformed class name error on jdk8u

This PR intends to fix a bug of `objects.NewInstance` if a user runs Spark on jdk8u and a given `cls` in `NewInstance` is a deeply-nested inner class, e.g.,.
```
  object OuterLevelWithVeryVeryVeryLongClassName1 {
    object OuterLevelWithVeryVeryVeryLongClassName2 {
      object OuterLevelWithVeryVeryVeryLongClassName3 {
        object OuterLevelWithVeryVeryVeryLongClassName4 {
          object OuterLevelWithVeryVeryVeryLongClassName5 {
            object OuterLevelWithVeryVeryVeryLongClassName6 {
              object OuterLevelWithVeryVeryVeryLongClassName7 {
                object OuterLevelWithVeryVeryVeryLongClassName8 {
                  object OuterLevelWithVeryVeryVeryLongClassName9 {
                    object OuterLevelWithVeryVeryVeryLongClassName10 {
                      object OuterLevelWithVeryVeryVeryLongClassName11 {
                        object OuterLevelWithVeryVeryVeryLongClassName12 {
                          object OuterLevelWithVeryVeryVeryLongClassName13 {
                            object OuterLevelWithVeryVeryVeryLongClassName14 {
                              object OuterLevelWithVeryVeryVeryLongClassName15 {
                                object OuterLevelWithVeryVeryVeryLongClassName16 {
                                  object OuterLevelWithVeryVeryVeryLongClassName17 {
                                    object OuterLevelWithVeryVeryVeryLongClassName18 {
                                      object OuterLevelWithVeryVeryVeryLongClassName19 {
                                        object OuterLevelWithVeryVeryVeryLongClassName20 {
                                          case class MalformedNameExample2(x: Int)
                                        }}}}}}}}}}}}}}}}}}}}
```

The root cause that Kris (@rednaxelafx) investigated is as follows (Kudos to Kris);

The reason why the test case above is so convoluted is in the way Scala generates the class name for nested classes. In general, Scala generates a class name for a nested class by inserting the dollar-sign ( `$` ) in between each level of class nesting. The problem is that this format can concatenate into a very long string that goes beyond certain limits, so Scala will change the class name format beyond certain length threshold.

For the example above, we can see that the first two levels of class nesting have class names that look like this:
```
org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite$OuterLevelWithVeryVeryVeryLongClassName1$
org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite$OuterLevelWithVeryVeryVeryLongClassName1$OuterLevelWithVeryVeryVeryLongClassName2$
```
If we leave out the fact that Scala uses a dollar-sign ( `$` ) suffix for the class name of the companion object, `OuterLevelWithVeryVeryVeryLongClassName1`'s full name is a prefix (substring) of `OuterLevelWithVeryVeryVeryLongClassName2`.

But if we keep going deeper into the levels of nesting, you'll find names that look like:
```
org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite$OuterLevelWithVeryVeryVeryLongClassNam$$$$2a1321b953c615695d7442b2adb1$$$$ryVeryLongClassName8$OuterLevelWithVeryVeryVeryLongClassName9$OuterLevelWithVeryVeryVeryLongClassName10$
org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite$OuterLevelWithVeryVeryVeryLongClassNam$$$$2a1321b953c615695d7442b2adb1$$$$ryVeryLongClassName8$OuterLevelWithVeryVeryVeryLongClassName9$OuterLevelWithVeryVeryVeryLongClassName10$OuterLevelWithVeryVeryVeryLongClassName11$
org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite$OuterLevelWithVeryVeryVeryLongClassNam$$$$85f068777e7ecf112afcbe997d461b$$$$VeryLongClassName11$OuterLevelWithVeryVeryVeryLongClassName12$
org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite$OuterLevelWithVeryVeryVeryLongClassNam$$$$85f068777e7ecf112afcbe997d461b$$$$VeryLongClassName11$OuterLevelWithVeryVeryVeryLongClassName12$OuterLevelWithVeryVeryVeryLongClassName13$
org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite$OuterLevelWithVeryVeryVeryLongClassNam$$$$85f068777e7ecf112afcbe997d461b$$$$VeryLongClassName11$OuterLevelWithVeryVeryVeryLongClassName12$OuterLevelWithVeryVeryVeryLongClassName13$OuterLevelWithVeryVeryVeryLongClassName14$
org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite$OuterLevelWithVeryVeryVeryLongClassNam$$$$5f7ad51804cb1be53938ea804699fa$$$$VeryLongClassName14$OuterLevelWithVeryVeryVeryLongClassName15$
org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite$OuterLevelWithVeryVeryVeryLongClassNam$$$$5f7ad51804cb1be53938ea804699fa$$$$VeryLongClassName14$OuterLevelWithVeryVeryVeryLongClassName15$OuterLevelWithVeryVeryVeryLongClassName16$
org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite$OuterLevelWithVeryVeryVeryLongClassNam$$$$5f7ad51804cb1be53938ea804699fa$$$$VeryLongClassName14$OuterLevelWithVeryVeryVeryLongClassName15$OuterLevelWithVeryVeryVeryLongClassName16$OuterLevelWithVeryVeryVeryLongClassName17$
org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite$OuterLevelWithVeryVeryVeryLongClassNam$$$$69b54f16b1965a31e88968df1a58d8$$$$VeryLongClassName17$OuterLevelWithVeryVeryVeryLongClassName18$
org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite$OuterLevelWithVeryVeryVeryLongClassNam$$$$69b54f16b1965a31e88968df1a58d8$$$$VeryLongClassName17$OuterLevelWithVeryVeryVeryLongClassName18$OuterLevelWithVeryVeryVeryLongClassName19$
org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite$OuterLevelWithVeryVeryVeryLongClassNam$$$$69b54f16b1965a31e88968df1a58d8$$$$VeryLongClassName17$OuterLevelWithVeryVeryVeryLongClassName18$OuterLevelWithVeryVeryVeryLongClassName19$OuterLevelWithVeryVeryVeryLongClassName20$
```
with a hash code in the middle and various levels of nesting omitted.

The `java.lang.Class.isMemberClass` method is implemented in JDK8u as:
http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/tip/src/share/classes/java/lang/Class.java#l1425
```
    /**
     * Returns {@code true} if and only if the underlying class
     * is a member class.
     *
     * @return {@code true} if and only if this class is a member class.
     * @since 1.5
     */
    public boolean isMemberClass() {
        return getSimpleBinaryName() != null && !isLocalOrAnonymousClass();
    }

    /**
     * Returns the "simple binary name" of the underlying class, i.e.,
     * the binary name without the leading enclosing class name.
     * Returns {@code null} if the underlying class is a top level
     * class.
     */
    private String getSimpleBinaryName() {
        Class<?> enclosingClass = getEnclosingClass();
        if (enclosingClass == null) // top level class
            return null;
        // Otherwise, strip the enclosing class' name
        try {
            return getName().substring(enclosingClass.getName().length());
        } catch (IndexOutOfBoundsException ex) {
            throw new InternalError("Malformed class name", ex);
        }
    }
```
and the problematic code is `getName().substring(enclosingClass.getName().length())` -- if a class's enclosing class's full name is *longer* than the nested class's full name, this logic would end up going out of bounds.

The bug has been fixed in JDK9 by https://bugs.java.com/bugdatabase/view_bug.do?bug_id=8057919 , but still exists in the latest JDK8u release. So from the Spark side we'd need to do something to avoid hitting this problem.

This is the backport of #31733.

Bugfix on jdk8u.

No.

Added tests.
---
 .../scala/org/apache/spark/util/Utils.scala   | 28 ++++++++
 .../sql/catalyst/encoders/OuterScopes.scala   |  2 +-
 .../expressions/objects/objects.scala         |  2 +-
 .../encoders/ExpressionEncoderSuite.scala     | 70 +++++++++++++++++++
 4 files changed, 100 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index fa38903e423b3..725422d488cb1 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2867,6 +2867,34 @@ private[spark] object Utils extends Logging {
     Hex.encodeHexString(secretBytes)
   }
 
+  /**
+   * Returns true if and only if the underlying class is a member class.
+   *
+   * Note: jdk8u throws a "Malformed class name" error if a given class is a deeply-nested
+   * inner class (See SPARK-34607 for details). This issue has already been fixed in jdk9+, so
+   * we can remove this helper method safely if we drop the support of jdk8u.
+   */
+  def isMemberClass(cls: Class[_]): Boolean = {
+    try {
+      cls.isMemberClass
+    } catch {
+      case _: InternalError =>
+        // We emulate jdk8u `Class.isMemberClass` below:
+        //   public boolean isMemberClass() {
+        //     return getSimpleBinaryName() != null && !isLocalOrAnonymousClass();
+        //   }
+        // `getSimpleBinaryName()` returns null if a given class is a top-level class,
+        // so we replace it with `cls.getEnclosingClass != null`. The second condition checks
+        // if a given class is not a local or an anonymous class, so we replace it with
+        // `cls.getEnclosingMethod == null` because `cls.getEnclosingMethod()` return a value
+        // only in either case (JVM Spec 4.8.6).
+        //
+        // Note: The newer jdk evaluates `!isLocalOrAnonymousClass()` first,
+        // we reorder the conditions to follow it.
+        cls.getEnclosingMethod == null && cls.getEnclosingClass != null
+    }
+  }
+
   /**
    * Safer than Class obj's getSimpleName which may throw Malformed class name error in scala.
    * This method mimics scalatest's getSimpleNameOfAnObjectsClass.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/OuterScopes.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/OuterScopes.scala
index 665b2cd1274fd..2d8f02860dd7d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/OuterScopes.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/OuterScopes.scala
@@ -48,7 +48,7 @@ object OuterScopes {
    * useful for inner class defined in REPL.
    */
   def getOuterScope(innerCls: Class[_]): () => AnyRef = {
-    assert(innerCls.isMemberClass)
+    assert(Utils.isMemberClass(innerCls))
     val outerClassName = innerCls.getDeclaringClass.getName
     val outer = outerScopes.get(outerClassName)
     if (outer == null) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 8801c7d3a9271..b4e93ebf74dd2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -443,7 +443,7 @@ case class NewInstance(
     // Note that static inner classes (e.g., inner classes within Scala objects) don't need
     // outer pointer registration.
     val needOuterPointer =
-      outerPointer.isEmpty && cls.isMemberClass && !Modifier.isStatic(cls.getModifiers)
+      outerPointer.isEmpty && Utils.isMemberClass(cls) && !Modifier.isStatic(cls.getModifiers)
     childrenResolved && !needOuterPointer
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
index 26352648638c7..095f6a970617b 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
@@ -217,6 +217,76 @@ class ExpressionEncoderSuite extends CodegenInterpretedPlanTest with AnalysisTes
       "nested Scala class should work")
   }
 
+  object OuterLevelWithVeryVeryVeryLongClassName1 {
+    object OuterLevelWithVeryVeryVeryLongClassName2 {
+      object OuterLevelWithVeryVeryVeryLongClassName3 {
+        object OuterLevelWithVeryVeryVeryLongClassName4 {
+          object OuterLevelWithVeryVeryVeryLongClassName5 {
+            object OuterLevelWithVeryVeryVeryLongClassName6 {
+              object OuterLevelWithVeryVeryVeryLongClassName7 {
+                object OuterLevelWithVeryVeryVeryLongClassName8 {
+                  object OuterLevelWithVeryVeryVeryLongClassName9 {
+                    object OuterLevelWithVeryVeryVeryLongClassName10 {
+                      object OuterLevelWithVeryVeryVeryLongClassName11 {
+                        object OuterLevelWithVeryVeryVeryLongClassName12 {
+                          object OuterLevelWithVeryVeryVeryLongClassName13 {
+                            object OuterLevelWithVeryVeryVeryLongClassName14 {
+                              object OuterLevelWithVeryVeryVeryLongClassName15 {
+                                object OuterLevelWithVeryVeryVeryLongClassName16 {
+                                  object OuterLevelWithVeryVeryVeryLongClassName17 {
+                                    object OuterLevelWithVeryVeryVeryLongClassName18 {
+                                      object OuterLevelWithVeryVeryVeryLongClassName19 {
+                                        object OuterLevelWithVeryVeryVeryLongClassName20 {
+                                          case class MalformedNameExample(x: Int)
+                                        }}}}}}}}}}}}}}}}}}}}
+
+  {
+    OuterScopes.addOuterScope(
+      OuterLevelWithVeryVeryVeryLongClassName1
+        .OuterLevelWithVeryVeryVeryLongClassName2
+        .OuterLevelWithVeryVeryVeryLongClassName3
+        .OuterLevelWithVeryVeryVeryLongClassName4
+        .OuterLevelWithVeryVeryVeryLongClassName5
+        .OuterLevelWithVeryVeryVeryLongClassName6
+        .OuterLevelWithVeryVeryVeryLongClassName7
+        .OuterLevelWithVeryVeryVeryLongClassName8
+        .OuterLevelWithVeryVeryVeryLongClassName9
+        .OuterLevelWithVeryVeryVeryLongClassName10
+        .OuterLevelWithVeryVeryVeryLongClassName11
+        .OuterLevelWithVeryVeryVeryLongClassName12
+        .OuterLevelWithVeryVeryVeryLongClassName13
+        .OuterLevelWithVeryVeryVeryLongClassName14
+        .OuterLevelWithVeryVeryVeryLongClassName15
+        .OuterLevelWithVeryVeryVeryLongClassName16
+        .OuterLevelWithVeryVeryVeryLongClassName17
+        .OuterLevelWithVeryVeryVeryLongClassName18
+        .OuterLevelWithVeryVeryVeryLongClassName19
+        .OuterLevelWithVeryVeryVeryLongClassName20)
+    encodeDecodeTest(
+      OuterLevelWithVeryVeryVeryLongClassName1
+        .OuterLevelWithVeryVeryVeryLongClassName2
+        .OuterLevelWithVeryVeryVeryLongClassName3
+        .OuterLevelWithVeryVeryVeryLongClassName4
+        .OuterLevelWithVeryVeryVeryLongClassName5
+        .OuterLevelWithVeryVeryVeryLongClassName6
+        .OuterLevelWithVeryVeryVeryLongClassName7
+        .OuterLevelWithVeryVeryVeryLongClassName8
+        .OuterLevelWithVeryVeryVeryLongClassName9
+        .OuterLevelWithVeryVeryVeryLongClassName10
+        .OuterLevelWithVeryVeryVeryLongClassName11
+        .OuterLevelWithVeryVeryVeryLongClassName12
+        .OuterLevelWithVeryVeryVeryLongClassName13
+        .OuterLevelWithVeryVeryVeryLongClassName14
+        .OuterLevelWithVeryVeryVeryLongClassName15
+        .OuterLevelWithVeryVeryVeryLongClassName16
+        .OuterLevelWithVeryVeryVeryLongClassName17
+        .OuterLevelWithVeryVeryVeryLongClassName18
+        .OuterLevelWithVeryVeryVeryLongClassName19
+        .OuterLevelWithVeryVeryVeryLongClassName20
+        .MalformedNameExample(42),
+      "deeply nested Scala class should work")
+  }
+
   productTest(PrimitiveData(1, 1, 1, 1, 1, 1, true))
 
   productTest(

From e0ef75a5f4423820ed77ead417b69a5353d8f077 Mon Sep 17 00:00:00 2001
From: Cheng Su <chengsu@fb.com>
Date: Tue, 9 Mar 2021 22:55:27 -0800
Subject: [PATCH 032/169] [SPARK-34681][SQL] Fix bug for full outer shuffled
 hash join when building left side with non-equal condition

### What changes were proposed in this pull request?

For full outer shuffled hash join with building hash map on left side, and having non-equal condition, the join can produce wrong result.

The root cause is `boundCondition` in `HashJoin.scala` always assumes the left side row is `streamedPlan` and right side row is `buildPlan` ([streamedPlan.output ++ buildPlan.output](https://github.com/apache/spark/blob/branch-3.1/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala#L141)). This is valid assumption, except for full outer + build left case.

The fix is to correct `boundCondition` in `HashJoin.scala` to handle full outer + build left case properly. See reproduce in https://issues.apache.org/jira/browse/SPARK-32399?focusedCommentId=17298414&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17298414 .

### Why are the changes needed?

Fix data correctness bug.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Changed the test in `OuterJoinSuite.scala` to cover full outer shuffled hash join.
Before this change, the unit test `basic full outer join using ShuffledHashJoin` in `OuterJoinSuite.scala` is failed.

Closes #31792 from c21/join-bugfix.

Authored-by: Cheng Su <chengsu@fb.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit a916690dd9aac40df38922dbea233785354a2f2a)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../spark/sql/execution/joins/HashJoin.scala  |  8 ++++++-
 .../sql/execution/joins/OuterJoinSuite.scala  | 22 +++++++++----------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
index 53bd591d98a2e..42219ee615c5f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala
@@ -138,7 +138,13 @@ trait HashJoin extends BaseJoinExec with CodegenSupport {
     UnsafeProjection.create(streamedBoundKeys)
 
   @transient protected[this] lazy val boundCondition = if (condition.isDefined) {
-    Predicate.create(condition.get, streamedPlan.output ++ buildPlan.output).eval _
+    if (joinType == FullOuter && buildSide == BuildLeft) {
+      // Put join left side before right side. This is to be consistent with
+      // `ShuffledHashJoinExec.fullOuterJoin`.
+      Predicate.create(condition.get, buildPlan.output ++ streamedPlan.output).eval _
+    } else {
+      Predicate.create(condition.get, streamedPlan.output ++ buildPlan.output).eval _
+    }
   } else {
     (r: InternalRow) => true
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
index 9f7e0a14f6a5c..238d37afe1075 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/OuterJoinSuite.scala
@@ -104,18 +104,16 @@ class OuterJoinSuite extends SparkPlanTest with SharedSparkSession {
       ExtractEquiJoinKeys.unapply(join)
     }
 
-    if (joinType != FullOuter) {
-      test(s"$testName using ShuffledHashJoin") {
-        extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _, _) =>
-          withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
-            val buildSide = if (joinType == LeftOuter) BuildRight else BuildLeft
-            checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
-              EnsureRequirements.apply(
-                ShuffledHashJoinExec(
-                  leftKeys, rightKeys, joinType, buildSide, boundCondition, left, right)),
-              expectedAnswer.map(Row.fromTuple),
-              sortAnswers = true)
-          }
+    test(s"$testName using ShuffledHashJoin") {
+      extractJoinParts().foreach { case (_, leftKeys, rightKeys, boundCondition, _, _, _) =>
+        withSQLConf(SQLConf.SHUFFLE_PARTITIONS.key -> "1") {
+          val buildSide = if (joinType == LeftOuter) BuildRight else BuildLeft
+          checkAnswer2(leftRows, rightRows, (left: SparkPlan, right: SparkPlan) =>
+            EnsureRequirements.apply(
+              ShuffledHashJoinExec(
+                leftKeys, rightKeys, joinType, buildSide, boundCondition, left, right)),
+            expectedAnswer.map(Row.fromTuple),
+            sortAnswers = true)
         }
       }
     }

From b82a65352eb8fc67d8b633fa70fa97282c8b22cf Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove73@gmail.com>
Date: Wed, 10 Mar 2021 20:48:00 +0900
Subject: [PATCH 033/169] [SPARK-34682][SQL] Fix regression in canonicalization
 error check in CustomShuffleReaderExec

### What changes were proposed in this pull request?
There is a regression in 3.1.1 compared to 3.0.2 when checking for a canonicalized plan when executing CustomShuffleReaderExec.

The regression was caused by the call to `sendDriverMetrics` which happens before the check and will always fail if the plan is canonicalized.

### Why are the changes needed?
This is a regression in a useful error check.

### Does this PR introduce _any_ user-facing change?
No. This is not an error that a user would typically see, as far as I know.

### How was this patch tested?
I tested this change locally by making a distribution from this PR branch. Before fixing the regression I saw:

```
java.util.NoSuchElementException: key not found: numPartitions
```

After fixing this regression I saw:

```
java.lang.IllegalStateException: operating on canonicalized plan
```

Closes #31793 from andygrove/SPARK-34682.

Lead-authored-by: Andy Grove <andygrove73@gmail.com>
Co-authored-by: Andy Grove <andygrove@nvidia.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
(cherry picked from commit fd4843803c4670c656a94c1af652fb4b945bc82c)
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../adaptive/CustomShuffleReaderExec.scala    | 12 +++++------
 .../adaptive/AdaptiveQueryExecSuite.scala     | 21 +++++++++++++++++++
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffleReaderExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffleReaderExec.scala
index 49a4c25fa637f..2319c9ed1cfd3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffleReaderExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/CustomShuffleReaderExec.scala
@@ -179,12 +179,12 @@ case class CustomShuffleReaderExec private(
   }
 
   private lazy val shuffleRDD: RDD[_] = {
-    sendDriverMetrics()
-
-    shuffleStage.map { stage =>
-      stage.shuffle.getShuffleRDD(partitionSpecs.toArray)
-    }.getOrElse {
-      throw new IllegalStateException("operating on canonicalized plan")
+    shuffleStage match {
+      case Some(stage) =>
+        sendDriverMetrics()
+        stage.shuffle.getShuffleRDD(partitionSpecs.toArray)
+      case _ =>
+        throw new IllegalStateException("operating on canonicalized plan")
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index 76d309cef8610..a61863102b0f9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.adaptive
 
 import java.io.File
+import java.lang.reflect.InvocationTargetException
 import java.net.URI
 
 import org.apache.log4j.Level
@@ -869,6 +870,26 @@ class AdaptiveQueryExecSuite
     }
   }
 
+  test("SPARK-34682: CustomShuffleReaderExec operating on canonicalized plan") {
+    withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") {
+      val (_, adaptivePlan) = runAdaptiveAndVerifyResult(
+        "SELECT key FROM testData GROUP BY key")
+      val readers = collect(adaptivePlan) {
+        case r: CustomShuffleReaderExec => r
+      }
+      assert(readers.length == 1)
+      val reader = readers.head
+      val c = reader.canonicalized.asInstanceOf[CustomShuffleReaderExec]
+      // we can't just call execute() because that has separate checks for canonicalized plans
+      val doExecute = c.getClass.getMethod("doExecute")
+      doExecute.setAccessible(true)
+      val ex = intercept[InvocationTargetException] {
+        doExecute.invoke(c)
+      }
+      assert(ex.getCause.getMessage === "operating on canonicalized plan")
+    }
+  }
+
   test("metrics of the shuffle reader") {
     withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") {
       val (_, adaptivePlan) = runAdaptiveAndVerifyResult(

From 32624a70f59905f929f7786c699d59d8c6378db3 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove73@gmail.com>
Date: Wed, 10 Mar 2021 12:08:31 -0800
Subject: [PATCH 034/169] [SPARK-34682][SQL] Use PrivateMethodTester instead of
 reflection

### Why are the changes needed?
SPARK-34682 was merged prematurely. This PR implements feedback from the review. I wasn't sure whether I should create a new JIRA or not.

### Does this PR introduce _any_ user-facing change?
No. Just improves the test.

### How was this patch tested?
Updated test.

Closes #31798 from andygrove/SPARK-34682-follow-up.

Authored-by: Andy Grove <andygrove73@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit fc182f7e7f9ff55a6a005044ae0968340cf6f30d)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../adaptive/AdaptiveQueryExecSuite.scala          | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index a61863102b0f9..067c1509eb73b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -18,10 +18,10 @@
 package org.apache.spark.sql.execution.adaptive
 
 import java.io.File
-import java.lang.reflect.InvocationTargetException
 import java.net.URI
 
 import org.apache.log4j.Level
+import org.scalatest.PrivateMethodTester
 
 import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent, SparkListenerJobStart}
 import org.apache.spark.sql.{Dataset, QueryTest, Row, SaveMode, SparkSession, Strategy}
@@ -46,7 +46,8 @@ import org.apache.spark.util.Utils
 class AdaptiveQueryExecSuite
   extends QueryTest
   with SharedSparkSession
-  with AdaptiveSparkPlanHelper {
+  with AdaptiveSparkPlanHelper
+  with PrivateMethodTester {
 
   import testImplicits._
 
@@ -881,12 +882,11 @@ class AdaptiveQueryExecSuite
       val reader = readers.head
       val c = reader.canonicalized.asInstanceOf[CustomShuffleReaderExec]
       // we can't just call execute() because that has separate checks for canonicalized plans
-      val doExecute = c.getClass.getMethod("doExecute")
-      doExecute.setAccessible(true)
-      val ex = intercept[InvocationTargetException] {
-        doExecute.invoke(c)
+      val ex = intercept[IllegalStateException] {
+        val doExecute = PrivateMethod[Unit](Symbol("doExecute"))
+        c.invokePrivate(doExecute())
       }
-      assert(ex.getCause.getMessage === "operating on canonicalized plan")
+      assert(ex.getMessage === "operating on canonicalized plan")
     }
   }
 

From 6f94a88a311e95064f1a5cda0c7dd621354b684c Mon Sep 17 00:00:00 2001
From: Cheng Su <chengsu@fb.com>
Date: Wed, 10 Mar 2021 23:38:53 -0800
Subject: [PATCH 035/169] [MINOR][SQL] Remove unnecessary extend from
 BroadcastHashJoinExec

### What changes were proposed in this pull request?

This is just a minor fix. `HashJoin` already extends `JoinCodegenSupport`. So we don't need `CodegenSupport` here for `BroadcastHashJoinExec`. Submitted separately as a PR here per https://github.com/apache/spark/pull/31802#discussion_r592066686 .

### Why are the changes needed?

Clean up code.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Existing unit tests.

Closes #31805 from c21/bhj-minor.

Authored-by: Cheng Su <chengsu@fb.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit 14ad7afa1aa0f3bfd75f1bf076a27af792721190)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../spark/sql/execution/joins/BroadcastHashJoinExec.scala       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
index 2a9e15851e9f1..cec1286c98a7e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/BroadcastHashJoinExec.scala
@@ -46,7 +46,7 @@ case class BroadcastHashJoinExec(
     left: SparkPlan,
     right: SparkPlan,
     isNullAwareAntiJoin: Boolean = false)
-  extends HashJoin with CodegenSupport {
+  extends HashJoin {
 
   if (isNullAwareAntiJoin) {
     require(leftKeys.length == 1, "leftKeys length should be 1")

From f5474b5a91f2fb12d0c44032c502c4d86a0a88fb Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 11 Mar 2021 09:21:58 -0800
Subject: [PATCH 036/169] [SPARK-34713][SQL] Fix group by CreateStruct with
 ExtractValue

### What changes were proposed in this pull request?

This is a bug caused by https://issues.apache.org/jira/browse/SPARK-31670 . We remove the `Alias` when resolving column references in grouping expressions, which breaks `ResolveCreateNamedStruct`

### Why are the changes needed?

bug fix

### Does this PR introduce _any_ user-facing change?

no

### How was this patch tested?

new tests

Closes #31808 from cloud-fan/bug.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit 6a42b633bf39981242f6f0d13ae40919f3fa7f8b)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      |  2 ++
 .../expressions/complexTypeExtractors.scala   | 11 ++++++++++-
 .../spark/sql/DataFrameAggregateSuite.scala   | 19 +++++++++++++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 771b817955854..8644f8c204aec 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -3841,6 +3841,8 @@ object ResolveCreateNamedStruct extends Rule[LogicalPlan] {
       val children = e.children.grouped(2).flatMap {
         case Seq(NamePlaceholder, e: NamedExpression) if e.resolved =>
           Seq(Literal(e.name), e)
+        case Seq(NamePlaceholder, e: ExtractValue) if e.resolved && e.name.isDefined =>
+          Seq(Literal(e.name.get), e)
         case kv =>
           kv
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
index ef247efbe1a04..9b8014035944c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
@@ -94,7 +94,10 @@ object ExtractValue {
   }
 }
 
-trait ExtractValue extends Expression
+trait ExtractValue extends Expression {
+  // The name that is used to extract the value.
+  def name: Option[String]
+}
 
 /**
  * Returns the value of fields in the Struct `child`.
@@ -160,6 +163,7 @@ case class GetArrayStructFields(
   override def dataType: DataType = ArrayType(field.dataType, containsNull)
   override def toString: String = s"$child.${field.name}"
   override def sql: String = s"${child.sql}.${quoteIdentifier(field.name)}"
+  override def name: Option[String] = Some(field.name)
 
   protected override def nullSafeEval(input: Any): Any = {
     val array = input.asInstanceOf[ArrayData]
@@ -237,6 +241,7 @@ case class GetArrayItem(
 
   override def toString: String = s"$child[$ordinal]"
   override def sql: String = s"${child.sql}[${ordinal.sql}]"
+  override def name: Option[String] = None
 
   override def left: Expression = child
   override def right: Expression = ordinal
@@ -456,6 +461,10 @@ case class GetMapValue(
 
   override def toString: String = s"$child[$key]"
   override def sql: String = s"${child.sql}[${key.sql}]"
+  override def name: Option[String] = key match {
+    case NonNullLiteral(s, StringType) => Some(s.toString)
+    case _ => None
+  }
 
   override def left: Expression = child
   override def right: Expression = key
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index c6d134b16d06d..b5d0ebd307fca 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -1091,6 +1091,25 @@ class DataFrameAggregateSuite extends QueryTest
     val df = spark.sql(query)
     checkAnswer(df, Row(0, "0", 0, 0) :: Row(-1, "1", 1, 1) :: Row(-2, "2", 2, 2) :: Nil)
   }
+
+  test("SPARK-34713: group by CreateStruct with ExtractValue") {
+    val structDF = Seq(Tuple1(1 -> 1)).toDF("col")
+    checkAnswer(structDF.groupBy(struct($"col._1")).count().select("count"), Row(1))
+
+    val arrayOfStructDF = Seq(Tuple1(Seq(1 -> 1))).toDF("col")
+    checkAnswer(arrayOfStructDF.groupBy(struct($"col._1")).count().select("count"), Row(1))
+
+    val mapDF = Seq(Tuple1(Map("a" -> "a"))).toDF("col")
+    checkAnswer(mapDF.groupBy(struct($"col.a")).count().select("count"), Row(1))
+
+    val nonStringMapDF = Seq(Tuple1(Map(1 -> 1))).toDF("col")
+    // Spark implicit casts string literal "a" to int to match the key type.
+    checkAnswer(nonStringMapDF.groupBy(struct($"col.a")).count().select("count"), Row(1))
+
+    val arrayDF = Seq(Tuple1(Seq(1))).toDF("col")
+    val e = intercept[AnalysisException](arrayDF.groupBy(struct($"col.a")).count())
+    assert(e.message.contains("requires integral type"))
+  }
 }
 
 case class B(c: Option[Double])

From b52570a7684395a57173a4ea58ecf34fc3164e3f Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Fri, 12 Mar 2021 21:30:46 +0900
Subject: [PATCH 037/169] [SPARK-34724][SQL] Fix Interpreted evaluation by
 using getMethod instead of getDeclaredMethod

### What changes were proposed in this pull request?

This bug was introduced by SPARK-23583 at Apache Spark 2.4.0.

This PR aims to use `getMethod` instead of `getDeclaredMethod`.
```scala
- obj.getClass.getDeclaredMethod(functionName, argClasses: _*)
+ obj.getClass.getMethod(functionName, argClasses: _*)
```

### Why are the changes needed?

`getDeclaredMethod` does not search the super class's method. To invoke `GenericArrayData.toIntArray`, we need to use `getMethod` because it's declared at the super class `ArrayData`.

```
[info] - encode/decode for array of int: [I74655d03 (interpreted path) *** FAILED *** (14 milliseconds)
[info]   Exception thrown while decoding
[info]   Converted: [0,1000000020,3,0,ffffff850000001f,4]
[info]   Schema: value#680
[info]   root
[info]   -- value: array (nullable = true)
[info]       |-- element: integer (containsNull = false)
[info]
[info]
[info]   Encoder:
[info]   class[value[0]: array<int>] (ExpressionEncoderSuite.scala:578)
[info]   org.scalatest.exceptions.TestFailedException:
[info]   at org.scalatest.Assertions.newAssertionFailedException(Assertions.scala:472)
[info]   at org.scalatest.Assertions.newAssertionFailedException$(Assertions.scala:471)
[info]   at org.scalatest.funsuite.AnyFunSuite.newAssertionFailedException(AnyFunSuite.scala:1563)
[info]   at org.scalatest.Assertions.fail(Assertions.scala:949)
[info]   at org.scalatest.Assertions.fail$(Assertions.scala:945)
[info]   at org.scalatest.funsuite.AnyFunSuite.fail(AnyFunSuite.scala:1563)
[info]   at org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite.$anonfun$encodeDecodeTest$1(ExpressionEncoderSuite.scala:578)
[info]   at org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite.verifyNotLeakingReflectionObjects(ExpressionEncoderSuite.scala:656)
[info]   at org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite.$anonfun$testAndVerifyNotLeakingReflectionObjects$2(ExpressionEncoderSuite.scala:669)
[info]   at org.apache.spark.sql.catalyst.plans.CodegenInterpretedPlanTest.$anonfun$test$4(PlanTest.scala:50)
[info]   at org.apache.spark.sql.catalyst.plans.SQLHelper.withSQLConf(SQLHelper.scala:54)
[info]   at org.apache.spark.sql.catalyst.plans.SQLHelper.withSQLConf$(SQLHelper.scala:38)
[info]   at org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite.withSQLConf(ExpressionEncoderSuite.scala:118)
[info]   at org.apache.spark.sql.catalyst.plans.CodegenInterpretedPlanTest.$anonfun$test$3(PlanTest.scala:50)
...
[info]   Cause: java.lang.RuntimeException: Error while decoding: java.lang.NoSuchMethodException: org.apache.spark.sql.catalyst.util.GenericArrayData.toIntArray()
[info] mapobjects(lambdavariable(MapObject, IntegerType, false, -1), assertnotnull(lambdavariable(MapObject, IntegerType, false, -1)), input[0, array<int>, true], None).toIntArray
[info]   at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Deserializer.apply(ExpressionEncoder.scala:186)
[info]   at org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite.$anonfun$encodeDecodeTest$1(ExpressionEncoderSuite.scala:576)
[info]   at org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite.verifyNotLeakingReflectionObjects(ExpressionEncoderSuite.scala:656)
[info]   at org.apache.spark.sql.catalyst.encoders.ExpressionEncoderSuite.$anonfun$testAndVerifyNotLeakingReflectionObjects$2(ExpressionEncoderSuite.scala:669)
```

### Does this PR introduce _any_ user-facing change?

This causes a runtime exception when we use the interpreted mode.

### How was this patch tested?

Pass the modified unit test case.

Closes #31816 from dongjoon-hyun/SPARK-34724.

Authored-by: Dongjoon Hyun <dhyun@apple.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
(cherry picked from commit 9a7977933fd08d0e95ffa59161bed3b10bc9ca61)
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../apache/spark/sql/catalyst/expressions/objects/objects.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index b4e93ebf74dd2..c7b59d2a442e0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -332,7 +332,7 @@ case class Invoke(
       val invokeMethod = if (method.isDefined) {
         method.get
       } else {
-        obj.getClass.getDeclaredMethod(functionName, argClasses: _*)
+        obj.getClass.getMethod(functionName, argClasses: _*)
       }
       invoke(obj, invokeMethod, arguments, input, dataType)
     }

From 52395a7435084355a7b003dfb1431cf17ff629e9 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sat, 13 Mar 2021 00:05:41 -0800
Subject: [PATCH 038/169] [SPARK-34723][SQL] Correct parameter type for
 subexpression elimination under whole-stage

### What changes were proposed in this pull request?

This patch proposes to fix incorrect parameter type for subexpression elimination under whole-stage.

### Why are the changes needed?

If the parameter is a byte array, the subexpression elimination under wholestage codegen will use incorrect parameter type and cause compile error. Although Spark can automatically fallback to interpreted mode, we should fix it.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Manually test with customer application. Unit test.

Closes #31814 from viirya/SPARK-34723.

Authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
(cherry picked from commit 86baa36eebf72b72981830ddb8085950507a4bfa)
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 .../expressions/codegen/CodeGenerator.scala   |  3 +-
 .../SubexpressionEliminationSuite.scala       | 62 ++++++++++++++++++-
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 1ff4a93cf0acd..6e6b9461f3674 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -1089,7 +1089,8 @@ class CodegenContext extends Logging {
           // Generate the code for this expression tree and wrap it in a function.
           val fnName = freshName("subExpr")
           val inputVars = inputVarsForAllFuncs(i)
-          val argList = inputVars.map(v => s"${v.javaType.getName} ${v.variableName}")
+          val argList =
+            inputVars.map(v => s"${CodeGenerator.typeName(v.javaType)} ${v.variableName}")
           val returnType = javaType(expr.dataType)
           val fn =
             s"""
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubexpressionEliminationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubexpressionEliminationSuite.scala
index 0147c6c6a8260..65671d253dc53 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubexpressionEliminationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubexpressionEliminationSuite.scala
@@ -17,10 +17,11 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
-import org.apache.spark.sql.types.{DataType, IntegerType}
+import org.apache.spark.sql.catalyst.expressions.codegen._
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.{BinaryType, DataType, IntegerType}
 
-class SubexpressionEliminationSuite extends SparkFunSuite {
+class SubexpressionEliminationSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("Semantic equals and hash") {
     val a: AttributeReference = AttributeReference("name", IntegerType)()
     val id = {
@@ -253,6 +254,61 @@ class SubexpressionEliminationSuite extends SparkFunSuite {
 
     assert(equivalence2.getAllEquivalentExprs.count(_.size == 2) == 0)
   }
+
+  test("SPARK-34723: Correct parameter type for subexpression elimination under whole-stage") {
+    withSQLConf(SQLConf.CODEGEN_METHOD_SPLIT_THRESHOLD.key -> "1") {
+      val str = BoundReference(0, BinaryType, false)
+      val pos = BoundReference(1, IntegerType, false)
+
+      val substr = new Substring(str, pos)
+
+      val add = Add(Length(substr), Literal(1))
+      val add2 = Add(Length(substr), Literal(2))
+
+      val ctx = new CodegenContext()
+      val exprs = Seq(add, add2)
+
+      val oneVar = ctx.freshVariable("str", BinaryType)
+      val twoVar = ctx.freshVariable("pos", IntegerType)
+      ctx.addMutableState("byte[]", oneVar, forceInline = true, useFreshName = false)
+      ctx.addMutableState("int", twoVar, useFreshName = false)
+
+      ctx.INPUT_ROW = null
+      ctx.currentVars = Seq(
+        ExprCode(TrueLiteral, oneVar),
+        ExprCode(TrueLiteral, twoVar))
+
+      val subExprs = ctx.subexpressionEliminationForWholeStageCodegen(exprs)
+      ctx.withSubExprEliminationExprs(subExprs.states) {
+        exprs.map(_.genCode(ctx))
+      }
+      val subExprsCode = subExprs.codes.mkString("\n")
+
+      val codeBody = s"""
+        public java.lang.Object generate(Object[] references) {
+          return new TestCode(references);
+        }
+
+        class TestCode {
+          ${ctx.declareMutableStates()}
+
+          public TestCode(Object[] references) {
+          }
+
+          public void initialize(int partitionIndex) {
+            ${subExprsCode}
+          }
+
+          ${ctx.declareAddedFunctions()}
+        }
+      """
+
+      val code = CodeFormatter.stripOverlappingComments(
+        new CodeAndComment(codeBody, ctx.getPlaceHolderToComments()))
+
+      CodeGenerator.compile(code)
+    }
+  }
 }
 
 case class CodegenFallbackExpression(child: Expression)

From cf311da349468cce2b0dae075c3c66956074ebde Mon Sep 17 00:00:00 2001
From: Max Gekk <max.gekk@gmail.com>
Date: Sun, 14 Mar 2021 11:29:54 +0900
Subject: [PATCH 039/169] [SPARK-34727][SQL] Fix discrepancy in casting float
 to timestamp

### What changes were proposed in this pull request?
In non-ANSI mode, casting float to timestamp has different implementation for codegen on and off.

Codegen on:
1. Multiply float input by MICROS_PER_SECOND
2. Cast resulting float value to long

Codegen off:
1. CAST float input to double input
2. Multiply double input by MICROS_PER_SECOND
3. Cast resulting double value to long

In the PR, I propose to align to non-codegen code, and cast input float to double in codegen.

### Why are the changes needed?
This fixes the issue which is demonstrated by the code:
```sql
spark-sql> CREATE TEMP VIEW v1 AS SELECT 16777215.0f AS f;
spark-sql> SELECT * FROM v1;
1.6777215E7
spark-sql> SELECT CAST(f AS TIMESTAMP) FROM v1;
1970-07-14 07:20:15
spark-sql> CACHE TABLE v1;
spark-sql> SELECT * FROM v1;
1.6777215E7
spark-sql> SELECT CAST(f AS TIMESTAMP) FROM v1;
1970-07-14 07:20:14.951424
```
The result from the cached view **1970-07-14 07:20:14.951424** is different from un-cached view **1970-07-14 07:20:15**.

### Does this PR introduce _any_ user-facing change?
Yes. After the changes, the example above outputs the same timestamp for the cached view:
```sql
spark-sql> CACHE TABLE v1;
spark-sql> SELECT * FROM v1;
1.6777215E7
spark-sql> SELECT CAST(f AS TIMESTAMP) FROM v1;
1970-07-14 07:20:15
```

### How was this patch tested?
By running new test:
```
$ build/sbt "test:testOnly *CastSuite"
```

Closes #31819 from MaxGekk/fix-float-to-timestamp.

Authored-by: Max Gekk <max.gekk@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../org/apache/spark/sql/catalyst/expressions/Cast.scala      | 2 +-
 .../org/apache/spark/sql/catalyst/expressions/CastSuite.scala | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
index 1d215cf492a67..9a53eebe6c482 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -1301,7 +1301,7 @@ abstract class CastBase extends UnaryExpression with TimeZoneAwareExpression wit
           if (Float.isNaN($c) || Float.isInfinite($c)) {
             $evNull = true;
           } else {
-            $evPrim = (long)($c * $MICROS_PER_SECOND);
+            $evPrim = (long)((double)$c * $MICROS_PER_SECOND);
           }
         """
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
index e46599dc19a8b..20c7928b94209 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CastSuite.scala
@@ -1510,6 +1510,10 @@ class CastSuite extends CastSuiteBase {
   test("Cast from double II") {
     checkEvaluation(cast(cast(1.toDouble, TimestampType), DoubleType), 1.toDouble)
   }
+
+  test("SPARK-34727: cast from float II") {
+    checkCast(16777215.0f, java.time.Instant.ofEpochSecond(16777215))
+  }
 }
 
 /**

From bd2e15c5e4d236143bedb5273588e6fa3b134d6a Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 17 Mar 2021 11:16:51 +0800
Subject: [PATCH 040/169] [SPARK-34504][SQL] Avoid unnecessary resolving of SQL
 temp views for DDL commands

For DDL commands like DROP VIEW, they don't really need to resolve the view (parse and analyze the view SQL text), they just need to get the view metadata.

This PR fixes the rule `ResolveTempViews` to only resolve the temp view for `UnresolvedRelation`. This also fixes a bug for DROP VIEW, as previously it tried to resolve the view and failed to drop invalid views.

bug fix

no

new test

Closes #31853 from cloud-fan/view-resolve.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit af553735b10812d303de411034540e3d90199a26)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      | 33 +++++++++----------
 .../sql/execution/SQLViewTestSuite.scala      | 21 ++++++++++++
 2 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 8644f8c204aec..f98f33b02f0dc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -872,16 +872,16 @@ class Analyzer(override val catalogManager: CatalogManager)
   object ResolveTempViews extends Rule[LogicalPlan] {
     def apply(plan: LogicalPlan): LogicalPlan = plan.resolveOperatorsUp {
       case u @ UnresolvedRelation(ident, _, isStreaming) =>
-        lookupTempView(ident, isStreaming, performCheck = true).getOrElse(u)
+        lookupAndResolveTempView(ident, isStreaming).getOrElse(u)
       case i @ InsertIntoStatement(UnresolvedRelation(ident, _, false), _, _, _, _, _) =>
-        lookupTempView(ident, performCheck = true)
+        lookupAndResolveTempView(ident)
           .map(view => i.copy(table = view))
           .getOrElse(i)
       // TODO (SPARK-27484): handle streaming write commands when we have them.
       case write: V2WriteCommand =>
         write.table match {
           case UnresolvedRelation(ident, _, false) =>
-            lookupTempView(ident, performCheck = true).map(EliminateSubqueryAliases(_)).map {
+            lookupAndResolveTempView(ident).map(EliminateSubqueryAliases(_)).map {
               case r: DataSourceV2Relation => write.withNewTable(r)
               case _ => throw new AnalysisException("Cannot write into temp view " +
                 s"${ident.quoted} as it's not a data source v2 relation.")
@@ -905,10 +905,9 @@ class Analyzer(override val catalogManager: CatalogManager)
           .getOrElse(u)
     }
 
-    def lookupTempView(
+    private def lookupTempView(
         identifier: Seq[String],
-        isStreaming: Boolean = false,
-        performCheck: Boolean = false): Option[LogicalPlan] = {
+        isStreaming: Boolean = false): Option[LogicalPlan] = {
       // Permanent View can't refer to temp views, no need to lookup at all.
       if (isResolvingView && !referredTempViewNames.contains(identifier)) return None
 
@@ -922,7 +921,13 @@ class Analyzer(override val catalogManager: CatalogManager)
         throw new AnalysisException(s"${identifier.quoted} is not a temp view of streaming " +
           s"logical plan, please use batch API such as `DataFrameReader.table` to read it.")
       }
-      tmpView.map(ResolveRelations.resolveViews(_, performCheck))
+      tmpView
+    }
+
+    private def lookupAndResolveTempView(
+        identifier: Seq[String],
+        isStreaming: Boolean = false): Option[LogicalPlan] = {
+      lookupTempView(identifier, isStreaming).map(ResolveRelations.resolveViews)
     }
   }
 
@@ -1076,7 +1081,7 @@ class Analyzer(override val catalogManager: CatalogManager)
     // look at `AnalysisContext.catalogAndNamespace` when resolving relations with single-part name.
     // If `AnalysisContext.catalogAndNamespace` is non-empty, analyzer will expand single-part names
     // with it, instead of current catalog and namespace.
-    def resolveViews(plan: LogicalPlan, performCheck: Boolean = false): LogicalPlan = plan match {
+    def resolveViews(plan: LogicalPlan): LogicalPlan = plan match {
       // The view's child should be a logical plan parsed from the `desc.viewText`, the variable
       // `viewText` should be defined, or else we throw an error on the generation of the View
       // operator.
@@ -1097,16 +1102,10 @@ class Analyzer(override val catalogManager: CatalogManager)
         }
         // Fail the analysis eagerly because outside AnalysisContext, the unresolved operators
         // inside a view maybe resolved incorrectly.
-        // But for commands like `DropViewCommand`, resolving view is unnecessary even though
-        // there is unresolved node. So use the `performCheck` flag to skip the analysis check
-        // for these commands.
-        // TODO(SPARK-34504): avoid unnecessary view resolving and remove the `performCheck` flag
-        if (performCheck) {
-          checkAnalysis(newChild)
-        }
+        checkAnalysis(newChild)
         view.copy(child = newChild)
       case p @ SubqueryAlias(_, view: View) =>
-        p.copy(child = resolveViews(view, performCheck))
+        p.copy(child = resolveViews(view))
       case _ => plan
     }
 
@@ -1144,7 +1143,7 @@ class Analyzer(override val catalogManager: CatalogManager)
 
       case u: UnresolvedRelation =>
         lookupRelation(u.multipartIdentifier, u.options, u.isStreaming)
-          .map(resolveViews(_, performCheck = true)).getOrElse(u)
+          .map(resolveViews).getOrElse(u)
 
       case u @ UnresolvedTable(identifier, cmd) =>
         lookupTableOrView(identifier).map {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala
index 88218b1865882..52aa6d821e441 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala
@@ -293,6 +293,27 @@ abstract class SQLViewTestSuite extends QueryTest with SQLTestUtils {
       }
     }
   }
+
+  test("SPARK-34504: drop an invalid view") {
+    // TODO: fix dropping non-existing global temp views.
+    assume(viewTypeString != "GLOBAL TEMPORARY VIEW")
+    withTable("t") {
+      sql("CREATE TABLE t(s STRUCT<i: INT, j: INT>) USING json")
+      val viewName = createView("v", "SELECT s.i FROM t")
+      withView(viewName) {
+        assert(spark.table(viewName).collect().isEmpty)
+
+        // re-create the table without nested field `i` which is referred by the view.
+        sql("DROP TABLE t")
+        sql("CREATE TABLE t(s STRUCT<j: INT>) USING json")
+        val e = intercept[AnalysisException](spark.table(viewName))
+        assert(e.message.contains("No such struct field i in j"))
+
+        // drop invalid view should be fine
+        sql(s"DROP VIEW $viewName")
+      }
+    }
+  }
 }
 
 class LocalTempViewTestSuite extends SQLViewTestSuite with SharedSparkSession {

From a91c0d82ed7e2244426834e252e62b538277218e Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 17 Mar 2021 16:36:50 +0800
Subject: [PATCH 041/169] [SPARK-34770][SQL] InMemoryCatalog.tableExists should
 not fail if database doesn't exist

This PR updates `InMemoryCatalog.tableExists` to return false if database doesn't exist, instead of failing. The new behavior is consistent with `HiveExternalCatalog` which is used in production, so this bug mostly only affects tests.

bug fix

no

a new test

Closes #31860 from cloud-fan/catalog.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 1a4971d8a16b5bc624cef584271243bf64a51941)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala    | 3 +--
 .../spark/sql/catalyst/catalog/SessionCatalogSuite.scala       | 3 +++
 .../org/apache/spark/sql/execution/SQLViewTestSuite.scala      | 2 --
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index 90e69469eef69..08b54fc7538ea 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -342,8 +342,7 @@ class InMemoryCatalog(
   }
 
   override def tableExists(db: String, table: String): Boolean = synchronized {
-    requireDbExists(db)
-    catalog(db).tables.contains(table)
+    catalog.contains(db) && catalog(db).tables.contains(table)
   }
 
   override def listTables(db: String): Seq[String] = synchronized {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
index 98f9ce6fe9dbb..ad996dbc509be 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalogSuite.scala
@@ -691,6 +691,9 @@ abstract class SessionCatalogSuite extends AnalysisTest with Eventually {
       catalog.createTempView("tbl3", tempTable, overrideIfExists = false)
       // tableExists should not check temp view.
       assert(!catalog.tableExists(TableIdentifier("tbl3")))
+
+      // If database doesn't exist, return false instead of failing.
+      assert(!catalog.tableExists(TableIdentifier("tbl1", Some("non-exist"))))
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala
index 52aa6d821e441..cf5ef63a46c18 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewTestSuite.scala
@@ -295,8 +295,6 @@ abstract class SQLViewTestSuite extends QueryTest with SQLTestUtils {
   }
 
   test("SPARK-34504: drop an invalid view") {
-    // TODO: fix dropping non-existing global temp views.
-    assume(viewTypeString != "GLOBAL TEMPORARY VIEW")
     withTable("t") {
       sql("CREATE TABLE t(s STRUCT<i: INT, j: INT>) USING json")
       val viewName = createView("v", "SELECT s.i FROM t")

From ff792c4b9a2d9e793ceeb566960584a11a7e12ee Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Thu, 18 Mar 2021 07:44:11 +0900
Subject: [PATCH 042/169] [SPARK-34749][SQL][3.1] Simplify
 ResolveCreateNamedStruct

backports https://github.com/apache/spark/pull/31843

### What changes were proposed in this pull request?

This is a follow-up of https://github.com/apache/spark/pull/31808 and simplifies its fix to one line (excluding comments).

### Why are the changes needed?

code simplification

### Does this PR introduce _any_ user-facing change?

no

### How was this patch tested?

N/A

Closes #31867 from cloud-fan/backport.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
---
 .../apache/spark/sql/catalyst/analysis/Analyzer.scala |  2 --
 .../sql/catalyst/expressions/complexTypeCreator.scala | 10 +++++++++-
 .../catalyst/expressions/complexTypeExtractors.scala  | 11 +----------
 .../sql/catalyst/parser/ExpressionParserSuite.scala   |  2 +-
 4 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index f98f33b02f0dc..f4cdeab063ce7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -3840,8 +3840,6 @@ object ResolveCreateNamedStruct extends Rule[LogicalPlan] {
       val children = e.children.grouped(2).flatMap {
         case Seq(NamePlaceholder, e: NamedExpression) if e.resolved =>
           Seq(Literal(e.name), e)
-        case Seq(NamePlaceholder, e: ExtractValue) if e.resolved && e.name.isDefined =>
-          Seq(Literal(e.name.get), e)
         case kv =>
           kv
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
index cb59fbda2b3b9..1779d413e025d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.analysis.{Resolver, TypeCheckResult, TypeCoercion, UnresolvedExtractValue}
+import org.apache.spark.sql.catalyst.analysis.{Resolver, TypeCheckResult, TypeCoercion, UnresolvedAttribute, UnresolvedExtractValue}
 import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.{FUNC_ALIAS, FunctionBuilder}
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
@@ -336,6 +336,14 @@ object CreateStruct {
    */
   def apply(children: Seq[Expression]): CreateNamedStruct = {
     CreateNamedStruct(children.zipWithIndex.flatMap {
+      // For multi-part column name like `struct(a.b.c)`, it may be resolved into:
+      //   1. Attribute if `a.b.c` is simply a qualified column name.
+      //   2. GetStructField if `a.b` refers to a struct-type column.
+      //   3. GetArrayStructFields if `a.b` refers to a array-of-struct-type column.
+      //   4. GetMapValue if `a.b` refers to a map-type column.
+      // We should always use the last part of the column name (`c` in the above example) as the
+      // alias name inside CreateNamedStruct.
+      case (u: UnresolvedAttribute, _) => Seq(Literal(u.nameParts.last), u)
       case (e: NamedExpression, _) if e.resolved => Seq(Literal(e.name), e)
       case (e: NamedExpression, _) => Seq(NamePlaceholder, e)
       case (e, index) => Seq(Literal(s"col${index + 1}"), e)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
index 9b8014035944c..ef247efbe1a04 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeExtractors.scala
@@ -94,10 +94,7 @@ object ExtractValue {
   }
 }
 
-trait ExtractValue extends Expression {
-  // The name that is used to extract the value.
-  def name: Option[String]
-}
+trait ExtractValue extends Expression
 
 /**
  * Returns the value of fields in the Struct `child`.
@@ -163,7 +160,6 @@ case class GetArrayStructFields(
   override def dataType: DataType = ArrayType(field.dataType, containsNull)
   override def toString: String = s"$child.${field.name}"
   override def sql: String = s"${child.sql}.${quoteIdentifier(field.name)}"
-  override def name: Option[String] = Some(field.name)
 
   protected override def nullSafeEval(input: Any): Any = {
     val array = input.asInstanceOf[ArrayData]
@@ -241,7 +237,6 @@ case class GetArrayItem(
 
   override def toString: String = s"$child[$ordinal]"
   override def sql: String = s"${child.sql}[${ordinal.sql}]"
-  override def name: Option[String] = None
 
   override def left: Expression = child
   override def right: Expression = ordinal
@@ -461,10 +456,6 @@ case class GetMapValue(
 
   override def toString: String = s"$child[$key]"
   override def sql: String = s"${child.sql}[${key.sql}]"
-  override def name: Option[String] = key match {
-    case NonNullLiteral(s, StringType) => Some(s.toString)
-    case _ => None
-  }
 
   override def left: Expression = child
   override def right: Expression = key
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
index 21a8dafd6b760..7b5e83f4396f3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/ExpressionParserSuite.scala
@@ -429,7 +429,7 @@ class ExpressionParserSuite extends AnalysisTest {
     assertEqual("(a + b).b", ('a + 'b).getField("b")) // This will fail analysis.
     assertEqual(
       "struct(a, b).b",
-      namedStruct(NamePlaceholder, 'a, NamePlaceholder, 'b).getField("b"))
+      namedStruct(Literal("a"), 'a, Literal("b"), 'b).getField("b"))
   }
 
   test("reference") {

From d2b4abf2378efcd4b2c795be65b923f5d7b2d2e0 Mon Sep 17 00:00:00 2001
From: Max Gekk <max.gekk@gmail.com>
Date: Thu, 18 Mar 2021 03:17:41 +0000
Subject: [PATCH 043/169] [SPARK-34737][SQL][3.1] Cast input float to double in
 `TIMESTAMP_SECONDS`

### What changes were proposed in this pull request?
In the PR, I propose to cast the input float to double in the `SecondsToTimestamp` expression in the same way as in the `Cast` expression.

### Why are the changes needed?
To have the same results from `CAST(<float> AS TIMESTAMP)` and from `TIMESTAMP_SECONDS`:
```sql
spark-sql> SELECT CAST(16777215.0f AS TIMESTAMP);
1970-07-14 07:20:15
spark-sql> SELECT TIMESTAMP_SECONDS(16777215.0f);
1970-07-14 07:20:14.951424
```

### Does this PR introduce _any_ user-facing change?
Yes. After the changes:
```sql
spark-sql> SELECT TIMESTAMP_SECONDS(16777215.0f);
1970-07-14 07:20:15
```

### How was this patch tested?
By running new test:
```
$ build/sbt "test:testOnly *DateExpressionsSuite"
```

Authored-by: Max Gekk <max.gekkgmail.com>
Signed-off-by: HyukjinKwon <gurwls223apache.org>
(cherry picked from commit 7aaed76125c82aff8683fe319f8047c2cb87afdd)
Signed-off-by: Max Gekk <max.gekkgmail.com>

Closes #31872 from MaxGekk/adjust-SecondsToTimestamp-3.1.

Authored-by: Max Gekk <max.gekk@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/expressions/datetimeExpressions.scala | 5 +++--
 .../sql/catalyst/expressions/DateExpressionsSuite.scala      | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index c20dd6148be3e..4a27b2a482728 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -502,7 +502,7 @@ case class SecondsToTimestamp(child: Expression) extends UnaryExpression
       input.asInstanceOf[Decimal].toJavaBigDecimal.multiply(operand).longValueExact()
     case _: FloatType => input =>
       val f = input.asInstanceOf[Float]
-      if (f.isNaN || f.isInfinite) null else (f * MICROS_PER_SECOND).toLong
+      if (f.isNaN || f.isInfinite) null else (f.toDouble * MICROS_PER_SECOND).toLong
     case _: DoubleType => input =>
       val d = input.asInstanceOf[Double]
       if (d.isNaN || d.isInfinite) null else (d * MICROS_PER_SECOND).toLong
@@ -517,13 +517,14 @@ case class SecondsToTimestamp(child: Expression) extends UnaryExpression
       val operand = s"new java.math.BigDecimal($MICROS_PER_SECOND)"
       defineCodeGen(ctx, ev, c => s"$c.toJavaBigDecimal().multiply($operand).longValueExact()")
     case other =>
+      val castToDouble = if (other.isInstanceOf[FloatType]) "(double)" else ""
       nullSafeCodeGen(ctx, ev, c => {
         val typeStr = CodeGenerator.boxedType(other)
         s"""
            |if ($typeStr.isNaN($c) || $typeStr.isInfinite($c)) {
            |  ${ev.isNull} = true;
            |} else {
-           |  ${ev.value} = (long)($c * $MICROS_PER_SECOND);
+           |  ${ev.value} = (long)($castToDouble$c * $MICROS_PER_SECOND);
            |}
            |""".stripMargin
       })
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
index 79770505ec35d..763ecba2f3fcb 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/DateExpressionsSuite.scala
@@ -1365,6 +1365,7 @@ class DateExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(
       SecondsToTimestamp(Literal(123.456789123)),
       Instant.ofEpochSecond(123, 456789000))
+    checkEvaluation(SecondsToTimestamp(Literal(16777215.0f)), Instant.ofEpochSecond(16777215))
   }
 
   test("TIMESTAMP_MILLIS") {

From acfe003d05e9417c4e8cc733792f66a8b8289184 Mon Sep 17 00:00:00 2001
From: Bruce Robbins <bersprockets@gmail.com>
Date: Thu, 18 Mar 2021 14:59:57 +0900
Subject: [PATCH 044/169] [SPARK-34731][CORE] Avoid
 ConcurrentModificationException when redacting properties in
 EventLoggingListener

### What changes were proposed in this pull request?

Change DAGScheduler to pass a clone of the Properties object, rather than the original object, to the SparkListenerJobStart event.

### Why are the changes needed?

 DAGScheduler might modify the Properties object (e.g., in addPySparkConfigsToProperties) after firing off the SparkListenerJobStart event. Since the handler for that event (onJobStart in EventLoggingListener) will iterate over the elements of the Property object, this sometimes results in a ConcurrentModificationException.

This can be demonstrated using these steps:
```
$ bin/spark-shell --conf spark.ui.showConsoleProgress=false \
--conf spark.executor.cores=1 --driver-memory 4g --conf \
"spark.ui.showConsoleProgress=false" \
--conf spark.eventLog.enabled=true \
--conf spark.eventLog.dir=/tmp/spark-events
...
scala> (0 to 500).foreach { i =>
     |   val df = spark.range(0, 20000).toDF("a")
     |   df.filter("a > 12").count
     | }
21/03/12 18:16:44 ERROR AsyncEventQueue: Listener EventLoggingListener threw an exception
java.util.ConcurrentModificationException
	at java.util.Hashtable$Enumerator.next(Hashtable.java:1387)
```

I've not actually seen a ConcurrentModificationException in onStageSubmitted, only in onJobStart. However, they both iterate over the Properties object, so for safety's sake I pass a clone to SparkListenerStageSubmitted as well.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
By repeatedly running the reproduction steps from above.

Closes #31826 from bersprockets/elconcurrent.

Authored-by: Bruce Robbins <bersprockets@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
(cherry picked from commit f8a8b340b3a69fd10514af03085d77027b84e617)
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../apache/spark/scheduler/DAGScheduler.scala   | 17 +++++++++++------
 .../scala/org/apache/spark/util/Utils.scala     |  3 +++
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 608f5a4efdae8..68ee368ddabaa 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -892,10 +892,11 @@ private[spark] class DAGScheduler(
       timeout: Long,
       properties: Properties): PartialResult[R] = {
     val jobId = nextJobId.getAndIncrement()
+    val clonedProperties = Utils.cloneProperties(properties)
     if (rdd.partitions.isEmpty) {
       // Return immediately if the job is running 0 tasks
       val time = clock.getTimeMillis()
-      listenerBus.post(SparkListenerJobStart(jobId, time, Seq[StageInfo](), properties))
+      listenerBus.post(SparkListenerJobStart(jobId, time, Seq[StageInfo](), clonedProperties))
       listenerBus.post(SparkListenerJobEnd(jobId, time, JobSucceeded))
       return new PartialResult(evaluator.currentResult(), true)
     }
@@ -903,7 +904,7 @@ private[spark] class DAGScheduler(
     val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
     eventProcessLoop.post(JobSubmitted(
       jobId, rdd, func2, rdd.partitions.indices.toArray, callSite, listener,
-      Utils.cloneProperties(properties)))
+      clonedProperties))
     listener.awaitResult()    // Will throw an exception if the job fails
   }
 
@@ -1164,7 +1165,8 @@ private[spark] class DAGScheduler(
     val stageIds = jobIdToStageIds(jobId).toArray
     val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
     listenerBus.post(
-      SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
+      SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos,
+        Utils.cloneProperties(properties)))
     submitStage(finalStage)
   }
 
@@ -1202,7 +1204,8 @@ private[spark] class DAGScheduler(
     val stageIds = jobIdToStageIds(jobId).toArray
     val stageInfos = stageIds.flatMap(id => stageIdToStage.get(id).map(_.latestInfo))
     listenerBus.post(
-      SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos, properties))
+      SparkListenerJobStart(job.jobId, jobSubmissionTime, stageInfos,
+        Utils.cloneProperties(properties)))
     submitStage(finalStage)
 
     // If the whole stage has already finished, tell the listener and remove it
@@ -1336,7 +1339,8 @@ private[spark] class DAGScheduler(
     } catch {
       case NonFatal(e) =>
         stage.makeNewStageAttempt(partitionsToCompute.size)
-        listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
+        listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo,
+          Utils.cloneProperties(properties)))
         abortStage(stage, s"Task creation failed: $e\n${Utils.exceptionString(e)}", Some(e))
         runningStages -= stage
         return
@@ -1350,7 +1354,8 @@ private[spark] class DAGScheduler(
     if (partitionsToCompute.nonEmpty) {
       stage.latestInfo.submissionTime = Some(clock.getTimeMillis())
     }
-    listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo, properties))
+    listenerBus.post(SparkListenerStageSubmitted(stage.latestInfo,
+      Utils.cloneProperties(properties)))
 
     // TODO: Maybe we can keep the taskBinary in Stage to avoid serializing it multiple times.
     // Broadcasted binary for the task, used to dispatch tasks to executors. Note that we broadcast
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 725422d488cb1..41eedfa805651 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -3007,6 +3007,9 @@ private[spark] object Utils extends Logging {
 
   /** Create a new properties object with the same values as `props` */
   def cloneProperties(props: Properties): Properties = {
+    if (props == null) {
+      return props
+    }
     val resultProps = new Properties()
     props.forEach((k, v) => resultProps.put(k, v))
     resultProps

From 24369bdec05a585afdd5675a3e8071b6a3d09b2f Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 19 Mar 2021 11:44:02 -0700
Subject: [PATCH 045/169] [SPARK-34776][SQL] Nested column pruning should not
 prune Window produced attributes

### What changes were proposed in this pull request?

This patch proposes to fix a bug related to `NestedColumnAliasing`. The root cause is `Window`  doesn't override `producedAttributes` so `NestedColumnAliasing` rule wrongly prune attributes produced by `Window`.

The master and branch-3.1 both have this issue.

### Why are the changes needed?

It is needed to fix a bug of nested column pruning.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Unit test.

Closes #31897 from viirya/SPARK-34776.

Authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit 7a8a600995ddee32f0a9c81a97be3fc2bca21928)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../plans/logical/basicLogicalOperators.scala |  2 ++
 .../org/apache/spark/sql/DataFrameSuite.scala | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index eee99f9130f8c..4de540299b71b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -639,6 +639,8 @@ case class Window(
   override def output: Seq[Attribute] =
     child.output ++ windowExpressions.map(_.toAttribute)
 
+  override def producedAttributes: AttributeSet = windowOutputSet
+
   def windowOutputSet: AttributeSet = AttributeSet(windowExpressions.map(_.toAttribute))
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 8d95f83e49f3a..594b142551cef 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -41,6 +41,7 @@ import org.apache.spark.sql.execution.{FilterExec, QueryExecution, WholeStageCod
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
 import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchangeExec}
+import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SharedSparkSession}
@@ -2584,6 +2585,24 @@ class DataFrameSuite extends QueryTest
     val col2 = df.colRegex("test\n_table.`tes.*\n.*mn`")
     checkAnswer(df.select(col2), Row(1) :: Row(2) :: Row(3) :: Nil)
   }
+
+  test("SPARK-34776: Nested column pruning should not prune Window produced attributes") {
+    val df = Seq(
+      ("t1", "123", "bob"),
+      ("t1", "456", "bob"),
+      ("t2", "123", "sam")
+    ).toDF("type", "value", "name")
+
+    val test = df.select(
+      $"*",
+      struct(count($"*").over(Window.partitionBy($"type", $"value", $"name"))
+        .as("count"), $"name").as("name_count")
+    ).select(
+      $"*",
+      max($"name_count").over(Window.partitionBy($"type", $"value")).as("best_name")
+    )
+    checkAnswer(test.select($"best_name.name"), Row("bob") :: Row("bob") :: Row("sam") :: Nil)
+  }
 }
 
 case class GroupByKey(a: Int, b: Int)

From f3c14c5c2c989d25046ebc6f282047738caa844a Mon Sep 17 00:00:00 2001
From: Cheng Su <chengsu@fb.com>
Date: Sun, 21 Mar 2021 20:47:58 +0900
Subject: [PATCH 046/169] [SPARK-34796][SQL][3.1] Initialize counter variable
 for LIMIT code-gen in doProduce()

### What changes were proposed in this pull request?

This PR is to fix the LIMIT code-gen bug in https://issues.apache.org/jira/browse/SPARK-34796, where the counter variable from `BaseLimitExec` is not initialized but used in code-gen. This is because the limit counter variable will be used in upstream operators (LIMIT's child plan, e.g. `ColumnarToRowExec` operator for early termination), but in the same stage, there can be some operators doing the shortcut and not calling `BaseLimitExec`'s `doConsume()`, e.g. [HashJoin.codegenInner](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashJoin.scala#L402). So if we have query that `LocalLimit - BroadcastHashJoin - FileScan` in the same stage, the whole stage code-gen compilation will be failed.

Here is an example:

```
  test("failed limit query") {
    withTable("left_table", "empty_right_table", "output_table") {
      spark.range(5).toDF("k").write.saveAsTable("left_table")
      spark.range(0).toDF("k").write.saveAsTable("empty_right_table")

      withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") {
        spark.sql("CREATE TABLE output_table (k INT) USING parquet")
        spark.sql(
          s"""
             |INSERT INTO TABLE output_table
             |SELECT t1.k FROM left_table t1
             |JOIN empty_right_table t2
             |ON t1.k = t2.k
             |LIMIT 3
             |""".stripMargin)
      }
    }
  }
```

Query plan:

```
Execute InsertIntoHadoopFsRelationCommand file:/Users/chengsu/spark/sql/core/spark-warehouse/org.apache.spark.sql.SQLQuerySuite/output_table, false, Parquet, Map(path -> file:/Users/chengsu/spark/sql/core/spark-warehouse/org.apache.spark.sql.SQLQuerySuite/output_table), Append, CatalogTable(
Database: default
Table: output_table
Created Time: Thu Mar 18 21:46:26 PDT 2021
Last Access: UNKNOWN
Created By: Spark 3.2.0-SNAPSHOT
Type: MANAGED
Provider: parquet
Location: file:/Users/chengsu/spark/sql/core/spark-warehouse/org.apache.spark.sql.SQLQuerySuite/output_table
Schema: root
 |-- k: integer (nullable = true)
), org.apache.spark.sql.execution.datasources.InMemoryFileIndexb25d08b, [k]
+- *(3) Project [ansi_cast(k#228L as int) AS k#231]
   +- *(3) GlobalLimit 3
      +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [id=#179]
         +- *(2) LocalLimit 3
            +- *(2) Project [k#228L]
               +- *(2) BroadcastHashJoin [k#228L], [k#229L], Inner, BuildRight, false
                  :- *(2) Filter isnotnull(k#228L)
                  :  +- *(2) ColumnarToRow
                  :     +- FileScan parquet default.left_table[k#228L] Batched: true, DataFilters: [isnotnull(k#228L)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/Users/chengsu/spark/sql/core/spark-warehouse/org.apache.spark.sq..., PartitionFilters: [], PushedFilters: [IsNotNull(k)], ReadSchema: struct<k:bigint>
                  +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, false]),false), [id=#173]
                     +- *(1) Filter isnotnull(k#229L)
                        +- *(1) ColumnarToRow
                           +- FileScan parquet default.empty_right_table[k#229L] Batched: true, DataFilters: [isnotnull(k#229L)], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/Users/chengsu/spark/sql/core/spark-warehouse/org.apache.spark.sq..., PartitionFilters: [], PushedFilters: [IsNotNull(k)], ReadSchema: struct<k:bigint>
```

Codegen failure - https://gist.github.com/c21/ea760c75b546d903247582be656d9d66 .

The uninitialized variable `_limit_counter_1` from `LocalLimitExec` is referenced in `ColumnarToRowExec`, but `BroadcastHashJoinExec` does not call `LocalLimitExec.doConsume()` to initialize the counter variable.

The fix is to move the counter variable initialization to `doProduce()`, as in whole stage code-gen framework, `doProduce()` will definitely be called if upstream operators `doProduce()`/`doConsume()` is called.

Note: this only happens in AQE disabled case, because we have an AQE optimization rule [EliminateUnnecessaryJoin](https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/EliminateUnnecessaryJoin.scala#L69) to change the whole query to an empty `LocalRelation` if inner join broadcast side is empty with AQE enabled.

### Why are the changes needed?

Fix query failure.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Added unit test in `SQLQuerySuite.scala`.

Closes #31911 from c21/limit-fix-3.1.

Authored-by: Cheng Su <chengsu@fb.com>
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
---
 .../apache/spark/sql/execution/limit.scala    | 12 ++++++++----
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 19 +++++++++++++++++++
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
index e6cfbb0a01e07..c62b121456e80 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/limit.scala
@@ -124,14 +124,18 @@ trait BaseLimitExec extends LimitExec with CodegenSupport {
   }
 
   protected override def doProduce(ctx: CodegenContext): String = {
-    child.asInstanceOf[CodegenSupport].produce(ctx, this)
-  }
-
-  override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = {
     // The counter name is already obtained by the upstream operators via `limitNotReachedChecks`.
     // Here we have to inline it to not change its name. This is fine as we won't have many limit
     // operators in one query.
+    //
+    // Note: create counter variable here instead of `doConsume()` to avoid compilation error,
+    // because upstream operators might not call `doConsume()` here
+    // (e.g. `HashJoin.codegenInner()`).
     ctx.addMutableState(CodeGenerator.JAVA_INT, countTerm, forceInline = true, useFreshName = false)
+    child.asInstanceOf[CodegenSupport].produce(ctx, this)
+  }
+
+  override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = {
     s"""
        | if ($countTerm < $limit) {
        |   $countTerm += 1;
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index e42fb963d5f8c..c29eac2c7b1f3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -3926,6 +3926,25 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
       }
     }
   }
+
+  test("SPARK-34796: Avoid code-gen compilation error for LIMIT query") {
+    withTable("left_table", "empty_right_table", "output_table") {
+      spark.range(5).toDF("k").write.saveAsTable("left_table")
+      spark.range(0).toDF("k").write.saveAsTable("empty_right_table")
+
+      withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") {
+        spark.sql("CREATE TABLE output_table (k INT) USING parquet")
+        spark.sql(
+          """
+            |INSERT INTO TABLE output_table
+            |SELECT t1.k FROM left_table t1
+            |JOIN empty_right_table t2
+            |ON t1.k = t2.k
+            |LIMIT 3
+          """.stripMargin)
+      }
+    }
+  }
 }
 
 case class Foo(bar: Option[String])

From 1d45ac77251097bc73e3c84cc6a876bee8f35d8a Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Sun, 21 Mar 2021 14:08:34 -0700
Subject: [PATCH 047/169] [SPARK-34811][CORE] Redact fs.s3a.access.key like
 secret and token

### What changes were proposed in this pull request?

Like we redact secrets and tokens, this PR aims to redact access key.

### Why are the changes needed?

Access key is also worth to hide.

### Does this PR introduce _any_ user-facing change?

This will hide this information from SparkUI (`Spark Properties` and `Hadoop Properties` and logs).

### How was this patch tested?

Pass the newly updated UT.

Closes #31912 from dongjoon-hyun/SPARK-34811.

Authored-by: Dongjoon Hyun <dhyun@apple.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit 3c32b54a0fbdc55c503bc72a3d39d58bf99e3bfa)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../scala/org/apache/spark/internal/config/package.scala     | 2 +-
 core/src/test/scala/org/apache/spark/util/UtilsSuite.scala   | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
index 954c055ce63f1..6d9bf85f3d0cd 100644
--- a/core/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -1013,7 +1013,7 @@ package object config {
         "like YARN and event logs.")
       .version("2.1.2")
       .regexConf
-      .createWithDefault("(?i)secret|password|token".r)
+      .createWithDefault("(?i)secret|password|token|access[.]key".r)
 
   private[spark] val STRING_REDACTION_PATTERN =
     ConfigBuilder("spark.redaction.string.regex")
diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
index 18ff96021153f..208e7297f06aa 100644
--- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
@@ -1024,11 +1024,13 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     // Set some secret keys
     val secretKeys = Seq(
       "spark.executorEnv.HADOOP_CREDSTORE_PASSWORD",
+      "spark.hadoop.fs.s3a.access.key",
       "spark.my.password",
       "spark.my.sECreT")
     secretKeys.foreach { key => sparkConf.set(key, "sensitive_value") }
     // Set a non-secret key
     sparkConf.set("spark.regular.property", "regular_value")
+    sparkConf.set("spark.hadoop.fs.s3a.access_key", "regular_value")
     // Set a property with a regular key but secret in the value
     sparkConf.set("spark.sensitive.property", "has_secret_in_value")
 
@@ -1039,7 +1041,8 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {
     secretKeys.foreach { key => assert(redactedConf(key) === Utils.REDACTION_REPLACEMENT_TEXT) }
     assert(redactedConf("spark.regular.property") === "regular_value")
     assert(redactedConf("spark.sensitive.property") === Utils.REDACTION_REPLACEMENT_TEXT)
-
+    assert(redactedConf("spark.hadoop.fs.s3a.access.key") === Utils.REDACTION_REPLACEMENT_TEXT)
+    assert(redactedConf("spark.hadoop.fs.s3a.access_key") === "regular_value")
   }
 
   test("redact sensitive information in command line args") {

From 20be38f4df49a3ea109aa7cf37060938a369eb55 Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.com>
Date: Mon, 22 Mar 2021 14:06:41 +0900
Subject: [PATCH 048/169] [SPARK-34225][CORE] Don't encode further when a URI
 form string is passed to addFile or addJar

### What changes were proposed in this pull request?

This PR fixes an issue that `addFile` and `addJar` further encode even though a URI form string is passed.
For example, the following operation will throw exception even though the file exists.
```
sc.addFile("file:/foo/test%20file.txt")
```

Another case is `--files` and `--jars` option when we submit an application.
```
bin/spark-shell --files "/foo/test file.txt"
```
The path above is transformed to URI form [here](https://github.com/apache/spark/blob/ecf4811764f1ef91954c865a864e0bf6691f99a6/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala#L400) and passed to `addFile` so the same issue happens.

### Why are the changes needed?

This is a bug.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

New test.

Closes #31718 from sarutak/fix-uri-encode-double.

Authored-by: Kousuke Saruta <sarutak@oss.nttdata.com>
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
(cherry picked from commit 0734101bb716b50aa675cee0da21a20692bb44d4)
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
---
 .../scala/org/apache/spark/SparkContext.scala | 16 +++++---
 .../scala/org/apache/spark/util/Utils.scala   | 11 +++++
 .../org/apache/spark/SparkContextSuite.scala  | 40 +++++++++++++++++++
 3 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index a51133f295dc6..6bb898a964c68 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1587,7 +1587,11 @@ class SparkContext(config: SparkConf) extends Logging {
       path: String, recursive: Boolean, addedOnSubmit: Boolean, isArchive: Boolean = false
     ): Unit = {
     val uri = if (!isArchive) {
-      new Path(path).toUri
+      if (Utils.isAbsoluteURI(path) && path.contains("%")) {
+        new URI(path)
+      } else {
+        new Path(path).toUri
+      }
     } else {
       Utils.resolveURI(path)
     }
@@ -1622,10 +1626,8 @@ class SparkContext(config: SparkConf) extends Logging {
       env.rpcEnv.fileServer.addFile(new File(uri.getPath))
     } else if (uri.getScheme == null) {
       schemeCorrectedURI.toString
-    } else if (isArchive) {
-      uri.toString
     } else {
-      path
+      uri.toString
     }
 
     val timestamp = if (addedOnSubmit) startTime else System.currentTimeMillis
@@ -1980,7 +1982,11 @@ class SparkContext(config: SparkConf) extends Logging {
         // For local paths with backslashes on Windows, URI throws an exception
         addLocalJarFile(new File(path))
       } else {
-        val uri = new Path(path).toUri
+        val uri = if (Utils.isAbsoluteURI(path) && path.contains("%")) {
+          new URI(path)
+        } else {
+          new Path(path).toUri
+        }
         // SPARK-17650: Make sure this is a valid URL before adding it to the list of dependencies
         Utils.validateURL(uri)
         uri.getScheme match {
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 41eedfa805651..b94549f74f7d4 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2072,6 +2072,17 @@ private[spark] object Utils extends Logging {
     }
   }
 
+  /** Check whether a path is an absolute URI. */
+  def isAbsoluteURI(path: String): Boolean = {
+    try {
+      val uri = new URI(path: String)
+      uri.isAbsolute
+    } catch {
+      case _: URISyntaxException =>
+        false
+    }
+  }
+
   /** Return all non-local paths from a comma-separated list of paths. */
   def nonLocalPaths(paths: String, testWindows: Boolean = false): Array[String] = {
     val windows = isWindows || testWindows
diff --git a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
index 0c0a9b86ac603..c4bcccfd1d45a 100644
--- a/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
+++ b/core/src/test/scala/org/apache/spark/SparkContextSuite.scala
@@ -1069,6 +1069,46 @@ class SparkContextSuite extends SparkFunSuite with LocalSparkContext with Eventu
     assert(sc.hadoopConfiguration.get(bufferKey).toInt === 65536,
       "spark configs have higher priority than spark.hadoop configs")
   }
+
+  test("SPARK-34225: addFile/addJar shouldn't further encode URI if a URI form string is passed") {
+    withTempDir { dir =>
+      val jar1 = File.createTempFile("testprefix", "test jar.jar", dir)
+      val jarUrl1 = jar1.toURI.toString
+      val file1 = File.createTempFile("testprefix", "test file.txt", dir)
+      val fileUrl1 = file1.toURI.toString
+      val jar2 = File.createTempFile("testprefix", "test %20jar.jar", dir)
+      val file2 = File.createTempFile("testprefix", "test %20file.txt", dir)
+
+      try {
+        sc = new SparkContext(new SparkConf().setAppName("test").setMaster("local"))
+        sc.addJar(jarUrl1)
+        sc.addFile(fileUrl1)
+        sc.addJar(jar2.toString)
+        sc.addFile(file2.toString)
+        sc.parallelize(Array(1), 1).map { x =>
+          val gottenJar1 = new File(SparkFiles.get(jar1.getName))
+          if (!gottenJar1.exists()) {
+            throw new SparkException("file doesn't exist : " + jar1)
+          }
+          val gottenFile1 = new File(SparkFiles.get(file1.getName))
+          if (!gottenFile1.exists()) {
+            throw new SparkException("file doesn't exist : " + file1)
+          }
+          val gottenJar2 = new File(SparkFiles.get(jar2.getName))
+          if (!gottenJar2.exists()) {
+            throw new SparkException("file doesn't exist : " + jar2)
+          }
+          val gottenFile2 = new File(SparkFiles.get(file2.getName))
+          if (!gottenFile2.exists()) {
+            throw new SparkException("file doesn't exist : " + file2)
+          }
+          x
+        }.collect()
+      } finally {
+        sc.stop()
+      }
+    }
+  }
 }
 
 object SparkContextSuite {

From 519a6fab82406a17e1c8c890e4ef8769a881c51a Mon Sep 17 00:00:00 2001
From: hezuojiao <hezuojiao@gmail.com>
Date: Mon, 22 Mar 2021 13:06:12 -0700
Subject: [PATCH 049/169] [SPARK-34790][CORE] Disable fetching shuffle blocks
 in batch when io encryption is enabled

### What changes were proposed in this pull request?

This patch proposes to disable fetching shuffle blocks in batch when io encryption is enabled. Adaptive Query Execution fetch contiguous shuffle blocks for the same map task in batch to reduce IO and improve performance. However, we found that batch fetching is incompatible with io encryption.

### Why are the changes needed?
Before this patch, we set `spark.io.encryption.enabled` to true, then run some queries which coalesced partitions by AEQ, may got following error message:
```14:05:52.638 WARN org.apache.spark.scheduler.TaskSetManager: Lost task 1.0 in stage 2.0 (TID 3) (11.240.37.88 executor driver): FetchFailed(BlockManagerId(driver, 11.240.37.88, 63574, None), shuffleId=0, mapIndex=0, mapId=0, reduceId=2, message=
org.apache.spark.shuffle.FetchFailedException: Stream is corrupted
	at org.apache.spark.storage.ShuffleBlockFetcherIterator.throwFetchFailedException(ShuffleBlockFetcherIterator.scala:772)
	at org.apache.spark.storage.BufferReleasingInputStream.read(ShuffleBlockFetcherIterator.scala:845)
	at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
	at java.io.BufferedInputStream.read(BufferedInputStream.java:265)
	at java.io.DataInputStream.readInt(DataInputStream.java:387)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$2$$anon$3.readSize(UnsafeRowSerializer.scala:113)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$2$$anon$3.next(UnsafeRowSerializer.scala:129)
	at org.apache.spark.sql.execution.UnsafeRowSerializerInstance$$anon$2$$anon$3.next(UnsafeRowSerializer.scala:110)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:494)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
	at org.apache.spark.util.CompletionIterator.next(CompletionIterator.scala:29)
	at org.apache.spark.InterruptibleIterator.next(InterruptibleIterator.scala:40)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:498)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1437)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:501)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.io.IOException: Stream is corrupted
	at net.jpountz.lz4.LZ4BlockInputStream.refill(LZ4BlockInputStream.java:200)
	at net.jpountz.lz4.LZ4BlockInputStream.refill(LZ4BlockInputStream.java:226)
	at net.jpountz.lz4.LZ4BlockInputStream.read(LZ4BlockInputStream.java:157)
	at org.apache.spark.storage.BufferReleasingInputStream.read(ShuffleBlockFetcherIterator.scala:841)
	... 25 more

)
```

### Does this PR introduce any user-facing change?

No

### How was this patch tested?

New tests.

Closes #31898 from hezuojiao/fetch_shuffle_in_batch.

Authored-by: hezuojiao <hezuojiao@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit 39542bb81f8570219770bb6533c077f44f6cbd2a)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../shuffle/BlockStoreShuffleReader.scala     |  6 +++--
 .../apache/spark/sql/internal/SQLConf.scala   |  4 +--
 .../CoalesceShufflePartitionsSuite.scala      | 25 ++++++++++++++++++-
 3 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
index bc2a0fbc36d5b..30c752960d5da 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
@@ -51,15 +51,17 @@ private[spark] class BlockStoreShuffleReader[K, C](
       true
     }
     val useOldFetchProtocol = conf.get(config.SHUFFLE_USE_OLD_FETCH_PROTOCOL)
+    // SPARK-34790: Fetching continuous blocks in batch is incompatible with io encryption.
+    val ioEncryption = conf.get(config.IO_ENCRYPTION_ENABLED)
 
     val doBatchFetch = shouldBatchFetch && serializerRelocatable &&
-      (!compressed || codecConcatenation) && !useOldFetchProtocol
+      (!compressed || codecConcatenation) && !useOldFetchProtocol && !ioEncryption
     if (shouldBatchFetch && !doBatchFetch) {
       logDebug("The feature tag of continuous shuffle block fetching is set to true, but " +
         "we can not enable the feature because other conditions are not satisfied. " +
         s"Shuffle compress: $compressed, serializer relocatable: $serializerRelocatable, " +
         s"codec concatenation: $codecConcatenation, use old shuffle fetch protocol: " +
-        s"$useOldFetchProtocol.")
+        s"$useOldFetchProtocol, io encryption: $ioEncryption.")
     }
     doBatchFetch
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 536c9ccb7e7ba..ba4746e10680b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -491,8 +491,8 @@ object SQLConf {
         "reduce IO and improve performance. Note, multiple contiguous blocks exist in single " +
         s"fetch request only happen when '${ADAPTIVE_EXECUTION_ENABLED.key}' and " +
         s"'${COALESCE_PARTITIONS_ENABLED.key}' are both true. This feature also depends " +
-        "on a relocatable serializer, the concatenation support codec in use and the new version " +
-        "shuffle fetch protocol.")
+        "on a relocatable serializer, the concatenation support codec in use, the new version " +
+        "shuffle fetch protocol and io encryption is disabled.")
       .version("3.0.0")
       .booleanConf
       .createWithDefault(true)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/CoalesceShufflePartitionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/CoalesceShufflePartitionsSuite.scala
index 22c5b651f7e12..4cb8cf1cab9f5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/CoalesceShufflePartitionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/CoalesceShufflePartitionsSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution
 import org.scalatest.BeforeAndAfterAll
 
 import org.apache.spark.{SparkConf, SparkFunSuite}
+import org.apache.spark.internal.config.IO_ENCRYPTION_ENABLED
 import org.apache.spark.internal.config.UI.UI_ENABLED
 import org.apache.spark.sql._
 import org.apache.spark.sql.execution.adaptive._
@@ -57,15 +58,18 @@ class CoalesceShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterAl
   def withSparkSession(
       f: SparkSession => Unit,
       targetPostShuffleInputSize: Int,
-      minNumPostShufflePartitions: Option[Int]): Unit = {
+      minNumPostShufflePartitions: Option[Int],
+      enableIOEncryption: Boolean = false): Unit = {
     val sparkConf =
       new SparkConf(false)
         .setMaster("local[*]")
         .setAppName("test")
         .set(UI_ENABLED, false)
+        .set(IO_ENCRYPTION_ENABLED, enableIOEncryption)
         .set(SQLConf.SHUFFLE_PARTITIONS.key, "5")
         .set(SQLConf.COALESCE_PARTITIONS_INITIAL_PARTITION_NUM.key, "5")
         .set(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key, "true")
+        .set(SQLConf.FETCH_SHUFFLE_BLOCKS_IN_BATCH.key, "true")
         .set(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key, "-1")
         .set(
           SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES.key,
@@ -408,6 +412,25 @@ class CoalesceShufflePartitionsSuite extends SparkFunSuite with BeforeAndAfterAl
     }
     withSparkSession(test, 100, None)
   }
+
+  test("SPARK-34790: enable IO encryption in AQE partition coalescing") {
+    val test: SparkSession => Unit = { spark: SparkSession =>
+      val ds = spark.range(0, 100, 1, numInputPartitions)
+      val resultDf = ds.repartition(ds.col("id"))
+      resultDf.collect()
+
+      val finalPlan = resultDf.queryExecution.executedPlan
+        .asInstanceOf[AdaptiveSparkPlanExec].executedPlan
+      assert(
+        finalPlan.collect {
+          case r @ CoalescedShuffleReader() => r
+        }.isDefinedAt(0))
+    }
+    Seq(true, false).foreach { enableIOEncryption =>
+      // Before SPARK-34790, it will throw an exception when io encryption enabled.
+      withSparkSession(test, Int.MaxValue, None, enableIOEncryption)
+    }
+  }
 }
 
 object CoalescedShuffleReader {

From 5a22ad8564c5aefa9b8e031fe4ed7bfea8384c6e Mon Sep 17 00:00:00 2001
From: Peter Toth <peter.toth@gmail.com>
Date: Tue, 23 Mar 2021 17:01:16 +0800
Subject: [PATCH 050/169] [SPARK-33482][SPARK-34756][SQL] Fix FileScan equality
 check

### What changes were proposed in this pull request?

This bug was introduced by SPARK-30428 at Apache Spark 3.0.0.
This PR fixes `FileScan.equals()`.

### Why are the changes needed?
- Without this fix `FileScan.equals` doesn't take `fileIndex` and `readSchema` into account.
- Partition filters and data filters added to `FileScan` (in #27112 and #27157) caused that canonicalized form of some `BatchScanExec` nodes don't match and this prevents some reuse possibilities.

### Does this PR introduce _any_ user-facing change?
Yes, before this fix incorrect reuse of `FileScan` and so `BatchScanExec` could have happed causing correctness issues.

### How was this patch tested?
Added new UTs.

Closes #31848 from peter-toth/SPARK-34756-fix-filescan-equality-check.

Authored-by: Peter Toth <peter.toth@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 93a5d34f84c362110ef7d8853e59ce597faddad9)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../apache/spark/sql/avro/AvroScanSuite.scala |  30 ++
 .../execution/datasources/v2/FileScan.scala   |  22 +-
 .../org/apache/spark/sql/FileScanSuite.scala  | 374 ++++++++++++++++++
 .../org/apache/spark/sql/SQLQuerySuite.scala  |  24 ++
 4 files changed, 446 insertions(+), 4 deletions(-)
 create mode 100644 external/avro/src/test/scala/org/apache/spark/sql/avro/AvroScanSuite.scala
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/FileScanSuite.scala

diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroScanSuite.scala b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroScanSuite.scala
new file mode 100644
index 0000000000000..98a7190ba984e
--- /dev/null
+++ b/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroScanSuite.scala
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.avro
+
+import org.apache.spark.sql.FileScanSuiteBase
+import org.apache.spark.sql.v2.avro.AvroScan
+
+class AvroScanSuite extends FileScanSuiteBase {
+  val scanBuilders = Seq[(String, ScanBuilder, Seq[String])](
+    ("AvroScan",
+      (s, fi, ds, rds, rps, f, o, pf, df) => AvroScan(s, fi, ds, rds, rps, o, f, pf, df),
+      Seq.empty))
+
+  run(scanBuilders)
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala
index 363dd154b5fbb..ac63725b774d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala
@@ -24,8 +24,9 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.IO_WARNING_LARGEFILETHRESHOLD
 import org.apache.spark.sql.{AnalysisException, SparkSession}
-import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionSet}
+import org.apache.spark.sql.catalyst.expressions.{AttributeSet, Expression, ExpressionSet}
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.connector.read.{Batch, InputPartition, Scan, Statistics, SupportsReportStatistics}
 import org.apache.spark.sql.execution.PartitionedFileUtil
 import org.apache.spark.sql.execution.datasources._
@@ -84,11 +85,24 @@ trait FileScan extends Scan
 
   protected def seqToString(seq: Seq[Any]): String = seq.mkString("[", ", ", "]")
 
+  private lazy val (normalizedPartitionFilters, normalizedDataFilters) = {
+    val output = readSchema().toAttributes
+    val partitionFilterAttributes = AttributeSet(partitionFilters).map(a => a.name -> a).toMap
+    val dataFiltersAttributes = AttributeSet(dataFilters).map(a => a.name -> a).toMap
+    val normalizedPartitionFilters = ExpressionSet(partitionFilters.map(
+      QueryPlan.normalizeExpressions(_,
+        output.map(a => partitionFilterAttributes.getOrElse(a.name, a)))))
+    val normalizedDataFilters = ExpressionSet(dataFilters.map(
+      QueryPlan.normalizeExpressions(_,
+        output.map(a => dataFiltersAttributes.getOrElse(a.name, a)))))
+    (normalizedPartitionFilters, normalizedDataFilters)
+  }
+
   override def equals(obj: Any): Boolean = obj match {
     case f: FileScan =>
-      fileIndex == f.fileIndex && readSchema == f.readSchema
-        ExpressionSet(partitionFilters) == ExpressionSet(f.partitionFilters) &&
-        ExpressionSet(dataFilters) == ExpressionSet(f.dataFilters)
+      fileIndex == f.fileIndex && readSchema == f.readSchema &&
+        normalizedPartitionFilters == f.normalizedPartitionFilters &&
+        normalizedDataFilters == f.normalizedDataFilters
 
     case _ => false
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileScanSuite.scala
new file mode 100644
index 0000000000000..4e7fe8455ff93
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/FileScanSuite.scala
@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql
+
+import scala.collection.mutable
+
+import com.google.common.collect.ImmutableMap
+import org.apache.hadoop.fs.{FileStatus, Path}
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.expressions.{And, Expression, IsNull, LessThan}
+import org.apache.spark.sql.execution.datasources.{PartitioningAwareFileIndex, PartitionSpec}
+import org.apache.spark.sql.execution.datasources.v2.FileScan
+import org.apache.spark.sql.execution.datasources.v2.csv.CSVScan
+import org.apache.spark.sql.execution.datasources.v2.json.JsonScan
+import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan
+import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan
+import org.apache.spark.sql.execution.datasources.v2.text.TextScan
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.CaseInsensitiveStringMap
+
+trait FileScanSuiteBase extends SharedSparkSession {
+  private def newPartitioningAwareFileIndex() = {
+    new PartitioningAwareFileIndex(spark, Map.empty, None) {
+      override def partitionSpec(): PartitionSpec = {
+        PartitionSpec.emptySpec
+      }
+
+      override protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus] = {
+        mutable.LinkedHashMap.empty
+      }
+
+      override protected def leafDirToChildrenFiles: Map[Path, Array[FileStatus]] = {
+        Map.empty
+      }
+
+      override def rootPaths: Seq[Path] = {
+        Seq.empty
+      }
+
+      override def refresh(): Unit = {}
+    }
+  }
+
+  type ScanBuilder = (
+    SparkSession,
+      PartitioningAwareFileIndex,
+      StructType,
+      StructType,
+      StructType,
+      Array[Filter],
+      CaseInsensitiveStringMap,
+      Seq[Expression],
+      Seq[Expression]) => FileScan
+
+  def run(scanBuilders: Seq[(String, ScanBuilder, Seq[String])]): Unit = {
+    val dataSchema = StructType.fromDDL("data INT, partition INT, other INT")
+    val dataSchemaNotEqual = StructType.fromDDL("data INT, partition INT, other INT, new INT")
+    val readDataSchema = StructType.fromDDL("data INT")
+    val readDataSchemaNotEqual = StructType.fromDDL("data INT, other INT")
+    val readPartitionSchema = StructType.fromDDL("partition INT")
+    val readPartitionSchemaNotEqual = StructType.fromDDL("partition INT, other INT")
+    val pushedFilters =
+      Array[Filter](sources.And(sources.IsNull("data"), sources.LessThan("data", 0)))
+    val pushedFiltersNotEqual =
+      Array[Filter](sources.And(sources.IsNull("data"), sources.LessThan("data", 1)))
+    val optionsMap = ImmutableMap.of("key", "value")
+    val options = new CaseInsensitiveStringMap(ImmutableMap.copyOf(optionsMap))
+    val optionsNotEqual =
+      new CaseInsensitiveStringMap(ImmutableMap.copyOf(ImmutableMap.of("key2", "value2")))
+    val partitionFilters = Seq(And(IsNull('data.int), LessThan('data.int, 0)))
+    val partitionFiltersNotEqual = Seq(And(IsNull('data.int), LessThan('data.int, 1)))
+    val dataFilters = Seq(And(IsNull('data.int), LessThan('data.int, 0)))
+    val dataFiltersNotEqual = Seq(And(IsNull('data.int), LessThan('data.int, 1)))
+
+    scanBuilders.foreach { case (name, scanBuilder, exclusions) =>
+      test(s"SPARK-33482: Test $name equals") {
+        val partitioningAwareFileIndex = newPartitioningAwareFileIndex()
+
+        val scan = scanBuilder(
+          spark,
+          partitioningAwareFileIndex,
+          dataSchema,
+          readDataSchema,
+          readPartitionSchema,
+          pushedFilters,
+          options,
+          partitionFilters,
+          dataFilters)
+
+        val scanEquals = scanBuilder(
+          spark,
+          partitioningAwareFileIndex,
+          dataSchema.copy(),
+          readDataSchema.copy(),
+          readPartitionSchema.copy(),
+          pushedFilters.clone(),
+          new CaseInsensitiveStringMap(ImmutableMap.copyOf(optionsMap)),
+          Seq(partitionFilters: _*),
+          Seq(dataFilters: _*))
+
+        assert(scan === scanEquals)
+      }
+
+      test(s"SPARK-33482: Test $name fileIndex not equals") {
+        val partitioningAwareFileIndex = newPartitioningAwareFileIndex()
+
+        val scan = scanBuilder(
+          spark,
+          partitioningAwareFileIndex,
+          dataSchema,
+          readDataSchema,
+          readPartitionSchema,
+          pushedFilters,
+          options,
+          partitionFilters,
+          dataFilters)
+
+        val partitioningAwareFileIndexNotEqual = newPartitioningAwareFileIndex()
+
+        val scanNotEqual = scanBuilder(
+          spark,
+          partitioningAwareFileIndexNotEqual,
+          dataSchema,
+          readDataSchema,
+          readPartitionSchema,
+          pushedFilters,
+          options,
+          partitionFilters,
+          dataFilters)
+
+        assert(scan !== scanNotEqual)
+      }
+
+      if (!exclusions.contains("dataSchema")) {
+        test(s"SPARK-33482: Test $name dataSchema not equals") {
+          val partitioningAwareFileIndex = newPartitioningAwareFileIndex()
+
+          val scan = scanBuilder(
+            spark,
+            partitioningAwareFileIndex,
+            dataSchema,
+            readDataSchema,
+            readPartitionSchema,
+            pushedFilters,
+            options,
+            partitionFilters,
+            dataFilters)
+
+          val scanNotEqual = scanBuilder(
+            spark,
+            partitioningAwareFileIndex,
+            dataSchemaNotEqual,
+            readDataSchema,
+            readPartitionSchema,
+            pushedFilters,
+            options,
+            partitionFilters,
+            dataFilters)
+
+          assert(scan !== scanNotEqual)
+        }
+      }
+
+      test(s"SPARK-33482: Test $name readDataSchema not equals") {
+        val partitioningAwareFileIndex = newPartitioningAwareFileIndex()
+
+        val scan = scanBuilder(
+          spark,
+          partitioningAwareFileIndex,
+          dataSchema,
+          readDataSchema,
+          readPartitionSchema,
+          pushedFilters,
+          options,
+          partitionFilters,
+          dataFilters)
+
+        val scanNotEqual = scanBuilder(
+          spark,
+          partitioningAwareFileIndex,
+          dataSchema,
+          readDataSchemaNotEqual,
+          readPartitionSchema,
+          pushedFilters,
+          options,
+          partitionFilters,
+          dataFilters)
+
+        assert(scan !== scanNotEqual)
+      }
+
+      test(s"SPARK-33482: Test $name readPartitionSchema not equals") {
+        val partitioningAwareFileIndex = newPartitioningAwareFileIndex()
+
+        val scan = scanBuilder(
+          spark,
+          partitioningAwareFileIndex,
+          dataSchema,
+          readDataSchema,
+          readPartitionSchema,
+          pushedFilters,
+          options,
+          partitionFilters,
+          dataFilters)
+
+        val scanNotEqual = scanBuilder(
+          spark,
+          partitioningAwareFileIndex,
+          dataSchema,
+          readDataSchema,
+          readPartitionSchemaNotEqual,
+          pushedFilters,
+          options,
+          partitionFilters,
+          dataFilters)
+
+        assert(scan !== scanNotEqual)
+      }
+
+      if (!exclusions.contains("pushedFilters")) {
+        test(s"SPARK-33482: Test $name pushedFilters not equals") {
+          val partitioningAwareFileIndex = newPartitioningAwareFileIndex()
+
+          val scan = scanBuilder(
+            spark,
+            partitioningAwareFileIndex,
+            dataSchema,
+            readDataSchema,
+            readPartitionSchema,
+            pushedFilters,
+            options,
+            partitionFilters,
+            dataFilters)
+
+          val scanNotEqual = scanBuilder(
+            spark,
+            partitioningAwareFileIndex,
+            dataSchema,
+            readDataSchema,
+            readPartitionSchema,
+            pushedFiltersNotEqual,
+            options,
+            partitionFilters,
+            dataFilters)
+
+          assert(scan !== scanNotEqual)
+        }
+      }
+
+      test(s"SPARK-33482: Test $name options not equals") {
+        val partitioningAwareFileIndex = newPartitioningAwareFileIndex()
+
+        val scan = scanBuilder(
+          spark,
+          partitioningAwareFileIndex,
+          dataSchema,
+          readDataSchema,
+          readPartitionSchema,
+          pushedFilters,
+          options,
+          partitionFilters,
+          dataFilters)
+
+        val scanNotEqual = scanBuilder(
+          spark,
+          partitioningAwareFileIndex,
+          dataSchema,
+          readDataSchema,
+          readPartitionSchema,
+          pushedFilters,
+          optionsNotEqual,
+          partitionFilters,
+          dataFilters)
+
+        assert(scan !== scanNotEqual)
+      }
+
+      test(s"SPARK-33482: Test $name partitionFilters not equals") {
+        val partitioningAwareFileIndex = newPartitioningAwareFileIndex()
+
+        val scan = scanBuilder(
+          spark,
+          partitioningAwareFileIndex,
+          dataSchema,
+          readDataSchema,
+          readPartitionSchema,
+          pushedFilters,
+          options,
+          partitionFilters,
+          dataFilters)
+
+        val scanNotEqual = scanBuilder(
+          spark,
+          partitioningAwareFileIndex,
+          dataSchema,
+          readDataSchema,
+          readPartitionSchema,
+          pushedFilters,
+          options,
+          partitionFiltersNotEqual,
+          dataFilters)
+        assert(scan !== scanNotEqual)
+      }
+
+      test(s"SPARK-33482: Test $name dataFilters not equals") {
+        val partitioningAwareFileIndex = newPartitioningAwareFileIndex()
+
+        val scan = scanBuilder(
+          spark,
+          partitioningAwareFileIndex,
+          dataSchema,
+          readDataSchema,
+          readPartitionSchema,
+          pushedFilters,
+          options,
+          partitionFilters,
+          dataFilters)
+
+        val scanNotEqual = scanBuilder(
+          spark,
+          partitioningAwareFileIndex,
+          dataSchema,
+          readDataSchema,
+          readPartitionSchema,
+          pushedFilters,
+          options,
+          partitionFilters,
+          dataFiltersNotEqual)
+        assert(scan !== scanNotEqual)
+      }
+    }
+  }
+}
+
+class FileScanSuite extends FileScanSuiteBase {
+  val scanBuilders = Seq[(String, ScanBuilder, Seq[String])](
+    ("ParquetScan",
+      (s, fi, ds, rds, rps, f, o, pf, df) =>
+        ParquetScan(s, s.sessionState.newHadoopConf(), fi, ds, rds, rps, f, o, pf, df),
+      Seq.empty),
+    ("OrcScan",
+      (s, fi, ds, rds, rps, f, o, pf, df) =>
+        OrcScan(s, s.sessionState.newHadoopConf(), fi, ds, rds, rps, o, f, pf, df),
+      Seq.empty),
+    ("CSVScan",
+      (s, fi, ds, rds, rps, f, o, pf, df) => CSVScan(s, fi, ds, rds, rps, o, f, pf, df),
+      Seq.empty),
+    ("JsonScan",
+      (s, fi, ds, rds, rps, f, o, pf, df) => JsonScan(s, fi, ds, rds, rps, o, f, pf, df),
+      Seq.empty),
+    ("TextScan",
+      (s, fi, _, rds, rps, _, o, pf, df) => TextScan(s, fi, rds, rps, o, pf, df),
+      Seq("dataSchema", "pushedFilters")))
+
+  run(scanBuilders)
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index c29eac2c7b1f3..aa673dc666510 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -37,6 +37,7 @@ import org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupporte
 import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
 import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan
 import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan
+import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, CartesianProductExec, SortMergeJoinExec}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -3945,6 +3946,29 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
       }
     }
   }
+
+  test("SPARK-33482: Fix FileScan canonicalization") {
+    withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> "") {
+      withTempPath { path =>
+        spark.range(5).toDF().write.mode("overwrite").parquet(path.toString)
+        withTempView("t") {
+          spark.read.parquet(path.toString).createOrReplaceTempView("t")
+          val df = sql(
+            """
+              |SELECT *
+              |FROM t AS t1
+              |JOIN t AS t2 ON t2.id = t1.id
+              |JOIN t AS t3 ON t3.id = t2.id
+              |""".stripMargin)
+          df.collect()
+          val reusedExchanges = collect(df.queryExecution.executedPlan) {
+            case r: ReusedExchangeExec => r
+          }
+          assert(reusedExchanges.size == 1)
+        }
+      }
+    }
+  }
 }
 
 case class Foo(bar: Option[String])

From e1390427e674fc663ad61629f0cb40a50842a607 Mon Sep 17 00:00:00 2001
From: Takeshi Yamamuro <yamamuro@apache.org>
Date: Thu, 25 Mar 2021 08:31:57 +0900
Subject: [PATCH 051/169] [SPARK-34833][SQL] Apply right-padding correctly for
 correlated subqueries

### What changes were proposed in this pull request?

This PR intends to fix the bug that does not apply right-padding for char types inside correlated subquries.
For example,  a query below returns nothing in master, but a correct result is `c`.
```
scala> sql(s"CREATE TABLE t1(v VARCHAR(3), c CHAR(5)) USING parquet")
scala> sql(s"CREATE TABLE t2(v VARCHAR(5), c CHAR(7)) USING parquet")
scala> sql("INSERT INTO t1 VALUES ('c', 'b')")
scala> sql("INSERT INTO t2 VALUES ('a', 'b')")
scala> val df = sql("""
  |SELECT v FROM t1
  |WHERE 'a' IN (SELECT v FROM t2 WHERE t2.c = t1.c )""".stripMargin)

scala> df.show()
+---+
|  v|
+---+
+---+

```

This is because `ApplyCharTypePadding`  does not handle the case above to apply right-padding into `'abc'`. This PR modifies the code in `ApplyCharTypePadding` for handling it correctly.

```
// Before this PR:
scala> df.explain(true)
== Analyzed Logical Plan ==
v: string
Project [v#13]
+- Filter a IN (list#12 [c#14])
   :  +- Project [v#15]
   :     +- Filter (c#16 = outer(c#14))
   :        +- SubqueryAlias spark_catalog.default.t2
   :           +- Relation default.t2[v#15,c#16] parquet
   +- SubqueryAlias spark_catalog.default.t1
      +- Relation default.t1[v#13,c#14] parquet

scala> df.show()
+---+
|  v|
+---+
+---+

// After this PR:
scala> df.explain(true)
== Analyzed Logical Plan ==
v: string
Project [v#43]
+- Filter a IN (list#42 [c#44])
   :  +- Project [v#45]
   :     +- Filter (c#46 = rpad(outer(c#44), 7,  ))
   :        +- SubqueryAlias spark_catalog.default.t2
   :           +- Relation default.t2[v#45,c#46] parquet
   +- SubqueryAlias spark_catalog.default.t1
      +- Relation default.t1[v#43,c#44] parquet

scala> df.show()
+---+
|  v|
+---+
|  c|
+---+
```

This fix is lated to TPCDS q17; the query returns nothing because of this bug: https://github.com/apache/spark/pull/31886/files#r599333799

### Why are the changes needed?

Bugfix.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Unit tests added.

Closes #31940 from maropu/FixCharPadding.

Authored-by: Takeshi Yamamuro <yamamuro@apache.org>
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
(cherry picked from commit 150769bcedb6e4a97596e0f04d686482cd09e92a)
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
---
 .../sql/catalyst/analysis/Analyzer.scala      | 45 ++++++++++++---
 .../spark/sql/CharVarcharTestSuite.scala      | 57 ++++++++++++++-----
 2 files changed, 79 insertions(+), 23 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index f4cdeab063ce7..d490845aa9e61 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -3921,16 +3921,28 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
 
   override def apply(plan: LogicalPlan): LogicalPlan = {
     plan.resolveOperatorsUp {
-      case operator if operator.resolved => operator.transformExpressionsUp {
+      case operator => operator.transformExpressionsUp {
+        case e if !e.childrenResolved => e
+
         // String literal is treated as char type when it's compared to a char type column.
         // We should pad the shorter one to the longer length.
         case b @ BinaryComparison(attr: Attribute, lit) if lit.foldable =>
-          padAttrLitCmp(attr, lit).map { newChildren =>
+          padAttrLitCmp(attr, attr.metadata, lit).map { newChildren =>
             b.withNewChildren(newChildren)
           }.getOrElse(b)
 
         case b @ BinaryComparison(lit, attr: Attribute) if lit.foldable =>
-          padAttrLitCmp(attr, lit).map { newChildren =>
+          padAttrLitCmp(attr, attr.metadata, lit).map { newChildren =>
+            b.withNewChildren(newChildren.reverse)
+          }.getOrElse(b)
+
+        case b @ BinaryComparison(or @ OuterReference(attr: Attribute), lit) if lit.foldable =>
+          padAttrLitCmp(or, attr.metadata, lit).map { newChildren =>
+            b.withNewChildren(newChildren)
+          }.getOrElse(b)
+
+        case b @ BinaryComparison(lit, or @ OuterReference(attr: Attribute)) if lit.foldable =>
+          padAttrLitCmp(or, attr.metadata, lit).map { newChildren =>
             b.withNewChildren(newChildren.reverse)
           }.getOrElse(b)
 
@@ -3954,6 +3966,12 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
         case b @ BinaryComparison(left: Attribute, right: Attribute) =>
           b.withNewChildren(CharVarcharUtils.addPaddingInStringComparison(Seq(left, right)))
 
+        case b @ BinaryComparison(OuterReference(left: Attribute), right: Attribute) =>
+          b.withNewChildren(padOuterRefAttrCmp(left, right))
+
+        case b @ BinaryComparison(left: Attribute, OuterReference(right: Attribute)) =>
+          b.withNewChildren(padOuterRefAttrCmp(right, left).reverse)
+
         case i @ In(attr: Attribute, list) if list.forall(_.isInstanceOf[Attribute]) =>
           val newChildren = CharVarcharUtils.addPaddingInStringComparison(
             attr +: list.map(_.asInstanceOf[Attribute]))
@@ -3962,9 +3980,12 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
     }
   }
 
-  private def padAttrLitCmp(attr: Attribute, lit: Expression): Option[Seq[Expression]] = {
-    if (attr.dataType == StringType) {
-      CharVarcharUtils.getRawType(attr.metadata).flatMap {
+  private def padAttrLitCmp(
+      expr: Expression,
+      metadata: Metadata,
+      lit: Expression): Option[Seq[Expression]] = {
+    if (expr.dataType == StringType) {
+      CharVarcharUtils.getRawType(metadata).flatMap {
         case CharType(length) =>
           val str = lit.eval().asInstanceOf[UTF8String]
           if (str == null) {
@@ -3972,9 +3993,9 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
           } else {
             val stringLitLen = str.numChars()
             if (length < stringLitLen) {
-              Some(Seq(StringRPad(attr, Literal(stringLitLen)), lit))
+              Some(Seq(StringRPad(expr, Literal(stringLitLen)), lit))
             } else if (length > stringLitLen) {
-              Some(Seq(attr, StringRPad(lit, Literal(length))))
+              Some(Seq(expr, StringRPad(lit, Literal(length))))
             } else {
               None
             }
@@ -3986,6 +4007,14 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
     }
   }
 
+  private def padOuterRefAttrCmp(outerAttr: Attribute, attr: Attribute): Seq[Expression] = {
+    val Seq(r, newAttr) = CharVarcharUtils.addPaddingInStringComparison(Seq(outerAttr, attr))
+    val newOuterRef = r.transform {
+      case ar: Attribute if ar.semanticEquals(outerAttr) => OuterReference(ar)
+    }
+    Seq(newOuterRef, newAttr)
+  }
+
   private def addPadding(expr: Expression, charLength: Int, targetLength: Int): Expression = {
     if (targetLength > charLength) StringRPad(expr, Literal(targetLength)) else expr
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
index 177517236ef88..76f7f42d66923 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
@@ -584,21 +584,6 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils {
     }
   }
 
-  test("SPARK-33992: char/varchar resolution in correlated sub query") {
-    withTable("t1", "t2") {
-      sql(s"CREATE TABLE t1(v VARCHAR(3), c CHAR(5)) USING $format")
-      sql(s"CREATE TABLE t2(v VARCHAR(3), c CHAR(5)) USING $format")
-      sql("INSERT INTO t1 VALUES ('c', 'b')")
-      sql("INSERT INTO t2 VALUES ('a', 'b')")
-
-      checkAnswer(sql(
-        """
-          |SELECT v FROM t1
-          |WHERE 'a' IN (SELECT v FROM t2 WHERE t1.c = t2.c )""".stripMargin),
-        Row("c"))
-    }
-  }
-
   test("SPARK-34003: fix char/varchar fails w/ both group by and order by ") {
     withTable("t") {
       sql(s"CREATE TABLE t(v VARCHAR(3), i INT) USING $format")
@@ -633,6 +618,48 @@ trait CharVarcharTestSuite extends QueryTest with SQLTestUtils {
       checkAnswer(spark.table("t"), Row("c "))
     }
   }
+
+  test("SPARK-34833: right-padding applied correctly for correlated subqueries - join keys") {
+    withTable("t1", "t2") {
+      sql(s"CREATE TABLE t1(v VARCHAR(3), c CHAR(5)) USING $format")
+      sql(s"CREATE TABLE t2(v VARCHAR(5), c CHAR(8)) USING $format")
+      sql("INSERT INTO t1 VALUES ('c', 'b')")
+      sql("INSERT INTO t2 VALUES ('a', 'b')")
+      Seq("t1.c = t2.c", "t2.c = t1.c",
+        "t1.c = 'b'", "'b' = t1.c", "t1.c = 'b    '", "'b    ' = t1.c",
+        "t1.c = 'b      '", "'b      ' = t1.c").foreach { predicate =>
+        checkAnswer(sql(
+          s"""
+             |SELECT v FROM t1
+             |WHERE 'a' IN (SELECT v FROM t2 WHERE $predicate)
+           """.stripMargin),
+          Row("c"))
+      }
+    }
+  }
+
+  test("SPARK-34833: right-padding applied correctly for correlated subqueries - other preds") {
+    withTable("t") {
+      sql(s"CREATE TABLE t(c0 INT, c1 CHAR(5), c2 CHAR(7)) USING $format")
+      sql("INSERT INTO t VALUES (1, 'abc', 'abc')")
+      Seq("c1 = 'abc'", "'abc' = c1", "c1 = 'abc  '", "'abc  ' = c1",
+        "c1 = 'abc    '", "'abc    ' = c1", "c1 = c2", "c2 = c1",
+        "c1 IN ('xxx', 'abc', 'xxxxx')", "c1 IN ('xxx', 'abc  ', 'xxxxx')",
+        "c1 IN ('xxx', 'abc    ', 'xxxxx')",
+        "c1 IN (c2)", "c2 IN (c1)").foreach { predicate =>
+        checkAnswer(sql(
+          s"""
+             |SELECT c0 FROM t t1
+             |WHERE (
+             |  SELECT count(*) AS c
+             |  FROM t
+             |  WHERE c0 = t1.c0 AND $predicate
+             |) > 0
+         """.stripMargin),
+          Row(1))
+      }
+    }
+  }
 }
 
 // Some basic char/varchar tests which doesn't rely on table implementation.

From 544a0353167a3b8601376da841ce3ac4cbf975e9 Mon Sep 17 00:00:00 2001
From: Chandni Singh <singh.chandni@gmail.com>
Date: Thu, 25 Mar 2021 12:47:46 -0500
Subject: [PATCH 052/169] =?UTF-8?q?[SPARK-34840][SHUFFLE]=20Fixes=20cases?=
 =?UTF-8?q?=20of=20corruption=20in=20merged=20shuffle=20=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?
This PR fixes bugs that causes corruption of push-merged blocks when a client terminates while pushing block. `RemoteBlockPushResolver` was introduced in #30062 (SPARK-32916).

There are 2 scenarios where the merged blocks get corrupted:
1. `StreamCallback.onFailure()` is called more than once. Initially we assumed that the onFailure callback will be called just once per stream. However, we observed that this is called twice when a client connection is reset. When the client connection is reset then there are 2 events that get triggered in this order.
 - `exceptionCaught`. This event is propagated to `StreamInterceptor`. `StreamInterceptor.exceptionCaught()` invokes `callback.onFailure(streamId, cause)`. This is the first time StreamCallback.onFailure() will be invoked.
 - `channelInactive`. Since the channel closes, the `channelInactive` event gets triggered which again is propagated to `StreamInterceptor`. `StreamInterceptor.channelInactive()` invokes `callback.onFailure(streamId, new ClosedChannelException())`. This is the second time  StreamCallback.onFailure() will be invoked.

2. The flag `isWriting` is set prematurely to true. This introduces an edge case where a stream that is trying to merge a duplicate block (created because of a speculative task) may interfere with an active stream if the duplicate stream fails.

Also adding additional changes that improve the code.

1.  Using positional writes all the time because this simplifies the code and with microbenchmarking haven't seen any performance impact.
2. Additional minor changes suggested by mridulm during an internal review.

### Why are the changes needed?
These are bug fixes and simplify the code.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Added unit tests. I have also tested these changes in Linkedin's internal fork on a cluster.

Co-authored-by: Chandni Singh chsinghlinkedin.com
Co-authored-by: Min Shen mshenlinkedin.com

Closes #31934 from otterc/SPARK-32916-followup.

Lead-authored-by: Chandni Singh <singh.chandni@gmail.com>
Co-authored-by: Min Shen <mshen@linkedin.com>
Signed-off-by: Mridul Muralidharan <mridul<at>gmail.com>
(cherry picked from commit 6d88212f79a67fa070a91a0123c5bb34683ddc3a)
Signed-off-by: Mridul Muralidharan <mridulatgmail.com>
---
 .../shuffle/RemoteBlockPushResolver.java      | 53 +++--------
 .../shuffle/RemoteBlockPushResolverSuite.java | 87 ++++++++++++++++++-
 2 files changed, 97 insertions(+), 43 deletions(-)

diff --git a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java
index 0e2355646465d..9363efc58d7c6 100644
--- a/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java
+++ b/common/network-shuffle/src/main/java/org/apache/spark/network/shuffle/RemoteBlockPushResolver.java
@@ -30,7 +30,6 @@
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Iterator;
-import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.ConcurrentMap;
@@ -445,9 +444,9 @@ static class PushBlockStreamCallback implements StreamCallbackWithID {
     private final AppShufflePartitionInfo partitionInfo;
     private int length = 0;
     // This indicates that this stream got the opportunity to write the blocks to the merged file.
-    // Once this is set to true and the stream encounters a failure then it will take necessary
-    // action to overwrite any partial written data. This is reset to false when the stream
-    // completes without any failures.
+    // Once this is set to true and the stream encounters a failure then it will unset the
+    // currentMapId of the partition so that another stream can start merging the blocks to the
+    // partition. This is reset to false when the stream completes.
     private boolean isWriting = false;
     // Use on-heap instead of direct ByteBuffer since these buffers will be GC'ed very quickly
     private List<ByteBuffer> deferredBufs;
@@ -477,16 +476,11 @@ public String getID() {
      */
     private void writeBuf(ByteBuffer buf) throws IOException {
       while (buf.hasRemaining()) {
-        if (partitionInfo.isEncounteredFailure()) {
-          long updatedPos = partitionInfo.getDataFilePos() + length;
-          logger.debug(
-            "{} shuffleId {} reduceId {} encountered failure current pos {} updated pos {}",
-            partitionInfo.appShuffleId.appId, partitionInfo.appShuffleId.shuffleId,
-            partitionInfo.reduceId, partitionInfo.getDataFilePos(), updatedPos);
-          length += partitionInfo.dataChannel.write(buf, updatedPos);
-        } else {
-          length += partitionInfo.dataChannel.write(buf);
-        }
+        long updatedPos = partitionInfo.getDataFilePos() + length;
+        logger.debug("{} shuffleId {} reduceId {} current pos {} updated pos {}",
+          partitionInfo.appShuffleId.appId, partitionInfo.appShuffleId.shuffleId,
+          partitionInfo.reduceId, partitionInfo.getDataFilePos(), updatedPos);
+        length += partitionInfo.dataChannel.write(buf, updatedPos);
       }
     }
 
@@ -581,7 +575,6 @@ public void onData(String streamId, ByteBuffer buf) throws IOException {
         }
         // Check whether we can write to disk
         if (allowedToWrite()) {
-          isWriting = true;
           // Identify duplicate block generated by speculative tasks. We respond success to
           // the client in cases of duplicate even though no data is written.
           if (isDuplicateBlock()) {
@@ -598,6 +591,7 @@ public void onData(String streamId, ByteBuffer buf) throws IOException {
 
           // If we got here, it's safe to write the block data to the merged shuffle file. We
           // first write any deferred block.
+          isWriting = true;
           try {
             if (deferredBufs != null && !deferredBufs.isEmpty()) {
               writeDeferredBufs();
@@ -609,16 +603,6 @@ public void onData(String streamId, ByteBuffer buf) throws IOException {
             // back to the client so the block could be retried.
             throw ioe;
           }
-          // If we got here, it means we successfully write the current chunk of block to merged
-          // shuffle file. If we encountered failure while writing the previous block, we should
-          // reset the file channel position and the status of partitionInfo to indicate that we
-          // have recovered from previous disk write failure. However, we do not update the
-          // position tracked by partitionInfo here. That is only updated while the entire block
-          // is successfully written to merged shuffle file.
-          if (partitionInfo.isEncounteredFailure()) {
-            partitionInfo.dataChannel.position(partitionInfo.getDataFilePos() + length);
-            partitionInfo.setEncounteredFailure(false);
-          }
         } else {
           logger.trace("{} shuffleId {} reduceId {} onData deferred",
             partitionInfo.appShuffleId.appId, partitionInfo.appShuffleId.shuffleId,
@@ -639,7 +623,7 @@ public void onData(String streamId, ByteBuffer buf) throws IOException {
           // written to disk due to this reason. We thus decide to optimize for server
           // throughput and memory usage.
           if (deferredBufs == null) {
-            deferredBufs = new LinkedList<>();
+            deferredBufs = new ArrayList<>();
           }
           // Write the buffer to the in-memory deferred cache. Since buf is a slice of a larger
           // byte buffer, we cache only the relevant bytes not the entire large buffer to save
@@ -670,7 +654,6 @@ public void onComplete(String streamId) throws IOException {
         }
         // Check if we can commit this block
         if (allowedToWrite()) {
-          isWriting = true;
           // Identify duplicate block generated by speculative tasks. We respond success to
           // the client in cases of duplicate even though no data is written.
           if (isDuplicateBlock()) {
@@ -681,6 +664,7 @@ public void onComplete(String streamId) throws IOException {
             try {
               if (deferredBufs != null && !deferredBufs.isEmpty()) {
                 abortIfNecessary();
+                isWriting = true;
                 writeDeferredBufs();
               }
             } catch (IOException ioe) {
@@ -738,14 +722,14 @@ public void onFailure(String streamId, Throwable throwable) throws IOException {
           Map<Integer, AppShufflePartitionInfo> shufflePartitions =
             mergeManager.partitions.get(partitionInfo.appShuffleId);
           if (shufflePartitions != null && shufflePartitions.containsKey(partitionInfo.reduceId)) {
-            logger.debug("{} shuffleId {} reduceId {} set encountered failure",
+            logger.debug("{} shuffleId {} reduceId {} encountered failure",
               partitionInfo.appShuffleId.appId, partitionInfo.appShuffleId.shuffleId,
               partitionInfo.reduceId);
             partitionInfo.setCurrentMapIndex(-1);
-            partitionInfo.setEncounteredFailure(true);
           }
         }
       }
+      isWriting = false;
     }
 
     @VisibleForTesting
@@ -802,8 +786,6 @@ public static class AppShufflePartitionInfo {
     public FileChannel dataChannel;
     // Location offset of the last successfully merged block for this shuffle partition
     private long dataFilePos;
-    // Indicating whether failure was encountered when merging the previous block
-    private boolean encounteredFailure;
     // Track the map index whose block is being merged for this shuffle partition
     private int currentMapIndex;
     // Bitmap tracking which mapper's blocks have been merged for this shuffle partition
@@ -836,7 +818,6 @@ public static class AppShufflePartitionInfo {
       // Writing 0 offset so that we can reuse ShuffleIndexInformation.getIndex()
       updateChunkInfo(0L, -1);
       this.dataFilePos = 0;
-      this.encounteredFailure = false;
       this.mapTracker = new RoaringBitmap();
       this.chunkTracker = new RoaringBitmap();
     }
@@ -851,14 +832,6 @@ public void setDataFilePos(long dataFilePos) {
       this.dataFilePos = dataFilePos;
     }
 
-    boolean isEncounteredFailure() {
-      return encounteredFailure;
-    }
-
-    void setEncounteredFailure(boolean encounteredFailure) {
-      this.encounteredFailure = encounteredFailure;
-    }
-
     int getCurrentMapIndex() {
       return currentMapIndex;
     }
diff --git a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RemoteBlockPushResolverSuite.java b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RemoteBlockPushResolverSuite.java
index 8c6f7434748ec..565d433ff3203 100644
--- a/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RemoteBlockPushResolverSuite.java
+++ b/common/network-shuffle/src/test/java/org/apache/spark/network/shuffle/RemoteBlockPushResolverSuite.java
@@ -28,6 +28,7 @@
 import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.concurrent.Semaphore;
+import java.util.concurrent.ThreadLocalRandom;
 
 import com.google.common.base.Throwables;
 import com.google.common.collect.ImmutableMap;
@@ -292,18 +293,32 @@ public void testTooLateArrival() throws IOException {
   @Test
   public void testIncompleteStreamsAreOverwritten() throws IOException {
     registerExecutor(TEST_APP, prepareLocalDirs(localDirs));
+    byte[] expectedBytes = new byte[4];
+    ThreadLocalRandom.current().nextBytes(expectedBytes);
+
     StreamCallbackWithID stream1 =
       pushResolver.receiveBlockDataAsStream(new PushBlockStream(TEST_APP, 0, 0, 0, 0));
-    stream1.onData(stream1.getID(), ByteBuffer.wrap(new byte[4]));
+    byte[] data = new byte[10];
+    ThreadLocalRandom.current().nextBytes(data);
+    stream1.onData(stream1.getID(), ByteBuffer.wrap(data));
     // There is a failure
     stream1.onFailure(stream1.getID(), new RuntimeException("forced error"));
     StreamCallbackWithID stream2 =
       pushResolver.receiveBlockDataAsStream(new PushBlockStream(TEST_APP, 0, 1, 0, 0));
-    stream2.onData(stream2.getID(), ByteBuffer.wrap(new byte[5]));
+    ByteBuffer nextBuf= ByteBuffer.wrap(expectedBytes, 0, 2);
+    stream2.onData(stream2.getID(), nextBuf);
     stream2.onComplete(stream2.getID());
+    StreamCallbackWithID stream3 =
+      pushResolver.receiveBlockDataAsStream(new PushBlockStream(TEST_APP, 0, 2, 0, 0));
+    nextBuf =  ByteBuffer.wrap(expectedBytes, 2, 2);
+    stream3.onData(stream3.getID(), nextBuf);
+    stream3.onComplete(stream3.getID());
     pushResolver.finalizeShuffleMerge(new FinalizeShuffleMerge(TEST_APP, 0));
     MergedBlockMeta blockMeta = pushResolver.getMergedBlockMeta(TEST_APP, 0, 0);
-    validateChunks(TEST_APP, 0, 0, blockMeta, new int[]{5}, new int[][]{{1}});
+    validateChunks(TEST_APP, 0, 0, blockMeta, new int[]{4}, new int[][]{{1, 2}});
+    FileSegmentManagedBuffer mb =
+      (FileSegmentManagedBuffer) pushResolver.getMergedBlockData(TEST_APP, 0, 0, 0);
+    assertArrayEquals(expectedBytes, mb.nioByteBuffer().array());
   }
 
   @Test (expected = RuntimeException.class)
@@ -740,6 +755,72 @@ public void testFailureWhileTruncatingFiles() throws IOException {
     validateChunks(TEST_APP, 0, 1, meta, new int[]{5, 3}, new int[][]{{0},{1}});
   }
 
+  @Test
+  public void testOnFailureInvokedMoreThanOncePerBlock() throws IOException {
+    StreamCallbackWithID stream1 =
+      pushResolver.receiveBlockDataAsStream(new PushBlockStream(TEST_APP, 0, 0, 0, 0));
+    stream1.onData(stream1.getID(), ByteBuffer.wrap(new byte[2]));
+    stream1.onFailure(stream1.getID(), new RuntimeException("forced error"));
+    StreamCallbackWithID stream2 =
+      pushResolver.receiveBlockDataAsStream(new PushBlockStream(TEST_APP, 0, 1, 0, 0));
+    stream2.onData(stream2.getID(), ByteBuffer.wrap(new byte[5]));
+    // On failure on stream1 gets invoked again and should cause no interference
+    stream1.onFailure(stream1.getID(), new RuntimeException("2nd forced error"));
+    StreamCallbackWithID stream3 =
+      pushResolver.receiveBlockDataAsStream(new PushBlockStream(TEST_APP, 0, 3, 0, 0));
+    // This should be deferred as stream 2 is still the active stream
+    stream3.onData(stream3.getID(), ByteBuffer.wrap(new byte[2]));
+    // Stream 2 writes more and completes
+    stream2.onData(stream2.getID(), ByteBuffer.wrap(new byte[4]));
+    stream2.onComplete(stream2.getID());
+    stream3.onComplete(stream3.getID());
+    pushResolver.finalizeShuffleMerge(new FinalizeShuffleMerge(TEST_APP, 0));
+    MergedBlockMeta blockMeta = pushResolver.getMergedBlockMeta(TEST_APP, 0, 0);
+    validateChunks(TEST_APP, 0, 0, blockMeta, new int[] {9, 2}, new int[][] {{1},{3}});
+    removeApplication(TEST_APP);
+  }
+
+  @Test (expected = RuntimeException.class)
+  public void testFailureAfterDuplicateBlockDoesNotInterfereActiveStream() throws IOException {
+    StreamCallbackWithID stream1 =
+      pushResolver.receiveBlockDataAsStream(new PushBlockStream(TEST_APP, 0, 0, 0, 0));
+    StreamCallbackWithID stream1Duplicate =
+      pushResolver.receiveBlockDataAsStream(new PushBlockStream(TEST_APP, 0, 0, 0, 0));
+    stream1.onData(stream1.getID(), ByteBuffer.wrap(new byte[2]));
+    stream1.onComplete(stream1.getID());
+    stream1Duplicate.onData(stream1.getID(), ByteBuffer.wrap(new byte[2]));
+
+    StreamCallbackWithID stream2 =
+      pushResolver.receiveBlockDataAsStream(new PushBlockStream(TEST_APP, 0, 1, 0, 0));
+    stream2.onData(stream2.getID(), ByteBuffer.wrap(new byte[5]));
+    // Should not change the current map id of the reduce partition
+    stream1Duplicate.onFailure(stream2.getID(), new RuntimeException("forced error"));
+
+    StreamCallbackWithID stream3 =
+      pushResolver.receiveBlockDataAsStream(new PushBlockStream(TEST_APP, 0, 2, 0, 0));
+    // This should be deferred as stream 2 is still the active stream
+    stream3.onData(stream3.getID(), ByteBuffer.wrap(new byte[2]));
+    RuntimeException failedEx = null;
+    try {
+      stream3.onComplete(stream3.getID());
+    } catch (RuntimeException re) {
+      assertEquals(
+        "Couldn't find an opportunity to write block shufflePush_0_2_0 to merged shuffle",
+        re.getMessage());
+      failedEx = re;
+    }
+    // Stream 2 writes more and completes
+    stream2.onData(stream2.getID(), ByteBuffer.wrap(new byte[4]));
+    stream2.onComplete(stream2.getID());
+    pushResolver.finalizeShuffleMerge(new FinalizeShuffleMerge(TEST_APP, 0));
+    MergedBlockMeta blockMeta = pushResolver.getMergedBlockMeta(TEST_APP, 0, 0);
+    validateChunks(TEST_APP, 0, 0, blockMeta, new int[] {11}, new int[][] {{0, 1}});
+    removeApplication(TEST_APP);
+    if (failedEx != null) {
+      throw failedEx;
+    }
+  }
+
   private void useTestFiles(boolean useTestIndexFile, boolean useTestMetaFile) throws IOException {
     pushResolver = new RemoteBlockPushResolver(conf) {
       @Override

From 975baa6386c8630137e48abd6afaeed1e347dc43 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 26 Mar 2021 09:10:03 +0900
Subject: [PATCH 053/169] [SPARK-34833][SQL][FOLLOWUP] Handle outer references
 in all the places

### What changes were proposed in this pull request?

This is a follow-up of https://github.com/apache/spark/pull/31940 . This PR generalizes the matching of attributes and outer references, so that outer references are handled everywhere.

Note that, currently correlated subquery has a lot of limitations in Spark, and the newly covered cases are not possible to happen. So this PR is a code refactor.

### Why are the changes needed?

code cleanup

### Does this PR introduce _any_ user-facing change?

no

### How was this patch tested?

existing tests

Closes #31959 from cloud-fan/follow.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
(cherry picked from commit 658e95c345d5aa2a98b8d2a854e003a5c77ed581)
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
---
 .../sql/catalyst/analysis/Analyzer.scala      | 67 ++++++++++++-------
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index d490845aa9e61..600a5afe62938 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -3919,6 +3919,14 @@ object UpdateOuterReferences extends Rule[LogicalPlan] {
  */
 object ApplyCharTypePadding extends Rule[LogicalPlan] {
 
+  object AttrOrOuterRef {
+    def unapply(e: Expression): Option[Attribute] = e match {
+      case a: Attribute => Some(a)
+      case OuterReference(a: Attribute) => Some(a)
+      case _ => None
+    }
+  }
+
   override def apply(plan: LogicalPlan): LogicalPlan = {
     plan.resolveOperatorsUp {
       case operator => operator.transformExpressionsUp {
@@ -3926,27 +3934,17 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
 
         // String literal is treated as char type when it's compared to a char type column.
         // We should pad the shorter one to the longer length.
-        case b @ BinaryComparison(attr: Attribute, lit) if lit.foldable =>
-          padAttrLitCmp(attr, attr.metadata, lit).map { newChildren =>
-            b.withNewChildren(newChildren)
-          }.getOrElse(b)
-
-        case b @ BinaryComparison(lit, attr: Attribute) if lit.foldable =>
-          padAttrLitCmp(attr, attr.metadata, lit).map { newChildren =>
-            b.withNewChildren(newChildren.reverse)
-          }.getOrElse(b)
-
-        case b @ BinaryComparison(or @ OuterReference(attr: Attribute), lit) if lit.foldable =>
-          padAttrLitCmp(or, attr.metadata, lit).map { newChildren =>
+        case b @ BinaryComparison(e @ AttrOrOuterRef(attr), lit) if lit.foldable =>
+          padAttrLitCmp(e, attr.metadata, lit).map { newChildren =>
             b.withNewChildren(newChildren)
           }.getOrElse(b)
 
-        case b @ BinaryComparison(lit, or @ OuterReference(attr: Attribute)) if lit.foldable =>
-          padAttrLitCmp(or, attr.metadata, lit).map { newChildren =>
+        case b @ BinaryComparison(lit, e @ AttrOrOuterRef(attr)) if lit.foldable =>
+          padAttrLitCmp(e, attr.metadata, lit).map { newChildren =>
             b.withNewChildren(newChildren.reverse)
           }.getOrElse(b)
 
-        case i @ In(attr: Attribute, list)
+        case i @ In(e @ AttrOrOuterRef(attr), list)
           if attr.dataType == StringType && list.forall(_.foldable) =>
           CharVarcharUtils.getRawType(attr.metadata).flatMap {
             case CharType(length) =>
@@ -3955,7 +3953,7 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
               val literalCharLengths = literalChars.map(_.numChars())
               val targetLen = (length +: literalCharLengths).max
               Some(i.copy(
-                value = addPadding(attr, length, targetLen),
+                value = addPadding(e, length, targetLen),
                 list = list.zip(literalCharLengths).map {
                   case (lit, charLength) => addPadding(lit, charLength, targetLen)
                 } ++ nulls.map(Literal.create(_, StringType))))
@@ -3963,19 +3961,36 @@ object ApplyCharTypePadding extends Rule[LogicalPlan] {
           }.getOrElse(i)
 
         // For char type column or inner field comparison, pad the shorter one to the longer length.
-        case b @ BinaryComparison(left: Attribute, right: Attribute) =>
-          b.withNewChildren(CharVarcharUtils.addPaddingInStringComparison(Seq(left, right)))
-
-        case b @ BinaryComparison(OuterReference(left: Attribute), right: Attribute) =>
-          b.withNewChildren(padOuterRefAttrCmp(left, right))
-
-        case b @ BinaryComparison(left: Attribute, OuterReference(right: Attribute)) =>
-          b.withNewChildren(padOuterRefAttrCmp(right, left).reverse)
+        case b @ BinaryComparison(e1 @ AttrOrOuterRef(left), e2 @ AttrOrOuterRef(right))
+            // For the same attribute, they must be the same length and no padding is needed.
+            if !left.semanticEquals(right) =>
+          val outerRefs = (e1, e2) match {
+            case (_: OuterReference, _: OuterReference) => Seq(left, right)
+            case (_: OuterReference, _) => Seq(left)
+            case (_, _: OuterReference) => Seq(right)
+            case _ => Nil
+          }
+          val newChildren = CharVarcharUtils.addPaddingInStringComparison(Seq(left, right))
+          if (outerRefs.nonEmpty) {
+            b.withNewChildren(newChildren.map(_.transform {
+              case a: Attribute if outerRefs.exists(_.semanticEquals(a)) => OuterReference(a)
+            }))
+          } else {
+            b.withNewChildren(newChildren)
+          }
 
-        case i @ In(attr: Attribute, list) if list.forall(_.isInstanceOf[Attribute]) =>
+        case i @ In(e @ AttrOrOuterRef(attr), list) if list.forall(_.isInstanceOf[Attribute]) =>
           val newChildren = CharVarcharUtils.addPaddingInStringComparison(
             attr +: list.map(_.asInstanceOf[Attribute]))
-          i.copy(value = newChildren.head, list = newChildren.tail)
+          if (e.isInstanceOf[OuterReference]) {
+            i.copy(
+              value = newChildren.head.transform {
+                case a: Attribute if a.semanticEquals(attr) => OuterReference(a)
+              },
+              list = newChildren.tail)
+          } else {
+            i.copy(value = newChildren.head, list = newChildren.tail)
+          }
       }
     }
   }

From dcb2badebc49848034449dddfe4faf61a29c6699 Mon Sep 17 00:00:00 2001
From: Peter Toth <peter.toth@gmail.com>
Date: Sun, 28 Mar 2021 10:01:09 -0700
Subject: [PATCH 054/169] [SPARK-34829][SQL] Fix higher order function results

### What changes were proposed in this pull request?
This PR fixes a correctness issue with higher order functions. The results of function expressions needs to be copied in some higher order functions as such an expression can return with internal buffers and higher order functions can call multiple times the expression.
The issue was discovered with typed `ScalaUDF`s after https://github.com/apache/spark/pull/28979.

### Why are the changes needed?
To fix a bug.

### Does this PR introduce _any_ user-facing change?
Yes, some queries return the right results again.

### How was this patch tested?
Added new UT.

Closes #31955 from peter-toth/SPARK-34829-fix-scalaudf-resultconversion.

Authored-by: Peter Toth <peter.toth@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit 33821903490650463f47d03d20d2b16f77360e99)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../expressions/higherOrderFunctions.scala    | 14 +++--
 .../org/apache/spark/sql/DataFrameSuite.scala | 56 +++++++++++++++++++
 2 files changed, 65 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala
index 4454afb6c099b..ba447ea4de2ce 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala
@@ -277,7 +277,8 @@ case class ArrayTransform(
       if (indexVar.isDefined) {
         indexVar.get.value.set(i)
       }
-      result.update(i, f.eval(inputRow))
+      val v = InternalRow.copyValue(f.eval(inputRow))
+      result.update(i, v)
       i += 1
     }
     result
@@ -796,7 +797,7 @@ case class TransformKeys(
     while (i < map.numElements) {
       keyVar.value.set(map.keyArray().get(i, keyVar.dataType))
       valueVar.value.set(map.valueArray().get(i, valueVar.dataType))
-      val result = functionForEval.eval(inputRow)
+      val result = InternalRow.copyValue(functionForEval.eval(inputRow))
       resultKeys.update(i, result)
       i += 1
     }
@@ -843,7 +844,8 @@ case class TransformValues(
     while (i < map.numElements) {
       keyVar.value.set(map.keyArray().get(i, keyVar.dataType))
       valueVar.value.set(map.valueArray().get(i, valueVar.dataType))
-      resultValues.update(i, functionForEval.eval(inputRow))
+      val v = InternalRow.copyValue(functionForEval.eval(inputRow))
+      resultValues.update(i, v)
       i += 1
     }
     new ArrayBasedMapData(map.keyArray(), resultValues)
@@ -1026,7 +1028,8 @@ case class MapZipWith(left: Expression, right: Expression, function: Expression)
       value1Var.value.set(v1)
       value2Var.value.set(v2)
       keys.update(i, key)
-      values.update(i, functionForEval.eval(inputRow))
+      val v = InternalRow.copyValue(functionForEval.eval(inputRow))
+      values.update(i, v)
       i += 1
     }
     new ArrayBasedMapData(keys, values)
@@ -1098,7 +1101,8 @@ case class ZipWith(left: Expression, right: Expression, function: Expression)
           } else {
             rightElemVar.value.set(null)
           }
-          result.update(i, f.eval(input))
+          val v = InternalRow.copyValue(f.eval(input))
+          result.update(i, v)
           i += 1
         }
         result
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 594b142551cef..79f08897fc0dd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2603,6 +2603,62 @@ class DataFrameSuite extends QueryTest
     )
     checkAnswer(test.select($"best_name.name"), Row("bob") :: Row("bob") :: Row("sam") :: Nil)
   }
+
+  test("SPARK-34829: Multiple applications of typed ScalaUDFs in higher order functions work") {
+    val reverse = udf((s: String) => s.reverse)
+    val reverse2 = udf((b: Bar2) => Bar2(b.s.reverse))
+
+    val df = Seq(Array("abc", "def")).toDF("array")
+    val test = df.select(transform(col("array"), s => reverse(s)))
+    checkAnswer(test, Row(Array("cba", "fed")) :: Nil)
+
+    val df2 = Seq(Array(Bar2("abc"), Bar2("def"))).toDF("array")
+    val test2 = df2.select(transform(col("array"), b => reverse2(b)))
+    checkAnswer(test2, Row(Array(Row("cba"), Row("fed"))) :: Nil)
+
+    val df3 = Seq(Map("abc" -> 1, "def" -> 2)).toDF("map")
+    val test3 = df3.select(transform_keys(col("map"), (s, _) => reverse(s)))
+    checkAnswer(test3, Row(Map("cba" -> 1, "fed" -> 2)) :: Nil)
+
+    val df4 = Seq(Map(Bar2("abc") -> 1, Bar2("def") -> 2)).toDF("map")
+    val test4 = df4.select(transform_keys(col("map"), (b, _) => reverse2(b)))
+    checkAnswer(test4, Row(Map(Row("cba") -> 1, Row("fed") -> 2)) :: Nil)
+
+    val df5 = Seq(Map(1 -> "abc", 2 -> "def")).toDF("map")
+    val test5 = df5.select(transform_values(col("map"), (_, s) => reverse(s)))
+    checkAnswer(test5, Row(Map(1 -> "cba", 2 -> "fed")) :: Nil)
+
+    val df6 = Seq(Map(1 -> Bar2("abc"), 2 -> Bar2("def"))).toDF("map")
+    val test6 = df6.select(transform_values(col("map"), (_, b) => reverse2(b)))
+    checkAnswer(test6, Row(Map(1 -> Row("cba"), 2 -> Row("fed"))) :: Nil)
+
+    val reverseThenConcat = udf((s1: String, s2: String) => s1.reverse ++ s2.reverse)
+    val reverseThenConcat2 = udf((b1: Bar2, b2: Bar2) => Bar2(b1.s.reverse ++ b2.s.reverse))
+
+    val df7 = Seq((Map(1 -> "abc", 2 -> "def"), Map(1 -> "ghi", 2 -> "jkl"))).toDF("map1", "map2")
+    val test7 =
+      df7.select(map_zip_with(col("map1"), col("map2"), (_, s1, s2) => reverseThenConcat(s1, s2)))
+    checkAnswer(test7, Row(Map(1 -> "cbaihg", 2 -> "fedlkj")) :: Nil)
+
+    val df8 = Seq((Map(1 -> Bar2("abc"), 2 -> Bar2("def")),
+      Map(1 -> Bar2("ghi"), 2 -> Bar2("jkl")))).toDF("map1", "map2")
+    val test8 =
+      df8.select(map_zip_with(col("map1"), col("map2"), (_, b1, b2) => reverseThenConcat2(b1, b2)))
+    checkAnswer(test8, Row(Map(1 -> Row("cbaihg"), 2 -> Row("fedlkj"))) :: Nil)
+
+    val df9 = Seq((Array("abc", "def"), Array("ghi", "jkl"))).toDF("array1", "array2")
+    val test9 =
+      df9.select(zip_with(col("array1"), col("array2"), (s1, s2) => reverseThenConcat(s1, s2)))
+    checkAnswer(test9, Row(Array("cbaihg", "fedlkj")) :: Nil)
+
+    val df10 = Seq((Array(Bar2("abc"), Bar2("def")), Array(Bar2("ghi"), Bar2("jkl"))))
+      .toDF("array1", "array2")
+    val test10 =
+      df10.select(zip_with(col("array1"), col("array2"), (b1, b2) => reverseThenConcat2(b1, b2)))
+    checkAnswer(test10, Row(Array(Row("cbaihg"), Row("fedlkj"))) :: Nil)
+  }
 }
 
 case class GroupByKey(a: Int, b: Int)
+
+case class Bar2(s: String)

From 31fa6805df6dcb58f6904a31d8b008fb23f2bad5 Mon Sep 17 00:00:00 2001
From: Tanel Kiis <tanel.kiis@gmail.com>
Date: Mon, 29 Mar 2021 11:47:08 +0900
Subject: [PATCH 055/169] [SPARK-34876][SQL] Fill defaultResult of non-nullable
 aggregates

### What changes were proposed in this pull request?

Filled the `defaultResult` field on non-nullable aggregates

### Why are the changes needed?

The `defaultResult` defaults to `None` and in some situations (like correlated scalar subqueries) it is used for the value of the aggregation.

The UT result before the fix:
```
-- !query
SELECT t1a,
   (SELECT count(t2d) FROM t2 WHERE t2a = t1a) count_t2,
   (SELECT count_if(t2d > 0) FROM t2 WHERE t2a = t1a) count_if_t2,
   (SELECT approx_count_distinct(t2d) FROM t2 WHERE t2a = t1a) approx_count_distinct_t2,
   (SELECT collect_list(t2d) FROM t2 WHERE t2a = t1a) collect_list_t2,
   (SELECT collect_set(t2d) FROM t2 WHERE t2a = t1a) collect_set_t2,
    (SELECT hex(count_min_sketch(t2d, 0.5d, 0.5d, 1)) FROM t2 WHERE t2a = t1a) collect_set_t2
FROM t1
-- !query schema
struct<t1a:string,count_t2:bigint,count_if_t2:bigint,approx_count_distinct_t2:bigint,collect_list_t2:array<bigint>,collect_set_t2:array<bigint>,collect_set_t2:string>
-- !query output
val1a	0	0	NULL	NULL	NULL	NULL
val1a	0	0	NULL	NULL	NULL	NULL
val1a	0	0	NULL	NULL	NULL	NULL
val1a	0	0	NULL	NULL	NULL	NULL
val1b	6	6	3	[19,119,319,19,19,19]	[19,119,319]	0000000100000000000000060000000100000004000000005D8D6AB90000000000000000000000000000000400000000000000010000000000000001
val1c	2	2	2	[219,19]	[219,19]	0000000100000000000000020000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000001
val1d	0	0	NULL	NULL	NULL	NULL
val1d	0	0	NULL	NULL	NULL	NULL
val1d	0	0	NULL	NULL	NULL	NULL
val1e	1	1	1	[19]	[19]	0000000100000000000000010000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000000
val1e	1	1	1	[19]	[19]	0000000100000000000000010000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000000
val1e	1	1	1	[19]	[19]	0000000100000000000000010000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000000
```

### Does this PR introduce _any_ user-facing change?

Bugfix

### How was this patch tested?

UT

Closes #31973 from tanelk/SPARK-34876_non_nullable_agg_subquery.

Authored-by: Tanel Kiis <tanel.kiis@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
(cherry picked from commit 4b9e94c44412f399ba19e0ea90525d346942bf71)
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../aggregate/CountMinSketchAgg.scala         |  5 +++-
 .../aggregate/HyperLogLogPlusPlus.scala       |  2 ++
 .../expressions/aggregate/collect.scala       |  2 ++
 .../expressions/aggregate/interfaces.scala    |  3 +-
 .../scalar-subquery-select.sql                | 10 +++++++
 .../scalar-subquery-select.sql.out            | 28 ++++++++++++++++++-
 6 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountMinSketchAgg.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountMinSketchAgg.scala
index 8b51e0a908f42..45d55a085a717 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountMinSketchAgg.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CountMinSketchAgg.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.expressions.aggregate
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess}
-import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, ExpressionDescription}
+import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, ExpressionDescription, Literal}
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
 import org.apache.spark.util.sketch.CountMinSketch
@@ -142,6 +142,9 @@ case class CountMinSketchAgg(
 
   override def dataType: DataType = BinaryType
 
+  override def defaultResult: Option[Literal] =
+    Option(Literal.create(eval(createAggregationBuffer()), dataType))
+
   override def children: Seq[Expression] =
     Seq(child, epsExpression, confidenceExpression, seedExpression)
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
index b3cc9a31fec2f..b6a4d116b16b7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/HyperLogLogPlusPlus.scala
@@ -90,6 +90,8 @@ case class HyperLogLogPlusPlus(
 
   override def aggBufferSchema: StructType = StructType.fromAttributes(aggBufferAttributes)
 
+  override def defaultResult: Option[Literal] = Option(Literal.create(0L, dataType))
+
   val hllppHelper = new HyperLogLogPlusPlusHelper(relativeSD)
 
   /** Allocate enough words to store all registers. */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
index f95f44c808092..f1b9630312d55 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
@@ -47,6 +47,8 @@ abstract class Collect[T <: Growable[Any] with Iterable[Any]] extends TypedImper
   // actual order of input rows.
   override lazy val deterministic: Boolean = false
 
+  override def defaultResult: Option[Literal] = Option(Literal.create(Array(), dataType))
+
   protected def convertToBufferElement(value: Any): Any
 
   override def update(buffer: T, input: InternalRow): T = {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
index 421b8ee2a25b2..9dd370445ec46 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/interfaces.scala
@@ -202,8 +202,7 @@ abstract class AggregateFunction extends Expression {
   def inputAggBufferAttributes: Seq[AttributeReference]
 
   /**
-   * Result of the aggregate function when the input is empty. This is currently only used for the
-   * proper rewriting of distinct aggregate functions.
+   * Result of the aggregate function when the input is empty.
    */
   def defaultResult: Option[Literal] = None
 
diff --git a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql
index eabbd0a932253..81712bfac239a 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/subquery/scalar-subquery/scalar-subquery-select.sql
@@ -128,3 +128,13 @@ WHERE  NOT EXISTS (SELECT (SELECT max(t2b)
                                  ON     t2a = t1a
                                  WHERE  t2c = t3c)
                    AND    t3a = t1a);
+
+-- SPARK-34876: Non-nullable aggregates should not return NULL in a correlated subquery
+SELECT t1a,
+    (SELECT count(t2d) FROM t2 WHERE t2a = t1a) count_t2,
+    (SELECT count_if(t2d > 0) FROM t2 WHERE t2a = t1a) count_if_t2,
+    (SELECT approx_count_distinct(t2d) FROM t2 WHERE t2a = t1a) approx_count_distinct_t2,
+    (SELECT collect_list(t2d) FROM t2 WHERE t2a = t1a) collect_list_t2,
+    (SELECT collect_set(t2d) FROM t2 WHERE t2a = t1a) collect_set_t2,
+    (SELECT hex(count_min_sketch(t2d, 0.5d, 0.5d, 1)) FROM t2 WHERE t2a = t1a) collect_set_t2
+FROM t1;
\ No newline at end of file
diff --git a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out
index 184b8daf9d28e..16570c659dc38 100644
--- a/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/subquery/scalar-subquery/scalar-subquery-select.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 11
+-- Number of queries: 12
 
 
 -- !query
@@ -196,3 +196,29 @@ val1d	NULL
 val1e	10
 val1e	10
 val1e	10
+
+
+-- !query
+SELECT t1a,
+    (SELECT count(t2d) FROM t2 WHERE t2a = t1a) count_t2,
+    (SELECT count_if(t2d > 0) FROM t2 WHERE t2a = t1a) count_if_t2,
+    (SELECT approx_count_distinct(t2d) FROM t2 WHERE t2a = t1a) approx_count_distinct_t2,
+    (SELECT collect_list(t2d) FROM t2 WHERE t2a = t1a) collect_list_t2,
+    (SELECT collect_set(t2d) FROM t2 WHERE t2a = t1a) collect_set_t2,
+    (SELECT hex(count_min_sketch(t2d, 0.5d, 0.5d, 1)) FROM t2 WHERE t2a = t1a) collect_set_t2
+FROM t1
+-- !query schema
+struct<t1a:string,count_t2:bigint,count_if_t2:bigint,approx_count_distinct_t2:bigint,collect_list_t2:array<bigint>,collect_set_t2:array<bigint>,collect_set_t2:string>
+-- !query output
+val1a	0	0	0	[]	[]	0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000
+val1a	0	0	0	[]	[]	0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000
+val1a	0	0	0	[]	[]	0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000
+val1a	0	0	0	[]	[]	0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000
+val1b	6	6	3	[19,119,319,19,19,19]	[19,119,319]	0000000100000000000000060000000100000004000000005D8D6AB90000000000000000000000000000000400000000000000010000000000000001
+val1c	2	2	2	[219,19]	[219,19]	0000000100000000000000020000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000001
+val1d	0	0	0	[]	[]	0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000
+val1d	0	0	0	[]	[]	0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000
+val1d	0	0	0	[]	[]	0000000100000000000000000000000100000004000000005D8D6AB90000000000000000000000000000000000000000000000000000000000000000
+val1e	1	1	1	[19]	[19]	0000000100000000000000010000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000000
+val1e	1	1	1	[19]	[19]	0000000100000000000000010000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000000
+val1e	1	1	1	[19]	[19]	0000000100000000000000010000000100000004000000005D8D6AB90000000000000000000000000000000100000000000000000000000000000000

From ee910285d676691ef59788a175e9e2e178a69168 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Mon, 29 Mar 2021 12:05:00 +0900
Subject: [PATCH 056/169] [SPARK-34814][SQL] LikeSimplification should handle
 NULL

### What changes were proposed in this pull request?
LikeSimplification should handle NULL.

UT will failed  before this pr
```
  test("SPARK-34814: LikeSimplification should handle NULL") {
    withSQLConf(SQLConf.OPTIMIZER_EXCLUDED_RULES.key ->
      ConstantFolding.getClass.getName.stripSuffix("$")) {
      checkEvaluation(Literal.create("foo", StringType)
        .likeAll("%foo%", Literal.create(null, StringType)), null)
    }
  }

[info] - test *** FAILED *** (2 seconds, 443 milliseconds)
[info]   java.lang.NullPointerException:
[info]   at org.apache.spark.sql.catalyst.optimizer.LikeSimplification$.$anonfun$simplifyMultiLike$1(expressions.scala:697)
[info]   at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
[info]   at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
[info]   at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
[info]   at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
[info]   at scala.collection.TraversableLike.map(TraversableLike.scala:238)
[info]   at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
[info]   at scala.collection.AbstractTraversable.map(Traversable.scala:108)
[info]   at org.apache.spark.sql.catalyst.optimizer.LikeSimplification$.org$apache$spark$sql$catalyst$optimizer$LikeSimplification$$simplifyMultiLike(expressions.scala:697)
[info]   at org.apache.spark.sql.catalyst.optimizer.LikeSimplification$$anonfun$apply$9.applyOrElse(expressions.scala:722)
[info]   at org.apache.spark.sql.catalyst.optimizer.LikeSimplification$$anonfun$apply$9.applyOrElse(expressions.scala:714)
[info]   at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$1(TreeNode.scala:316)
[info]   at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:72)
[info]   at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:316)
[info]   at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$3(TreeNode.scala:321)
[info]   at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$mapChildren$1(TreeNode.scala:406)
[info]   at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:242)
[info]   at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:404)
[info]   at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:357)
[info]   at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:321)
[info]   at org.apache.spark.sql.catalyst.plans.QueryPlan.$anonfun$transformExpressionsDown$1(QueryPlan.scala:94)
[info]   at org.apache.spark.sql.catalyst.plans.QueryPlan.$anonfun$mapExpressions$1(QueryPlan.scala:116)
[info]   at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:72)
```

### Why are the changes needed?
Fix bug

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Added UT

Closes #31976 from AngersZhuuuu/SPARK-34814.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
(cherry picked from commit 2356cdd420f600f38d0e786dc50c15f2603b7ff2)
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
---
 .../spark/sql/catalyst/optimizer/expressions.scala     |  4 +++-
 .../catalyst/expressions/RegexpExpressionsSuite.scala  | 10 ++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 78d7cccc09dfb..d9897530ff74c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -576,7 +576,9 @@ object LikeSimplification extends Rule[LogicalPlan] {
   private def simplifyMultiLike(
       child: Expression, patterns: Seq[UTF8String], multi: MultiLikeBase): Expression = {
     val (remainPatternMap, replacementMap) =
-      patterns.map { p => p -> simplifyLike(child, p.toString)}.partition(_._2.isEmpty)
+      patterns.map { p =>
+        p -> Option(p).flatMap(p => simplifyLike(child, p.toString))
+      }.partition(_._2.isEmpty)
     val remainPatterns = remainPatternMap.map(_._1)
     val replacements = replacementMap.map(_._2.get)
     if (replacements.isEmpty) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index 8d7501d952ecb..019857580d077 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -22,6 +22,8 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
+import org.apache.spark.sql.catalyst.optimizer.ConstantFolding
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StringType
 
 /**
@@ -470,4 +472,12 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     cache.setAccessible(true)
     assert(cache.get(expr).asInstanceOf[java.util.regex.Pattern].pattern().contains("a"))
   }
+
+  test("SPARK-34814: LikeSimplification should handle NULL") {
+    withSQLConf(SQLConf.OPTIMIZER_EXCLUDED_RULES.key ->
+      ConstantFolding.getClass.getName.stripSuffix("$")) {
+      checkEvaluation(Literal.create("foo", StringType)
+        .likeAll("%foo%", Literal.create(null, StringType)), null)
+    }
+  }
 }

From 891e873dedd4fb052a260d3379d3e688ffce80b7 Mon Sep 17 00:00:00 2001
From: Baohe Zhang <baohe.zhang@verizonmedia.com>
Date: Mon, 29 Mar 2021 07:46:58 -0700
Subject: [PATCH 057/169] [SPARK-34845][CORE] ProcfsMetricsGetter shouldn't
 return partial procfs metrics

### What changes were proposed in this pull request?
In ProcfsMetricsGetter.scala, propogating IOException from addProcfsMetricsFromOneProcess to computeAllMetrics when the child pid's proc stat file is unavailable. As a result, the for-loop in computeAllMetrics() can terminate earlier and return an all-0 procfs metric.

### Why are the changes needed?
In the case of a child pid's stat file missing and the subsequent child pids' stat files exist, ProcfsMetricsGetter.computeAllMetrics() will return partial metrics (the sum of a subset of child pids), which can be misleading and is undesired per the existing code comments in https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/executor/ProcfsMetricsGetter.scala#L214.

Also, a side effect of this bug is that it can lead to a verbose warning log if many pids' stat files are missing. An early terminating can make the warning logs more concise.

The unit test can also explain the bug well.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
A unit test is added.

Closes #31945 from baohe-zhang/SPARK-34845.

Authored-by: Baohe Zhang <baohe.zhang@verizonmedia.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit b2bfe985e8adf55e5df5887340fd862776033a06)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../spark/executor/ProcfsMetricsGetter.scala  | 23 +++++++++++-------
 .../executor/ProcfsMetricsGetterSuite.scala   | 24 +++++++++++++++++++
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/executor/ProcfsMetricsGetter.scala b/core/src/main/scala/org/apache/spark/executor/ProcfsMetricsGetter.scala
index 80ef757332e43..5682a21e9560d 100644
--- a/core/src/main/scala/org/apache/spark/executor/ProcfsMetricsGetter.scala
+++ b/core/src/main/scala/org/apache/spark/executor/ProcfsMetricsGetter.scala
@@ -101,7 +101,8 @@ private[spark] class ProcfsMetricsGetter(procfsDir: String = "/proc/") extends L
     }
   }
 
-  private def computeProcessTree(): Set[Int] = {
+  // Exposed for testing
+  private[executor] def computeProcessTree(): Set[Int] = {
     if (!isAvailable || testing) {
       return Set()
     }
@@ -159,7 +160,8 @@ private[spark] class ProcfsMetricsGetter(procfsDir: String = "/proc/") extends L
     }
   }
 
-  def addProcfsMetricsFromOneProcess(
+  // Exposed for testing
+  private[executor] def addProcfsMetricsFromOneProcess(
       allMetrics: ProcfsMetrics,
       pid: Int): ProcfsMetrics = {
 
@@ -199,7 +201,7 @@ private[spark] class ProcfsMetricsGetter(procfsDir: String = "/proc/") extends L
       case f: IOException =>
         logWarning("There was a problem with reading" +
           " the stat file of the process. ", f)
-        ProcfsMetrics(0, 0, 0, 0, 0, 0)
+        throw f
     }
   }
 
@@ -210,11 +212,16 @@ private[spark] class ProcfsMetricsGetter(procfsDir: String = "/proc/") extends L
     val pids = computeProcessTree
     var allMetrics = ProcfsMetrics(0, 0, 0, 0, 0, 0)
     for (p <- pids) {
-      allMetrics = addProcfsMetricsFromOneProcess(allMetrics, p)
-      // if we had an error getting any of the metrics, we don't want to report partial metrics, as
-      // that would be misleading.
-      if (!isAvailable) {
-        return ProcfsMetrics(0, 0, 0, 0, 0, 0)
+      try {
+        allMetrics = addProcfsMetricsFromOneProcess(allMetrics, p)
+        // if we had an error getting any of the metrics, we don't want to
+        // report partial metrics, as that would be misleading.
+        if (!isAvailable) {
+          return ProcfsMetrics(0, 0, 0, 0, 0, 0)
+        }
+      } catch {
+        case _: IOException =>
+          return ProcfsMetrics(0, 0, 0, 0, 0, 0)
       }
     }
     allMetrics
diff --git a/core/src/test/scala/org/apache/spark/executor/ProcfsMetricsGetterSuite.scala b/core/src/test/scala/org/apache/spark/executor/ProcfsMetricsGetterSuite.scala
index 9836697e1647c..ff0374da1bcfe 100644
--- a/core/src/test/scala/org/apache/spark/executor/ProcfsMetricsGetterSuite.scala
+++ b/core/src/test/scala/org/apache/spark/executor/ProcfsMetricsGetterSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.executor
 
+import org.mockito.Mockito.{spy, when}
+
 import org.apache.spark.SparkFunSuite
 
 
@@ -38,4 +40,26 @@ class ProcfsMetricsGetterSuite extends SparkFunSuite {
     assert(r.jvmVmemTotal == 4769947648L)
     assert(r.jvmRSSTotal == 262610944)
   }
+
+  test("SPARK-34845: partial metrics shouldn't be returned") {
+    val p = new ProcfsMetricsGetter(getTestResourcePath("ProcfsMetrics"))
+    val mockedP = spy(p)
+
+    var ptree: Set[Int] = Set(26109, 22763)
+    when(mockedP.computeProcessTree).thenReturn(ptree)
+    var r = mockedP.computeAllMetrics
+    assert(r.jvmVmemTotal == 4769947648L)
+    assert(r.jvmRSSTotal == 262610944)
+    assert(r.pythonVmemTotal == 360595456)
+    assert(r.pythonRSSTotal == 7831552)
+
+    // proc file of pid 22764 doesn't exist, so partial metrics shouldn't be returned
+    ptree = Set(26109, 22764, 22763)
+    when(mockedP.computeProcessTree).thenReturn(ptree)
+    r = mockedP.computeAllMetrics
+    assert(r.jvmVmemTotal == 0)
+    assert(r.jvmRSSTotal == 0)
+    assert(r.pythonVmemTotal == 0)
+    assert(r.pythonRSSTotal == 0)
+  }
 }

From 3a53e8cbef97adc76503583d208525ad89ac3006 Mon Sep 17 00:00:00 2001
From: Tim Armstrong <tim.armstrong@databricks.com>
Date: Wed, 31 Mar 2021 12:58:29 +0800
Subject: [PATCH 058/169] [SPARK-34909][SQL] Fix conversion of negative to
 unsigned in conv()

### What changes were proposed in this pull request?
Use `java.lang.Long.divideUnsigned()` to do integer division in `NumberConverter` to avoid a bug in `unsignedLongDiv` that produced invalid results.

### Why are the changes needed?
The previous results are incorrect, the result of the below query should be 45012021522523134134555
```
scala> spark.sql("select conv('-10', 11, 7)").show(20, 150)
+-----------------------+
|       conv(-10, 11, 7)|
+-----------------------+
|4501202152252313413456|
+-----------------------+
scala> spark.sql("select hex(conv('-10', 11, 7))").show(20, 150)
+----------------------------------------------+
|                         hex(conv(-10, 11, 7))|
+----------------------------------------------+
|3435303132303231353232353233313334313334353600|
+----------------------------------------------+
```

### Does this PR introduce _any_ user-facing change?
`conv()` will produce different results because the bug is fixed.

### How was this patch tested?
Added a simple unit test.

Closes #32006 from timarmstrong/conv-unsigned.

Authored-by: Tim Armstrong <tim.armstrong@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 13b255fefd881beb68fd8bb6741c7f88318baf9b)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/util/NumberConverter.scala   | 26 +++----------------
 .../catalyst/util/NumberConverterSuite.scala  |  4 +++
 2 files changed, 7 insertions(+), 23 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala
index 7dbdd1ef1cdc5..dca75e5083331 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/NumberConverter.scala
@@ -21,26 +21,6 @@ import org.apache.spark.unsafe.types.UTF8String
 
 object NumberConverter {
 
-  /**
-   * Divide x by m as if x is an unsigned 64-bit integer. Examples:
-   * unsignedLongDiv(-1, 2) == Long.MAX_VALUE unsignedLongDiv(6, 3) == 2
-   * unsignedLongDiv(0, 5) == 0
-   *
-   * @param x is treated as unsigned
-   * @param m is treated as signed
-   */
-  private def unsignedLongDiv(x: Long, m: Int): Long = {
-    if (x >= 0) {
-      x / m
-    } else {
-      // Let uval be the value of the unsigned long with the same bits as x
-      // Two's complement => x = uval - 2*MAX - 2
-      // => uval = x + 2*MAX + 2
-      // Now, use the fact: (a+b)/c = a/c + b/c + (a%c+b%c)/c
-      x / m + 2 * (Long.MaxValue / m) + 2 / m + (x % m + 2 * (Long.MaxValue % m) + 2 % m) / m
-    }
-  }
-
   /**
    * Decode v into value[].
    *
@@ -52,7 +32,7 @@ object NumberConverter {
     java.util.Arrays.fill(value, 0.asInstanceOf[Byte])
     var i = value.length - 1
     while (tmpV != 0) {
-      val q = unsignedLongDiv(tmpV, radix)
+      val q = java.lang.Long.divideUnsigned(tmpV, radix)
       value(i) = (tmpV - q * radix).asInstanceOf[Byte]
       tmpV = q
       i -= 1
@@ -69,12 +49,12 @@ object NumberConverter {
    */
   private def encode(radix: Int, fromPos: Int, value: Array[Byte]): Long = {
     var v: Long = 0L
-    val bound = unsignedLongDiv(-1 - radix, radix) // Possible overflow once
+    val bound = java.lang.Long.divideUnsigned(-1 - radix, radix) // Possible overflow once
     var i = fromPos
     while (i < value.length && value(i) >= 0) {
       if (v >= bound) {
         // Check for overflow
-        if (unsignedLongDiv(-1 - value(i), radix) < v) {
+        if (java.lang.Long.divideUnsigned(-1 - value(i), radix) < v) {
           return -1
         }
       }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala
index ec73f4518737d..eb257b7975622 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/NumberConverterSuite.scala
@@ -40,6 +40,10 @@ class NumberConverterSuite extends SparkFunSuite {
     checkConv("11abc", 10, 16, "B")
   }
 
+  test("SPARK-34909: convert negative to unsigned") {
+    checkConv("-10", 11, 7, "45012021522523134134555")
+  }
+
   test("byte to binary") {
     checkToBinary(0.toByte)
     checkToBinary(1.toByte)

From d56aabe01aac612f59a0c9b3ef6894f0bd4d5127 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sat, 3 Apr 2021 18:37:50 -0700
Subject: [PATCH 059/169] [SPARK-34939][CORE] Throw fetch failure exception
 when unable to deserialize broadcasted map statuses

### What changes were proposed in this pull request?

This patch catches `IOException`, which is possibly thrown due to unable to deserialize map statuses (e.g., broadcasted value is destroyed), when deserilizing map statuses. Once `IOException` is caught, `MetadataFetchFailedException` is thrown to let Spark handle it.

### Why are the changes needed?

One customer encountered application error. From the log, it is caused by accessing non-existing broadcasted value. The broadcasted value is map statuses. E.g.,

```
[info]   Cause: java.io.IOException: org.apache.spark.SparkException: Failed to get broadcast_0_piece0 of broadcast_0
[info]   at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1410)
[info]   at org.apache.spark.broadcast.TorrentBroadcast.readBroadcastBlock(TorrentBroadcast.scala:226)
[info]   at org.apache.spark.broadcast.TorrentBroadcast.getValue(TorrentBroadcast.scala:103)
[info]   at org.apache.spark.broadcast.Broadcast.value(Broadcast.scala:70)
[info]   at org.apache.spark.MapOutputTracker$.$anonfun$deserializeMapStatuses$3(MapOutputTracker.scala:967)
[info]   at org.apache.spark.internal.Logging.logInfo(Logging.scala:57)
[info]   at org.apache.spark.internal.Logging.logInfo$(Logging.scala:56)
[info]   at org.apache.spark.MapOutputTracker$.logInfo(MapOutputTracker.scala:887)
[info]   at org.apache.spark.MapOutputTracker$.deserializeMapStatuses(MapOutputTracker.scala:967)
```

There is a race-condition. After map statuses are broadcasted and the executors obtain serialized broadcasted map statuses. If any fetch failure happens after, Spark scheduler invalidates cached map statuses and destroy broadcasted value of the map statuses. Then any executor trying to deserialize serialized broadcasted map statuses and access broadcasted value, `IOException` will be thrown. Currently we don't catch it in `MapOutputTrackerWorker` and above exception will fail the application.

Normally we should throw a fetch failure exception for such case. Spark scheduler will handle this.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Unit test.

Closes #32033 from viirya/fix-broadcast-master.

Authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit 571acc87fef6ddf8a6046bf710d5065dc02d76bd)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../org/apache/spark/MapOutputTracker.scala   | 33 ++++++++++-----
 .../apache/spark/MapOutputTrackerSuite.scala  | 41 +++++++++++++++++++
 2 files changed, 64 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
index 971b6f2b041b3..03c0fbfef3b94 100644
--- a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
+++ b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark
 
-import java.io.{ByteArrayInputStream, ObjectInputStream, ObjectOutputStream}
+import java.io.{ByteArrayInputStream, IOException, ObjectInputStream, ObjectOutputStream}
 import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, ThreadPoolExecutor, TimeUnit}
 import java.util.concurrent.locks.ReentrantReadWriteLock
 
@@ -100,7 +100,7 @@ private class ShuffleStatus(numPartitions: Int) extends Logging {
    * broadcast variable in order to keep it from being garbage collected and to allow for it to be
    * explicitly destroyed later on when the ShuffleMapStage is garbage-collected.
    */
-  private[this] var cachedSerializedBroadcast: Broadcast[Array[Byte]] = _
+  private[spark] var cachedSerializedBroadcast: Broadcast[Array[Byte]] = _
 
   /**
    * Counter tracking the number of partitions that have output. This is a performance optimization
@@ -843,7 +843,14 @@ private[spark] class MapOutputTrackerWorker(conf: SparkConf) extends MapOutputTr
         if (fetchedStatuses == null) {
           logInfo("Doing the fetch; tracker endpoint = " + trackerEndpoint)
           val fetchedBytes = askTracker[Array[Byte]](GetMapOutputStatuses(shuffleId))
-          fetchedStatuses = MapOutputTracker.deserializeMapStatuses(fetchedBytes, conf)
+          try {
+            fetchedStatuses = MapOutputTracker.deserializeMapStatuses(fetchedBytes, conf)
+          } catch {
+            case e: SparkException =>
+              throw new MetadataFetchFailedException(shuffleId, -1,
+                s"Unable to deserialize broadcasted map statuses for shuffle $shuffleId: " +
+                  e.getCause)
+          }
           logInfo("Got the output locations")
           mapStatuses.put(shuffleId, fetchedStatuses)
         }
@@ -953,13 +960,19 @@ private[spark] object MapOutputTracker extends Logging {
       case DIRECT =>
         deserializeObject(bytes, 1, bytes.length - 1).asInstanceOf[Array[MapStatus]]
       case BROADCAST =>
-        // deserialize the Broadcast, pull .value array out of it, and then deserialize that
-        val bcast = deserializeObject(bytes, 1, bytes.length - 1).
-          asInstanceOf[Broadcast[Array[Byte]]]
-        logInfo("Broadcast mapstatuses size = " + bytes.length +
-          ", actual size = " + bcast.value.length)
-        // Important - ignore the DIRECT tag ! Start from offset 1
-        deserializeObject(bcast.value, 1, bcast.value.length - 1).asInstanceOf[Array[MapStatus]]
+        try {
+          // deserialize the Broadcast, pull .value array out of it, and then deserialize that
+          val bcast = deserializeObject(bytes, 1, bytes.length - 1).
+            asInstanceOf[Broadcast[Array[Byte]]]
+          logInfo("Broadcast mapstatuses size = " + bytes.length +
+            ", actual size = " + bcast.value.length)
+          // Important - ignore the DIRECT tag ! Start from offset 1
+          deserializeObject(bcast.value, 1, bcast.value.length - 1).asInstanceOf[Array[MapStatus]]
+        } catch {
+          case e: IOException =>
+            logWarning("Exception encountered during deserializing broadcasted map statuses: ", e)
+            throw new SparkException("Unable to deserialize broadcasted map statuses", e)
+        }
       case _ => throw new IllegalArgumentException("Unexpected byte tag = " + bytes(0))
     }
   }
diff --git a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
index b5b68f639ffc9..33e1113774663 100644
--- a/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/MapOutputTrackerSuite.scala
@@ -333,4 +333,45 @@ class MapOutputTrackerSuite extends SparkFunSuite {
     rpcEnv.shutdown()
   }
 
+  test("SPARK-34939: remote fetch using broadcast if broadcasted value is destroyed") {
+    val newConf = new SparkConf
+    newConf.set(RPC_MESSAGE_MAX_SIZE, 1)
+    newConf.set(RPC_ASK_TIMEOUT, "1") // Fail fast
+    newConf.set(SHUFFLE_MAPOUTPUT_MIN_SIZE_FOR_BROADCAST, 10240L) // 10 KiB << 1MiB framesize
+
+    // needs TorrentBroadcast so need a SparkContext
+    withSpark(new SparkContext("local", "MapOutputTrackerSuite", newConf)) { sc =>
+      val masterTracker = sc.env.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster]
+      val rpcEnv = sc.env.rpcEnv
+      val masterEndpoint = new MapOutputTrackerMasterEndpoint(rpcEnv, masterTracker, newConf)
+      rpcEnv.stop(masterTracker.trackerEndpoint)
+      rpcEnv.setupEndpoint(MapOutputTracker.ENDPOINT_NAME, masterEndpoint)
+
+      masterTracker.registerShuffle(20, 100)
+      (0 until 100).foreach { i =>
+        masterTracker.registerMapOutput(20, i, new CompressedMapStatus(
+          BlockManagerId("999", "mps", 1000), Array.fill[Long](4000000)(0), 5))
+      }
+
+      val mapWorkerRpcEnv = createRpcEnv("spark-worker", "localhost", 0, new SecurityManager(conf))
+      val mapWorkerTracker = new MapOutputTrackerWorker(conf)
+      mapWorkerTracker.trackerEndpoint =
+        mapWorkerRpcEnv.setupEndpointRef(rpcEnv.address, MapOutputTracker.ENDPOINT_NAME)
+
+      val fetchedBytes = mapWorkerTracker.trackerEndpoint
+        .askSync[Array[Byte]](GetMapOutputStatuses(20))
+      assert(fetchedBytes(0) == 1)
+
+      // Normally `unregisterMapOutput` triggers the destroy of broadcasted value.
+      // But the timing of destroying broadcasted value is indeterminate, we manually destroy
+      // it by blocking.
+      masterTracker.shuffleStatuses.get(20).foreach { shuffleStatus =>
+        shuffleStatus.cachedSerializedBroadcast.destroy(true)
+      }
+      val err = intercept[SparkException] {
+        MapOutputTracker.deserializeMapStatuses(fetchedBytes, conf)
+      }
+      assert(err.getMessage.contains("Unable to deserialize broadcasted map statuses"))
+    }
+  }
 }

From 21a727d49a362fb220de2d5b5cd5408492399aa2 Mon Sep 17 00:00:00 2001
From: Karen Feng <karen.feng@databricks.com>
Date: Tue, 6 Apr 2021 16:04:30 +0800
Subject: [PATCH 060/169] [SPARK-34923][SQL] Metadata output should be empty
 for more plans

Changes the metadata propagation framework.

Previously, most `LogicalPlan`'s propagated their `children`'s `metadataOutput`. This did not make sense in cases where the `LogicalPlan` did not even propagate their `children`'s `output`.

I set the metadata output for plans that do not propagate their `children`'s `output` to be `Nil`. Notably, `Project` and `View` no longer have metadata output.

Previously, `SELECT m from (SELECT a from tb)` would output `m` if it were metadata. This did not make sense.

Yes. Now, `SELECT m from (SELECT a from tb)` will encounter an `AnalysisException`.

Added unit tests. I did not cover all cases, as they are fairly extensive. However, the new tests cover major cases (and an existing test already covers Join).

Closes #32017 from karenfeng/spark-34923.

Authored-by: Karen Feng <karen.feng@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 3b634f66c3e4a942178a1e322ae65ce82779625d)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/plans/logical/LogicalPlan.scala  |   5 +-
 .../plans/logical/basicLogicalOperators.scala |  25 +++++
 .../sql/connector/DataSourceV2SQLSuite.scala  | 103 ++++++++++++++++++
 3 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
index bdf37d040eb79..3ea79b35e8c52 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -33,7 +33,10 @@ abstract class LogicalPlan
   with QueryPlanConstraints
   with Logging {
 
-  /** Metadata fields that can be projected from this node */
+  /**
+   * Metadata fields that can be projected from this node.
+   * Should be overridden if the plan does not propagate its children's output.
+   */
   def metadataOutput: Seq[Attribute] = children.flatMap(_.metadataOutput)
 
   /** Returns true if this subtree has data from a streaming data source. */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 4de540299b71b..110e21f9c3935 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -59,6 +59,7 @@ object Subquery {
 case class Project(projectList: Seq[NamedExpression], child: LogicalPlan)
     extends OrderPreservingUnaryNode {
   override def output: Seq[Attribute] = projectList.map(_.toAttribute)
+  override def metadataOutput: Seq[Attribute] = Nil
   override def maxRows: Option[Long] = child.maxRows
 
   override lazy val resolved: Boolean = {
@@ -185,6 +186,8 @@ case class Intersect(
       leftAttr.withNullability(leftAttr.nullable && rightAttr.nullable)
     }
 
+  override def metadataOutput: Seq[Attribute] = Nil
+
   override protected lazy val validConstraints: ExpressionSet =
     leftConstraints.union(rightConstraints)
 
@@ -205,6 +208,8 @@ case class Except(
   /** We don't use right.output because those rows get excluded from the set. */
   override def output: Seq[Attribute] = left.output
 
+  override def metadataOutput: Seq[Attribute] = Nil
+
   override protected lazy val validConstraints: ExpressionSet = leftConstraints
 }
 
@@ -268,6 +273,8 @@ case class Union(
     }
   }
 
+  override def metadataOutput: Seq[Attribute] = Nil
+
   override lazy val resolved: Boolean = {
     // allChildrenCompatible needs to be evaluated after childrenResolved
     def allChildrenCompatible: Boolean =
@@ -343,6 +350,17 @@ case class Join(
     }
   }
 
+  override def metadataOutput: Seq[Attribute] = {
+    joinType match {
+      case ExistenceJoin(_) =>
+        left.metadataOutput
+      case LeftExistence(_) =>
+        left.metadataOutput
+      case _ =>
+        children.flatMap(_.metadataOutput)
+    }
+  }
+
   override protected lazy val validConstraints: ExpressionSet = {
     joinType match {
       case _: InnerLike if condition.isDefined =>
@@ -419,6 +437,7 @@ case class InsertIntoDir(
   extends UnaryNode {
 
   override def output: Seq[Attribute] = Seq.empty
+  override def metadataOutput: Seq[Attribute] = Nil
   override lazy val resolved: Boolean = false
 }
 
@@ -449,6 +468,8 @@ case class View(
 
   override def newInstance(): LogicalPlan = copy(output = output.map(_.newInstance()))
 
+  override def metadataOutput: Seq[Attribute] = Nil
+
   override def simpleString(maxFields: Int): String = {
     s"View (${desc.identifier}, ${output.mkString("[", ",", "]")})"
   }
@@ -616,6 +637,7 @@ case class Aggregate(
   }
 
   override def output: Seq[Attribute] = aggregateExpressions.map(_.toAttribute)
+  override def metadataOutput: Seq[Attribute] = Nil
   override def maxRows: Option[Long] = {
     if (groupingExpressions.isEmpty) {
       Some(1L)
@@ -751,6 +773,8 @@ case class Expand(
   override lazy val references: AttributeSet =
     AttributeSet(projections.flatten.flatMap(_.references))
 
+  override def metadataOutput: Seq[Attribute] = Nil
+
   override def producedAttributes: AttributeSet = AttributeSet(output diff child.output)
 
   // This operator can reuse attributes (for example making them null when doing a roll up) so
@@ -845,6 +869,7 @@ case class Pivot(
     }
     groupByExprsOpt.getOrElse(Seq.empty).map(_.toAttribute) ++ pivotAgg
   }
+  override def metadataOutput: Seq[Attribute] = Nil
 }
 
 /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
index 3e066c977c15a..af99b6b954617 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
@@ -2751,6 +2751,109 @@ class DataSourceV2SQLSuite
     }.getMessage
     assert(errMsg.contains(expectedError))
   }
+
+  test("SPARK-34923: do not propagate metadata columns through Project") {
+    val t1 = s"${catalogAndNamespace}table"
+    withTable(t1) {
+      sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " +
+        "PARTITIONED BY (bucket(4, id), id)")
+      sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+
+      assertThrows[AnalysisException] {
+        sql(s"SELECT index, _partition from (SELECT id, data FROM $t1)")
+      }
+      assertThrows[AnalysisException] {
+        spark.table(t1).select("id", "data").select("index", "_partition")
+      }
+    }
+  }
+
+  test("SPARK-34923: do not propagate metadata columns through View") {
+    val t1 = s"${catalogAndNamespace}table"
+    val view = "view"
+
+    withTable(t1) {
+      withTempView(view) {
+        sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " +
+          "PARTITIONED BY (bucket(4, id), id)")
+        sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+        sql(s"CACHE TABLE $view AS SELECT * FROM $t1")
+        assertThrows[AnalysisException] {
+          sql(s"SELECT index, _partition FROM $view")
+        }
+      }
+    }
+  }
+
+  test("SPARK-34923: propagate metadata columns through Filter") {
+    val t1 = s"${catalogAndNamespace}table"
+    withTable(t1) {
+      sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " +
+        "PARTITIONED BY (bucket(4, id), id)")
+      sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+
+      val sqlQuery = spark.sql(s"SELECT id, data, index, _partition FROM $t1 WHERE id > 1")
+      val dfQuery = spark.table(t1).where("id > 1").select("id", "data", "index", "_partition")
+
+      Seq(sqlQuery, dfQuery).foreach { query =>
+        checkAnswer(query, Seq(Row(2, "b", 0, "0/2"), Row(3, "c", 0, "1/3")))
+      }
+    }
+  }
+
+  test("SPARK-34923: propagate metadata columns through Sort") {
+    val t1 = s"${catalogAndNamespace}table"
+    withTable(t1) {
+      sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " +
+        "PARTITIONED BY (bucket(4, id), id)")
+      sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+
+      val sqlQuery = spark.sql(s"SELECT id, data, index, _partition FROM $t1 ORDER BY id")
+      val dfQuery = spark.table(t1).orderBy("id").select("id", "data", "index", "_partition")
+
+      Seq(sqlQuery, dfQuery).foreach { query =>
+        checkAnswer(query, Seq(Row(1, "a", 0, "3/1"), Row(2, "b", 0, "0/2"), Row(3, "c", 0, "1/3")))
+      }
+    }
+  }
+
+  test("SPARK-34923: propagate metadata columns through RepartitionBy") {
+    val t1 = s"${catalogAndNamespace}table"
+    withTable(t1) {
+      sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " +
+        "PARTITIONED BY (bucket(4, id), id)")
+      sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+
+      val sqlQuery = spark.sql(
+        s"SELECT /*+ REPARTITION_BY_RANGE(3, id) */ id, data, index, _partition FROM $t1")
+      val tbl = spark.table(t1)
+      val dfQuery = tbl.repartitionByRange(3, tbl.col("id"))
+        .select("id", "data", "index", "_partition")
+
+      Seq(sqlQuery, dfQuery).foreach { query =>
+        checkAnswer(query, Seq(Row(1, "a", 0, "3/1"), Row(2, "b", 0, "0/2"), Row(3, "c", 0, "1/3")))
+      }
+    }
+  }
+
+  test("SPARK-34923: propagate metadata columns through SubqueryAlias") {
+    val t1 = s"${catalogAndNamespace}table"
+    val sbq = "sbq"
+    withTable(t1) {
+      sql(s"CREATE TABLE $t1 (id bigint, data string) USING $v2Format " +
+        "PARTITIONED BY (bucket(4, id), id)")
+      sql(s"INSERT INTO $t1 VALUES (1, 'a'), (2, 'b'), (3, 'c')")
+
+      val sqlQuery = spark.sql(
+        s"SELECT $sbq.id, $sbq.data, $sbq.index, $sbq._partition FROM $t1 as $sbq")
+      val dfQuery = spark.table(t1).as(sbq).select(
+        s"$sbq.id", s"$sbq.data", s"$sbq.index", s"$sbq._partition")
+
+      Seq(sqlQuery, dfQuery).foreach { query =>
+        checkAnswer(query, Seq(Row(1, "a", 0, "3/1"), Row(2, "b", 0, "0/2"), Row(3, "c", 0, "1/3")))
+      }
+    }
+  }
 }
 
 

From 58af5531d317f12481e5a2bc4be1e476cb6c2b70 Mon Sep 17 00:00:00 2001
From: Tanel Kiis <tanel.kiis@gmail.com>
Date: Thu, 8 Apr 2021 11:01:53 +0900
Subject: [PATCH 061/169] [SPARK-34922][SQL][3.1] Use a relative cost
 comparison function in the CBO
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?

Changed the cost comparison function of the CBO to use the ratios of row counts and sizes in bytes.

### Why are the changes needed?

In #30965 we changed to CBO cost comparison function so it would be "symetric": `A.betterThan(B)` now implies, that `!B.betterThan(A)`.
With that we caused a performance regressions in some queries - TPCDS q19 for example.

The original cost comparison function used the ratios `relativeRows = A.rowCount / B.rowCount` and `relativeSize = A.size / B.size`. The changed function compared "absolute" cost values `costA = w*A.rowCount + (1-w)*A.size` and `costB = w*B.rowCount + (1-w)*B.size`.

Given the input from wzhfy we decided to go back to the relative values, because otherwise one (size) may overwhelm the other (rowCount). But this time we avoid adding up the ratios.

Originally `A.betterThan(B) => w*relativeRows + (1-w)*relativeSize < 1` was used. Besides being "non-symteric", this also can exhibit one overwhelming other.
For `w=0.5` If `A` size (bytes) is at least 2x larger than `B`, then no matter how many times more rows does the `B` plan have, `B` will allways be considered to be better - `0.5*2 + 0.5*0.00000000000001 > 1`.

When working with ratios, then it would be better to multiply them.
The proposed cost comparison function is: `A.betterThan(B) => relativeRows^w  * relativeSize^(1-w) < 1`.

### Does this PR introduce _any_ user-facing change?

Comparison of the changed TPCDS v1.4 query execution times at sf=10:

  | absolute | multiplicative |   | additive |  
-- | -- | -- | -- | -- | --
q12 | 145 | 137 | -5.52% | 141 | -2.76%
q13 | 264 | 271 | 2.65% | 271 | 2.65%
q17 | 4521 | 4243 | -6.15% | 4348 | -3.83%
q18 | 758 | 466 | -38.52% | 480 | -36.68%
q19 | 38503 | 2167 | -94.37% | 2176 | -94.35%
q20 | 119 | 120 | 0.84% | 126 | 5.88%
q24a | 16429 | 16838 | 2.49% | 17103 | 4.10%
q24b | 16592 | 16999 | 2.45% | 17268 | 4.07%
q25 | 3558 | 3556 | -0.06% | 3675 | 3.29%
q33 | 362 | 361 | -0.28% | 380 | 4.97%
q52 | 1020 | 1032 | 1.18% | 1052 | 3.14%
q55 | 927 | 938 | 1.19% | 961 | 3.67%
q72 | 24169 | 13377 | -44.65% | 24306 | 0.57%
q81 | 1285 | 1185 | -7.78% | 1168 | -9.11%
q91 | 324 | 336 | 3.70% | 337 | 4.01%
q98 | 126 | 129 | 2.38% | 131 | 3.97%

All times are in ms, the change is compared to the situation in the master branch (absolute).
The proposed cost function (multiplicative) significantlly improves the performance on q18, q19 and q72. The original cost function (additive) has similar improvements at q18 and q19. All other chagnes are within the error bars and I would ignore them - perhaps q81 has also improved.

### How was this patch tested?

PlanStabilitySuite

Closes #32075 from tanelk/SPARK-34922_cbo_better_cost_function_3.1.

Lead-authored-by: Tanel Kiis <tanel.kiis@gmail.com>
Co-authored-by: tanel.kiis@gmail.com <tanel.kiis@gmail.com>
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
---
 .../optimizer/CostBasedJoinReorder.scala      |  28 +-
 .../apache/spark/sql/internal/SQLConf.scala   |   6 +-
 .../joinReorder/JoinReorderSuite.scala        |   3 -
 .../StarJoinCostBasedReorderSuite.scala       |   9 +-
 .../q73.sf100/explain.txt                     |   8 +-
 .../approved-plans-v1_4/q12.sf100/explain.txt | 174 ++--
 .../q12.sf100/simplified.txt                  |  52 +-
 .../approved-plans-v1_4/q13.sf100/explain.txt | 138 +--
 .../q13.sf100/simplified.txt                  |  34 +-
 .../approved-plans-v1_4/q18.sf100/explain.txt | 303 ++++---
 .../q18.sf100/simplified.txt                  |  50 +-
 .../approved-plans-v1_4/q19.sf100/explain.txt | 368 ++++----
 .../q19.sf100/simplified.txt                  | 116 +--
 .../approved-plans-v1_4/q20.sf100/explain.txt | 174 ++--
 .../q20.sf100/simplified.txt                  |  52 +-
 .../q24a.sf100/explain.txt                    | 832 +++++++++---------
 .../q24a.sf100/simplified.txt                 |  34 +-
 .../q24b.sf100/explain.txt                    | 832 +++++++++---------
 .../q24b.sf100/simplified.txt                 |  34 +-
 .../approved-plans-v1_4/q25.sf100/explain.txt | 186 ++--
 .../q25.sf100/simplified.txt                  | 130 +--
 .../approved-plans-v1_4/q33.sf100/explain.txt | 395 +++++----
 .../q33.sf100/simplified.txt                  |  58 +-
 .../approved-plans-v1_4/q52.sf100/explain.txt | 138 +--
 .../q52.sf100/simplified.txt                  |  26 +-
 .../approved-plans-v1_4/q55.sf100/explain.txt | 134 +--
 .../q55.sf100/simplified.txt                  |  26 +-
 .../approved-plans-v1_4/q72.sf100/explain.txt | 260 +++---
 .../q72.sf100/simplified.txt                  | 150 ++--
 .../approved-plans-v1_4/q81.sf100/explain.txt | 570 ++++++------
 .../q81.sf100/simplified.txt                  | 142 +--
 .../approved-plans-v1_4/q91.sf100/explain.txt | 304 +++----
 .../q91.sf100/simplified.txt                  |  62 +-
 .../approved-plans-v1_4/q98.sf100/explain.txt | 182 ++--
 .../q98.sf100/simplified.txt                  |  52 +-
 .../approved-plans-v2_7/q12.sf100/explain.txt | 174 ++--
 .../q12.sf100/simplified.txt                  |  52 +-
 .../q18a.sf100/explain.txt                    | 737 ++++++++--------
 .../q18a.sf100/simplified.txt                 |  54 +-
 .../approved-plans-v2_7/q20.sf100/explain.txt | 174 ++--
 .../q20.sf100/simplified.txt                  |  52 +-
 .../approved-plans-v2_7/q72.sf100/explain.txt | 260 +++---
 .../q72.sf100/simplified.txt                  | 150 ++--
 .../approved-plans-v2_7/q98.sf100/explain.txt | 178 ++--
 .../q98.sf100/simplified.txt                  |  52 +-
 45 files changed, 4024 insertions(+), 3921 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala
index c41686da79487..828f768f17701 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CostBasedJoinReorder.scala
@@ -348,12 +348,30 @@ object JoinReorderDP extends PredicateHelper with Logging {
       }
     }
 
+    /**
+     * To identify the plan with smaller computational cost,
+     * we use the weighted geometric mean of ratio of rows and the ratio of sizes in bytes.
+     *
+     * There are other ways to combine these values as a cost comparison function.
+     * Some of these, that we have experimented with, but have gotten worse result,
+     * than with the current one:
+     * 1) Weighted arithmetic mean of these two ratios - adding up fractions puts
+     * less emphasis on ratios between 0 and 1. Ratios 10 and 0.1 should be considered
+     * to be just as strong evidences in opposite directions. The arithmetic mean of these
+     * would be heavily biased towards the 10.
+     * 2) Absolute cost (cost = weight * rowCount + (1 - weight) * size) - when adding up
+     * two numeric measurements that have different units we can easily end up with one
+     * overwhelming the other.
+     */
     def betterThan(other: JoinPlan, conf: SQLConf): Boolean = {
-      val thisCost = BigDecimal(this.planCost.card) * conf.joinReorderCardWeight +
-        BigDecimal(this.planCost.size) * (1 - conf.joinReorderCardWeight)
-      val otherCost = BigDecimal(other.planCost.card) * conf.joinReorderCardWeight +
-        BigDecimal(other.planCost.size) * (1 - conf.joinReorderCardWeight)
-      thisCost < otherCost
+      if (other.planCost.card == 0 || other.planCost.size == 0) {
+        false
+      } else {
+        val relativeRows = BigDecimal(this.planCost.card) / BigDecimal(other.planCost.card)
+        val relativeSize = BigDecimal(this.planCost.size) / BigDecimal(other.planCost.size)
+        Math.pow(relativeRows.doubleValue, conf.joinReorderCardWeight) *
+          Math.pow(relativeSize.doubleValue, 1 - conf.joinReorderCardWeight) < 1
+      }
     }
   }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index ba4746e10680b..19215d51d4ef8 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -1882,8 +1882,10 @@ object SQLConf {
   val JOIN_REORDER_CARD_WEIGHT =
     buildConf("spark.sql.cbo.joinReorder.card.weight")
       .internal()
-      .doc("The weight of cardinality (number of rows) for plan cost comparison in join reorder: " +
-        "rows * weight + size * (1 - weight).")
+      .doc("The weight of the ratio of cardinalities (number of rows) " +
+        "in the cost comparison function. The ratio of sizes in bytes has weight " +
+        "1 - this value. The weighted geometric mean of these ratios is used to decide " +
+        "which of the candidate plans will be chosen by the CBO.")
       .version("2.2.0")
       .doubleConf
       .checkValue(weight => weight >= 0 && weight <= 1, "The weight value must be in [0, 1].")
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/joinReorder/JoinReorderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/joinReorder/JoinReorderSuite.scala
index 2e1cf4a137e25..7b591766cb4b2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/joinReorder/JoinReorderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/joinReorder/JoinReorderSuite.scala
@@ -369,9 +369,6 @@ class JoinReorderSuite extends JoinReorderPlanTestBase with StatsEstimationTestB
     val plan1 = JoinPlan(null, null, null, Cost(300, 80))
     val plan2 = JoinPlan(null, null, null, Cost(500, 30))
 
-    // cost1 = 300*0.7 + 80*0.3 = 234
-    // cost2 = 500*0.7 + 30*0.3 = 359
-
     assert(!plan1.betterThan(plan1, conf))
     assert(!plan2.betterThan(plan2, conf))
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/joinReorder/StarJoinCostBasedReorderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/joinReorder/StarJoinCostBasedReorderSuite.scala
index a42914765dcc8..c4a660532b207 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/joinReorder/StarJoinCostBasedReorderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/joinReorder/StarJoinCostBasedReorderSuite.scala
@@ -294,12 +294,13 @@ class StarJoinCostBasedReorderSuite extends JoinReorderPlanTestBase with StatsEs
           (nameToAttr("f1_fk2") === nameToAttr("d2_pk")))
 
     val expected =
-      t3.join(t4, Inner, Some(nameToAttr("t3_c1") === nameToAttr("t4_c1")))
+      f1
+        .join(d2, Inner, Some(nameToAttr("f1_fk2") === nameToAttr("d2_pk")))
+        .join(d1, Inner, Some(nameToAttr("f1_fk1") === nameToAttr("d1_pk")))
+        .join(t3.join(t4, Inner, Some(nameToAttr("t3_c1") === nameToAttr("t4_c1"))), Inner,
+          Some(nameToAttr("d1_c2") === nameToAttr("t4_c1")))
         .join(t1.join(t2, Inner, Some(nameToAttr("t1_c1") === nameToAttr("t2_c1"))), Inner,
           Some(nameToAttr("t1_c2") === nameToAttr("t4_c2")))
-        .join(f1
-          .join(d2, Inner, Some(nameToAttr("f1_fk2") === nameToAttr("d2_pk")))
-          .join(d1, Inner, Some(nameToAttr("f1_fk1") === nameToAttr("d1_pk"))))
         .select(outputsOf(d1, t1, t2, t3, t4, f1, d2): _*)
 
     assertEqualJoinPlans(Optimize, query, expected)
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-modified/q73.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-modified/q73.sf100/explain.txt
index 25da173c8ecde..cfdd41a08844c 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-modified/q73.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-modified/q73.sf100/explain.txt
@@ -117,7 +117,7 @@ Input [5]: [ss_customer_sk#2, ss_hdemo_sk#3, ss_store_sk#4, ss_ticket_number#5,
 Output [4]: [hd_demo_sk#13, hd_buy_potential#14, hd_dep_count#15, hd_vehicle_count#16]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/household_demographics]
-PushedFilters: [IsNotNull(hd_vehicle_count), Or(EqualTo(hd_buy_potential,>10000),EqualTo(hd_buy_potential,Unknown)), GreaterThan(hd_vehicle_count,0), IsNotNull(hd_demo_sk)]
+PushedFilters: [IsNotNull(hd_vehicle_count), Or(EqualTo(hd_buy_potential,>10000         ),EqualTo(hd_buy_potential,Unknown        )), GreaterThan(hd_vehicle_count,0), IsNotNull(hd_demo_sk)]
 ReadSchema: struct<hd_demo_sk:int,hd_buy_potential:string,hd_dep_count:int,hd_vehicle_count:int>
 
 (19) ColumnarToRow [codegen id : 3]
@@ -125,7 +125,7 @@ Input [4]: [hd_demo_sk#13, hd_buy_potential#14, hd_dep_count#15, hd_vehicle_coun
 
 (20) Filter [codegen id : 3]
 Input [4]: [hd_demo_sk#13, hd_buy_potential#14, hd_dep_count#15, hd_vehicle_count#16]
-Condition : ((((isnotnull(hd_vehicle_count#16) AND ((hd_buy_potential#14 = >10000) OR (hd_buy_potential#14 = Unknown))) AND (hd_vehicle_count#16 > 0)) AND (CASE WHEN (hd_vehicle_count#16 > 0) THEN (cast(hd_dep_count#15 as double) / cast(hd_vehicle_count#16 as double)) ELSE null END > 1.0)) AND isnotnull(hd_demo_sk#13))
+Condition : ((((isnotnull(hd_vehicle_count#16) AND ((hd_buy_potential#14 = >10000         ) OR (hd_buy_potential#14 = Unknown        ))) AND (hd_vehicle_count#16 > 0)) AND (CASE WHEN (hd_vehicle_count#16 > 0) THEN (cast(hd_dep_count#15 as double) / cast(hd_vehicle_count#16 as double)) ELSE null END > 1.0)) AND isnotnull(hd_demo_sk#13))
 
 (21) Project [codegen id : 3]
 Output [1]: [hd_demo_sk#13]
@@ -153,7 +153,7 @@ Results [3]: [ss_ticket_number#5, ss_customer_sk#2, count#19]
 
 (26) Exchange
 Input [3]: [ss_ticket_number#5, ss_customer_sk#2, count#19]
-Arguments: hashpartitioning(ss_ticket_number#5, ss_customer_sk#2, 5), true, [id=#20]
+Arguments: hashpartitioning(ss_ticket_number#5, ss_customer_sk#2, 5), ENSURE_REQUIREMENTS, [id=#20]
 
 (27) HashAggregate [codegen id : 5]
 Input [3]: [ss_ticket_number#5, ss_customer_sk#2, count#19]
@@ -195,7 +195,7 @@ Input [8]: [ss_ticket_number#5, ss_customer_sk#2, cnt#22, c_customer_sk#24, c_sa
 
 (35) Exchange
 Input [6]: [c_last_name#27, c_first_name#26, c_salutation#25, c_preferred_cust_flag#28, ss_ticket_number#5, cnt#22]
-Arguments: rangepartitioning(cnt#22 DESC NULLS LAST, 5), true, [id=#29]
+Arguments: rangepartitioning(cnt#22 DESC NULLS LAST, 5), ENSURE_REQUIREMENTS, [id=#29]
 
 (36) Sort [codegen id : 7]
 Input [6]: [c_last_name#27, c_first_name#26, c_salutation#25, c_preferred_cust_flag#28, ss_ticket_number#5, cnt#22]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q12.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q12.sf100/explain.txt
index b4dd8173664b6..f4dc8d6c5d6df 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q12.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q12.sf100/explain.txt
@@ -8,24 +8,24 @@ TakeOrderedAndProject (27)
                +- Exchange (21)
                   +- * HashAggregate (20)
                      +- * Project (19)
-                        +- * SortMergeJoin Inner (18)
-                           :- * Sort (12)
-                           :  +- Exchange (11)
-                           :     +- * Project (10)
-                           :        +- * BroadcastHashJoin Inner BuildRight (9)
-                           :           :- * Filter (3)
-                           :           :  +- * ColumnarToRow (2)
-                           :           :     +- Scan parquet default.web_sales (1)
-                           :           +- BroadcastExchange (8)
-                           :              +- * Project (7)
-                           :                 +- * Filter (6)
-                           :                    +- * ColumnarToRow (5)
-                           :                       +- Scan parquet default.date_dim (4)
-                           +- * Sort (17)
-                              +- Exchange (16)
+                        +- * BroadcastHashJoin Inner BuildRight (18)
+                           :- * Project (12)
+                           :  +- * SortMergeJoin Inner (11)
+                           :     :- * Sort (5)
+                           :     :  +- Exchange (4)
+                           :     :     +- * Filter (3)
+                           :     :        +- * ColumnarToRow (2)
+                           :     :           +- Scan parquet default.web_sales (1)
+                           :     +- * Sort (10)
+                           :        +- Exchange (9)
+                           :           +- * Filter (8)
+                           :              +- * ColumnarToRow (7)
+                           :                 +- Scan parquet default.item (6)
+                           +- BroadcastExchange (17)
+                              +- * Project (16)
                                  +- * Filter (15)
                                     +- * ColumnarToRow (14)
-                                       +- Scan parquet default.item (13)
+                                       +- Scan parquet default.date_dim (13)
 
 
 (1) Scan parquet default.web_sales
@@ -35,118 +35,118 @@ Location [not included in comparison]/{warehouse_dir}/web_sales]
 PushedFilters: [IsNotNull(ws_item_sk), IsNotNull(ws_sold_date_sk)]
 ReadSchema: struct<ws_sold_date_sk:int,ws_item_sk:int,ws_ext_sales_price:decimal(7,2)>
 
-(2) ColumnarToRow [codegen id : 2]
+(2) ColumnarToRow [codegen id : 1]
 Input [3]: [ws_sold_date_sk#1, ws_item_sk#2, ws_ext_sales_price#3]
 
-(3) Filter [codegen id : 2]
+(3) Filter [codegen id : 1]
 Input [3]: [ws_sold_date_sk#1, ws_item_sk#2, ws_ext_sales_price#3]
 Condition : (isnotnull(ws_item_sk#2) AND isnotnull(ws_sold_date_sk#1))
 
-(4) Scan parquet default.date_dim
-Output [2]: [d_date_sk#4, d_date#5]
-Batched: true
-Location [not included in comparison]/{warehouse_dir}/date_dim]
-PushedFilters: [IsNotNull(d_date), GreaterThanOrEqual(d_date,1999-02-22), LessThanOrEqual(d_date,1999-03-24), IsNotNull(d_date_sk)]
-ReadSchema: struct<d_date_sk:int,d_date:date>
+(4) Exchange
+Input [3]: [ws_sold_date_sk#1, ws_item_sk#2, ws_ext_sales_price#3]
+Arguments: hashpartitioning(ws_item_sk#2, 5), ENSURE_REQUIREMENTS, [id=#4]
 
-(5) ColumnarToRow [codegen id : 1]
-Input [2]: [d_date_sk#4, d_date#5]
+(5) Sort [codegen id : 2]
+Input [3]: [ws_sold_date_sk#1, ws_item_sk#2, ws_ext_sales_price#3]
+Arguments: [ws_item_sk#2 ASC NULLS FIRST], false, 0
 
-(6) Filter [codegen id : 1]
-Input [2]: [d_date_sk#4, d_date#5]
-Condition : (((isnotnull(d_date#5) AND (d_date#5 >= 10644)) AND (d_date#5 <= 10674)) AND isnotnull(d_date_sk#4))
+(6) Scan parquet default.item
+Output [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/item]
+PushedFilters: [In(i_category, [Sports                                            ,Books                                             ,Home                                              ]), IsNotNull(i_item_sk)]
+ReadSchema: struct<i_item_sk:int,i_item_id:string,i_item_desc:string,i_current_price:decimal(7,2),i_class:string,i_category:string>
 
-(7) Project [codegen id : 1]
-Output [1]: [d_date_sk#4]
-Input [2]: [d_date_sk#4, d_date#5]
+(7) ColumnarToRow [codegen id : 3]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
 
-(8) BroadcastExchange
-Input [1]: [d_date_sk#4]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#6]
+(8) Filter [codegen id : 3]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Condition : (i_category#10 IN (Sports                                            ,Books                                             ,Home                                              ) AND isnotnull(i_item_sk#5))
 
-(9) BroadcastHashJoin [codegen id : 2]
-Left keys [1]: [ws_sold_date_sk#1]
-Right keys [1]: [d_date_sk#4]
-Join condition: None
+(9) Exchange
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Arguments: hashpartitioning(i_item_sk#5, 5), ENSURE_REQUIREMENTS, [id=#11]
 
-(10) Project [codegen id : 2]
-Output [2]: [ws_item_sk#2, ws_ext_sales_price#3]
-Input [4]: [ws_sold_date_sk#1, ws_item_sk#2, ws_ext_sales_price#3, d_date_sk#4]
+(10) Sort [codegen id : 4]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Arguments: [i_item_sk#5 ASC NULLS FIRST], false, 0
 
-(11) Exchange
-Input [2]: [ws_item_sk#2, ws_ext_sales_price#3]
-Arguments: hashpartitioning(ws_item_sk#2, 5), true, [id=#7]
+(11) SortMergeJoin [codegen id : 6]
+Left keys [1]: [ws_item_sk#2]
+Right keys [1]: [i_item_sk#5]
+Join condition: None
 
-(12) Sort [codegen id : 3]
-Input [2]: [ws_item_sk#2, ws_ext_sales_price#3]
-Arguments: [ws_item_sk#2 ASC NULLS FIRST], false, 0
+(12) Project [codegen id : 6]
+Output [7]: [ws_sold_date_sk#1, ws_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Input [9]: [ws_sold_date_sk#1, ws_item_sk#2, ws_ext_sales_price#3, i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
 
-(13) Scan parquet default.item
-Output [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+(13) Scan parquet default.date_dim
+Output [2]: [d_date_sk#12, d_date#13]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/item]
-PushedFilters: [In(i_category, [Sports,Books,Home]), IsNotNull(i_item_sk)]
-ReadSchema: struct<i_item_sk:int,i_item_id:string,i_item_desc:string,i_current_price:decimal(7,2),i_class:string,i_category:string>
+Location [not included in comparison]/{warehouse_dir}/date_dim]
+PushedFilters: [IsNotNull(d_date), GreaterThanOrEqual(d_date,1999-02-22), LessThanOrEqual(d_date,1999-03-24), IsNotNull(d_date_sk)]
+ReadSchema: struct<d_date_sk:int,d_date:date>
 
-(14) ColumnarToRow [codegen id : 4]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+(14) ColumnarToRow [codegen id : 5]
+Input [2]: [d_date_sk#12, d_date#13]
 
-(15) Filter [codegen id : 4]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Condition : (i_category#13 IN (Sports,Books,Home) AND isnotnull(i_item_sk#8))
+(15) Filter [codegen id : 5]
+Input [2]: [d_date_sk#12, d_date#13]
+Condition : (((isnotnull(d_date#13) AND (d_date#13 >= 10644)) AND (d_date#13 <= 10674)) AND isnotnull(d_date_sk#12))
 
-(16) Exchange
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Arguments: hashpartitioning(i_item_sk#8, 5), true, [id=#14]
+(16) Project [codegen id : 5]
+Output [1]: [d_date_sk#12]
+Input [2]: [d_date_sk#12, d_date#13]
 
-(17) Sort [codegen id : 5]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Arguments: [i_item_sk#8 ASC NULLS FIRST], false, 0
+(17) BroadcastExchange
+Input [1]: [d_date_sk#12]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#14]
 
-(18) SortMergeJoin [codegen id : 6]
-Left keys [1]: [ws_item_sk#2]
-Right keys [1]: [i_item_sk#8]
+(18) BroadcastHashJoin [codegen id : 6]
+Left keys [1]: [ws_sold_date_sk#1]
+Right keys [1]: [d_date_sk#12]
 Join condition: None
 
 (19) Project [codegen id : 6]
-Output [6]: [ws_ext_sales_price#3, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Input [8]: [ws_item_sk#2, ws_ext_sales_price#3, i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+Output [6]: [ws_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Input [8]: [ws_sold_date_sk#1, ws_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10, d_date_sk#12]
 
 (20) HashAggregate [codegen id : 6]
-Input [6]: [ws_ext_sales_price#3, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Keys [5]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11]
+Input [6]: [ws_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Keys [5]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8]
 Functions [1]: [partial_sum(UnscaledValue(ws_ext_sales_price#3))]
 Aggregate Attributes [1]: [sum#15]
-Results [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
+Results [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
 
 (21) Exchange
-Input [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
-Arguments: hashpartitioning(i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, 5), true, [id=#17]
+Input [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
+Arguments: hashpartitioning(i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, 5), ENSURE_REQUIREMENTS, [id=#17]
 
 (22) HashAggregate [codegen id : 7]
-Input [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
-Keys [5]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11]
+Input [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
+Keys [5]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8]
 Functions [1]: [sum(UnscaledValue(ws_ext_sales_price#3))]
 Aggregate Attributes [1]: [sum(UnscaledValue(ws_ext_sales_price#3))#18]
-Results [8]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, MakeDecimal(sum(UnscaledValue(ws_ext_sales_price#3))#18,17,2) AS itemrevenue#19, MakeDecimal(sum(UnscaledValue(ws_ext_sales_price#3))#18,17,2) AS _w0#20, MakeDecimal(sum(UnscaledValue(ws_ext_sales_price#3))#18,17,2) AS _w1#21, i_item_id#9]
+Results [8]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, MakeDecimal(sum(UnscaledValue(ws_ext_sales_price#3))#18,17,2) AS itemrevenue#19, MakeDecimal(sum(UnscaledValue(ws_ext_sales_price#3))#18,17,2) AS _w0#20, MakeDecimal(sum(UnscaledValue(ws_ext_sales_price#3))#18,17,2) AS _w1#21, i_item_id#6]
 
 (23) Exchange
-Input [8]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, i_item_id#9]
-Arguments: hashpartitioning(i_class#12, 5), true, [id=#22]
+Input [8]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, i_item_id#6]
+Arguments: hashpartitioning(i_class#9, 5), ENSURE_REQUIREMENTS, [id=#22]
 
 (24) Sort [codegen id : 8]
-Input [8]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, i_item_id#9]
-Arguments: [i_class#12 ASC NULLS FIRST], false, 0
+Input [8]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, i_item_id#6]
+Arguments: [i_class#9 ASC NULLS FIRST], false, 0
 
 (25) Window
-Input [8]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, i_item_id#9]
-Arguments: [sum(_w1#21) windowspecdefinition(i_class#12, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#23], [i_class#12]
+Input [8]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, i_item_id#6]
+Arguments: [sum(_w1#21) windowspecdefinition(i_class#9, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#23], [i_class#9]
 
 (26) Project [codegen id : 9]
-Output [7]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(_w0#20) * 100.00), DecimalType(21,2), true) as decimal(27,2))) / promote_precision(_we0#23)), DecimalType(38,17), true) AS revenueratio#24, i_item_id#9]
-Input [9]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, i_item_id#9, _we0#23]
+Output [7]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(_w0#20) * 100.00), DecimalType(21,2), true) as decimal(27,2))) / promote_precision(_we0#23)), DecimalType(38,17), true) AS revenueratio#24, i_item_id#6]
+Input [9]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, i_item_id#6, _we0#23]
 
 (27) TakeOrderedAndProject
-Input [7]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, revenueratio#24, i_item_id#9]
-Arguments: 100, [i_category#13 ASC NULLS FIRST, i_class#12 ASC NULLS FIRST, i_item_id#9 ASC NULLS FIRST, i_item_desc#10 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST], [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, revenueratio#24]
+Input [7]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, revenueratio#24, i_item_id#6]
+Arguments: 100, [i_category#10 ASC NULLS FIRST, i_class#9 ASC NULLS FIRST, i_item_id#6 ASC NULLS FIRST, i_item_desc#7 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST], [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, revenueratio#24]
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q12.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q12.sf100/simplified.txt
index e9b94a6b4651c..2207b0fee23ce 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q12.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q12.sf100/simplified.txt
@@ -14,34 +14,34 @@ TakeOrderedAndProject [i_category,i_class,i_item_id,i_item_desc,revenueratio,i_c
                           WholeStageCodegen (6)
                             HashAggregate [i_item_id,i_item_desc,i_category,i_class,i_current_price,ws_ext_sales_price] [sum,sum]
                               Project [ws_ext_sales_price,i_item_id,i_item_desc,i_current_price,i_class,i_category]
-                                SortMergeJoin [ws_item_sk,i_item_sk]
-                                  InputAdapter
-                                    WholeStageCodegen (3)
-                                      Sort [ws_item_sk]
-                                        InputAdapter
-                                          Exchange [ws_item_sk] #3
-                                            WholeStageCodegen (2)
-                                              Project [ws_item_sk,ws_ext_sales_price]
-                                                BroadcastHashJoin [ws_sold_date_sk,d_date_sk]
+                                BroadcastHashJoin [ws_sold_date_sk,d_date_sk]
+                                  Project [ws_sold_date_sk,ws_ext_sales_price,i_item_id,i_item_desc,i_current_price,i_class,i_category]
+                                    SortMergeJoin [ws_item_sk,i_item_sk]
+                                      InputAdapter
+                                        WholeStageCodegen (2)
+                                          Sort [ws_item_sk]
+                                            InputAdapter
+                                              Exchange [ws_item_sk] #3
+                                                WholeStageCodegen (1)
                                                   Filter [ws_item_sk,ws_sold_date_sk]
                                                     ColumnarToRow
                                                       InputAdapter
                                                         Scan parquet default.web_sales [ws_sold_date_sk,ws_item_sk,ws_ext_sales_price]
-                                                  InputAdapter
-                                                    BroadcastExchange #4
-                                                      WholeStageCodegen (1)
-                                                        Project [d_date_sk]
-                                                          Filter [d_date,d_date_sk]
-                                                            ColumnarToRow
-                                                              InputAdapter
-                                                                Scan parquet default.date_dim [d_date_sk,d_date]
+                                      InputAdapter
+                                        WholeStageCodegen (4)
+                                          Sort [i_item_sk]
+                                            InputAdapter
+                                              Exchange [i_item_sk] #4
+                                                WholeStageCodegen (3)
+                                                  Filter [i_category,i_item_sk]
+                                                    ColumnarToRow
+                                                      InputAdapter
+                                                        Scan parquet default.item [i_item_sk,i_item_id,i_item_desc,i_current_price,i_class,i_category]
                                   InputAdapter
-                                    WholeStageCodegen (5)
-                                      Sort [i_item_sk]
-                                        InputAdapter
-                                          Exchange [i_item_sk] #5
-                                            WholeStageCodegen (4)
-                                              Filter [i_category,i_item_sk]
-                                                ColumnarToRow
-                                                  InputAdapter
-                                                    Scan parquet default.item [i_item_sk,i_item_id,i_item_desc,i_current_price,i_class,i_category]
+                                    BroadcastExchange #5
+                                      WholeStageCodegen (5)
+                                        Project [d_date_sk]
+                                          Filter [d_date,d_date_sk]
+                                            ColumnarToRow
+                                              InputAdapter
+                                                Scan parquet default.date_dim [d_date_sk,d_date]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q13.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q13.sf100/explain.txt
index 327e7db702faa..3dc7034581000 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q13.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q13.sf100/explain.txt
@@ -4,8 +4,8 @@
    +- * HashAggregate (36)
       +- * Project (35)
          +- * BroadcastHashJoin Inner BuildRight (34)
-            :- * Project (29)
-            :  +- * BroadcastHashJoin Inner BuildRight (28)
+            :- * Project (28)
+            :  +- * BroadcastHashJoin Inner BuildRight (27)
             :     :- * Project (22)
             :     :  +- * BroadcastHashJoin Inner BuildRight (21)
             :     :     :- * Project (15)
@@ -27,16 +27,16 @@
             :     :        +- * Project (19)
             :     :           +- * Filter (18)
             :     :              +- * ColumnarToRow (17)
-            :     :                 +- Scan parquet default.customer_address (16)
-            :     +- BroadcastExchange (27)
-            :        +- * Project (26)
-            :           +- * Filter (25)
-            :              +- * ColumnarToRow (24)
-            :                 +- Scan parquet default.date_dim (23)
+            :     :                 +- Scan parquet default.date_dim (16)
+            :     +- BroadcastExchange (26)
+            :        +- * Filter (25)
+            :           +- * ColumnarToRow (24)
+            :              +- Scan parquet default.store (23)
             +- BroadcastExchange (33)
-               +- * Filter (32)
-                  +- * ColumnarToRow (31)
-                     +- Scan parquet default.store (30)
+               +- * Project (32)
+                  +- * Filter (31)
+                     +- * ColumnarToRow (30)
+                        +- Scan parquet default.customer_address (29)
 
 
 (1) Scan parquet default.store_sales
@@ -57,7 +57,7 @@ Condition : ((((((isnotnull(ss_store_sk#5) AND isnotnull(ss_addr_sk#4)) AND isno
 Output [3]: [cd_demo_sk#11, cd_marital_status#12, cd_education_status#13]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer_demographics]
-PushedFilters: [IsNotNull(cd_demo_sk), Or(Or(And(EqualTo(cd_marital_status,M),EqualTo(cd_education_status,Advanced Degree)),And(EqualTo(cd_marital_status,S),EqualTo(cd_education_status,College))),And(EqualTo(cd_marital_status,W),EqualTo(cd_education_status,2 yr Degree)))]
+PushedFilters: [IsNotNull(cd_demo_sk), Or(Or(And(EqualTo(cd_marital_status,M),EqualTo(cd_education_status,Advanced Degree     )),And(EqualTo(cd_marital_status,S),EqualTo(cd_education_status,College             ))),And(EqualTo(cd_marital_status,W),EqualTo(cd_education_status,2 yr Degree         )))]
 ReadSchema: struct<cd_demo_sk:int,cd_marital_status:string,cd_education_status:string>
 
 (5) ColumnarToRow [codegen id : 1]
@@ -65,7 +65,7 @@ Input [3]: [cd_demo_sk#11, cd_marital_status#12, cd_education_status#13]
 
 (6) Filter [codegen id : 1]
 Input [3]: [cd_demo_sk#11, cd_marital_status#12, cd_education_status#13]
-Condition : (isnotnull(cd_demo_sk#11) AND ((((cd_marital_status#12 = M) AND (cd_education_status#13 = Advanced Degree)) OR ((cd_marital_status#12 = S) AND (cd_education_status#13 = College))) OR ((cd_marital_status#12 = W) AND (cd_education_status#13 = 2 yr Degree))))
+Condition : (isnotnull(cd_demo_sk#11) AND ((((cd_marital_status#12 = M) AND (cd_education_status#13 = Advanced Degree     )) OR ((cd_marital_status#12 = S) AND (cd_education_status#13 = College             ))) OR ((cd_marital_status#12 = W) AND (cd_education_status#13 = 2 yr Degree         ))))
 
 (7) BroadcastExchange
 Input [3]: [cd_demo_sk#11, cd_marital_status#12, cd_education_status#13]
@@ -74,7 +74,7 @@ Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)
 (8) BroadcastHashJoin [codegen id : 6]
 Left keys [1]: [ss_cdemo_sk#2]
 Right keys [1]: [cd_demo_sk#11]
-Join condition: ((((((cd_marital_status#12 = M) AND (cd_education_status#13 = Advanced Degree)) AND (ss_sales_price#7 >= 100.00)) AND (ss_sales_price#7 <= 150.00)) OR ((((cd_marital_status#12 = S) AND (cd_education_status#13 = College)) AND (ss_sales_price#7 >= 50.00)) AND (ss_sales_price#7 <= 100.00))) OR ((((cd_marital_status#12 = W) AND (cd_education_status#13 = 2 yr Degree)) AND (ss_sales_price#7 >= 150.00)) AND (ss_sales_price#7 <= 200.00)))
+Join condition: ((((((cd_marital_status#12 = M) AND (cd_education_status#13 = Advanced Degree     )) AND (ss_sales_price#7 >= 100.00)) AND (ss_sales_price#7 <= 150.00)) OR ((((cd_marital_status#12 = S) AND (cd_education_status#13 = College             )) AND (ss_sales_price#7 >= 50.00)) AND (ss_sales_price#7 <= 100.00))) OR ((((cd_marital_status#12 = W) AND (cd_education_status#13 = 2 yr Degree         )) AND (ss_sales_price#7 >= 150.00)) AND (ss_sales_price#7 <= 200.00)))
 
 (9) Project [codegen id : 6]
 Output [11]: [ss_sold_date_sk#1, ss_hdemo_sk#3, ss_addr_sk#4, ss_store_sk#5, ss_quantity#6, ss_sales_price#7, ss_ext_sales_price#8, ss_ext_wholesale_cost#9, ss_net_profit#10, cd_marital_status#12, cd_education_status#13]
@@ -101,100 +101,100 @@ Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)
 (14) BroadcastHashJoin [codegen id : 6]
 Left keys [1]: [ss_hdemo_sk#3]
 Right keys [1]: [hd_demo_sk#15]
-Join condition: (((((((cd_marital_status#12 = M) AND (cd_education_status#13 = Advanced Degree)) AND (ss_sales_price#7 >= 100.00)) AND (ss_sales_price#7 <= 150.00)) AND (hd_dep_count#16 = 3)) OR (((((cd_marital_status#12 = S) AND (cd_education_status#13 = College)) AND (ss_sales_price#7 >= 50.00)) AND (ss_sales_price#7 <= 100.00)) AND (hd_dep_count#16 = 1))) OR (((((cd_marital_status#12 = W) AND (cd_education_status#13 = 2 yr Degree)) AND (ss_sales_price#7 >= 150.00)) AND (ss_sales_price#7 <= 200.00)) AND (hd_dep_count#16 = 1)))
+Join condition: (((((((cd_marital_status#12 = M) AND (cd_education_status#13 = Advanced Degree     )) AND (ss_sales_price#7 >= 100.00)) AND (ss_sales_price#7 <= 150.00)) AND (hd_dep_count#16 = 3)) OR (((((cd_marital_status#12 = S) AND (cd_education_status#13 = College             )) AND (ss_sales_price#7 >= 50.00)) AND (ss_sales_price#7 <= 100.00)) AND (hd_dep_count#16 = 1))) OR (((((cd_marital_status#12 = W) AND (cd_education_status#13 = 2 yr Degree         )) AND (ss_sales_price#7 >= 150.00)) AND (ss_sales_price#7 <= 200.00)) AND (hd_dep_count#16 = 1)))
 
 (15) Project [codegen id : 6]
 Output [7]: [ss_sold_date_sk#1, ss_addr_sk#4, ss_store_sk#5, ss_quantity#6, ss_ext_sales_price#8, ss_ext_wholesale_cost#9, ss_net_profit#10]
 Input [13]: [ss_sold_date_sk#1, ss_hdemo_sk#3, ss_addr_sk#4, ss_store_sk#5, ss_quantity#6, ss_sales_price#7, ss_ext_sales_price#8, ss_ext_wholesale_cost#9, ss_net_profit#10, cd_marital_status#12, cd_education_status#13, hd_demo_sk#15, hd_dep_count#16]
 
-(16) Scan parquet default.customer_address
-Output [3]: [ca_address_sk#18, ca_state#19, ca_country#20]
+(16) Scan parquet default.date_dim
+Output [2]: [d_date_sk#18, d_year#19]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/customer_address]
-PushedFilters: [IsNotNull(ca_country), EqualTo(ca_country,United States), IsNotNull(ca_address_sk), Or(Or(In(ca_state, [TX,OH]),In(ca_state, [OR,NM,KY])),In(ca_state, [VA,TX,MS]))]
-ReadSchema: struct<ca_address_sk:int,ca_state:string,ca_country:string>
+Location [not included in comparison]/{warehouse_dir}/date_dim]
+PushedFilters: [IsNotNull(d_year), EqualTo(d_year,2001), IsNotNull(d_date_sk)]
+ReadSchema: struct<d_date_sk:int,d_year:int>
 
 (17) ColumnarToRow [codegen id : 3]
-Input [3]: [ca_address_sk#18, ca_state#19, ca_country#20]
+Input [2]: [d_date_sk#18, d_year#19]
 
 (18) Filter [codegen id : 3]
-Input [3]: [ca_address_sk#18, ca_state#19, ca_country#20]
-Condition : (((isnotnull(ca_country#20) AND (ca_country#20 = United States)) AND isnotnull(ca_address_sk#18)) AND ((ca_state#19 IN (TX,OH) OR ca_state#19 IN (OR,NM,KY)) OR ca_state#19 IN (VA,TX,MS)))
+Input [2]: [d_date_sk#18, d_year#19]
+Condition : ((isnotnull(d_year#19) AND (d_year#19 = 2001)) AND isnotnull(d_date_sk#18))
 
 (19) Project [codegen id : 3]
-Output [2]: [ca_address_sk#18, ca_state#19]
-Input [3]: [ca_address_sk#18, ca_state#19, ca_country#20]
+Output [1]: [d_date_sk#18]
+Input [2]: [d_date_sk#18, d_year#19]
 
 (20) BroadcastExchange
-Input [2]: [ca_address_sk#18, ca_state#19]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#21]
+Input [1]: [d_date_sk#18]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#20]
 
 (21) BroadcastHashJoin [codegen id : 6]
-Left keys [1]: [ss_addr_sk#4]
-Right keys [1]: [ca_address_sk#18]
-Join condition: ((((ca_state#19 IN (TX,OH) AND (ss_net_profit#10 >= 100.00)) AND (ss_net_profit#10 <= 200.00)) OR ((ca_state#19 IN (OR,NM,KY) AND (ss_net_profit#10 >= 150.00)) AND (ss_net_profit#10 <= 300.00))) OR ((ca_state#19 IN (VA,TX,MS) AND (ss_net_profit#10 >= 50.00)) AND (ss_net_profit#10 <= 250.00)))
+Left keys [1]: [ss_sold_date_sk#1]
+Right keys [1]: [d_date_sk#18]
+Join condition: None
 
 (22) Project [codegen id : 6]
-Output [5]: [ss_sold_date_sk#1, ss_store_sk#5, ss_quantity#6, ss_ext_sales_price#8, ss_ext_wholesale_cost#9]
-Input [9]: [ss_sold_date_sk#1, ss_addr_sk#4, ss_store_sk#5, ss_quantity#6, ss_ext_sales_price#8, ss_ext_wholesale_cost#9, ss_net_profit#10, ca_address_sk#18, ca_state#19]
+Output [6]: [ss_addr_sk#4, ss_store_sk#5, ss_quantity#6, ss_ext_sales_price#8, ss_ext_wholesale_cost#9, ss_net_profit#10]
+Input [8]: [ss_sold_date_sk#1, ss_addr_sk#4, ss_store_sk#5, ss_quantity#6, ss_ext_sales_price#8, ss_ext_wholesale_cost#9, ss_net_profit#10, d_date_sk#18]
 
-(23) Scan parquet default.date_dim
-Output [2]: [d_date_sk#22, d_year#23]
+(23) Scan parquet default.store
+Output [1]: [s_store_sk#21]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/date_dim]
-PushedFilters: [IsNotNull(d_year), EqualTo(d_year,2001), IsNotNull(d_date_sk)]
-ReadSchema: struct<d_date_sk:int,d_year:int>
+Location [not included in comparison]/{warehouse_dir}/store]
+PushedFilters: [IsNotNull(s_store_sk)]
+ReadSchema: struct<s_store_sk:int>
 
 (24) ColumnarToRow [codegen id : 4]
-Input [2]: [d_date_sk#22, d_year#23]
+Input [1]: [s_store_sk#21]
 
 (25) Filter [codegen id : 4]
-Input [2]: [d_date_sk#22, d_year#23]
-Condition : ((isnotnull(d_year#23) AND (d_year#23 = 2001)) AND isnotnull(d_date_sk#22))
-
-(26) Project [codegen id : 4]
-Output [1]: [d_date_sk#22]
-Input [2]: [d_date_sk#22, d_year#23]
+Input [1]: [s_store_sk#21]
+Condition : isnotnull(s_store_sk#21)
 
-(27) BroadcastExchange
-Input [1]: [d_date_sk#22]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#24]
+(26) BroadcastExchange
+Input [1]: [s_store_sk#21]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#22]
 
-(28) BroadcastHashJoin [codegen id : 6]
-Left keys [1]: [ss_sold_date_sk#1]
-Right keys [1]: [d_date_sk#22]
+(27) BroadcastHashJoin [codegen id : 6]
+Left keys [1]: [ss_store_sk#5]
+Right keys [1]: [s_store_sk#21]
 Join condition: None
 
-(29) Project [codegen id : 6]
-Output [4]: [ss_store_sk#5, ss_quantity#6, ss_ext_sales_price#8, ss_ext_wholesale_cost#9]
-Input [6]: [ss_sold_date_sk#1, ss_store_sk#5, ss_quantity#6, ss_ext_sales_price#8, ss_ext_wholesale_cost#9, d_date_sk#22]
+(28) Project [codegen id : 6]
+Output [5]: [ss_addr_sk#4, ss_quantity#6, ss_ext_sales_price#8, ss_ext_wholesale_cost#9, ss_net_profit#10]
+Input [7]: [ss_addr_sk#4, ss_store_sk#5, ss_quantity#6, ss_ext_sales_price#8, ss_ext_wholesale_cost#9, ss_net_profit#10, s_store_sk#21]
 
-(30) Scan parquet default.store
-Output [1]: [s_store_sk#25]
+(29) Scan parquet default.customer_address
+Output [3]: [ca_address_sk#23, ca_state#24, ca_country#25]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/store]
-PushedFilters: [IsNotNull(s_store_sk)]
-ReadSchema: struct<s_store_sk:int>
+Location [not included in comparison]/{warehouse_dir}/customer_address]
+PushedFilters: [IsNotNull(ca_country), EqualTo(ca_country,United States), IsNotNull(ca_address_sk), Or(Or(In(ca_state, [TX,OH]),In(ca_state, [OR,NM,KY])),In(ca_state, [VA,TX,MS]))]
+ReadSchema: struct<ca_address_sk:int,ca_state:string,ca_country:string>
+
+(30) ColumnarToRow [codegen id : 5]
+Input [3]: [ca_address_sk#23, ca_state#24, ca_country#25]
 
-(31) ColumnarToRow [codegen id : 5]
-Input [1]: [s_store_sk#25]
+(31) Filter [codegen id : 5]
+Input [3]: [ca_address_sk#23, ca_state#24, ca_country#25]
+Condition : (((isnotnull(ca_country#25) AND (ca_country#25 = United States)) AND isnotnull(ca_address_sk#23)) AND ((ca_state#24 IN (TX,OH) OR ca_state#24 IN (OR,NM,KY)) OR ca_state#24 IN (VA,TX,MS)))
 
-(32) Filter [codegen id : 5]
-Input [1]: [s_store_sk#25]
-Condition : isnotnull(s_store_sk#25)
+(32) Project [codegen id : 5]
+Output [2]: [ca_address_sk#23, ca_state#24]
+Input [3]: [ca_address_sk#23, ca_state#24, ca_country#25]
 
 (33) BroadcastExchange
-Input [1]: [s_store_sk#25]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#26]
+Input [2]: [ca_address_sk#23, ca_state#24]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#26]
 
 (34) BroadcastHashJoin [codegen id : 6]
-Left keys [1]: [ss_store_sk#5]
-Right keys [1]: [s_store_sk#25]
-Join condition: None
+Left keys [1]: [ss_addr_sk#4]
+Right keys [1]: [ca_address_sk#23]
+Join condition: ((((ca_state#24 IN (TX,OH) AND (ss_net_profit#10 >= 100.00)) AND (ss_net_profit#10 <= 200.00)) OR ((ca_state#24 IN (OR,NM,KY) AND (ss_net_profit#10 >= 150.00)) AND (ss_net_profit#10 <= 300.00))) OR ((ca_state#24 IN (VA,TX,MS) AND (ss_net_profit#10 >= 50.00)) AND (ss_net_profit#10 <= 250.00)))
 
 (35) Project [codegen id : 6]
 Output [3]: [ss_quantity#6, ss_ext_sales_price#8, ss_ext_wholesale_cost#9]
-Input [5]: [ss_store_sk#5, ss_quantity#6, ss_ext_sales_price#8, ss_ext_wholesale_cost#9, s_store_sk#25]
+Input [7]: [ss_addr_sk#4, ss_quantity#6, ss_ext_sales_price#8, ss_ext_wholesale_cost#9, ss_net_profit#10, ca_address_sk#23, ca_state#24]
 
 (36) HashAggregate [codegen id : 6]
 Input [3]: [ss_quantity#6, ss_ext_sales_price#8, ss_ext_wholesale_cost#9]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q13.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q13.sf100/simplified.txt
index 45d6c8f3b0bae..b457788dbd0b2 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q13.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q13.sf100/simplified.txt
@@ -5,11 +5,11 @@ WholeStageCodegen (7)
         WholeStageCodegen (6)
           HashAggregate [ss_quantity,ss_ext_sales_price,ss_ext_wholesale_cost] [sum,count,sum,count,sum,count,sum,sum,count,sum,count,sum,count,sum]
             Project [ss_quantity,ss_ext_sales_price,ss_ext_wholesale_cost]
-              BroadcastHashJoin [ss_store_sk,s_store_sk]
-                Project [ss_store_sk,ss_quantity,ss_ext_sales_price,ss_ext_wholesale_cost]
-                  BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
-                    Project [ss_sold_date_sk,ss_store_sk,ss_quantity,ss_ext_sales_price,ss_ext_wholesale_cost]
-                      BroadcastHashJoin [ss_addr_sk,ca_address_sk,ca_state,ss_net_profit]
+              BroadcastHashJoin [ss_addr_sk,ca_address_sk,ca_state,ss_net_profit]
+                Project [ss_addr_sk,ss_quantity,ss_ext_sales_price,ss_ext_wholesale_cost,ss_net_profit]
+                  BroadcastHashJoin [ss_store_sk,s_store_sk]
+                    Project [ss_addr_sk,ss_store_sk,ss_quantity,ss_ext_sales_price,ss_ext_wholesale_cost,ss_net_profit]
+                      BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
                         Project [ss_sold_date_sk,ss_addr_sk,ss_store_sk,ss_quantity,ss_ext_sales_price,ss_ext_wholesale_cost,ss_net_profit]
                           BroadcastHashJoin [ss_hdemo_sk,hd_demo_sk,cd_marital_status,cd_education_status,ss_sales_price,hd_dep_count]
                             Project [ss_sold_date_sk,ss_hdemo_sk,ss_addr_sk,ss_store_sk,ss_quantity,ss_sales_price,ss_ext_sales_price,ss_ext_wholesale_cost,ss_net_profit,cd_marital_status,cd_education_status]
@@ -35,23 +35,23 @@ WholeStageCodegen (7)
                         InputAdapter
                           BroadcastExchange #4
                             WholeStageCodegen (3)
-                              Project [ca_address_sk,ca_state]
-                                Filter [ca_country,ca_address_sk,ca_state]
+                              Project [d_date_sk]
+                                Filter [d_year,d_date_sk]
                                   ColumnarToRow
                                     InputAdapter
-                                      Scan parquet default.customer_address [ca_address_sk,ca_state,ca_country]
+                                      Scan parquet default.date_dim [d_date_sk,d_year]
                     InputAdapter
                       BroadcastExchange #5
                         WholeStageCodegen (4)
-                          Project [d_date_sk]
-                            Filter [d_year,d_date_sk]
-                              ColumnarToRow
-                                InputAdapter
-                                  Scan parquet default.date_dim [d_date_sk,d_year]
+                          Filter [s_store_sk]
+                            ColumnarToRow
+                              InputAdapter
+                                Scan parquet default.store [s_store_sk]
                 InputAdapter
                   BroadcastExchange #6
                     WholeStageCodegen (5)
-                      Filter [s_store_sk]
-                        ColumnarToRow
-                          InputAdapter
-                            Scan parquet default.store [s_store_sk]
+                      Project [ca_address_sk,ca_state]
+                        Filter [ca_country,ca_address_sk,ca_state]
+                          ColumnarToRow
+                            InputAdapter
+                              Scan parquet default.customer_address [ca_address_sk,ca_state,ca_country]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q18.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q18.sf100/explain.txt
index 12e95ba50cd0d..f7927aad003a2 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q18.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q18.sf100/explain.txt
@@ -34,261 +34,272 @@ TakeOrderedAndProject (53)
                   +- * Sort (46)
                      +- Exchange (45)
                         +- * Project (44)
-                           +- * BroadcastHashJoin Inner BuildRight (43)
-                              :- * Project (38)
-                              :  +- * SortMergeJoin Inner (37)
-                              :     :- * Sort (31)
-                              :     :  +- Exchange (30)
-                              :     :     +- * Project (29)
-                              :     :        +- * Filter (28)
-                              :     :           +- * ColumnarToRow (27)
-                              :     :              +- Scan parquet default.customer (26)
-                              :     +- * Sort (36)
-                              :        +- Exchange (35)
-                              :           +- * Filter (34)
-                              :              +- * ColumnarToRow (33)
-                              :                 +- Scan parquet default.customer_demographics (32)
-                              +- BroadcastExchange (42)
-                                 +- * Filter (41)
-                                    +- * ColumnarToRow (40)
-                                       +- Scan parquet default.customer_address (39)
+                           +- * SortMergeJoin Inner (43)
+                              :- * Sort (37)
+                              :  +- Exchange (36)
+                              :     +- * Project (35)
+                              :        +- * BroadcastHashJoin Inner BuildRight (34)
+                              :           :- * Project (29)
+                              :           :  +- * Filter (28)
+                              :           :     +- * ColumnarToRow (27)
+                              :           :        +- Scan parquet default.customer (26)
+                              :           +- BroadcastExchange (33)
+                              :              +- * Filter (32)
+                              :                 +- * ColumnarToRow (31)
+                              :                    +- Scan parquet default.customer_address (30)
+                              +- * Sort (42)
+                                 +- Exchange (41)
+                                    +- * Filter (40)
+                                       +- * ColumnarToRow (39)
+                                          +- Scan parquet default.customer_demographics (38)
 
 
 (1) Scan parquet default.catalog_sales
-Output [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9]
+Output [9]: [cs_bill_customer_sk#1, cs_bill_cdemo_sk#2, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cs_sold_date_sk#9]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/catalog_sales]
-PushedFilters: [IsNotNull(cs_bill_cdemo_sk), IsNotNull(cs_bill_customer_sk), IsNotNull(cs_sold_date_sk), IsNotNull(cs_item_sk)]
-ReadSchema: struct<cs_sold_date_sk:int,cs_bill_customer_sk:int,cs_bill_cdemo_sk:int,cs_item_sk:int,cs_quantity:int,cs_list_price:decimal(7,2),cs_sales_price:decimal(7,2),cs_coupon_amt:decimal(7,2),cs_net_profit:decimal(7,2)>
+Location: InMemoryFileIndex []
+PartitionFilters: [isnotnull(cs_sold_date_sk#9), dynamicpruningexpression(cs_sold_date_sk#9 IN dynamicpruning#10)]
+PushedFilters: [IsNotNull(cs_bill_cdemo_sk), IsNotNull(cs_bill_customer_sk), IsNotNull(cs_item_sk)]
+ReadSchema: struct<cs_bill_customer_sk:int,cs_bill_cdemo_sk:int,cs_item_sk:int,cs_quantity:int,cs_list_price:decimal(7,2),cs_sales_price:decimal(7,2),cs_coupon_amt:decimal(7,2),cs_net_profit:decimal(7,2)>
 
 (2) ColumnarToRow [codegen id : 4]
-Input [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9]
+Input [9]: [cs_bill_customer_sk#1, cs_bill_cdemo_sk#2, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cs_sold_date_sk#9]
 
 (3) Filter [codegen id : 4]
-Input [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9]
-Condition : (((isnotnull(cs_bill_cdemo_sk#3) AND isnotnull(cs_bill_customer_sk#2)) AND isnotnull(cs_sold_date_sk#1)) AND isnotnull(cs_item_sk#4))
+Input [9]: [cs_bill_customer_sk#1, cs_bill_cdemo_sk#2, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cs_sold_date_sk#9]
+Condition : ((isnotnull(cs_bill_cdemo_sk#2) AND isnotnull(cs_bill_customer_sk#1)) AND isnotnull(cs_item_sk#3))
 
 (4) Scan parquet default.customer_demographics
-Output [4]: [cd_demo_sk#10, cd_gender#11, cd_education_status#12, cd_dep_count#13]
+Output [4]: [cd_demo_sk#11, cd_gender#12, cd_education_status#13, cd_dep_count#14]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer_demographics]
-PushedFilters: [IsNotNull(cd_gender), IsNotNull(cd_education_status), EqualTo(cd_gender,F), EqualTo(cd_education_status,Unknown), IsNotNull(cd_demo_sk)]
+PushedFilters: [IsNotNull(cd_gender), IsNotNull(cd_education_status), EqualTo(cd_gender,F), EqualTo(cd_education_status,Unknown             ), IsNotNull(cd_demo_sk)]
 ReadSchema: struct<cd_demo_sk:int,cd_gender:string,cd_education_status:string,cd_dep_count:int>
 
 (5) ColumnarToRow [codegen id : 1]
-Input [4]: [cd_demo_sk#10, cd_gender#11, cd_education_status#12, cd_dep_count#13]
+Input [4]: [cd_demo_sk#11, cd_gender#12, cd_education_status#13, cd_dep_count#14]
 
 (6) Filter [codegen id : 1]
-Input [4]: [cd_demo_sk#10, cd_gender#11, cd_education_status#12, cd_dep_count#13]
-Condition : ((((isnotnull(cd_gender#11) AND isnotnull(cd_education_status#12)) AND (cd_gender#11 = F)) AND (cd_education_status#12 = Unknown)) AND isnotnull(cd_demo_sk#10))
+Input [4]: [cd_demo_sk#11, cd_gender#12, cd_education_status#13, cd_dep_count#14]
+Condition : ((((isnotnull(cd_gender#12) AND isnotnull(cd_education_status#13)) AND (cd_gender#12 = F)) AND (cd_education_status#13 = Unknown             )) AND isnotnull(cd_demo_sk#11))
 
 (7) Project [codegen id : 1]
-Output [2]: [cd_demo_sk#10, cd_dep_count#13]
-Input [4]: [cd_demo_sk#10, cd_gender#11, cd_education_status#12, cd_dep_count#13]
+Output [2]: [cd_demo_sk#11, cd_dep_count#14]
+Input [4]: [cd_demo_sk#11, cd_gender#12, cd_education_status#13, cd_dep_count#14]
 
 (8) BroadcastExchange
-Input [2]: [cd_demo_sk#10, cd_dep_count#13]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#14]
+Input [2]: [cd_demo_sk#11, cd_dep_count#14]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#15]
 
 (9) BroadcastHashJoin [codegen id : 4]
-Left keys [1]: [cs_bill_cdemo_sk#3]
-Right keys [1]: [cd_demo_sk#10]
+Left keys [1]: [cs_bill_cdemo_sk#2]
+Right keys [1]: [cd_demo_sk#11]
 Join condition: None
 
 (10) Project [codegen id : 4]
-Output [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13]
-Input [11]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_demo_sk#10, cd_dep_count#13]
+Output [9]: [cs_bill_customer_sk#1, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cs_sold_date_sk#9, cd_dep_count#14]
+Input [11]: [cs_bill_customer_sk#1, cs_bill_cdemo_sk#2, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cs_sold_date_sk#9, cd_demo_sk#11, cd_dep_count#14]
 
 (11) Scan parquet default.date_dim
-Output [2]: [d_date_sk#15, d_year#16]
+Output [2]: [d_date_sk#16, d_year#17]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/date_dim]
 PushedFilters: [IsNotNull(d_year), EqualTo(d_year,1998), IsNotNull(d_date_sk)]
 ReadSchema: struct<d_date_sk:int,d_year:int>
 
 (12) ColumnarToRow [codegen id : 2]
-Input [2]: [d_date_sk#15, d_year#16]
+Input [2]: [d_date_sk#16, d_year#17]
 
 (13) Filter [codegen id : 2]
-Input [2]: [d_date_sk#15, d_year#16]
-Condition : ((isnotnull(d_year#16) AND (d_year#16 = 1998)) AND isnotnull(d_date_sk#15))
+Input [2]: [d_date_sk#16, d_year#17]
+Condition : ((isnotnull(d_year#17) AND (d_year#17 = 1998)) AND isnotnull(d_date_sk#16))
 
 (14) Project [codegen id : 2]
-Output [1]: [d_date_sk#15]
-Input [2]: [d_date_sk#15, d_year#16]
+Output [1]: [d_date_sk#16]
+Input [2]: [d_date_sk#16, d_year#17]
 
 (15) BroadcastExchange
-Input [1]: [d_date_sk#15]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#17]
+Input [1]: [d_date_sk#16]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#18]
 
 (16) BroadcastHashJoin [codegen id : 4]
-Left keys [1]: [cs_sold_date_sk#1]
-Right keys [1]: [d_date_sk#15]
+Left keys [1]: [cs_sold_date_sk#9]
+Right keys [1]: [d_date_sk#16]
 Join condition: None
 
 (17) Project [codegen id : 4]
-Output [8]: [cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13]
-Input [10]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, d_date_sk#15]
+Output [8]: [cs_bill_customer_sk#1, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14]
+Input [10]: [cs_bill_customer_sk#1, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cs_sold_date_sk#9, cd_dep_count#14, d_date_sk#16]
 
 (18) Scan parquet default.item
-Output [2]: [i_item_sk#18, i_item_id#19]
+Output [2]: [i_item_sk#19, i_item_id#20]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/item]
 PushedFilters: [IsNotNull(i_item_sk)]
 ReadSchema: struct<i_item_sk:int,i_item_id:string>
 
 (19) ColumnarToRow [codegen id : 3]
-Input [2]: [i_item_sk#18, i_item_id#19]
+Input [2]: [i_item_sk#19, i_item_id#20]
 
 (20) Filter [codegen id : 3]
-Input [2]: [i_item_sk#18, i_item_id#19]
-Condition : isnotnull(i_item_sk#18)
+Input [2]: [i_item_sk#19, i_item_id#20]
+Condition : isnotnull(i_item_sk#19)
 
 (21) BroadcastExchange
-Input [2]: [i_item_sk#18, i_item_id#19]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#20]
+Input [2]: [i_item_sk#19, i_item_id#20]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#21]
 
 (22) BroadcastHashJoin [codegen id : 4]
-Left keys [1]: [cs_item_sk#4]
-Right keys [1]: [i_item_sk#18]
+Left keys [1]: [cs_item_sk#3]
+Right keys [1]: [i_item_sk#19]
 Join condition: None
 
 (23) Project [codegen id : 4]
-Output [8]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_id#19]
-Input [10]: [cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_sk#18, i_item_id#19]
+Output [8]: [cs_bill_customer_sk#1, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, i_item_id#20]
+Input [10]: [cs_bill_customer_sk#1, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, i_item_sk#19, i_item_id#20]
 
 (24) Exchange
-Input [8]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_id#19]
-Arguments: hashpartitioning(cs_bill_customer_sk#2, 5), ENSURE_REQUIREMENTS, [id=#21]
+Input [8]: [cs_bill_customer_sk#1, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, i_item_id#20]
+Arguments: hashpartitioning(cs_bill_customer_sk#1, 5), ENSURE_REQUIREMENTS, [id=#22]
 
 (25) Sort [codegen id : 5]
-Input [8]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_id#19]
-Arguments: [cs_bill_customer_sk#2 ASC NULLS FIRST], false, 0
+Input [8]: [cs_bill_customer_sk#1, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, i_item_id#20]
+Arguments: [cs_bill_customer_sk#1 ASC NULLS FIRST], false, 0
 
 (26) Scan parquet default.customer
-Output [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+Output [5]: [c_customer_sk#23, c_current_cdemo_sk#24, c_current_addr_sk#25, c_birth_month#26, c_birth_year#27]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer]
 PushedFilters: [In(c_birth_month, [1,6,8,9,12,2]), IsNotNull(c_customer_sk), IsNotNull(c_current_cdemo_sk), IsNotNull(c_current_addr_sk)]
 ReadSchema: struct<c_customer_sk:int,c_current_cdemo_sk:int,c_current_addr_sk:int,c_birth_month:int,c_birth_year:int>
 
-(27) ColumnarToRow [codegen id : 6]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+(27) ColumnarToRow [codegen id : 7]
+Input [5]: [c_customer_sk#23, c_current_cdemo_sk#24, c_current_addr_sk#25, c_birth_month#26, c_birth_year#27]
 
-(28) Filter [codegen id : 6]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
-Condition : (((c_birth_month#25 IN (1,6,8,9,12,2) AND isnotnull(c_customer_sk#22)) AND isnotnull(c_current_cdemo_sk#23)) AND isnotnull(c_current_addr_sk#24))
+(28) Filter [codegen id : 7]
+Input [5]: [c_customer_sk#23, c_current_cdemo_sk#24, c_current_addr_sk#25, c_birth_month#26, c_birth_year#27]
+Condition : (((c_birth_month#26 IN (1,6,8,9,12,2) AND isnotnull(c_customer_sk#23)) AND isnotnull(c_current_cdemo_sk#24)) AND isnotnull(c_current_addr_sk#25))
 
-(29) Project [codegen id : 6]
-Output [4]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_year#26]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+(29) Project [codegen id : 7]
+Output [4]: [c_customer_sk#23, c_current_cdemo_sk#24, c_current_addr_sk#25, c_birth_year#27]
+Input [5]: [c_customer_sk#23, c_current_cdemo_sk#24, c_current_addr_sk#25, c_birth_month#26, c_birth_year#27]
 
-(30) Exchange
-Input [4]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_year#26]
-Arguments: hashpartitioning(c_current_cdemo_sk#23, 5), ENSURE_REQUIREMENTS, [id=#27]
-
-(31) Sort [codegen id : 7]
-Input [4]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_year#26]
-Arguments: [c_current_cdemo_sk#23 ASC NULLS FIRST], false, 0
-
-(32) Scan parquet default.customer_demographics
-Output [1]: [cd_demo_sk#28]
+(30) Scan parquet default.customer_address
+Output [4]: [ca_address_sk#28, ca_county#29, ca_state#30, ca_country#31]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/customer_demographics]
-PushedFilters: [IsNotNull(cd_demo_sk)]
-ReadSchema: struct<cd_demo_sk:int>
-
-(33) ColumnarToRow [codegen id : 8]
-Input [1]: [cd_demo_sk#28]
+Location [not included in comparison]/{warehouse_dir}/customer_address]
+PushedFilters: [In(ca_state, [MS,IN,ND,OK,NM,VA]), IsNotNull(ca_address_sk)]
+ReadSchema: struct<ca_address_sk:int,ca_county:string,ca_state:string,ca_country:string>
 
-(34) Filter [codegen id : 8]
-Input [1]: [cd_demo_sk#28]
-Condition : isnotnull(cd_demo_sk#28)
+(31) ColumnarToRow [codegen id : 6]
+Input [4]: [ca_address_sk#28, ca_county#29, ca_state#30, ca_country#31]
 
-(35) Exchange
-Input [1]: [cd_demo_sk#28]
-Arguments: hashpartitioning(cd_demo_sk#28, 5), ENSURE_REQUIREMENTS, [id=#29]
+(32) Filter [codegen id : 6]
+Input [4]: [ca_address_sk#28, ca_county#29, ca_state#30, ca_country#31]
+Condition : (ca_state#30 IN (MS,IN,ND,OK,NM,VA) AND isnotnull(ca_address_sk#28))
 
-(36) Sort [codegen id : 9]
-Input [1]: [cd_demo_sk#28]
-Arguments: [cd_demo_sk#28 ASC NULLS FIRST], false, 0
+(33) BroadcastExchange
+Input [4]: [ca_address_sk#28, ca_county#29, ca_state#30, ca_country#31]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#32]
 
-(37) SortMergeJoin [codegen id : 11]
-Left keys [1]: [c_current_cdemo_sk#23]
-Right keys [1]: [cd_demo_sk#28]
+(34) BroadcastHashJoin [codegen id : 7]
+Left keys [1]: [c_current_addr_sk#25]
+Right keys [1]: [ca_address_sk#28]
 Join condition: None
 
-(38) Project [codegen id : 11]
-Output [3]: [c_customer_sk#22, c_current_addr_sk#24, c_birth_year#26]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_year#26, cd_demo_sk#28]
+(35) Project [codegen id : 7]
+Output [6]: [c_customer_sk#23, c_current_cdemo_sk#24, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31]
+Input [8]: [c_customer_sk#23, c_current_cdemo_sk#24, c_current_addr_sk#25, c_birth_year#27, ca_address_sk#28, ca_county#29, ca_state#30, ca_country#31]
 
-(39) Scan parquet default.customer_address
-Output [4]: [ca_address_sk#30, ca_county#31, ca_state#32, ca_country#33]
+(36) Exchange
+Input [6]: [c_customer_sk#23, c_current_cdemo_sk#24, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31]
+Arguments: hashpartitioning(c_current_cdemo_sk#24, 5), ENSURE_REQUIREMENTS, [id=#33]
+
+(37) Sort [codegen id : 8]
+Input [6]: [c_customer_sk#23, c_current_cdemo_sk#24, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31]
+Arguments: [c_current_cdemo_sk#24 ASC NULLS FIRST], false, 0
+
+(38) Scan parquet default.customer_demographics
+Output [1]: [cd_demo_sk#34]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/customer_address]
-PushedFilters: [In(ca_state, [MS,IN,ND,OK,NM,VA]), IsNotNull(ca_address_sk)]
-ReadSchema: struct<ca_address_sk:int,ca_county:string,ca_state:string,ca_country:string>
+Location [not included in comparison]/{warehouse_dir}/customer_demographics]
+PushedFilters: [IsNotNull(cd_demo_sk)]
+ReadSchema: struct<cd_demo_sk:int>
+
+(39) ColumnarToRow [codegen id : 9]
+Input [1]: [cd_demo_sk#34]
 
-(40) ColumnarToRow [codegen id : 10]
-Input [4]: [ca_address_sk#30, ca_county#31, ca_state#32, ca_country#33]
+(40) Filter [codegen id : 9]
+Input [1]: [cd_demo_sk#34]
+Condition : isnotnull(cd_demo_sk#34)
 
-(41) Filter [codegen id : 10]
-Input [4]: [ca_address_sk#30, ca_county#31, ca_state#32, ca_country#33]
-Condition : (ca_state#32 IN (MS,IN,ND,OK,NM,VA) AND isnotnull(ca_address_sk#30))
+(41) Exchange
+Input [1]: [cd_demo_sk#34]
+Arguments: hashpartitioning(cd_demo_sk#34, 5), ENSURE_REQUIREMENTS, [id=#35]
 
-(42) BroadcastExchange
-Input [4]: [ca_address_sk#30, ca_county#31, ca_state#32, ca_country#33]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#34]
+(42) Sort [codegen id : 10]
+Input [1]: [cd_demo_sk#34]
+Arguments: [cd_demo_sk#34 ASC NULLS FIRST], false, 0
 
-(43) BroadcastHashJoin [codegen id : 11]
-Left keys [1]: [c_current_addr_sk#24]
-Right keys [1]: [ca_address_sk#30]
+(43) SortMergeJoin [codegen id : 11]
+Left keys [1]: [c_current_cdemo_sk#24]
+Right keys [1]: [cd_demo_sk#34]
 Join condition: None
 
 (44) Project [codegen id : 11]
-Output [5]: [c_customer_sk#22, c_birth_year#26, ca_county#31, ca_state#32, ca_country#33]
-Input [7]: [c_customer_sk#22, c_current_addr_sk#24, c_birth_year#26, ca_address_sk#30, ca_county#31, ca_state#32, ca_country#33]
+Output [5]: [c_customer_sk#23, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31]
+Input [7]: [c_customer_sk#23, c_current_cdemo_sk#24, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31, cd_demo_sk#34]
 
 (45) Exchange
-Input [5]: [c_customer_sk#22, c_birth_year#26, ca_county#31, ca_state#32, ca_country#33]
-Arguments: hashpartitioning(c_customer_sk#22, 5), ENSURE_REQUIREMENTS, [id=#35]
+Input [5]: [c_customer_sk#23, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31]
+Arguments: hashpartitioning(c_customer_sk#23, 5), ENSURE_REQUIREMENTS, [id=#36]
 
 (46) Sort [codegen id : 12]
-Input [5]: [c_customer_sk#22, c_birth_year#26, ca_county#31, ca_state#32, ca_country#33]
-Arguments: [c_customer_sk#22 ASC NULLS FIRST], false, 0
+Input [5]: [c_customer_sk#23, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31]
+Arguments: [c_customer_sk#23 ASC NULLS FIRST], false, 0
 
 (47) SortMergeJoin [codegen id : 13]
-Left keys [1]: [cs_bill_customer_sk#2]
-Right keys [1]: [c_customer_sk#22]
+Left keys [1]: [cs_bill_customer_sk#1]
+Right keys [1]: [c_customer_sk#23]
 Join condition: None
 
 (48) Project [codegen id : 13]
-Output [11]: [cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, c_birth_year#26, i_item_id#19, ca_country#33, ca_state#32, ca_county#31]
-Input [13]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_id#19, c_customer_sk#22, c_birth_year#26, ca_county#31, ca_state#32, ca_country#33]
+Output [11]: [cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, c_birth_year#27, i_item_id#20, ca_country#31, ca_state#30, ca_county#29]
+Input [13]: [cs_bill_customer_sk#1, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, i_item_id#20, c_customer_sk#23, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31]
 
 (49) Expand [codegen id : 13]
-Input [11]: [cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, c_birth_year#26, i_item_id#19, ca_country#33, ca_state#32, ca_county#31]
-Arguments: [List(cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, c_birth_year#26, i_item_id#19, ca_country#33, ca_state#32, ca_county#31, 0), List(cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, c_birth_year#26, i_item_id#19, ca_country#33, ca_state#32, null, 1), List(cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, c_birth_year#26, i_item_id#19, ca_country#33, null, null, 3), List(cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, c_birth_year#26, i_item_id#19, null, null, null, 7), List(cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, c_birth_year#26, null, null, null, null, 15)], [cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, c_birth_year#26, i_item_id#36, ca_country#37, ca_state#38, ca_county#39, spark_grouping_id#40]
+Input [11]: [cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, c_birth_year#27, i_item_id#20, ca_country#31, ca_state#30, ca_county#29]
+Arguments: [List(cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, c_birth_year#27, i_item_id#20, ca_country#31, ca_state#30, ca_county#29, 0), List(cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, c_birth_year#27, i_item_id#20, ca_country#31, ca_state#30, null, 1), List(cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, c_birth_year#27, i_item_id#20, ca_country#31, null, null, 3), List(cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, c_birth_year#27, i_item_id#20, null, null, null, 7), List(cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, c_birth_year#27, null, null, null, null, 15)], [cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, c_birth_year#27, i_item_id#37, ca_country#38, ca_state#39, ca_county#40, spark_grouping_id#41]
 
 (50) HashAggregate [codegen id : 13]
-Input [12]: [cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, c_birth_year#26, i_item_id#36, ca_country#37, ca_state#38, ca_county#39, spark_grouping_id#40]
-Keys [5]: [i_item_id#36, ca_country#37, ca_state#38, ca_county#39, spark_grouping_id#40]
-Functions [7]: [partial_avg(cast(cs_quantity#5 as decimal(12,2))), partial_avg(cast(cs_list_price#6 as decimal(12,2))), partial_avg(cast(cs_coupon_amt#8 as decimal(12,2))), partial_avg(cast(cs_sales_price#7 as decimal(12,2))), partial_avg(cast(cs_net_profit#9 as decimal(12,2))), partial_avg(cast(c_birth_year#26 as decimal(12,2))), partial_avg(cast(cd_dep_count#13 as decimal(12,2)))]
-Aggregate Attributes [14]: [sum#41, count#42, sum#43, count#44, sum#45, count#46, sum#47, count#48, sum#49, count#50, sum#51, count#52, sum#53, count#54]
-Results [19]: [i_item_id#36, ca_country#37, ca_state#38, ca_county#39, spark_grouping_id#40, sum#55, count#56, sum#57, count#58, sum#59, count#60, sum#61, count#62, sum#63, count#64, sum#65, count#66, sum#67, count#68]
+Input [12]: [cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, c_birth_year#27, i_item_id#37, ca_country#38, ca_state#39, ca_county#40, spark_grouping_id#41]
+Keys [5]: [i_item_id#37, ca_country#38, ca_state#39, ca_county#40, spark_grouping_id#41]
+Functions [7]: [partial_avg(cast(cs_quantity#4 as decimal(12,2))), partial_avg(cast(cs_list_price#5 as decimal(12,2))), partial_avg(cast(cs_coupon_amt#7 as decimal(12,2))), partial_avg(cast(cs_sales_price#6 as decimal(12,2))), partial_avg(cast(cs_net_profit#8 as decimal(12,2))), partial_avg(cast(c_birth_year#27 as decimal(12,2))), partial_avg(cast(cd_dep_count#14 as decimal(12,2)))]
+Aggregate Attributes [14]: [sum#42, count#43, sum#44, count#45, sum#46, count#47, sum#48, count#49, sum#50, count#51, sum#52, count#53, sum#54, count#55]
+Results [19]: [i_item_id#37, ca_country#38, ca_state#39, ca_county#40, spark_grouping_id#41, sum#56, count#57, sum#58, count#59, sum#60, count#61, sum#62, count#63, sum#64, count#65, sum#66, count#67, sum#68, count#69]
 
 (51) Exchange
-Input [19]: [i_item_id#36, ca_country#37, ca_state#38, ca_county#39, spark_grouping_id#40, sum#55, count#56, sum#57, count#58, sum#59, count#60, sum#61, count#62, sum#63, count#64, sum#65, count#66, sum#67, count#68]
-Arguments: hashpartitioning(i_item_id#36, ca_country#37, ca_state#38, ca_county#39, spark_grouping_id#40, 5), ENSURE_REQUIREMENTS, [id=#69]
+Input [19]: [i_item_id#37, ca_country#38, ca_state#39, ca_county#40, spark_grouping_id#41, sum#56, count#57, sum#58, count#59, sum#60, count#61, sum#62, count#63, sum#64, count#65, sum#66, count#67, sum#68, count#69]
+Arguments: hashpartitioning(i_item_id#37, ca_country#38, ca_state#39, ca_county#40, spark_grouping_id#41, 5), ENSURE_REQUIREMENTS, [id=#70]
 
 (52) HashAggregate [codegen id : 14]
-Input [19]: [i_item_id#36, ca_country#37, ca_state#38, ca_county#39, spark_grouping_id#40, sum#55, count#56, sum#57, count#58, sum#59, count#60, sum#61, count#62, sum#63, count#64, sum#65, count#66, sum#67, count#68]
-Keys [5]: [i_item_id#36, ca_country#37, ca_state#38, ca_county#39, spark_grouping_id#40]
-Functions [7]: [avg(cast(cs_quantity#5 as decimal(12,2))), avg(cast(cs_list_price#6 as decimal(12,2))), avg(cast(cs_coupon_amt#8 as decimal(12,2))), avg(cast(cs_sales_price#7 as decimal(12,2))), avg(cast(cs_net_profit#9 as decimal(12,2))), avg(cast(c_birth_year#26 as decimal(12,2))), avg(cast(cd_dep_count#13 as decimal(12,2)))]
-Aggregate Attributes [7]: [avg(cast(cs_quantity#5 as decimal(12,2)))#70, avg(cast(cs_list_price#6 as decimal(12,2)))#71, avg(cast(cs_coupon_amt#8 as decimal(12,2)))#72, avg(cast(cs_sales_price#7 as decimal(12,2)))#73, avg(cast(cs_net_profit#9 as decimal(12,2)))#74, avg(cast(c_birth_year#26 as decimal(12,2)))#75, avg(cast(cd_dep_count#13 as decimal(12,2)))#76]
-Results [11]: [i_item_id#36, ca_country#37, ca_state#38, ca_county#39, avg(cast(cs_quantity#5 as decimal(12,2)))#70 AS agg1#77, avg(cast(cs_list_price#6 as decimal(12,2)))#71 AS agg2#78, avg(cast(cs_coupon_amt#8 as decimal(12,2)))#72 AS agg3#79, avg(cast(cs_sales_price#7 as decimal(12,2)))#73 AS agg4#80, avg(cast(cs_net_profit#9 as decimal(12,2)))#74 AS agg5#81, avg(cast(c_birth_year#26 as decimal(12,2)))#75 AS agg6#82, avg(cast(cd_dep_count#13 as decimal(12,2)))#76 AS agg7#83]
+Input [19]: [i_item_id#37, ca_country#38, ca_state#39, ca_county#40, spark_grouping_id#41, sum#56, count#57, sum#58, count#59, sum#60, count#61, sum#62, count#63, sum#64, count#65, sum#66, count#67, sum#68, count#69]
+Keys [5]: [i_item_id#37, ca_country#38, ca_state#39, ca_county#40, spark_grouping_id#41]
+Functions [7]: [avg(cast(cs_quantity#4 as decimal(12,2))), avg(cast(cs_list_price#5 as decimal(12,2))), avg(cast(cs_coupon_amt#7 as decimal(12,2))), avg(cast(cs_sales_price#6 as decimal(12,2))), avg(cast(cs_net_profit#8 as decimal(12,2))), avg(cast(c_birth_year#27 as decimal(12,2))), avg(cast(cd_dep_count#14 as decimal(12,2)))]
+Aggregate Attributes [7]: [avg(cast(cs_quantity#4 as decimal(12,2)))#71, avg(cast(cs_list_price#5 as decimal(12,2)))#72, avg(cast(cs_coupon_amt#7 as decimal(12,2)))#73, avg(cast(cs_sales_price#6 as decimal(12,2)))#74, avg(cast(cs_net_profit#8 as decimal(12,2)))#75, avg(cast(c_birth_year#27 as decimal(12,2)))#76, avg(cast(cd_dep_count#14 as decimal(12,2)))#77]
+Results [11]: [i_item_id#37, ca_country#38, ca_state#39, ca_county#40, avg(cast(cs_quantity#4 as decimal(12,2)))#71 AS agg1#78, avg(cast(cs_list_price#5 as decimal(12,2)))#72 AS agg2#79, avg(cast(cs_coupon_amt#7 as decimal(12,2)))#73 AS agg3#80, avg(cast(cs_sales_price#6 as decimal(12,2)))#74 AS agg4#81, avg(cast(cs_net_profit#8 as decimal(12,2)))#75 AS agg5#82, avg(cast(c_birth_year#27 as decimal(12,2)))#76 AS agg6#83, avg(cast(cd_dep_count#14 as decimal(12,2)))#77 AS agg7#84]
 
 (53) TakeOrderedAndProject
-Input [11]: [i_item_id#36, ca_country#37, ca_state#38, ca_county#39, agg1#77, agg2#78, agg3#79, agg4#80, agg5#81, agg6#82, agg7#83]
-Arguments: 100, [ca_country#37 ASC NULLS FIRST, ca_state#38 ASC NULLS FIRST, ca_county#39 ASC NULLS FIRST, i_item_id#36 ASC NULLS FIRST], [i_item_id#36, ca_country#37, ca_state#38, ca_county#39, agg1#77, agg2#78, agg3#79, agg4#80, agg5#81, agg6#82, agg7#83]
+Input [11]: [i_item_id#37, ca_country#38, ca_state#39, ca_county#40, agg1#78, agg2#79, agg3#80, agg4#81, agg5#82, agg6#83, agg7#84]
+Arguments: 100, [ca_country#38 ASC NULLS FIRST, ca_state#39 ASC NULLS FIRST, ca_county#40 ASC NULLS FIRST, i_item_id#37 ASC NULLS FIRST], [i_item_id#37, ca_country#38, ca_state#39, ca_county#40, agg1#78, agg2#79, agg3#80, agg4#81, agg5#82, agg6#83, agg7#84]
+
+===== Subqueries =====
+
+Subquery:1 Hosting operator id = 1 Hosting Expression = cs_sold_date_sk#9 IN dynamicpruning#10
+ReusedExchange (54)
+
+
+(54) ReusedExchange [Reuses operator id: 15]
+Output [1]: [d_date_sk#16]
+
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q18.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q18.sf100/simplified.txt
index 8069d43c3451a..8c76e7cab3310 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q18.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q18.sf100/simplified.txt
@@ -54,34 +54,34 @@ TakeOrderedAndProject [ca_country,ca_state,ca_county,i_item_id,agg1,agg2,agg3,ag
                             Exchange [c_customer_sk] #6
                               WholeStageCodegen (11)
                                 Project [c_customer_sk,c_birth_year,ca_county,ca_state,ca_country]
-                                  BroadcastHashJoin [c_current_addr_sk,ca_address_sk]
-                                    Project [c_customer_sk,c_current_addr_sk,c_birth_year]
-                                      SortMergeJoin [c_current_cdemo_sk,cd_demo_sk]
-                                        InputAdapter
-                                          WholeStageCodegen (7)
-                                            Sort [c_current_cdemo_sk]
-                                              InputAdapter
-                                                Exchange [c_current_cdemo_sk] #7
-                                                  WholeStageCodegen (6)
+                                  SortMergeJoin [c_current_cdemo_sk,cd_demo_sk]
+                                    InputAdapter
+                                      WholeStageCodegen (8)
+                                        Sort [c_current_cdemo_sk]
+                                          InputAdapter
+                                            Exchange [c_current_cdemo_sk] #7
+                                              WholeStageCodegen (7)
+                                                Project [c_customer_sk,c_current_cdemo_sk,c_birth_year,ca_county,ca_state,ca_country]
+                                                  BroadcastHashJoin [c_current_addr_sk,ca_address_sk]
                                                     Project [c_customer_sk,c_current_cdemo_sk,c_current_addr_sk,c_birth_year]
                                                       Filter [c_birth_month,c_customer_sk,c_current_cdemo_sk,c_current_addr_sk]
                                                         ColumnarToRow
                                                           InputAdapter
                                                             Scan parquet default.customer [c_customer_sk,c_current_cdemo_sk,c_current_addr_sk,c_birth_month,c_birth_year]
-                                        InputAdapter
-                                          WholeStageCodegen (9)
-                                            Sort [cd_demo_sk]
-                                              InputAdapter
-                                                Exchange [cd_demo_sk] #8
-                                                  WholeStageCodegen (8)
-                                                    Filter [cd_demo_sk]
-                                                      ColumnarToRow
-                                                        InputAdapter
-                                                          Scan parquet default.customer_demographics [cd_demo_sk]
+                                                    InputAdapter
+                                                      BroadcastExchange #8
+                                                        WholeStageCodegen (6)
+                                                          Filter [ca_state,ca_address_sk]
+                                                            ColumnarToRow
+                                                              InputAdapter
+                                                                Scan parquet default.customer_address [ca_address_sk,ca_county,ca_state,ca_country]
                                     InputAdapter
-                                      BroadcastExchange #9
-                                        WholeStageCodegen (10)
-                                          Filter [ca_state,ca_address_sk]
-                                            ColumnarToRow
-                                              InputAdapter
-                                                Scan parquet default.customer_address [ca_address_sk,ca_county,ca_state,ca_country]
+                                      WholeStageCodegen (10)
+                                        Sort [cd_demo_sk]
+                                          InputAdapter
+                                            Exchange [cd_demo_sk] #9
+                                              WholeStageCodegen (9)
+                                                Filter [cd_demo_sk]
+                                                  ColumnarToRow
+                                                    InputAdapter
+                                                      Scan parquet default.customer_demographics [cd_demo_sk]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q19.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q19.sf100/explain.txt
index 4627bc19f25f0..89dfa65b4aa37 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q19.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q19.sf100/explain.txt
@@ -4,248 +4,248 @@ TakeOrderedAndProject (45)
    +- Exchange (43)
       +- * HashAggregate (42)
          +- * Project (41)
-            +- * BroadcastHashJoin Inner BuildRight (40)
-               :- * Project (34)
-               :  +- * SortMergeJoin Inner (33)
-               :     :- * Sort (18)
-               :     :  +- Exchange (17)
-               :     :     +- * Project (16)
-               :     :        +- * BroadcastHashJoin Inner BuildRight (15)
-               :     :           :- * Project (10)
-               :     :           :  +- * BroadcastHashJoin Inner BuildLeft (9)
-               :     :           :     :- BroadcastExchange (5)
-               :     :           :     :  +- * Project (4)
-               :     :           :     :     +- * Filter (3)
-               :     :           :     :        +- * ColumnarToRow (2)
-               :     :           :     :           +- Scan parquet default.date_dim (1)
-               :     :           :     +- * Filter (8)
-               :     :           :        +- * ColumnarToRow (7)
-               :     :           :           +- Scan parquet default.store_sales (6)
-               :     :           +- BroadcastExchange (14)
-               :     :              +- * Filter (13)
-               :     :                 +- * ColumnarToRow (12)
-               :     :                    +- Scan parquet default.store (11)
-               :     +- * Sort (32)
-               :        +- Exchange (31)
-               :           +- * Project (30)
-               :              +- * SortMergeJoin Inner (29)
-               :                 :- * Sort (23)
-               :                 :  +- Exchange (22)
-               :                 :     +- * Filter (21)
-               :                 :        +- * ColumnarToRow (20)
-               :                 :           +- Scan parquet default.customer (19)
-               :                 +- * Sort (28)
-               :                    +- Exchange (27)
-               :                       +- * Filter (26)
-               :                          +- * ColumnarToRow (25)
-               :                             +- Scan parquet default.customer_address (24)
-               +- BroadcastExchange (39)
-                  +- * Project (38)
-                     +- * Filter (37)
-                        +- * ColumnarToRow (36)
-                           +- Scan parquet default.item (35)
-
-
-(1) Scan parquet default.date_dim
-Output [3]: [d_date_sk#1, d_year#2, d_moy#3]
+            +- * SortMergeJoin Inner (40)
+               :- * Sort (25)
+               :  +- Exchange (24)
+               :     +- * Project (23)
+               :        +- * BroadcastHashJoin Inner BuildRight (22)
+               :           :- * Project (17)
+               :           :  +- * BroadcastHashJoin Inner BuildRight (16)
+               :           :     :- * Project (10)
+               :           :     :  +- * BroadcastHashJoin Inner BuildRight (9)
+               :           :     :     :- * Filter (3)
+               :           :     :     :  +- * ColumnarToRow (2)
+               :           :     :     :     +- Scan parquet default.store_sales (1)
+               :           :     :     +- BroadcastExchange (8)
+               :           :     :        +- * Project (7)
+               :           :     :           +- * Filter (6)
+               :           :     :              +- * ColumnarToRow (5)
+               :           :     :                 +- Scan parquet default.item (4)
+               :           :     +- BroadcastExchange (15)
+               :           :        +- * Project (14)
+               :           :           +- * Filter (13)
+               :           :              +- * ColumnarToRow (12)
+               :           :                 +- Scan parquet default.date_dim (11)
+               :           +- BroadcastExchange (21)
+               :              +- * Filter (20)
+               :                 +- * ColumnarToRow (19)
+               :                    +- Scan parquet default.store (18)
+               +- * Sort (39)
+                  +- Exchange (38)
+                     +- * Project (37)
+                        +- * SortMergeJoin Inner (36)
+                           :- * Sort (30)
+                           :  +- Exchange (29)
+                           :     +- * Filter (28)
+                           :        +- * ColumnarToRow (27)
+                           :           +- Scan parquet default.customer (26)
+                           +- * Sort (35)
+                              +- Exchange (34)
+                                 +- * Filter (33)
+                                    +- * ColumnarToRow (32)
+                                       +- Scan parquet default.customer_address (31)
+
+
+(1) Scan parquet default.store_sales
+Output [5]: [ss_sold_date_sk#1, ss_item_sk#2, ss_customer_sk#3, ss_store_sk#4, ss_ext_sales_price#5]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/date_dim]
-PushedFilters: [IsNotNull(d_moy), IsNotNull(d_year), EqualTo(d_moy,11), EqualTo(d_year,1998), IsNotNull(d_date_sk)]
-ReadSchema: struct<d_date_sk:int,d_year:int,d_moy:int>
+Location [not included in comparison]/{warehouse_dir}/store_sales]
+PushedFilters: [IsNotNull(ss_sold_date_sk), IsNotNull(ss_item_sk), IsNotNull(ss_customer_sk), IsNotNull(ss_store_sk)]
+ReadSchema: struct<ss_sold_date_sk:int,ss_item_sk:int,ss_customer_sk:int,ss_store_sk:int,ss_ext_sales_price:decimal(7,2)>
+
+(2) ColumnarToRow [codegen id : 4]
+Input [5]: [ss_sold_date_sk#1, ss_item_sk#2, ss_customer_sk#3, ss_store_sk#4, ss_ext_sales_price#5]
+
+(3) Filter [codegen id : 4]
+Input [5]: [ss_sold_date_sk#1, ss_item_sk#2, ss_customer_sk#3, ss_store_sk#4, ss_ext_sales_price#5]
+Condition : (((isnotnull(ss_sold_date_sk#1) AND isnotnull(ss_item_sk#2)) AND isnotnull(ss_customer_sk#3)) AND isnotnull(ss_store_sk#4))
+
+(4) Scan parquet default.item
+Output [6]: [i_item_sk#6, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10, i_manager_id#11]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/item]
+PushedFilters: [IsNotNull(i_manager_id), EqualTo(i_manager_id,8), IsNotNull(i_item_sk)]
+ReadSchema: struct<i_item_sk:int,i_brand_id:int,i_brand:string,i_manufact_id:int,i_manufact:string,i_manager_id:int>
+
+(5) ColumnarToRow [codegen id : 1]
+Input [6]: [i_item_sk#6, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10, i_manager_id#11]
+
+(6) Filter [codegen id : 1]
+Input [6]: [i_item_sk#6, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10, i_manager_id#11]
+Condition : ((isnotnull(i_manager_id#11) AND (i_manager_id#11 = 8)) AND isnotnull(i_item_sk#6))
 
-(2) ColumnarToRow [codegen id : 1]
-Input [3]: [d_date_sk#1, d_year#2, d_moy#3]
+(7) Project [codegen id : 1]
+Output [5]: [i_item_sk#6, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10]
+Input [6]: [i_item_sk#6, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10, i_manager_id#11]
 
-(3) Filter [codegen id : 1]
-Input [3]: [d_date_sk#1, d_year#2, d_moy#3]
-Condition : ((((isnotnull(d_moy#3) AND isnotnull(d_year#2)) AND (d_moy#3 = 11)) AND (d_year#2 = 1998)) AND isnotnull(d_date_sk#1))
+(8) BroadcastExchange
+Input [5]: [i_item_sk#6, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#12]
 
-(4) Project [codegen id : 1]
-Output [1]: [d_date_sk#1]
-Input [3]: [d_date_sk#1, d_year#2, d_moy#3]
+(9) BroadcastHashJoin [codegen id : 4]
+Left keys [1]: [ss_item_sk#2]
+Right keys [1]: [i_item_sk#6]
+Join condition: None
 
-(5) BroadcastExchange
-Input [1]: [d_date_sk#1]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#4]
+(10) Project [codegen id : 4]
+Output [8]: [ss_sold_date_sk#1, ss_customer_sk#3, ss_store_sk#4, ss_ext_sales_price#5, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10]
+Input [10]: [ss_sold_date_sk#1, ss_item_sk#2, ss_customer_sk#3, ss_store_sk#4, ss_ext_sales_price#5, i_item_sk#6, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10]
 
-(6) Scan parquet default.store_sales
-Output [5]: [ss_sold_date_sk#5, ss_item_sk#6, ss_customer_sk#7, ss_store_sk#8, ss_ext_sales_price#9]
+(11) Scan parquet default.date_dim
+Output [3]: [d_date_sk#13, d_year#14, d_moy#15]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/store_sales]
-PushedFilters: [IsNotNull(ss_sold_date_sk), IsNotNull(ss_item_sk), IsNotNull(ss_customer_sk), IsNotNull(ss_store_sk)]
-ReadSchema: struct<ss_sold_date_sk:int,ss_item_sk:int,ss_customer_sk:int,ss_store_sk:int,ss_ext_sales_price:decimal(7,2)>
+Location [not included in comparison]/{warehouse_dir}/date_dim]
+PushedFilters: [IsNotNull(d_moy), IsNotNull(d_year), EqualTo(d_moy,11), EqualTo(d_year,1998), IsNotNull(d_date_sk)]
+ReadSchema: struct<d_date_sk:int,d_year:int,d_moy:int>
+
+(12) ColumnarToRow [codegen id : 2]
+Input [3]: [d_date_sk#13, d_year#14, d_moy#15]
+
+(13) Filter [codegen id : 2]
+Input [3]: [d_date_sk#13, d_year#14, d_moy#15]
+Condition : ((((isnotnull(d_moy#15) AND isnotnull(d_year#14)) AND (d_moy#15 = 11)) AND (d_year#14 = 1998)) AND isnotnull(d_date_sk#13))
 
-(7) ColumnarToRow
-Input [5]: [ss_sold_date_sk#5, ss_item_sk#6, ss_customer_sk#7, ss_store_sk#8, ss_ext_sales_price#9]
+(14) Project [codegen id : 2]
+Output [1]: [d_date_sk#13]
+Input [3]: [d_date_sk#13, d_year#14, d_moy#15]
 
-(8) Filter
-Input [5]: [ss_sold_date_sk#5, ss_item_sk#6, ss_customer_sk#7, ss_store_sk#8, ss_ext_sales_price#9]
-Condition : (((isnotnull(ss_sold_date_sk#5) AND isnotnull(ss_item_sk#6)) AND isnotnull(ss_customer_sk#7)) AND isnotnull(ss_store_sk#8))
+(15) BroadcastExchange
+Input [1]: [d_date_sk#13]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#16]
 
-(9) BroadcastHashJoin [codegen id : 3]
-Left keys [1]: [d_date_sk#1]
-Right keys [1]: [ss_sold_date_sk#5]
+(16) BroadcastHashJoin [codegen id : 4]
+Left keys [1]: [ss_sold_date_sk#1]
+Right keys [1]: [d_date_sk#13]
 Join condition: None
 
-(10) Project [codegen id : 3]
-Output [4]: [ss_item_sk#6, ss_customer_sk#7, ss_store_sk#8, ss_ext_sales_price#9]
-Input [6]: [d_date_sk#1, ss_sold_date_sk#5, ss_item_sk#6, ss_customer_sk#7, ss_store_sk#8, ss_ext_sales_price#9]
+(17) Project [codegen id : 4]
+Output [7]: [ss_customer_sk#3, ss_store_sk#4, ss_ext_sales_price#5, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10]
+Input [9]: [ss_sold_date_sk#1, ss_customer_sk#3, ss_store_sk#4, ss_ext_sales_price#5, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10, d_date_sk#13]
 
-(11) Scan parquet default.store
-Output [2]: [s_store_sk#10, s_zip#11]
+(18) Scan parquet default.store
+Output [2]: [s_store_sk#17, s_zip#18]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/store]
 PushedFilters: [IsNotNull(s_zip), IsNotNull(s_store_sk)]
 ReadSchema: struct<s_store_sk:int,s_zip:string>
 
-(12) ColumnarToRow [codegen id : 2]
-Input [2]: [s_store_sk#10, s_zip#11]
+(19) ColumnarToRow [codegen id : 3]
+Input [2]: [s_store_sk#17, s_zip#18]
 
-(13) Filter [codegen id : 2]
-Input [2]: [s_store_sk#10, s_zip#11]
-Condition : (isnotnull(s_zip#11) AND isnotnull(s_store_sk#10))
+(20) Filter [codegen id : 3]
+Input [2]: [s_store_sk#17, s_zip#18]
+Condition : (isnotnull(s_zip#18) AND isnotnull(s_store_sk#17))
 
-(14) BroadcastExchange
-Input [2]: [s_store_sk#10, s_zip#11]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#12]
+(21) BroadcastExchange
+Input [2]: [s_store_sk#17, s_zip#18]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#19]
 
-(15) BroadcastHashJoin [codegen id : 3]
-Left keys [1]: [ss_store_sk#8]
-Right keys [1]: [s_store_sk#10]
+(22) BroadcastHashJoin [codegen id : 4]
+Left keys [1]: [ss_store_sk#4]
+Right keys [1]: [s_store_sk#17]
 Join condition: None
 
-(16) Project [codegen id : 3]
-Output [4]: [ss_item_sk#6, ss_customer_sk#7, ss_ext_sales_price#9, s_zip#11]
-Input [6]: [ss_item_sk#6, ss_customer_sk#7, ss_store_sk#8, ss_ext_sales_price#9, s_store_sk#10, s_zip#11]
+(23) Project [codegen id : 4]
+Output [7]: [ss_customer_sk#3, ss_ext_sales_price#5, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10, s_zip#18]
+Input [9]: [ss_customer_sk#3, ss_store_sk#4, ss_ext_sales_price#5, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10, s_store_sk#17, s_zip#18]
 
-(17) Exchange
-Input [4]: [ss_item_sk#6, ss_customer_sk#7, ss_ext_sales_price#9, s_zip#11]
-Arguments: hashpartitioning(ss_customer_sk#7, 5), ENSURE_REQUIREMENTS, [id=#13]
+(24) Exchange
+Input [7]: [ss_customer_sk#3, ss_ext_sales_price#5, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10, s_zip#18]
+Arguments: hashpartitioning(ss_customer_sk#3, 5), ENSURE_REQUIREMENTS, [id=#20]
 
-(18) Sort [codegen id : 4]
-Input [4]: [ss_item_sk#6, ss_customer_sk#7, ss_ext_sales_price#9, s_zip#11]
-Arguments: [ss_customer_sk#7 ASC NULLS FIRST], false, 0
+(25) Sort [codegen id : 5]
+Input [7]: [ss_customer_sk#3, ss_ext_sales_price#5, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10, s_zip#18]
+Arguments: [ss_customer_sk#3 ASC NULLS FIRST], false, 0
 
-(19) Scan parquet default.customer
-Output [2]: [c_customer_sk#14, c_current_addr_sk#15]
+(26) Scan parquet default.customer
+Output [2]: [c_customer_sk#21, c_current_addr_sk#22]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer]
 PushedFilters: [IsNotNull(c_customer_sk), IsNotNull(c_current_addr_sk)]
 ReadSchema: struct<c_customer_sk:int,c_current_addr_sk:int>
 
-(20) ColumnarToRow [codegen id : 5]
-Input [2]: [c_customer_sk#14, c_current_addr_sk#15]
+(27) ColumnarToRow [codegen id : 6]
+Input [2]: [c_customer_sk#21, c_current_addr_sk#22]
 
-(21) Filter [codegen id : 5]
-Input [2]: [c_customer_sk#14, c_current_addr_sk#15]
-Condition : (isnotnull(c_customer_sk#14) AND isnotnull(c_current_addr_sk#15))
+(28) Filter [codegen id : 6]
+Input [2]: [c_customer_sk#21, c_current_addr_sk#22]
+Condition : (isnotnull(c_customer_sk#21) AND isnotnull(c_current_addr_sk#22))
 
-(22) Exchange
-Input [2]: [c_customer_sk#14, c_current_addr_sk#15]
-Arguments: hashpartitioning(c_current_addr_sk#15, 5), ENSURE_REQUIREMENTS, [id=#16]
+(29) Exchange
+Input [2]: [c_customer_sk#21, c_current_addr_sk#22]
+Arguments: hashpartitioning(c_current_addr_sk#22, 5), ENSURE_REQUIREMENTS, [id=#23]
 
-(23) Sort [codegen id : 6]
-Input [2]: [c_customer_sk#14, c_current_addr_sk#15]
-Arguments: [c_current_addr_sk#15 ASC NULLS FIRST], false, 0
+(30) Sort [codegen id : 7]
+Input [2]: [c_customer_sk#21, c_current_addr_sk#22]
+Arguments: [c_current_addr_sk#22 ASC NULLS FIRST], false, 0
 
-(24) Scan parquet default.customer_address
-Output [2]: [ca_address_sk#17, ca_zip#18]
+(31) Scan parquet default.customer_address
+Output [2]: [ca_address_sk#24, ca_zip#25]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer_address]
 PushedFilters: [IsNotNull(ca_address_sk), IsNotNull(ca_zip)]
 ReadSchema: struct<ca_address_sk:int,ca_zip:string>
 
-(25) ColumnarToRow [codegen id : 7]
-Input [2]: [ca_address_sk#17, ca_zip#18]
+(32) ColumnarToRow [codegen id : 8]
+Input [2]: [ca_address_sk#24, ca_zip#25]
 
-(26) Filter [codegen id : 7]
-Input [2]: [ca_address_sk#17, ca_zip#18]
-Condition : (isnotnull(ca_address_sk#17) AND isnotnull(ca_zip#18))
+(33) Filter [codegen id : 8]
+Input [2]: [ca_address_sk#24, ca_zip#25]
+Condition : (isnotnull(ca_address_sk#24) AND isnotnull(ca_zip#25))
 
-(27) Exchange
-Input [2]: [ca_address_sk#17, ca_zip#18]
-Arguments: hashpartitioning(ca_address_sk#17, 5), ENSURE_REQUIREMENTS, [id=#19]
+(34) Exchange
+Input [2]: [ca_address_sk#24, ca_zip#25]
+Arguments: hashpartitioning(ca_address_sk#24, 5), ENSURE_REQUIREMENTS, [id=#26]
 
-(28) Sort [codegen id : 8]
-Input [2]: [ca_address_sk#17, ca_zip#18]
-Arguments: [ca_address_sk#17 ASC NULLS FIRST], false, 0
+(35) Sort [codegen id : 9]
+Input [2]: [ca_address_sk#24, ca_zip#25]
+Arguments: [ca_address_sk#24 ASC NULLS FIRST], false, 0
 
-(29) SortMergeJoin [codegen id : 9]
-Left keys [1]: [c_current_addr_sk#15]
-Right keys [1]: [ca_address_sk#17]
+(36) SortMergeJoin [codegen id : 10]
+Left keys [1]: [c_current_addr_sk#22]
+Right keys [1]: [ca_address_sk#24]
 Join condition: None
 
-(30) Project [codegen id : 9]
-Output [2]: [c_customer_sk#14, ca_zip#18]
-Input [4]: [c_customer_sk#14, c_current_addr_sk#15, ca_address_sk#17, ca_zip#18]
-
-(31) Exchange
-Input [2]: [c_customer_sk#14, ca_zip#18]
-Arguments: hashpartitioning(c_customer_sk#14, 5), ENSURE_REQUIREMENTS, [id=#20]
-
-(32) Sort [codegen id : 10]
-Input [2]: [c_customer_sk#14, ca_zip#18]
-Arguments: [c_customer_sk#14 ASC NULLS FIRST], false, 0
+(37) Project [codegen id : 10]
+Output [2]: [c_customer_sk#21, ca_zip#25]
+Input [4]: [c_customer_sk#21, c_current_addr_sk#22, ca_address_sk#24, ca_zip#25]
 
-(33) SortMergeJoin [codegen id : 12]
-Left keys [1]: [ss_customer_sk#7]
-Right keys [1]: [c_customer_sk#14]
-Join condition: NOT (substr(ca_zip#18, 1, 5) = substr(s_zip#11, 1, 5))
+(38) Exchange
+Input [2]: [c_customer_sk#21, ca_zip#25]
+Arguments: hashpartitioning(c_customer_sk#21, 5), ENSURE_REQUIREMENTS, [id=#27]
 
-(34) Project [codegen id : 12]
-Output [2]: [ss_item_sk#6, ss_ext_sales_price#9]
-Input [6]: [ss_item_sk#6, ss_customer_sk#7, ss_ext_sales_price#9, s_zip#11, c_customer_sk#14, ca_zip#18]
+(39) Sort [codegen id : 11]
+Input [2]: [c_customer_sk#21, ca_zip#25]
+Arguments: [c_customer_sk#21 ASC NULLS FIRST], false, 0
 
-(35) Scan parquet default.item
-Output [6]: [i_item_sk#21, i_brand_id#22, i_brand#23, i_manufact_id#24, i_manufact#25, i_manager_id#26]
-Batched: true
-Location [not included in comparison]/{warehouse_dir}/item]
-PushedFilters: [IsNotNull(i_manager_id), EqualTo(i_manager_id,8), IsNotNull(i_item_sk)]
-ReadSchema: struct<i_item_sk:int,i_brand_id:int,i_brand:string,i_manufact_id:int,i_manufact:string,i_manager_id:int>
-
-(36) ColumnarToRow [codegen id : 11]
-Input [6]: [i_item_sk#21, i_brand_id#22, i_brand#23, i_manufact_id#24, i_manufact#25, i_manager_id#26]
-
-(37) Filter [codegen id : 11]
-Input [6]: [i_item_sk#21, i_brand_id#22, i_brand#23, i_manufact_id#24, i_manufact#25, i_manager_id#26]
-Condition : ((isnotnull(i_manager_id#26) AND (i_manager_id#26 = 8)) AND isnotnull(i_item_sk#21))
-
-(38) Project [codegen id : 11]
-Output [5]: [i_item_sk#21, i_brand_id#22, i_brand#23, i_manufact_id#24, i_manufact#25]
-Input [6]: [i_item_sk#21, i_brand_id#22, i_brand#23, i_manufact_id#24, i_manufact#25, i_manager_id#26]
-
-(39) BroadcastExchange
-Input [5]: [i_item_sk#21, i_brand_id#22, i_brand#23, i_manufact_id#24, i_manufact#25]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#27]
-
-(40) BroadcastHashJoin [codegen id : 12]
-Left keys [1]: [ss_item_sk#6]
-Right keys [1]: [i_item_sk#21]
-Join condition: None
+(40) SortMergeJoin [codegen id : 12]
+Left keys [1]: [ss_customer_sk#3]
+Right keys [1]: [c_customer_sk#21]
+Join condition: NOT (substr(ca_zip#25, 1, 5) = substr(s_zip#18, 1, 5))
 
 (41) Project [codegen id : 12]
-Output [5]: [ss_ext_sales_price#9, i_brand_id#22, i_brand#23, i_manufact_id#24, i_manufact#25]
-Input [7]: [ss_item_sk#6, ss_ext_sales_price#9, i_item_sk#21, i_brand_id#22, i_brand#23, i_manufact_id#24, i_manufact#25]
+Output [5]: [ss_ext_sales_price#5, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10]
+Input [9]: [ss_customer_sk#3, ss_ext_sales_price#5, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10, s_zip#18, c_customer_sk#21, ca_zip#25]
 
 (42) HashAggregate [codegen id : 12]
-Input [5]: [ss_ext_sales_price#9, i_brand_id#22, i_brand#23, i_manufact_id#24, i_manufact#25]
-Keys [4]: [i_brand#23, i_brand_id#22, i_manufact_id#24, i_manufact#25]
-Functions [1]: [partial_sum(UnscaledValue(ss_ext_sales_price#9))]
+Input [5]: [ss_ext_sales_price#5, i_brand_id#7, i_brand#8, i_manufact_id#9, i_manufact#10]
+Keys [4]: [i_brand#8, i_brand_id#7, i_manufact_id#9, i_manufact#10]
+Functions [1]: [partial_sum(UnscaledValue(ss_ext_sales_price#5))]
 Aggregate Attributes [1]: [sum#28]
-Results [5]: [i_brand#23, i_brand_id#22, i_manufact_id#24, i_manufact#25, sum#29]
+Results [5]: [i_brand#8, i_brand_id#7, i_manufact_id#9, i_manufact#10, sum#29]
 
 (43) Exchange
-Input [5]: [i_brand#23, i_brand_id#22, i_manufact_id#24, i_manufact#25, sum#29]
-Arguments: hashpartitioning(i_brand#23, i_brand_id#22, i_manufact_id#24, i_manufact#25, 5), ENSURE_REQUIREMENTS, [id=#30]
+Input [5]: [i_brand#8, i_brand_id#7, i_manufact_id#9, i_manufact#10, sum#29]
+Arguments: hashpartitioning(i_brand#8, i_brand_id#7, i_manufact_id#9, i_manufact#10, 5), ENSURE_REQUIREMENTS, [id=#30]
 
 (44) HashAggregate [codegen id : 13]
-Input [5]: [i_brand#23, i_brand_id#22, i_manufact_id#24, i_manufact#25, sum#29]
-Keys [4]: [i_brand#23, i_brand_id#22, i_manufact_id#24, i_manufact#25]
-Functions [1]: [sum(UnscaledValue(ss_ext_sales_price#9))]
-Aggregate Attributes [1]: [sum(UnscaledValue(ss_ext_sales_price#9))#31]
-Results [5]: [i_brand_id#22 AS brand_id#32, i_brand#23 AS brand#33, i_manufact_id#24, i_manufact#25, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#9))#31,17,2) AS ext_price#34]
+Input [5]: [i_brand#8, i_brand_id#7, i_manufact_id#9, i_manufact#10, sum#29]
+Keys [4]: [i_brand#8, i_brand_id#7, i_manufact_id#9, i_manufact#10]
+Functions [1]: [sum(UnscaledValue(ss_ext_sales_price#5))]
+Aggregate Attributes [1]: [sum(UnscaledValue(ss_ext_sales_price#5))#31]
+Results [5]: [i_brand_id#7 AS brand_id#32, i_brand#8 AS brand#33, i_manufact_id#9, i_manufact#10, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#5))#31,17,2) AS ext_price#34]
 
 (45) TakeOrderedAndProject
-Input [5]: [brand_id#32, brand#33, i_manufact_id#24, i_manufact#25, ext_price#34]
-Arguments: 100, [ext_price#34 DESC NULLS LAST, brand#33 ASC NULLS FIRST, brand_id#32 ASC NULLS FIRST, i_manufact_id#24 ASC NULLS FIRST, i_manufact#25 ASC NULLS FIRST], [brand_id#32, brand#33, i_manufact_id#24, i_manufact#25, ext_price#34]
+Input [5]: [brand_id#32, brand#33, i_manufact_id#9, i_manufact#10, ext_price#34]
+Arguments: 100, [ext_price#34 DESC NULLS LAST, brand#33 ASC NULLS FIRST, brand_id#32 ASC NULLS FIRST, i_manufact_id#9 ASC NULLS FIRST, i_manufact#10 ASC NULLS FIRST], [brand_id#32, brand#33, i_manufact_id#9, i_manufact#10, ext_price#34]
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q19.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q19.sf100/simplified.txt
index b6441c5fe72c1..05fa3f82e27df 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q19.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q19.sf100/simplified.txt
@@ -6,71 +6,71 @@ TakeOrderedAndProject [ext_price,brand,brand_id,i_manufact_id,i_manufact]
           WholeStageCodegen (12)
             HashAggregate [i_brand,i_brand_id,i_manufact_id,i_manufact,ss_ext_sales_price] [sum,sum]
               Project [ss_ext_sales_price,i_brand_id,i_brand,i_manufact_id,i_manufact]
-                BroadcastHashJoin [ss_item_sk,i_item_sk]
-                  Project [ss_item_sk,ss_ext_sales_price]
-                    SortMergeJoin [ss_customer_sk,c_customer_sk,ca_zip,s_zip]
-                      InputAdapter
-                        WholeStageCodegen (4)
-                          Sort [ss_customer_sk]
-                            InputAdapter
-                              Exchange [ss_customer_sk] #2
-                                WholeStageCodegen (3)
-                                  Project [ss_item_sk,ss_customer_sk,ss_ext_sales_price,s_zip]
-                                    BroadcastHashJoin [ss_store_sk,s_store_sk]
-                                      Project [ss_item_sk,ss_customer_sk,ss_store_sk,ss_ext_sales_price]
-                                        BroadcastHashJoin [d_date_sk,ss_sold_date_sk]
+                SortMergeJoin [ss_customer_sk,c_customer_sk,ca_zip,s_zip]
+                  InputAdapter
+                    WholeStageCodegen (5)
+                      Sort [ss_customer_sk]
+                        InputAdapter
+                          Exchange [ss_customer_sk] #2
+                            WholeStageCodegen (4)
+                              Project [ss_customer_sk,ss_ext_sales_price,i_brand_id,i_brand,i_manufact_id,i_manufact,s_zip]
+                                BroadcastHashJoin [ss_store_sk,s_store_sk]
+                                  Project [ss_customer_sk,ss_store_sk,ss_ext_sales_price,i_brand_id,i_brand,i_manufact_id,i_manufact]
+                                    BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
+                                      Project [ss_sold_date_sk,ss_customer_sk,ss_store_sk,ss_ext_sales_price,i_brand_id,i_brand,i_manufact_id,i_manufact]
+                                        BroadcastHashJoin [ss_item_sk,i_item_sk]
+                                          Filter [ss_sold_date_sk,ss_item_sk,ss_customer_sk,ss_store_sk]
+                                            ColumnarToRow
+                                              InputAdapter
+                                                Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk,ss_customer_sk,ss_store_sk,ss_ext_sales_price]
                                           InputAdapter
                                             BroadcastExchange #3
                                               WholeStageCodegen (1)
-                                                Project [d_date_sk]
-                                                  Filter [d_moy,d_year,d_date_sk]
+                                                Project [i_item_sk,i_brand_id,i_brand,i_manufact_id,i_manufact]
+                                                  Filter [i_manager_id,i_item_sk]
                                                     ColumnarToRow
                                                       InputAdapter
-                                                        Scan parquet default.date_dim [d_date_sk,d_year,d_moy]
-                                          Filter [ss_sold_date_sk,ss_item_sk,ss_customer_sk,ss_store_sk]
-                                            ColumnarToRow
-                                              InputAdapter
-                                                Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk,ss_customer_sk,ss_store_sk,ss_ext_sales_price]
+                                                        Scan parquet default.item [i_item_sk,i_brand_id,i_brand,i_manufact_id,i_manufact,i_manager_id]
                                       InputAdapter
                                         BroadcastExchange #4
                                           WholeStageCodegen (2)
-                                            Filter [s_zip,s_store_sk]
-                                              ColumnarToRow
-                                                InputAdapter
-                                                  Scan parquet default.store [s_store_sk,s_zip]
-                      InputAdapter
-                        WholeStageCodegen (10)
-                          Sort [c_customer_sk]
-                            InputAdapter
-                              Exchange [c_customer_sk] #5
-                                WholeStageCodegen (9)
-                                  Project [c_customer_sk,ca_zip]
-                                    SortMergeJoin [c_current_addr_sk,ca_address_sk]
-                                      InputAdapter
-                                        WholeStageCodegen (6)
-                                          Sort [c_current_addr_sk]
-                                            InputAdapter
-                                              Exchange [c_current_addr_sk] #6
-                                                WholeStageCodegen (5)
-                                                  Filter [c_customer_sk,c_current_addr_sk]
-                                                    ColumnarToRow
-                                                      InputAdapter
-                                                        Scan parquet default.customer [c_customer_sk,c_current_addr_sk]
-                                      InputAdapter
-                                        WholeStageCodegen (8)
-                                          Sort [ca_address_sk]
+                                            Project [d_date_sk]
+                                              Filter [d_moy,d_year,d_date_sk]
+                                                ColumnarToRow
+                                                  InputAdapter
+                                                    Scan parquet default.date_dim [d_date_sk,d_year,d_moy]
+                                  InputAdapter
+                                    BroadcastExchange #5
+                                      WholeStageCodegen (3)
+                                        Filter [s_zip,s_store_sk]
+                                          ColumnarToRow
                                             InputAdapter
-                                              Exchange [ca_address_sk] #7
-                                                WholeStageCodegen (7)
-                                                  Filter [ca_address_sk,ca_zip]
-                                                    ColumnarToRow
-                                                      InputAdapter
-                                                        Scan parquet default.customer_address [ca_address_sk,ca_zip]
+                                              Scan parquet default.store [s_store_sk,s_zip]
                   InputAdapter
-                    BroadcastExchange #8
-                      WholeStageCodegen (11)
-                        Project [i_item_sk,i_brand_id,i_brand,i_manufact_id,i_manufact]
-                          Filter [i_manager_id,i_item_sk]
-                            ColumnarToRow
-                              InputAdapter
-                                Scan parquet default.item [i_item_sk,i_brand_id,i_brand,i_manufact_id,i_manufact,i_manager_id]
+                    WholeStageCodegen (11)
+                      Sort [c_customer_sk]
+                        InputAdapter
+                          Exchange [c_customer_sk] #6
+                            WholeStageCodegen (10)
+                              Project [c_customer_sk,ca_zip]
+                                SortMergeJoin [c_current_addr_sk,ca_address_sk]
+                                  InputAdapter
+                                    WholeStageCodegen (7)
+                                      Sort [c_current_addr_sk]
+                                        InputAdapter
+                                          Exchange [c_current_addr_sk] #7
+                                            WholeStageCodegen (6)
+                                              Filter [c_customer_sk,c_current_addr_sk]
+                                                ColumnarToRow
+                                                  InputAdapter
+                                                    Scan parquet default.customer [c_customer_sk,c_current_addr_sk]
+                                  InputAdapter
+                                    WholeStageCodegen (9)
+                                      Sort [ca_address_sk]
+                                        InputAdapter
+                                          Exchange [ca_address_sk] #8
+                                            WholeStageCodegen (8)
+                                              Filter [ca_address_sk,ca_zip]
+                                                ColumnarToRow
+                                                  InputAdapter
+                                                    Scan parquet default.customer_address [ca_address_sk,ca_zip]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q20.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q20.sf100/explain.txt
index 2ef3660bc7ba4..a5b9623cbd96e 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q20.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q20.sf100/explain.txt
@@ -8,24 +8,24 @@ TakeOrderedAndProject (27)
                +- Exchange (21)
                   +- * HashAggregate (20)
                      +- * Project (19)
-                        +- * SortMergeJoin Inner (18)
-                           :- * Sort (12)
-                           :  +- Exchange (11)
-                           :     +- * Project (10)
-                           :        +- * BroadcastHashJoin Inner BuildRight (9)
-                           :           :- * Filter (3)
-                           :           :  +- * ColumnarToRow (2)
-                           :           :     +- Scan parquet default.catalog_sales (1)
-                           :           +- BroadcastExchange (8)
-                           :              +- * Project (7)
-                           :                 +- * Filter (6)
-                           :                    +- * ColumnarToRow (5)
-                           :                       +- Scan parquet default.date_dim (4)
-                           +- * Sort (17)
-                              +- Exchange (16)
+                        +- * BroadcastHashJoin Inner BuildRight (18)
+                           :- * Project (12)
+                           :  +- * SortMergeJoin Inner (11)
+                           :     :- * Sort (5)
+                           :     :  +- Exchange (4)
+                           :     :     +- * Filter (3)
+                           :     :        +- * ColumnarToRow (2)
+                           :     :           +- Scan parquet default.catalog_sales (1)
+                           :     +- * Sort (10)
+                           :        +- Exchange (9)
+                           :           +- * Filter (8)
+                           :              +- * ColumnarToRow (7)
+                           :                 +- Scan parquet default.item (6)
+                           +- BroadcastExchange (17)
+                              +- * Project (16)
                                  +- * Filter (15)
                                     +- * ColumnarToRow (14)
-                                       +- Scan parquet default.item (13)
+                                       +- Scan parquet default.date_dim (13)
 
 
 (1) Scan parquet default.catalog_sales
@@ -35,118 +35,118 @@ Location [not included in comparison]/{warehouse_dir}/catalog_sales]
 PushedFilters: [IsNotNull(cs_item_sk), IsNotNull(cs_sold_date_sk)]
 ReadSchema: struct<cs_sold_date_sk:int,cs_item_sk:int,cs_ext_sales_price:decimal(7,2)>
 
-(2) ColumnarToRow [codegen id : 2]
+(2) ColumnarToRow [codegen id : 1]
 Input [3]: [cs_sold_date_sk#1, cs_item_sk#2, cs_ext_sales_price#3]
 
-(3) Filter [codegen id : 2]
+(3) Filter [codegen id : 1]
 Input [3]: [cs_sold_date_sk#1, cs_item_sk#2, cs_ext_sales_price#3]
 Condition : (isnotnull(cs_item_sk#2) AND isnotnull(cs_sold_date_sk#1))
 
-(4) Scan parquet default.date_dim
-Output [2]: [d_date_sk#4, d_date#5]
-Batched: true
-Location [not included in comparison]/{warehouse_dir}/date_dim]
-PushedFilters: [IsNotNull(d_date), GreaterThanOrEqual(d_date,1999-02-22), LessThanOrEqual(d_date,1999-03-24), IsNotNull(d_date_sk)]
-ReadSchema: struct<d_date_sk:int,d_date:date>
+(4) Exchange
+Input [3]: [cs_sold_date_sk#1, cs_item_sk#2, cs_ext_sales_price#3]
+Arguments: hashpartitioning(cs_item_sk#2, 5), ENSURE_REQUIREMENTS, [id=#4]
 
-(5) ColumnarToRow [codegen id : 1]
-Input [2]: [d_date_sk#4, d_date#5]
+(5) Sort [codegen id : 2]
+Input [3]: [cs_sold_date_sk#1, cs_item_sk#2, cs_ext_sales_price#3]
+Arguments: [cs_item_sk#2 ASC NULLS FIRST], false, 0
 
-(6) Filter [codegen id : 1]
-Input [2]: [d_date_sk#4, d_date#5]
-Condition : (((isnotnull(d_date#5) AND (d_date#5 >= 10644)) AND (d_date#5 <= 10674)) AND isnotnull(d_date_sk#4))
+(6) Scan parquet default.item
+Output [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/item]
+PushedFilters: [In(i_category, [Sports                                            ,Books                                             ,Home                                              ]), IsNotNull(i_item_sk)]
+ReadSchema: struct<i_item_sk:int,i_item_id:string,i_item_desc:string,i_current_price:decimal(7,2),i_class:string,i_category:string>
 
-(7) Project [codegen id : 1]
-Output [1]: [d_date_sk#4]
-Input [2]: [d_date_sk#4, d_date#5]
+(7) ColumnarToRow [codegen id : 3]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
 
-(8) BroadcastExchange
-Input [1]: [d_date_sk#4]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#6]
+(8) Filter [codegen id : 3]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Condition : (i_category#10 IN (Sports                                            ,Books                                             ,Home                                              ) AND isnotnull(i_item_sk#5))
 
-(9) BroadcastHashJoin [codegen id : 2]
-Left keys [1]: [cs_sold_date_sk#1]
-Right keys [1]: [d_date_sk#4]
-Join condition: None
+(9) Exchange
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Arguments: hashpartitioning(i_item_sk#5, 5), ENSURE_REQUIREMENTS, [id=#11]
 
-(10) Project [codegen id : 2]
-Output [2]: [cs_item_sk#2, cs_ext_sales_price#3]
-Input [4]: [cs_sold_date_sk#1, cs_item_sk#2, cs_ext_sales_price#3, d_date_sk#4]
+(10) Sort [codegen id : 4]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Arguments: [i_item_sk#5 ASC NULLS FIRST], false, 0
 
-(11) Exchange
-Input [2]: [cs_item_sk#2, cs_ext_sales_price#3]
-Arguments: hashpartitioning(cs_item_sk#2, 5), true, [id=#7]
+(11) SortMergeJoin [codegen id : 6]
+Left keys [1]: [cs_item_sk#2]
+Right keys [1]: [i_item_sk#5]
+Join condition: None
 
-(12) Sort [codegen id : 3]
-Input [2]: [cs_item_sk#2, cs_ext_sales_price#3]
-Arguments: [cs_item_sk#2 ASC NULLS FIRST], false, 0
+(12) Project [codegen id : 6]
+Output [7]: [cs_sold_date_sk#1, cs_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Input [9]: [cs_sold_date_sk#1, cs_item_sk#2, cs_ext_sales_price#3, i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
 
-(13) Scan parquet default.item
-Output [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+(13) Scan parquet default.date_dim
+Output [2]: [d_date_sk#12, d_date#13]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/item]
-PushedFilters: [In(i_category, [Sports,Books,Home]), IsNotNull(i_item_sk)]
-ReadSchema: struct<i_item_sk:int,i_item_id:string,i_item_desc:string,i_current_price:decimal(7,2),i_class:string,i_category:string>
+Location [not included in comparison]/{warehouse_dir}/date_dim]
+PushedFilters: [IsNotNull(d_date), GreaterThanOrEqual(d_date,1999-02-22), LessThanOrEqual(d_date,1999-03-24), IsNotNull(d_date_sk)]
+ReadSchema: struct<d_date_sk:int,d_date:date>
 
-(14) ColumnarToRow [codegen id : 4]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+(14) ColumnarToRow [codegen id : 5]
+Input [2]: [d_date_sk#12, d_date#13]
 
-(15) Filter [codegen id : 4]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Condition : (i_category#13 IN (Sports,Books,Home) AND isnotnull(i_item_sk#8))
+(15) Filter [codegen id : 5]
+Input [2]: [d_date_sk#12, d_date#13]
+Condition : (((isnotnull(d_date#13) AND (d_date#13 >= 10644)) AND (d_date#13 <= 10674)) AND isnotnull(d_date_sk#12))
 
-(16) Exchange
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Arguments: hashpartitioning(i_item_sk#8, 5), true, [id=#14]
+(16) Project [codegen id : 5]
+Output [1]: [d_date_sk#12]
+Input [2]: [d_date_sk#12, d_date#13]
 
-(17) Sort [codegen id : 5]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Arguments: [i_item_sk#8 ASC NULLS FIRST], false, 0
+(17) BroadcastExchange
+Input [1]: [d_date_sk#12]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#14]
 
-(18) SortMergeJoin [codegen id : 6]
-Left keys [1]: [cs_item_sk#2]
-Right keys [1]: [i_item_sk#8]
+(18) BroadcastHashJoin [codegen id : 6]
+Left keys [1]: [cs_sold_date_sk#1]
+Right keys [1]: [d_date_sk#12]
 Join condition: None
 
 (19) Project [codegen id : 6]
-Output [6]: [cs_ext_sales_price#3, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Input [8]: [cs_item_sk#2, cs_ext_sales_price#3, i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+Output [6]: [cs_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Input [8]: [cs_sold_date_sk#1, cs_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10, d_date_sk#12]
 
 (20) HashAggregate [codegen id : 6]
-Input [6]: [cs_ext_sales_price#3, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Keys [5]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11]
+Input [6]: [cs_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Keys [5]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8]
 Functions [1]: [partial_sum(UnscaledValue(cs_ext_sales_price#3))]
 Aggregate Attributes [1]: [sum#15]
-Results [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
+Results [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
 
 (21) Exchange
-Input [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
-Arguments: hashpartitioning(i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, 5), true, [id=#17]
+Input [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
+Arguments: hashpartitioning(i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, 5), ENSURE_REQUIREMENTS, [id=#17]
 
 (22) HashAggregate [codegen id : 7]
-Input [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
-Keys [5]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11]
+Input [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
+Keys [5]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8]
 Functions [1]: [sum(UnscaledValue(cs_ext_sales_price#3))]
 Aggregate Attributes [1]: [sum(UnscaledValue(cs_ext_sales_price#3))#18]
-Results [8]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, MakeDecimal(sum(UnscaledValue(cs_ext_sales_price#3))#18,17,2) AS itemrevenue#19, MakeDecimal(sum(UnscaledValue(cs_ext_sales_price#3))#18,17,2) AS _w0#20, MakeDecimal(sum(UnscaledValue(cs_ext_sales_price#3))#18,17,2) AS _w1#21, i_item_id#9]
+Results [8]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, MakeDecimal(sum(UnscaledValue(cs_ext_sales_price#3))#18,17,2) AS itemrevenue#19, MakeDecimal(sum(UnscaledValue(cs_ext_sales_price#3))#18,17,2) AS _w0#20, MakeDecimal(sum(UnscaledValue(cs_ext_sales_price#3))#18,17,2) AS _w1#21, i_item_id#6]
 
 (23) Exchange
-Input [8]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, i_item_id#9]
-Arguments: hashpartitioning(i_class#12, 5), true, [id=#22]
+Input [8]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, i_item_id#6]
+Arguments: hashpartitioning(i_class#9, 5), ENSURE_REQUIREMENTS, [id=#22]
 
 (24) Sort [codegen id : 8]
-Input [8]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, i_item_id#9]
-Arguments: [i_class#12 ASC NULLS FIRST], false, 0
+Input [8]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, i_item_id#6]
+Arguments: [i_class#9 ASC NULLS FIRST], false, 0
 
 (25) Window
-Input [8]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, i_item_id#9]
-Arguments: [sum(_w1#21) windowspecdefinition(i_class#12, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#23], [i_class#12]
+Input [8]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, i_item_id#6]
+Arguments: [sum(_w1#21) windowspecdefinition(i_class#9, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#23], [i_class#9]
 
 (26) Project [codegen id : 9]
-Output [7]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(_w0#20) * 100.00), DecimalType(21,2), true) as decimal(27,2))) / promote_precision(_we0#23)), DecimalType(38,17), true) AS revenueratio#24, i_item_id#9]
-Input [9]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, i_item_id#9, _we0#23]
+Output [7]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(_w0#20) * 100.00), DecimalType(21,2), true) as decimal(27,2))) / promote_precision(_we0#23)), DecimalType(38,17), true) AS revenueratio#24, i_item_id#6]
+Input [9]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, i_item_id#6, _we0#23]
 
 (27) TakeOrderedAndProject
-Input [7]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, revenueratio#24, i_item_id#9]
-Arguments: 100, [i_category#13 ASC NULLS FIRST, i_class#12 ASC NULLS FIRST, i_item_id#9 ASC NULLS FIRST, i_item_desc#10 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST], [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, revenueratio#24]
+Input [7]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, revenueratio#24, i_item_id#6]
+Arguments: 100, [i_category#10 ASC NULLS FIRST, i_class#9 ASC NULLS FIRST, i_item_id#6 ASC NULLS FIRST, i_item_desc#7 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST], [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, revenueratio#24]
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q20.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q20.sf100/simplified.txt
index 1e8ab18f5e21a..965b68218eb0a 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q20.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q20.sf100/simplified.txt
@@ -14,34 +14,34 @@ TakeOrderedAndProject [i_category,i_class,i_item_id,i_item_desc,revenueratio,i_c
                           WholeStageCodegen (6)
                             HashAggregate [i_item_id,i_item_desc,i_category,i_class,i_current_price,cs_ext_sales_price] [sum,sum]
                               Project [cs_ext_sales_price,i_item_id,i_item_desc,i_current_price,i_class,i_category]
-                                SortMergeJoin [cs_item_sk,i_item_sk]
-                                  InputAdapter
-                                    WholeStageCodegen (3)
-                                      Sort [cs_item_sk]
-                                        InputAdapter
-                                          Exchange [cs_item_sk] #3
-                                            WholeStageCodegen (2)
-                                              Project [cs_item_sk,cs_ext_sales_price]
-                                                BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
+                                BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
+                                  Project [cs_sold_date_sk,cs_ext_sales_price,i_item_id,i_item_desc,i_current_price,i_class,i_category]
+                                    SortMergeJoin [cs_item_sk,i_item_sk]
+                                      InputAdapter
+                                        WholeStageCodegen (2)
+                                          Sort [cs_item_sk]
+                                            InputAdapter
+                                              Exchange [cs_item_sk] #3
+                                                WholeStageCodegen (1)
                                                   Filter [cs_item_sk,cs_sold_date_sk]
                                                     ColumnarToRow
                                                       InputAdapter
                                                         Scan parquet default.catalog_sales [cs_sold_date_sk,cs_item_sk,cs_ext_sales_price]
-                                                  InputAdapter
-                                                    BroadcastExchange #4
-                                                      WholeStageCodegen (1)
-                                                        Project [d_date_sk]
-                                                          Filter [d_date,d_date_sk]
-                                                            ColumnarToRow
-                                                              InputAdapter
-                                                                Scan parquet default.date_dim [d_date_sk,d_date]
+                                      InputAdapter
+                                        WholeStageCodegen (4)
+                                          Sort [i_item_sk]
+                                            InputAdapter
+                                              Exchange [i_item_sk] #4
+                                                WholeStageCodegen (3)
+                                                  Filter [i_category,i_item_sk]
+                                                    ColumnarToRow
+                                                      InputAdapter
+                                                        Scan parquet default.item [i_item_sk,i_item_id,i_item_desc,i_current_price,i_class,i_category]
                                   InputAdapter
-                                    WholeStageCodegen (5)
-                                      Sort [i_item_sk]
-                                        InputAdapter
-                                          Exchange [i_item_sk] #5
-                                            WholeStageCodegen (4)
-                                              Filter [i_category,i_item_sk]
-                                                ColumnarToRow
-                                                  InputAdapter
-                                                    Scan parquet default.item [i_item_sk,i_item_id,i_item_desc,i_current_price,i_class,i_category]
+                                    BroadcastExchange #5
+                                      WholeStageCodegen (5)
+                                        Project [d_date_sk]
+                                          Filter [d_date,d_date_sk]
+                                            ColumnarToRow
+                                              InputAdapter
+                                                Scan parquet default.date_dim [d_date_sk,d_date]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24a.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24a.sf100/explain.txt
index 093c4eed6cf11..351c7f0024dd0 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24a.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24a.sf100/explain.txt
@@ -1,567 +1,587 @@
 == Physical Plan ==
-* Project (48)
-+- * Filter (47)
-   +- * HashAggregate (46)
-      +- Exchange (45)
-         +- * HashAggregate (44)
-            +- * HashAggregate (43)
-               +- Exchange (42)
-                  +- * HashAggregate (41)
-                     +- * Project (40)
-                        +- * SortMergeJoin Inner (39)
-                           :- * Sort (33)
-                           :  +- Exchange (32)
-                           :     +- * Project (31)
-                           :        +- * BroadcastHashJoin Inner BuildRight (30)
-                           :           :- * Project (18)
-                           :           :  +- * SortMergeJoin Inner (17)
-                           :           :     :- * Sort (11)
-                           :           :     :  +- Exchange (10)
-                           :           :     :     +- * Project (9)
-                           :           :     :        +- * BroadcastHashJoin Inner BuildRight (8)
-                           :           :     :           :- * Filter (3)
-                           :           :     :           :  +- * ColumnarToRow (2)
-                           :           :     :           :     +- Scan parquet default.store_sales (1)
-                           :           :     :           +- BroadcastExchange (7)
-                           :           :     :              +- * Filter (6)
-                           :           :     :                 +- * ColumnarToRow (5)
-                           :           :     :                    +- Scan parquet default.item (4)
-                           :           :     +- * Sort (16)
-                           :           :        +- Exchange (15)
-                           :           :           +- * Filter (14)
-                           :           :              +- * ColumnarToRow (13)
-                           :           :                 +- Scan parquet default.customer (12)
-                           :           +- BroadcastExchange (29)
-                           :              +- * Project (28)
-                           :                 +- * BroadcastHashJoin Inner BuildLeft (27)
-                           :                    :- BroadcastExchange (23)
-                           :                    :  +- * Project (22)
-                           :                    :     +- * Filter (21)
-                           :                    :        +- * ColumnarToRow (20)
-                           :                    :           +- Scan parquet default.store (19)
-                           :                    +- * Filter (26)
-                           :                       +- * ColumnarToRow (25)
-                           :                          +- Scan parquet default.customer_address (24)
-                           +- * Sort (38)
-                              +- Exchange (37)
-                                 +- * Filter (36)
-                                    +- * ColumnarToRow (35)
-                                       +- Scan parquet default.store_returns (34)
+* Project (50)
++- * Filter (49)
+   +- * HashAggregate (48)
+      +- Exchange (47)
+         +- * HashAggregate (46)
+            +- * HashAggregate (45)
+               +- Exchange (44)
+                  +- * HashAggregate (43)
+                     +- * Project (42)
+                        +- * SortMergeJoin Inner (41)
+                           :- * Sort (34)
+                           :  +- Exchange (33)
+                           :     +- * Project (32)
+                           :        +- * BroadcastHashJoin Inner BuildRight (31)
+                           :           :- * Project (19)
+                           :           :  +- * SortMergeJoin Inner (18)
+                           :           :     :- * Sort (12)
+                           :           :     :  +- Exchange (11)
+                           :           :     :     +- * Project (10)
+                           :           :     :        +- * BroadcastHashJoin Inner BuildRight (9)
+                           :           :     :           :- * Project (4)
+                           :           :     :           :  +- * Filter (3)
+                           :           :     :           :     +- * ColumnarToRow (2)
+                           :           :     :           :        +- Scan parquet default.store_sales (1)
+                           :           :     :           +- BroadcastExchange (8)
+                           :           :     :              +- * Filter (7)
+                           :           :     :                 +- * ColumnarToRow (6)
+                           :           :     :                    +- Scan parquet default.item (5)
+                           :           :     +- * Sort (17)
+                           :           :        +- Exchange (16)
+                           :           :           +- * Filter (15)
+                           :           :              +- * ColumnarToRow (14)
+                           :           :                 +- Scan parquet default.customer (13)
+                           :           +- BroadcastExchange (30)
+                           :              +- * Project (29)
+                           :                 +- * BroadcastHashJoin Inner BuildLeft (28)
+                           :                    :- BroadcastExchange (24)
+                           :                    :  +- * Project (23)
+                           :                    :     +- * Filter (22)
+                           :                    :        +- * ColumnarToRow (21)
+                           :                    :           +- Scan parquet default.store (20)
+                           :                    +- * Filter (27)
+                           :                       +- * ColumnarToRow (26)
+                           :                          +- Scan parquet default.customer_address (25)
+                           +- * Sort (40)
+                              +- Exchange (39)
+                                 +- * Project (38)
+                                    +- * Filter (37)
+                                       +- * ColumnarToRow (36)
+                                          +- Scan parquet default.store_returns (35)
 
 
 (1) Scan parquet default.store_sales
-Output [5]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5]
+Output [6]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, ss_sold_date_sk#6]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/store_sales]
 PushedFilters: [IsNotNull(ss_ticket_number), IsNotNull(ss_item_sk), IsNotNull(ss_store_sk), IsNotNull(ss_customer_sk)]
 ReadSchema: struct<ss_item_sk:int,ss_customer_sk:int,ss_store_sk:int,ss_ticket_number:int,ss_net_paid:decimal(7,2)>
 
 (2) ColumnarToRow [codegen id : 2]
-Input [5]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5]
+Input [6]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, ss_sold_date_sk#6]
 
 (3) Filter [codegen id : 2]
-Input [5]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5]
+Input [6]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, ss_sold_date_sk#6]
 Condition : (((isnotnull(ss_ticket_number#4) AND isnotnull(ss_item_sk#1)) AND isnotnull(ss_store_sk#3)) AND isnotnull(ss_customer_sk#2))
 
-(4) Scan parquet default.item
-Output [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
+(4) Project [codegen id : 2]
+Output [5]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5]
+Input [6]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, ss_sold_date_sk#6]
+
+(5) Scan parquet default.item
+Output [6]: [i_item_sk#7, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/item]
-PushedFilters: [IsNotNull(i_color), EqualTo(i_color,pale), IsNotNull(i_item_sk)]
+PushedFilters: [IsNotNull(i_color), EqualTo(i_color,pale                ), IsNotNull(i_item_sk)]
 ReadSchema: struct<i_item_sk:int,i_current_price:decimal(7,2),i_size:string,i_color:string,i_units:string,i_manager_id:int>
 
-(5) ColumnarToRow [codegen id : 1]
-Input [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
+(6) ColumnarToRow [codegen id : 1]
+Input [6]: [i_item_sk#7, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
 
-(6) Filter [codegen id : 1]
-Input [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Condition : ((isnotnull(i_color#9) AND (i_color#9 = pale)) AND isnotnull(i_item_sk#6))
+(7) Filter [codegen id : 1]
+Input [6]: [i_item_sk#7, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
+Condition : ((isnotnull(i_color#10) AND (i_color#10 = pale                )) AND isnotnull(i_item_sk#7))
 
-(7) BroadcastExchange
-Input [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#12]
+(8) BroadcastExchange
+Input [6]: [i_item_sk#7, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#13]
 
-(8) BroadcastHashJoin [codegen id : 2]
+(9) BroadcastHashJoin [codegen id : 2]
 Left keys [1]: [ss_item_sk#1]
-Right keys [1]: [i_item_sk#6]
+Right keys [1]: [i_item_sk#7]
 Join condition: None
 
-(9) Project [codegen id : 2]
-Output [10]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Input [11]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
+(10) Project [codegen id : 2]
+Output [10]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
+Input [11]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_item_sk#7, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
 
-(10) Exchange
-Input [10]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Arguments: hashpartitioning(ss_customer_sk#2, 5), ENSURE_REQUIREMENTS, [id=#13]
+(11) Exchange
+Input [10]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
+Arguments: hashpartitioning(ss_customer_sk#2, 5), ENSURE_REQUIREMENTS, [id=#14]
 
-(11) Sort [codegen id : 3]
-Input [10]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
+(12) Sort [codegen id : 3]
+Input [10]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
 Arguments: [ss_customer_sk#2 ASC NULLS FIRST], false, 0
 
-(12) Scan parquet default.customer
-Output [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
+(13) Scan parquet default.customer
+Output [4]: [c_customer_sk#15, c_first_name#16, c_last_name#17, c_birth_country#18]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer]
 PushedFilters: [IsNotNull(c_customer_sk), IsNotNull(c_birth_country)]
 ReadSchema: struct<c_customer_sk:int,c_first_name:string,c_last_name:string,c_birth_country:string>
 
-(13) ColumnarToRow [codegen id : 4]
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
+(14) ColumnarToRow [codegen id : 4]
+Input [4]: [c_customer_sk#15, c_first_name#16, c_last_name#17, c_birth_country#18]
 
-(14) Filter [codegen id : 4]
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
-Condition : (isnotnull(c_customer_sk#14) AND isnotnull(c_birth_country#17))
+(15) Filter [codegen id : 4]
+Input [4]: [c_customer_sk#15, c_first_name#16, c_last_name#17, c_birth_country#18]
+Condition : (isnotnull(c_customer_sk#15) AND isnotnull(c_birth_country#18))
 
-(15) Exchange
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
-Arguments: hashpartitioning(c_customer_sk#14, 5), ENSURE_REQUIREMENTS, [id=#18]
+(16) Exchange
+Input [4]: [c_customer_sk#15, c_first_name#16, c_last_name#17, c_birth_country#18]
+Arguments: hashpartitioning(c_customer_sk#15, 5), ENSURE_REQUIREMENTS, [id=#19]
 
-(16) Sort [codegen id : 5]
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
-Arguments: [c_customer_sk#14 ASC NULLS FIRST], false, 0
+(17) Sort [codegen id : 5]
+Input [4]: [c_customer_sk#15, c_first_name#16, c_last_name#17, c_birth_country#18]
+Arguments: [c_customer_sk#15 ASC NULLS FIRST], false, 0
 
-(17) SortMergeJoin [codegen id : 8]
+(18) SortMergeJoin [codegen id : 8]
 Left keys [1]: [ss_customer_sk#2]
-Right keys [1]: [c_customer_sk#14]
+Right keys [1]: [c_customer_sk#15]
 Join condition: None
 
-(18) Project [codegen id : 8]
-Output [12]: [ss_item_sk#1, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, c_birth_country#17]
-Input [14]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
+(19) Project [codegen id : 8]
+Output [12]: [ss_item_sk#1, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, c_birth_country#18]
+Input [14]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_customer_sk#15, c_first_name#16, c_last_name#17, c_birth_country#18]
 
-(19) Scan parquet default.store
-Output [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
+(20) Scan parquet default.store
+Output [5]: [s_store_sk#20, s_store_name#21, s_market_id#22, s_state#23, s_zip#24]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/store]
 PushedFilters: [IsNotNull(s_market_id), EqualTo(s_market_id,8), IsNotNull(s_store_sk), IsNotNull(s_zip)]
 ReadSchema: struct<s_store_sk:int,s_store_name:string,s_market_id:int,s_state:string,s_zip:string>
 
-(20) ColumnarToRow [codegen id : 6]
-Input [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
+(21) ColumnarToRow [codegen id : 6]
+Input [5]: [s_store_sk#20, s_store_name#21, s_market_id#22, s_state#23, s_zip#24]
 
-(21) Filter [codegen id : 6]
-Input [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
-Condition : (((isnotnull(s_market_id#21) AND (s_market_id#21 = 8)) AND isnotnull(s_store_sk#19)) AND isnotnull(s_zip#23))
+(22) Filter [codegen id : 6]
+Input [5]: [s_store_sk#20, s_store_name#21, s_market_id#22, s_state#23, s_zip#24]
+Condition : (((isnotnull(s_market_id#22) AND (s_market_id#22 = 8)) AND isnotnull(s_store_sk#20)) AND isnotnull(s_zip#24))
 
-(22) Project [codegen id : 6]
-Output [4]: [s_store_sk#19, s_store_name#20, s_state#22, s_zip#23]
-Input [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
+(23) Project [codegen id : 6]
+Output [4]: [s_store_sk#20, s_store_name#21, s_state#23, s_zip#24]
+Input [5]: [s_store_sk#20, s_store_name#21, s_market_id#22, s_state#23, s_zip#24]
 
-(23) BroadcastExchange
-Input [4]: [s_store_sk#19, s_store_name#20, s_state#22, s_zip#23]
-Arguments: HashedRelationBroadcastMode(List(input[3, string, true]),false), [id=#24]
+(24) BroadcastExchange
+Input [4]: [s_store_sk#20, s_store_name#21, s_state#23, s_zip#24]
+Arguments: HashedRelationBroadcastMode(List(input[3, string, true]),false), [id=#25]
 
-(24) Scan parquet default.customer_address
-Output [3]: [ca_state#25, ca_zip#26, ca_country#27]
+(25) Scan parquet default.customer_address
+Output [3]: [ca_state#26, ca_zip#27, ca_country#28]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer_address]
 PushedFilters: [IsNotNull(ca_country), IsNotNull(ca_zip)]
 ReadSchema: struct<ca_state:string,ca_zip:string,ca_country:string>
 
-(25) ColumnarToRow
-Input [3]: [ca_state#25, ca_zip#26, ca_country#27]
+(26) ColumnarToRow
+Input [3]: [ca_state#26, ca_zip#27, ca_country#28]
 
-(26) Filter
-Input [3]: [ca_state#25, ca_zip#26, ca_country#27]
-Condition : (isnotnull(ca_country#27) AND isnotnull(ca_zip#26))
+(27) Filter
+Input [3]: [ca_state#26, ca_zip#27, ca_country#28]
+Condition : (isnotnull(ca_country#28) AND isnotnull(ca_zip#27))
 
-(27) BroadcastHashJoin [codegen id : 7]
-Left keys [1]: [s_zip#23]
-Right keys [1]: [ca_zip#26]
+(28) BroadcastHashJoin [codegen id : 7]
+Left keys [1]: [s_zip#24]
+Right keys [1]: [ca_zip#27]
 Join condition: None
 
-(28) Project [codegen id : 7]
-Output [5]: [s_store_sk#19, s_store_name#20, s_state#22, ca_state#25, ca_country#27]
-Input [7]: [s_store_sk#19, s_store_name#20, s_state#22, s_zip#23, ca_state#25, ca_zip#26, ca_country#27]
+(29) Project [codegen id : 7]
+Output [5]: [s_store_sk#20, s_store_name#21, s_state#23, ca_state#26, ca_country#28]
+Input [7]: [s_store_sk#20, s_store_name#21, s_state#23, s_zip#24, ca_state#26, ca_zip#27, ca_country#28]
 
-(29) BroadcastExchange
-Input [5]: [s_store_sk#19, s_store_name#20, s_state#22, ca_state#25, ca_country#27]
-Arguments: HashedRelationBroadcastMode(List(input[0, int, true], upper(input[4, string, true])),false), [id=#28]
+(30) BroadcastExchange
+Input [5]: [s_store_sk#20, s_store_name#21, s_state#23, ca_state#26, ca_country#28]
+Arguments: HashedRelationBroadcastMode(List(input[0, int, true], upper(input[4, string, true])),false), [id=#29]
 
-(30) BroadcastHashJoin [codegen id : 8]
-Left keys [2]: [ss_store_sk#3, c_birth_country#17]
-Right keys [2]: [s_store_sk#19, upper(ca_country#27)]
+(31) BroadcastHashJoin [codegen id : 8]
+Left keys [2]: [ss_store_sk#3, c_birth_country#18]
+Right keys [2]: [s_store_sk#20, upper(ca_country#28)]
 Join condition: None
 
-(31) Project [codegen id : 8]
-Output [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, s_store_name#20, s_state#22, ca_state#25]
-Input [17]: [ss_item_sk#1, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, c_birth_country#17, s_store_sk#19, s_store_name#20, s_state#22, ca_state#25, ca_country#27]
+(32) Project [codegen id : 8]
+Output [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, s_store_name#21, s_state#23, ca_state#26]
+Input [17]: [ss_item_sk#1, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, c_birth_country#18, s_store_sk#20, s_store_name#21, s_state#23, ca_state#26, ca_country#28]
 
-(32) Exchange
-Input [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, s_store_name#20, s_state#22, ca_state#25]
-Arguments: hashpartitioning(cast(ss_ticket_number#4 as bigint), cast(ss_item_sk#1 as bigint), 5), ENSURE_REQUIREMENTS, [id=#29]
+(33) Exchange
+Input [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, s_store_name#21, s_state#23, ca_state#26]
+Arguments: hashpartitioning(cast(ss_ticket_number#4 as bigint), cast(ss_item_sk#1 as bigint), 5), ENSURE_REQUIREMENTS, [id=#30]
 
-(33) Sort [codegen id : 9]
-Input [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, s_store_name#20, s_state#22, ca_state#25]
+(34) Sort [codegen id : 9]
+Input [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, s_store_name#21, s_state#23, ca_state#26]
 Arguments: [cast(ss_ticket_number#4 as bigint) ASC NULLS FIRST, cast(ss_item_sk#1 as bigint) ASC NULLS FIRST], false, 0
 
-(34) Scan parquet default.store_returns
-Output [2]: [sr_item_sk#30, sr_ticket_number#31]
+(35) Scan parquet default.store_returns
+Output [3]: [sr_item_sk#31, sr_ticket_number#32, sr_returned_date_sk#33]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/store_returns]
 PushedFilters: [IsNotNull(sr_ticket_number), IsNotNull(sr_item_sk)]
 ReadSchema: struct<sr_item_sk:bigint,sr_ticket_number:bigint>
 
-(35) ColumnarToRow [codegen id : 10]
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
+(36) ColumnarToRow [codegen id : 10]
+Input [3]: [sr_item_sk#31, sr_ticket_number#32, sr_returned_date_sk#33]
+
+(37) Filter [codegen id : 10]
+Input [3]: [sr_item_sk#31, sr_ticket_number#32, sr_returned_date_sk#33]
+Condition : (isnotnull(sr_ticket_number#32) AND isnotnull(sr_item_sk#31))
 
-(36) Filter [codegen id : 10]
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
-Condition : (isnotnull(sr_ticket_number#31) AND isnotnull(sr_item_sk#30))
+(38) Project [codegen id : 10]
+Output [2]: [sr_item_sk#31, sr_ticket_number#32]
+Input [3]: [sr_item_sk#31, sr_ticket_number#32, sr_returned_date_sk#33]
 
-(37) Exchange
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
-Arguments: hashpartitioning(sr_ticket_number#31, sr_item_sk#30, 5), ENSURE_REQUIREMENTS, [id=#32]
+(39) Exchange
+Input [2]: [sr_item_sk#31, sr_ticket_number#32]
+Arguments: hashpartitioning(sr_ticket_number#32, sr_item_sk#31, 5), ENSURE_REQUIREMENTS, [id=#34]
 
-(38) Sort [codegen id : 11]
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
-Arguments: [sr_ticket_number#31 ASC NULLS FIRST, sr_item_sk#30 ASC NULLS FIRST], false, 0
+(40) Sort [codegen id : 11]
+Input [2]: [sr_item_sk#31, sr_ticket_number#32]
+Arguments: [sr_ticket_number#32 ASC NULLS FIRST, sr_item_sk#31 ASC NULLS FIRST], false, 0
 
-(39) SortMergeJoin [codegen id : 12]
+(41) SortMergeJoin [codegen id : 12]
 Left keys [2]: [cast(ss_ticket_number#4 as bigint), cast(ss_item_sk#1 as bigint)]
-Right keys [2]: [sr_ticket_number#31, sr_item_sk#30]
+Right keys [2]: [sr_ticket_number#32, sr_item_sk#31]
 Join condition: None
 
-(40) Project [codegen id : 12]
-Output [11]: [ss_net_paid#5, s_store_name#20, s_state#22, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, ca_state#25]
-Input [15]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, s_store_name#20, s_state#22, ca_state#25, sr_item_sk#30, sr_ticket_number#31]
+(42) Project [codegen id : 12]
+Output [11]: [ss_net_paid#5, s_store_name#21, s_state#23, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, ca_state#26]
+Input [15]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, s_store_name#21, s_state#23, ca_state#26, sr_item_sk#31, sr_ticket_number#32]
 
-(41) HashAggregate [codegen id : 12]
-Input [11]: [ss_net_paid#5, s_store_name#20, s_state#22, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, ca_state#25]
-Keys [10]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8]
+(43) HashAggregate [codegen id : 12]
+Input [11]: [ss_net_paid#5, s_store_name#21, s_state#23, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, ca_state#26]
+Keys [10]: [c_last_name#17, c_first_name#16, s_store_name#21, ca_state#26, s_state#23, i_color#10, i_current_price#8, i_manager_id#12, i_units#11, i_size#9]
 Functions [1]: [partial_sum(UnscaledValue(ss_net_paid#5))]
-Aggregate Attributes [1]: [sum#33]
-Results [11]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, sum#34]
+Aggregate Attributes [1]: [sum#35]
+Results [11]: [c_last_name#17, c_first_name#16, s_store_name#21, ca_state#26, s_state#23, i_color#10, i_current_price#8, i_manager_id#12, i_units#11, i_size#9, sum#36]
 
-(42) Exchange
-Input [11]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, sum#34]
-Arguments: hashpartitioning(c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, 5), ENSURE_REQUIREMENTS, [id=#35]
+(44) Exchange
+Input [11]: [c_last_name#17, c_first_name#16, s_store_name#21, ca_state#26, s_state#23, i_color#10, i_current_price#8, i_manager_id#12, i_units#11, i_size#9, sum#36]
+Arguments: hashpartitioning(c_last_name#17, c_first_name#16, s_store_name#21, ca_state#26, s_state#23, i_color#10, i_current_price#8, i_manager_id#12, i_units#11, i_size#9, 5), ENSURE_REQUIREMENTS, [id=#37]
 
-(43) HashAggregate [codegen id : 13]
-Input [11]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, sum#34]
-Keys [10]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8]
+(45) HashAggregate [codegen id : 13]
+Input [11]: [c_last_name#17, c_first_name#16, s_store_name#21, ca_state#26, s_state#23, i_color#10, i_current_price#8, i_manager_id#12, i_units#11, i_size#9, sum#36]
+Keys [10]: [c_last_name#17, c_first_name#16, s_store_name#21, ca_state#26, s_state#23, i_color#10, i_current_price#8, i_manager_id#12, i_units#11, i_size#9]
 Functions [1]: [sum(UnscaledValue(ss_net_paid#5))]
-Aggregate Attributes [1]: [sum(UnscaledValue(ss_net_paid#5))#36]
-Results [4]: [c_last_name#16, c_first_name#15, s_store_name#20, MakeDecimal(sum(UnscaledValue(ss_net_paid#5))#36,17,2) AS netpaid#37]
-
-(44) HashAggregate [codegen id : 13]
-Input [4]: [c_last_name#16, c_first_name#15, s_store_name#20, netpaid#37]
-Keys [3]: [c_last_name#16, c_first_name#15, s_store_name#20]
-Functions [1]: [partial_sum(netpaid#37)]
-Aggregate Attributes [2]: [sum#38, isEmpty#39]
-Results [5]: [c_last_name#16, c_first_name#15, s_store_name#20, sum#40, isEmpty#41]
-
-(45) Exchange
-Input [5]: [c_last_name#16, c_first_name#15, s_store_name#20, sum#40, isEmpty#41]
-Arguments: hashpartitioning(c_last_name#16, c_first_name#15, s_store_name#20, 5), ENSURE_REQUIREMENTS, [id=#42]
-
-(46) HashAggregate [codegen id : 14]
-Input [5]: [c_last_name#16, c_first_name#15, s_store_name#20, sum#40, isEmpty#41]
-Keys [3]: [c_last_name#16, c_first_name#15, s_store_name#20]
-Functions [1]: [sum(netpaid#37)]
-Aggregate Attributes [1]: [sum(netpaid#37)#43]
-Results [5]: [c_last_name#16, c_first_name#15, s_store_name#20, sum(netpaid#37)#43 AS paid#44, sum(netpaid#37)#43 AS sum(netpaid#37)#45]
-
-(47) Filter [codegen id : 14]
-Input [5]: [c_last_name#16, c_first_name#15, s_store_name#20, paid#44, sum(netpaid#37)#45]
-Condition : (isnotnull(sum(netpaid#37)#45) AND (cast(sum(netpaid#37)#45 as decimal(33,8)) > cast(Subquery scalar-subquery#46, [id=#47] as decimal(33,8))))
-
-(48) Project [codegen id : 14]
-Output [4]: [c_last_name#16, c_first_name#15, s_store_name#20, paid#44]
-Input [5]: [c_last_name#16, c_first_name#15, s_store_name#20, paid#44, sum(netpaid#37)#45]
+Aggregate Attributes [1]: [sum(UnscaledValue(ss_net_paid#5))#38]
+Results [4]: [c_last_name#17, c_first_name#16, s_store_name#21, MakeDecimal(sum(UnscaledValue(ss_net_paid#5))#38,17,2) AS netpaid#39]
+
+(46) HashAggregate [codegen id : 13]
+Input [4]: [c_last_name#17, c_first_name#16, s_store_name#21, netpaid#39]
+Keys [3]: [c_last_name#17, c_first_name#16, s_store_name#21]
+Functions [1]: [partial_sum(netpaid#39)]
+Aggregate Attributes [2]: [sum#40, isEmpty#41]
+Results [5]: [c_last_name#17, c_first_name#16, s_store_name#21, sum#42, isEmpty#43]
+
+(47) Exchange
+Input [5]: [c_last_name#17, c_first_name#16, s_store_name#21, sum#42, isEmpty#43]
+Arguments: hashpartitioning(c_last_name#17, c_first_name#16, s_store_name#21, 5), ENSURE_REQUIREMENTS, [id=#44]
+
+(48) HashAggregate [codegen id : 14]
+Input [5]: [c_last_name#17, c_first_name#16, s_store_name#21, sum#42, isEmpty#43]
+Keys [3]: [c_last_name#17, c_first_name#16, s_store_name#21]
+Functions [1]: [sum(netpaid#39)]
+Aggregate Attributes [1]: [sum(netpaid#39)#45]
+Results [5]: [c_last_name#17, c_first_name#16, s_store_name#21, sum(netpaid#39)#45 AS paid#46, sum(netpaid#39)#45 AS sum(netpaid#39)#47]
+
+(49) Filter [codegen id : 14]
+Input [5]: [c_last_name#17, c_first_name#16, s_store_name#21, paid#46, sum(netpaid#39)#47]
+Condition : (isnotnull(sum(netpaid#39)#47) AND (cast(sum(netpaid#39)#47 as decimal(33,8)) > cast(Subquery scalar-subquery#48, [id=#49] as decimal(33,8))))
+
+(50) Project [codegen id : 14]
+Output [4]: [c_last_name#17, c_first_name#16, s_store_name#21, paid#46]
+Input [5]: [c_last_name#17, c_first_name#16, s_store_name#21, paid#46, sum(netpaid#39)#47]
 
 ===== Subqueries =====
 
-Subquery:1 Hosting operator id = 47 Hosting Expression = Subquery scalar-subquery#46, [id=#47]
-* HashAggregate (100)
-+- Exchange (99)
-   +- * HashAggregate (98)
-      +- * HashAggregate (97)
-         +- Exchange (96)
-            +- * HashAggregate (95)
-               +- * Project (94)
-                  +- * SortMergeJoin Inner (93)
-                     :- * Sort (87)
-                     :  +- Exchange (86)
-                     :     +- * Project (85)
-                     :        +- * SortMergeJoin Inner (84)
-                     :           :- * Sort (78)
-                     :           :  +- Exchange (77)
-                     :           :     +- * Project (76)
-                     :           :        +- * SortMergeJoin Inner (75)
-                     :           :           :- * Sort (69)
-                     :           :           :  +- Exchange (68)
-                     :           :           :     +- * Project (67)
-                     :           :           :        +- * SortMergeJoin Inner (66)
-                     :           :           :           :- * Sort (60)
-                     :           :           :           :  +- Exchange (59)
-                     :           :           :           :     +- * Project (58)
-                     :           :           :           :        +- * BroadcastHashJoin Inner BuildRight (57)
-                     :           :           :           :           :- * Filter (51)
-                     :           :           :           :           :  +- * ColumnarToRow (50)
-                     :           :           :           :           :     +- Scan parquet default.store_sales (49)
-                     :           :           :           :           +- BroadcastExchange (56)
-                     :           :           :           :              +- * Project (55)
-                     :           :           :           :                 +- * Filter (54)
-                     :           :           :           :                    +- * ColumnarToRow (53)
-                     :           :           :           :                       +- Scan parquet default.store (52)
-                     :           :           :           +- * Sort (65)
-                     :           :           :              +- Exchange (64)
-                     :           :           :                 +- * Filter (63)
-                     :           :           :                    +- * ColumnarToRow (62)
-                     :           :           :                       +- Scan parquet default.customer (61)
-                     :           :           +- * Sort (74)
-                     :           :              +- Exchange (73)
-                     :           :                 +- * Filter (72)
-                     :           :                    +- * ColumnarToRow (71)
-                     :           :                       +- Scan parquet default.item (70)
-                     :           +- * Sort (83)
-                     :              +- Exchange (82)
-                     :                 +- * Filter (81)
-                     :                    +- * ColumnarToRow (80)
-                     :                       +- Scan parquet default.customer_address (79)
-                     +- * Sort (92)
-                        +- Exchange (91)
-                           +- * Filter (90)
-                              +- * ColumnarToRow (89)
-                                 +- Scan parquet default.store_returns (88)
-
-
-(49) Scan parquet default.store_sales
-Output [5]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5]
+Subquery:1 Hosting operator id = 49 Hosting Expression = Subquery scalar-subquery#48, [id=#49]
+* HashAggregate (104)
++- Exchange (103)
+   +- * HashAggregate (102)
+      +- * HashAggregate (101)
+         +- Exchange (100)
+            +- * HashAggregate (99)
+               +- * Project (98)
+                  +- * SortMergeJoin Inner (97)
+                     :- * Sort (90)
+                     :  +- Exchange (89)
+                     :     +- * Project (88)
+                     :        +- * SortMergeJoin Inner (87)
+                     :           :- * Sort (81)
+                     :           :  +- Exchange (80)
+                     :           :     +- * Project (79)
+                     :           :        +- * SortMergeJoin Inner (78)
+                     :           :           :- * Sort (72)
+                     :           :           :  +- Exchange (71)
+                     :           :           :     +- * Project (70)
+                     :           :           :        +- * SortMergeJoin Inner (69)
+                     :           :           :           :- * Sort (63)
+                     :           :           :           :  +- Exchange (62)
+                     :           :           :           :     +- * Project (61)
+                     :           :           :           :        +- * BroadcastHashJoin Inner BuildRight (60)
+                     :           :           :           :           :- * Project (54)
+                     :           :           :           :           :  +- * Filter (53)
+                     :           :           :           :           :     +- * ColumnarToRow (52)
+                     :           :           :           :           :        +- Scan parquet default.store_sales (51)
+                     :           :           :           :           +- BroadcastExchange (59)
+                     :           :           :           :              +- * Project (58)
+                     :           :           :           :                 +- * Filter (57)
+                     :           :           :           :                    +- * ColumnarToRow (56)
+                     :           :           :           :                       +- Scan parquet default.store (55)
+                     :           :           :           +- * Sort (68)
+                     :           :           :              +- Exchange (67)
+                     :           :           :                 +- * Filter (66)
+                     :           :           :                    +- * ColumnarToRow (65)
+                     :           :           :                       +- Scan parquet default.item (64)
+                     :           :           +- * Sort (77)
+                     :           :              +- Exchange (76)
+                     :           :                 +- * Filter (75)
+                     :           :                    +- * ColumnarToRow (74)
+                     :           :                       +- Scan parquet default.customer (73)
+                     :           +- * Sort (86)
+                     :              +- Exchange (85)
+                     :                 +- * Filter (84)
+                     :                    +- * ColumnarToRow (83)
+                     :                       +- Scan parquet default.customer_address (82)
+                     +- * Sort (96)
+                        +- Exchange (95)
+                           +- * Project (94)
+                              +- * Filter (93)
+                                 +- * ColumnarToRow (92)
+                                    +- Scan parquet default.store_returns (91)
+
+
+(51) Scan parquet default.store_sales
+Output [6]: [ss_item_sk#50, ss_customer_sk#51, ss_store_sk#52, ss_ticket_number#53, ss_net_paid#54, ss_sold_date_sk#55]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/store_sales]
 PushedFilters: [IsNotNull(ss_ticket_number), IsNotNull(ss_item_sk), IsNotNull(ss_store_sk), IsNotNull(ss_customer_sk)]
 ReadSchema: struct<ss_item_sk:int,ss_customer_sk:int,ss_store_sk:int,ss_ticket_number:int,ss_net_paid:decimal(7,2)>
 
-(50) ColumnarToRow [codegen id : 2]
-Input [5]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5]
+(52) ColumnarToRow [codegen id : 2]
+Input [6]: [ss_item_sk#50, ss_customer_sk#51, ss_store_sk#52, ss_ticket_number#53, ss_net_paid#54, ss_sold_date_sk#55]
 
-(51) Filter [codegen id : 2]
-Input [5]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5]
-Condition : (((isnotnull(ss_ticket_number#4) AND isnotnull(ss_item_sk#1)) AND isnotnull(ss_store_sk#3)) AND isnotnull(ss_customer_sk#2))
+(53) Filter [codegen id : 2]
+Input [6]: [ss_item_sk#50, ss_customer_sk#51, ss_store_sk#52, ss_ticket_number#53, ss_net_paid#54, ss_sold_date_sk#55]
+Condition : (((isnotnull(ss_ticket_number#53) AND isnotnull(ss_item_sk#50)) AND isnotnull(ss_store_sk#52)) AND isnotnull(ss_customer_sk#51))
 
-(52) Scan parquet default.store
-Output [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
+(54) Project [codegen id : 2]
+Output [5]: [ss_item_sk#50, ss_customer_sk#51, ss_store_sk#52, ss_ticket_number#53, ss_net_paid#54]
+Input [6]: [ss_item_sk#50, ss_customer_sk#51, ss_store_sk#52, ss_ticket_number#53, ss_net_paid#54, ss_sold_date_sk#55]
+
+(55) Scan parquet default.store
+Output [5]: [s_store_sk#56, s_store_name#57, s_market_id#58, s_state#59, s_zip#60]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/store]
 PushedFilters: [IsNotNull(s_market_id), EqualTo(s_market_id,8), IsNotNull(s_store_sk), IsNotNull(s_zip)]
 ReadSchema: struct<s_store_sk:int,s_store_name:string,s_market_id:int,s_state:string,s_zip:string>
 
-(53) ColumnarToRow [codegen id : 1]
-Input [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
+(56) ColumnarToRow [codegen id : 1]
+Input [5]: [s_store_sk#56, s_store_name#57, s_market_id#58, s_state#59, s_zip#60]
 
-(54) Filter [codegen id : 1]
-Input [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
-Condition : (((isnotnull(s_market_id#21) AND (s_market_id#21 = 8)) AND isnotnull(s_store_sk#19)) AND isnotnull(s_zip#23))
+(57) Filter [codegen id : 1]
+Input [5]: [s_store_sk#56, s_store_name#57, s_market_id#58, s_state#59, s_zip#60]
+Condition : (((isnotnull(s_market_id#58) AND (s_market_id#58 = 8)) AND isnotnull(s_store_sk#56)) AND isnotnull(s_zip#60))
 
-(55) Project [codegen id : 1]
-Output [4]: [s_store_sk#19, s_store_name#20, s_state#22, s_zip#23]
-Input [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
+(58) Project [codegen id : 1]
+Output [4]: [s_store_sk#56, s_store_name#57, s_state#59, s_zip#60]
+Input [5]: [s_store_sk#56, s_store_name#57, s_market_id#58, s_state#59, s_zip#60]
 
-(56) BroadcastExchange
-Input [4]: [s_store_sk#19, s_store_name#20, s_state#22, s_zip#23]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#48]
+(59) BroadcastExchange
+Input [4]: [s_store_sk#56, s_store_name#57, s_state#59, s_zip#60]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#61]
 
-(57) BroadcastHashJoin [codegen id : 2]
-Left keys [1]: [ss_store_sk#3]
-Right keys [1]: [s_store_sk#19]
+(60) BroadcastHashJoin [codegen id : 2]
+Left keys [1]: [ss_store_sk#52]
+Right keys [1]: [s_store_sk#56]
 Join condition: None
 
-(58) Project [codegen id : 2]
-Output [7]: [ss_item_sk#1, ss_customer_sk#2, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23]
-Input [9]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, s_store_sk#19, s_store_name#20, s_state#22, s_zip#23]
+(61) Project [codegen id : 2]
+Output [7]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60]
+Input [9]: [ss_item_sk#50, ss_customer_sk#51, ss_store_sk#52, ss_ticket_number#53, ss_net_paid#54, s_store_sk#56, s_store_name#57, s_state#59, s_zip#60]
 
-(59) Exchange
-Input [7]: [ss_item_sk#1, ss_customer_sk#2, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23]
-Arguments: hashpartitioning(ss_customer_sk#2, 5), ENSURE_REQUIREMENTS, [id=#49]
+(62) Exchange
+Input [7]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60]
+Arguments: hashpartitioning(ss_item_sk#50, 5), ENSURE_REQUIREMENTS, [id=#62]
 
-(60) Sort [codegen id : 3]
-Input [7]: [ss_item_sk#1, ss_customer_sk#2, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23]
-Arguments: [ss_customer_sk#2 ASC NULLS FIRST], false, 0
+(63) Sort [codegen id : 3]
+Input [7]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60]
+Arguments: [ss_item_sk#50 ASC NULLS FIRST], false, 0
 
-(61) Scan parquet default.customer
-Output [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
+(64) Scan parquet default.item
+Output [6]: [i_item_sk#63, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/customer]
-PushedFilters: [IsNotNull(c_customer_sk), IsNotNull(c_birth_country)]
-ReadSchema: struct<c_customer_sk:int,c_first_name:string,c_last_name:string,c_birth_country:string>
+Location [not included in comparison]/{warehouse_dir}/item]
+PushedFilters: [IsNotNull(i_item_sk)]
+ReadSchema: struct<i_item_sk:int,i_current_price:decimal(7,2),i_size:string,i_color:string,i_units:string,i_manager_id:int>
 
-(62) ColumnarToRow [codegen id : 4]
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
+(65) ColumnarToRow [codegen id : 4]
+Input [6]: [i_item_sk#63, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
 
-(63) Filter [codegen id : 4]
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
-Condition : (isnotnull(c_customer_sk#14) AND isnotnull(c_birth_country#17))
+(66) Filter [codegen id : 4]
+Input [6]: [i_item_sk#63, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
+Condition : isnotnull(i_item_sk#63)
 
-(64) Exchange
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
-Arguments: hashpartitioning(c_customer_sk#14, 5), ENSURE_REQUIREMENTS, [id=#50]
+(67) Exchange
+Input [6]: [i_item_sk#63, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
+Arguments: hashpartitioning(i_item_sk#63, 5), ENSURE_REQUIREMENTS, [id=#69]
 
-(65) Sort [codegen id : 5]
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
-Arguments: [c_customer_sk#14 ASC NULLS FIRST], false, 0
+(68) Sort [codegen id : 5]
+Input [6]: [i_item_sk#63, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
+Arguments: [i_item_sk#63 ASC NULLS FIRST], false, 0
 
-(66) SortMergeJoin [codegen id : 6]
-Left keys [1]: [ss_customer_sk#2]
-Right keys [1]: [c_customer_sk#14]
+(69) SortMergeJoin [codegen id : 6]
+Left keys [1]: [ss_item_sk#50]
+Right keys [1]: [i_item_sk#63]
 Join condition: None
 
-(67) Project [codegen id : 6]
-Output [9]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17]
-Input [11]: [ss_item_sk#1, ss_customer_sk#2, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
+(70) Project [codegen id : 6]
+Output [12]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
+Input [13]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_item_sk#63, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
 
-(68) Exchange
-Input [9]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17]
-Arguments: hashpartitioning(ss_item_sk#1, 5), ENSURE_REQUIREMENTS, [id=#51]
+(71) Exchange
+Input [12]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
+Arguments: hashpartitioning(ss_customer_sk#51, 5), ENSURE_REQUIREMENTS, [id=#70]
 
-(69) Sort [codegen id : 7]
-Input [9]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17]
-Arguments: [ss_item_sk#1 ASC NULLS FIRST], false, 0
+(72) Sort [codegen id : 7]
+Input [12]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
+Arguments: [ss_customer_sk#51 ASC NULLS FIRST], false, 0
 
-(70) Scan parquet default.item
-Output [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
+(73) Scan parquet default.customer
+Output [4]: [c_customer_sk#71, c_first_name#72, c_last_name#73, c_birth_country#74]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/item]
-PushedFilters: [IsNotNull(i_item_sk)]
-ReadSchema: struct<i_item_sk:int,i_current_price:decimal(7,2),i_size:string,i_color:string,i_units:string,i_manager_id:int>
+Location [not included in comparison]/{warehouse_dir}/customer]
+PushedFilters: [IsNotNull(c_customer_sk), IsNotNull(c_birth_country)]
+ReadSchema: struct<c_customer_sk:int,c_first_name:string,c_last_name:string,c_birth_country:string>
 
-(71) ColumnarToRow [codegen id : 8]
-Input [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
+(74) ColumnarToRow [codegen id : 8]
+Input [4]: [c_customer_sk#71, c_first_name#72, c_last_name#73, c_birth_country#74]
 
-(72) Filter [codegen id : 8]
-Input [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Condition : isnotnull(i_item_sk#6)
+(75) Filter [codegen id : 8]
+Input [4]: [c_customer_sk#71, c_first_name#72, c_last_name#73, c_birth_country#74]
+Condition : (isnotnull(c_customer_sk#71) AND isnotnull(c_birth_country#74))
 
-(73) Exchange
-Input [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Arguments: hashpartitioning(i_item_sk#6, 5), ENSURE_REQUIREMENTS, [id=#52]
+(76) Exchange
+Input [4]: [c_customer_sk#71, c_first_name#72, c_last_name#73, c_birth_country#74]
+Arguments: hashpartitioning(c_customer_sk#71, 5), ENSURE_REQUIREMENTS, [id=#75]
 
-(74) Sort [codegen id : 9]
-Input [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Arguments: [i_item_sk#6 ASC NULLS FIRST], false, 0
+(77) Sort [codegen id : 9]
+Input [4]: [c_customer_sk#71, c_first_name#72, c_last_name#73, c_birth_country#74]
+Arguments: [c_customer_sk#71 ASC NULLS FIRST], false, 0
 
-(75) SortMergeJoin [codegen id : 10]
-Left keys [1]: [ss_item_sk#1]
-Right keys [1]: [i_item_sk#6]
+(78) SortMergeJoin [codegen id : 10]
+Left keys [1]: [ss_customer_sk#51]
+Right keys [1]: [c_customer_sk#71]
 Join condition: None
 
-(76) Project [codegen id : 10]
-Output [14]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Input [15]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17, i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
+(79) Project [codegen id : 10]
+Output [14]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, c_birth_country#74]
+Input [16]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_customer_sk#71, c_first_name#72, c_last_name#73, c_birth_country#74]
 
-(77) Exchange
-Input [14]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Arguments: hashpartitioning(c_birth_country#17, s_zip#23, 5), ENSURE_REQUIREMENTS, [id=#53]
+(80) Exchange
+Input [14]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, c_birth_country#74]
+Arguments: hashpartitioning(c_birth_country#74, s_zip#60, 5), ENSURE_REQUIREMENTS, [id=#76]
 
-(78) Sort [codegen id : 11]
-Input [14]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Arguments: [c_birth_country#17 ASC NULLS FIRST, s_zip#23 ASC NULLS FIRST], false, 0
+(81) Sort [codegen id : 11]
+Input [14]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, c_birth_country#74]
+Arguments: [c_birth_country#74 ASC NULLS FIRST, s_zip#60 ASC NULLS FIRST], false, 0
 
-(79) Scan parquet default.customer_address
-Output [3]: [ca_state#25, ca_zip#26, ca_country#27]
+(82) Scan parquet default.customer_address
+Output [3]: [ca_state#77, ca_zip#78, ca_country#79]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer_address]
 PushedFilters: [IsNotNull(ca_country), IsNotNull(ca_zip)]
 ReadSchema: struct<ca_state:string,ca_zip:string,ca_country:string>
 
-(80) ColumnarToRow [codegen id : 12]
-Input [3]: [ca_state#25, ca_zip#26, ca_country#27]
+(83) ColumnarToRow [codegen id : 12]
+Input [3]: [ca_state#77, ca_zip#78, ca_country#79]
 
-(81) Filter [codegen id : 12]
-Input [3]: [ca_state#25, ca_zip#26, ca_country#27]
-Condition : (isnotnull(ca_country#27) AND isnotnull(ca_zip#26))
+(84) Filter [codegen id : 12]
+Input [3]: [ca_state#77, ca_zip#78, ca_country#79]
+Condition : (isnotnull(ca_country#79) AND isnotnull(ca_zip#78))
 
-(82) Exchange
-Input [3]: [ca_state#25, ca_zip#26, ca_country#27]
-Arguments: hashpartitioning(upper(ca_country#27), ca_zip#26, 5), ENSURE_REQUIREMENTS, [id=#54]
+(85) Exchange
+Input [3]: [ca_state#77, ca_zip#78, ca_country#79]
+Arguments: hashpartitioning(upper(ca_country#79), ca_zip#78, 5), ENSURE_REQUIREMENTS, [id=#80]
 
-(83) Sort [codegen id : 13]
-Input [3]: [ca_state#25, ca_zip#26, ca_country#27]
-Arguments: [upper(ca_country#27) ASC NULLS FIRST, ca_zip#26 ASC NULLS FIRST], false, 0
+(86) Sort [codegen id : 13]
+Input [3]: [ca_state#77, ca_zip#78, ca_country#79]
+Arguments: [upper(ca_country#79) ASC NULLS FIRST, ca_zip#78 ASC NULLS FIRST], false, 0
 
-(84) SortMergeJoin [codegen id : 14]
-Left keys [2]: [c_birth_country#17, s_zip#23]
-Right keys [2]: [upper(ca_country#27), ca_zip#26]
+(87) SortMergeJoin [codegen id : 14]
+Left keys [2]: [c_birth_country#74, s_zip#60]
+Right keys [2]: [upper(ca_country#79), ca_zip#78]
 Join condition: None
 
-(85) Project [codegen id : 14]
-Output [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, c_first_name#15, c_last_name#16, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, ca_state#25]
-Input [17]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, ca_state#25, ca_zip#26, ca_country#27]
+(88) Project [codegen id : 14]
+Output [13]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, ca_state#77]
+Input [17]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, c_birth_country#74, ca_state#77, ca_zip#78, ca_country#79]
 
-(86) Exchange
-Input [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, c_first_name#15, c_last_name#16, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, ca_state#25]
-Arguments: hashpartitioning(cast(ss_ticket_number#4 as bigint), cast(ss_item_sk#1 as bigint), 5), ENSURE_REQUIREMENTS, [id=#55]
+(89) Exchange
+Input [13]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, ca_state#77]
+Arguments: hashpartitioning(cast(ss_ticket_number#53 as bigint), cast(ss_item_sk#50 as bigint), 5), ENSURE_REQUIREMENTS, [id=#81]
 
-(87) Sort [codegen id : 15]
-Input [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, c_first_name#15, c_last_name#16, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, ca_state#25]
-Arguments: [cast(ss_ticket_number#4 as bigint) ASC NULLS FIRST, cast(ss_item_sk#1 as bigint) ASC NULLS FIRST], false, 0
+(90) Sort [codegen id : 15]
+Input [13]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, ca_state#77]
+Arguments: [cast(ss_ticket_number#53 as bigint) ASC NULLS FIRST, cast(ss_item_sk#50 as bigint) ASC NULLS FIRST], false, 0
 
-(88) Scan parquet default.store_returns
-Output [2]: [sr_item_sk#30, sr_ticket_number#31]
+(91) Scan parquet default.store_returns
+Output [3]: [sr_item_sk#82, sr_ticket_number#83, sr_returned_date_sk#84]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/store_returns]
 PushedFilters: [IsNotNull(sr_ticket_number), IsNotNull(sr_item_sk)]
 ReadSchema: struct<sr_item_sk:bigint,sr_ticket_number:bigint>
 
-(89) ColumnarToRow [codegen id : 16]
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
+(92) ColumnarToRow [codegen id : 16]
+Input [3]: [sr_item_sk#82, sr_ticket_number#83, sr_returned_date_sk#84]
 
-(90) Filter [codegen id : 16]
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
-Condition : (isnotnull(sr_ticket_number#31) AND isnotnull(sr_item_sk#30))
+(93) Filter [codegen id : 16]
+Input [3]: [sr_item_sk#82, sr_ticket_number#83, sr_returned_date_sk#84]
+Condition : (isnotnull(sr_ticket_number#83) AND isnotnull(sr_item_sk#82))
 
-(91) Exchange
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
-Arguments: hashpartitioning(sr_ticket_number#31, sr_item_sk#30, 5), ENSURE_REQUIREMENTS, [id=#56]
+(94) Project [codegen id : 16]
+Output [2]: [sr_item_sk#82, sr_ticket_number#83]
+Input [3]: [sr_item_sk#82, sr_ticket_number#83, sr_returned_date_sk#84]
 
-(92) Sort [codegen id : 17]
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
-Arguments: [sr_ticket_number#31 ASC NULLS FIRST, sr_item_sk#30 ASC NULLS FIRST], false, 0
+(95) Exchange
+Input [2]: [sr_item_sk#82, sr_ticket_number#83]
+Arguments: hashpartitioning(sr_ticket_number#83, sr_item_sk#82, 5), ENSURE_REQUIREMENTS, [id=#85]
 
-(93) SortMergeJoin [codegen id : 18]
-Left keys [2]: [cast(ss_ticket_number#4 as bigint), cast(ss_item_sk#1 as bigint)]
-Right keys [2]: [sr_ticket_number#31, sr_item_sk#30]
-Join condition: None
-
-(94) Project [codegen id : 18]
-Output [11]: [ss_net_paid#5, s_store_name#20, s_state#22, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, ca_state#25]
-Input [15]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, c_first_name#15, c_last_name#16, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, ca_state#25, sr_item_sk#30, sr_ticket_number#31]
-
-(95) HashAggregate [codegen id : 18]
-Input [11]: [ss_net_paid#5, s_store_name#20, s_state#22, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, ca_state#25]
-Keys [10]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8]
-Functions [1]: [partial_sum(UnscaledValue(ss_net_paid#5))]
-Aggregate Attributes [1]: [sum#57]
-Results [11]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, sum#58]
+(96) Sort [codegen id : 17]
+Input [2]: [sr_item_sk#82, sr_ticket_number#83]
+Arguments: [sr_ticket_number#83 ASC NULLS FIRST, sr_item_sk#82 ASC NULLS FIRST], false, 0
 
-(96) Exchange
-Input [11]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, sum#58]
-Arguments: hashpartitioning(c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, 5), ENSURE_REQUIREMENTS, [id=#59]
-
-(97) HashAggregate [codegen id : 19]
-Input [11]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, sum#58]
-Keys [10]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8]
-Functions [1]: [sum(UnscaledValue(ss_net_paid#5))]
-Aggregate Attributes [1]: [sum(UnscaledValue(ss_net_paid#5))#60]
-Results [1]: [MakeDecimal(sum(UnscaledValue(ss_net_paid#5))#60,17,2) AS netpaid#37]
+(97) SortMergeJoin [codegen id : 18]
+Left keys [2]: [cast(ss_ticket_number#53 as bigint), cast(ss_item_sk#50 as bigint)]
+Right keys [2]: [sr_ticket_number#83, sr_item_sk#82]
+Join condition: None
 
-(98) HashAggregate [codegen id : 19]
-Input [1]: [netpaid#37]
+(98) Project [codegen id : 18]
+Output [11]: [ss_net_paid#54, s_store_name#57, s_state#59, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, ca_state#77]
+Input [15]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, ca_state#77, sr_item_sk#82, sr_ticket_number#83]
+
+(99) HashAggregate [codegen id : 18]
+Input [11]: [ss_net_paid#54, s_store_name#57, s_state#59, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, ca_state#77]
+Keys [10]: [c_last_name#73, c_first_name#72, s_store_name#57, ca_state#77, s_state#59, i_color#66, i_current_price#64, i_manager_id#68, i_units#67, i_size#65]
+Functions [1]: [partial_sum(UnscaledValue(ss_net_paid#54))]
+Aggregate Attributes [1]: [sum#86]
+Results [11]: [c_last_name#73, c_first_name#72, s_store_name#57, ca_state#77, s_state#59, i_color#66, i_current_price#64, i_manager_id#68, i_units#67, i_size#65, sum#87]
+
+(100) Exchange
+Input [11]: [c_last_name#73, c_first_name#72, s_store_name#57, ca_state#77, s_state#59, i_color#66, i_current_price#64, i_manager_id#68, i_units#67, i_size#65, sum#87]
+Arguments: hashpartitioning(c_last_name#73, c_first_name#72, s_store_name#57, ca_state#77, s_state#59, i_color#66, i_current_price#64, i_manager_id#68, i_units#67, i_size#65, 5), ENSURE_REQUIREMENTS, [id=#88]
+
+(101) HashAggregate [codegen id : 19]
+Input [11]: [c_last_name#73, c_first_name#72, s_store_name#57, ca_state#77, s_state#59, i_color#66, i_current_price#64, i_manager_id#68, i_units#67, i_size#65, sum#87]
+Keys [10]: [c_last_name#73, c_first_name#72, s_store_name#57, ca_state#77, s_state#59, i_color#66, i_current_price#64, i_manager_id#68, i_units#67, i_size#65]
+Functions [1]: [sum(UnscaledValue(ss_net_paid#54))]
+Aggregate Attributes [1]: [sum(UnscaledValue(ss_net_paid#54))#89]
+Results [1]: [MakeDecimal(sum(UnscaledValue(ss_net_paid#54))#89,17,2) AS netpaid#39]
+
+(102) HashAggregate [codegen id : 19]
+Input [1]: [netpaid#39]
 Keys: []
-Functions [1]: [partial_avg(netpaid#37)]
-Aggregate Attributes [2]: [sum#61, count#62]
-Results [2]: [sum#63, count#64]
+Functions [1]: [partial_avg(netpaid#39)]
+Aggregate Attributes [2]: [sum#90, count#91]
+Results [2]: [sum#92, count#93]
 
-(99) Exchange
-Input [2]: [sum#63, count#64]
-Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#65]
+(103) Exchange
+Input [2]: [sum#92, count#93]
+Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#94]
 
-(100) HashAggregate [codegen id : 20]
-Input [2]: [sum#63, count#64]
+(104) HashAggregate [codegen id : 20]
+Input [2]: [sum#92, count#93]
 Keys: []
-Functions [1]: [avg(netpaid#37)]
-Aggregate Attributes [1]: [avg(netpaid#37)#66]
-Results [1]: [CheckOverflow((0.050000 * promote_precision(avg(netpaid#37)#66)), DecimalType(24,8), true) AS (CAST(0.05 AS DECIMAL(21,6)) * CAST(avg(netpaid) AS DECIMAL(21,6)))#67]
+Functions [1]: [avg(netpaid#39)]
+Aggregate Attributes [1]: [avg(netpaid#39)#95]
+Results [1]: [CheckOverflow((0.050000 * promote_precision(avg(netpaid#39)#95)), DecimalType(24,8), true) AS (0.05 * avg(netpaid))#96]
 
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24a.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24a.sf100/simplified.txt
index 7de562c5d59a1..10f874f8f5543 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24a.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24a.sf100/simplified.txt
@@ -21,7 +21,7 @@ WholeStageCodegen (14)
                                         InputAdapter
                                           Exchange [ss_ticket_number,ss_item_sk] #12
                                             WholeStageCodegen (14)
-                                              Project [ss_item_sk,ss_ticket_number,ss_net_paid,s_store_name,s_state,c_first_name,c_last_name,i_current_price,i_size,i_color,i_units,i_manager_id,ca_state]
+                                              Project [ss_item_sk,ss_ticket_number,ss_net_paid,s_store_name,s_state,i_current_price,i_size,i_color,i_units,i_manager_id,c_first_name,c_last_name,ca_state]
                                                 SortMergeJoin [c_birth_country,s_zip,ca_country,ca_zip]
                                                   InputAdapter
                                                     WholeStageCodegen (11)
@@ -29,21 +29,21 @@ WholeStageCodegen (14)
                                                         InputAdapter
                                                           Exchange [c_birth_country,s_zip] #13
                                                             WholeStageCodegen (10)
-                                                              Project [ss_item_sk,ss_ticket_number,ss_net_paid,s_store_name,s_state,s_zip,c_first_name,c_last_name,c_birth_country,i_current_price,i_size,i_color,i_units,i_manager_id]
-                                                                SortMergeJoin [ss_item_sk,i_item_sk]
+                                                              Project [ss_item_sk,ss_ticket_number,ss_net_paid,s_store_name,s_state,s_zip,i_current_price,i_size,i_color,i_units,i_manager_id,c_first_name,c_last_name,c_birth_country]
+                                                                SortMergeJoin [ss_customer_sk,c_customer_sk]
                                                                   InputAdapter
                                                                     WholeStageCodegen (7)
-                                                                      Sort [ss_item_sk]
+                                                                      Sort [ss_customer_sk]
                                                                         InputAdapter
-                                                                          Exchange [ss_item_sk] #14
+                                                                          Exchange [ss_customer_sk] #14
                                                                             WholeStageCodegen (6)
-                                                                              Project [ss_item_sk,ss_ticket_number,ss_net_paid,s_store_name,s_state,s_zip,c_first_name,c_last_name,c_birth_country]
-                                                                                SortMergeJoin [ss_customer_sk,c_customer_sk]
+                                                                              Project [ss_item_sk,ss_customer_sk,ss_ticket_number,ss_net_paid,s_store_name,s_state,s_zip,i_current_price,i_size,i_color,i_units,i_manager_id]
+                                                                                SortMergeJoin [ss_item_sk,i_item_sk]
                                                                                   InputAdapter
                                                                                     WholeStageCodegen (3)
-                                                                                      Sort [ss_customer_sk]
+                                                                                      Sort [ss_item_sk]
                                                                                         InputAdapter
-                                                                                          Exchange [ss_customer_sk] #15
+                                                                                          Exchange [ss_item_sk] #15
                                                                                             WholeStageCodegen (2)
                                                                                               Project [ss_item_sk,ss_customer_sk,ss_ticket_number,ss_net_paid,s_store_name,s_state,s_zip]
                                                                                                 BroadcastHashJoin [ss_store_sk,s_store_sk]
@@ -61,24 +61,24 @@ WholeStageCodegen (14)
                                                                                                                 Scan parquet default.store [s_store_sk,s_store_name,s_market_id,s_state,s_zip]
                                                                                   InputAdapter
                                                                                     WholeStageCodegen (5)
-                                                                                      Sort [c_customer_sk]
+                                                                                      Sort [i_item_sk]
                                                                                         InputAdapter
-                                                                                          Exchange [c_customer_sk] #17
+                                                                                          Exchange [i_item_sk] #17
                                                                                             WholeStageCodegen (4)
-                                                                                              Filter [c_customer_sk,c_birth_country]
+                                                                                              Filter [i_item_sk]
                                                                                                 ColumnarToRow
                                                                                                   InputAdapter
-                                                                                                    Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name,c_birth_country]
+                                                                                                    Scan parquet default.item [i_item_sk,i_current_price,i_size,i_color,i_units,i_manager_id]
                                                                   InputAdapter
                                                                     WholeStageCodegen (9)
-                                                                      Sort [i_item_sk]
+                                                                      Sort [c_customer_sk]
                                                                         InputAdapter
-                                                                          Exchange [i_item_sk] #18
+                                                                          Exchange [c_customer_sk] #18
                                                                             WholeStageCodegen (8)
-                                                                              Filter [i_item_sk]
+                                                                              Filter [c_customer_sk,c_birth_country]
                                                                                 ColumnarToRow
                                                                                   InputAdapter
-                                                                                    Scan parquet default.item [i_item_sk,i_current_price,i_size,i_color,i_units,i_manager_id]
+                                                                                    Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name,c_birth_country]
                                                   InputAdapter
                                                     WholeStageCodegen (13)
                                                       Sort [ca_country,ca_zip]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24b.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24b.sf100/explain.txt
index 273950bed3546..97ee167a14b5a 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24b.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24b.sf100/explain.txt
@@ -1,567 +1,587 @@
 == Physical Plan ==
-* Project (48)
-+- * Filter (47)
-   +- * HashAggregate (46)
-      +- Exchange (45)
-         +- * HashAggregate (44)
-            +- * HashAggregate (43)
-               +- Exchange (42)
-                  +- * HashAggregate (41)
-                     +- * Project (40)
-                        +- * SortMergeJoin Inner (39)
-                           :- * Sort (33)
-                           :  +- Exchange (32)
-                           :     +- * Project (31)
-                           :        +- * BroadcastHashJoin Inner BuildRight (30)
-                           :           :- * Project (18)
-                           :           :  +- * SortMergeJoin Inner (17)
-                           :           :     :- * Sort (11)
-                           :           :     :  +- Exchange (10)
-                           :           :     :     +- * Project (9)
-                           :           :     :        +- * BroadcastHashJoin Inner BuildRight (8)
-                           :           :     :           :- * Filter (3)
-                           :           :     :           :  +- * ColumnarToRow (2)
-                           :           :     :           :     +- Scan parquet default.store_sales (1)
-                           :           :     :           +- BroadcastExchange (7)
-                           :           :     :              +- * Filter (6)
-                           :           :     :                 +- * ColumnarToRow (5)
-                           :           :     :                    +- Scan parquet default.item (4)
-                           :           :     +- * Sort (16)
-                           :           :        +- Exchange (15)
-                           :           :           +- * Filter (14)
-                           :           :              +- * ColumnarToRow (13)
-                           :           :                 +- Scan parquet default.customer (12)
-                           :           +- BroadcastExchange (29)
-                           :              +- * Project (28)
-                           :                 +- * BroadcastHashJoin Inner BuildLeft (27)
-                           :                    :- BroadcastExchange (23)
-                           :                    :  +- * Project (22)
-                           :                    :     +- * Filter (21)
-                           :                    :        +- * ColumnarToRow (20)
-                           :                    :           +- Scan parquet default.store (19)
-                           :                    +- * Filter (26)
-                           :                       +- * ColumnarToRow (25)
-                           :                          +- Scan parquet default.customer_address (24)
-                           +- * Sort (38)
-                              +- Exchange (37)
-                                 +- * Filter (36)
-                                    +- * ColumnarToRow (35)
-                                       +- Scan parquet default.store_returns (34)
+* Project (50)
++- * Filter (49)
+   +- * HashAggregate (48)
+      +- Exchange (47)
+         +- * HashAggregate (46)
+            +- * HashAggregate (45)
+               +- Exchange (44)
+                  +- * HashAggregate (43)
+                     +- * Project (42)
+                        +- * SortMergeJoin Inner (41)
+                           :- * Sort (34)
+                           :  +- Exchange (33)
+                           :     +- * Project (32)
+                           :        +- * BroadcastHashJoin Inner BuildRight (31)
+                           :           :- * Project (19)
+                           :           :  +- * SortMergeJoin Inner (18)
+                           :           :     :- * Sort (12)
+                           :           :     :  +- Exchange (11)
+                           :           :     :     +- * Project (10)
+                           :           :     :        +- * BroadcastHashJoin Inner BuildRight (9)
+                           :           :     :           :- * Project (4)
+                           :           :     :           :  +- * Filter (3)
+                           :           :     :           :     +- * ColumnarToRow (2)
+                           :           :     :           :        +- Scan parquet default.store_sales (1)
+                           :           :     :           +- BroadcastExchange (8)
+                           :           :     :              +- * Filter (7)
+                           :           :     :                 +- * ColumnarToRow (6)
+                           :           :     :                    +- Scan parquet default.item (5)
+                           :           :     +- * Sort (17)
+                           :           :        +- Exchange (16)
+                           :           :           +- * Filter (15)
+                           :           :              +- * ColumnarToRow (14)
+                           :           :                 +- Scan parquet default.customer (13)
+                           :           +- BroadcastExchange (30)
+                           :              +- * Project (29)
+                           :                 +- * BroadcastHashJoin Inner BuildLeft (28)
+                           :                    :- BroadcastExchange (24)
+                           :                    :  +- * Project (23)
+                           :                    :     +- * Filter (22)
+                           :                    :        +- * ColumnarToRow (21)
+                           :                    :           +- Scan parquet default.store (20)
+                           :                    +- * Filter (27)
+                           :                       +- * ColumnarToRow (26)
+                           :                          +- Scan parquet default.customer_address (25)
+                           +- * Sort (40)
+                              +- Exchange (39)
+                                 +- * Project (38)
+                                    +- * Filter (37)
+                                       +- * ColumnarToRow (36)
+                                          +- Scan parquet default.store_returns (35)
 
 
 (1) Scan parquet default.store_sales
-Output [5]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5]
+Output [6]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, ss_sold_date_sk#6]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/store_sales]
 PushedFilters: [IsNotNull(ss_ticket_number), IsNotNull(ss_item_sk), IsNotNull(ss_store_sk), IsNotNull(ss_customer_sk)]
 ReadSchema: struct<ss_item_sk:int,ss_customer_sk:int,ss_store_sk:int,ss_ticket_number:int,ss_net_paid:decimal(7,2)>
 
 (2) ColumnarToRow [codegen id : 2]
-Input [5]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5]
+Input [6]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, ss_sold_date_sk#6]
 
 (3) Filter [codegen id : 2]
-Input [5]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5]
+Input [6]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, ss_sold_date_sk#6]
 Condition : (((isnotnull(ss_ticket_number#4) AND isnotnull(ss_item_sk#1)) AND isnotnull(ss_store_sk#3)) AND isnotnull(ss_customer_sk#2))
 
-(4) Scan parquet default.item
-Output [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
+(4) Project [codegen id : 2]
+Output [5]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5]
+Input [6]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, ss_sold_date_sk#6]
+
+(5) Scan parquet default.item
+Output [6]: [i_item_sk#7, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/item]
-PushedFilters: [IsNotNull(i_color), EqualTo(i_color,chiffon), IsNotNull(i_item_sk)]
+PushedFilters: [IsNotNull(i_color), EqualTo(i_color,chiffon             ), IsNotNull(i_item_sk)]
 ReadSchema: struct<i_item_sk:int,i_current_price:decimal(7,2),i_size:string,i_color:string,i_units:string,i_manager_id:int>
 
-(5) ColumnarToRow [codegen id : 1]
-Input [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
+(6) ColumnarToRow [codegen id : 1]
+Input [6]: [i_item_sk#7, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
 
-(6) Filter [codegen id : 1]
-Input [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Condition : ((isnotnull(i_color#9) AND (i_color#9 = chiffon)) AND isnotnull(i_item_sk#6))
+(7) Filter [codegen id : 1]
+Input [6]: [i_item_sk#7, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
+Condition : ((isnotnull(i_color#10) AND (i_color#10 = chiffon             )) AND isnotnull(i_item_sk#7))
 
-(7) BroadcastExchange
-Input [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#12]
+(8) BroadcastExchange
+Input [6]: [i_item_sk#7, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#13]
 
-(8) BroadcastHashJoin [codegen id : 2]
+(9) BroadcastHashJoin [codegen id : 2]
 Left keys [1]: [ss_item_sk#1]
-Right keys [1]: [i_item_sk#6]
+Right keys [1]: [i_item_sk#7]
 Join condition: None
 
-(9) Project [codegen id : 2]
-Output [10]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Input [11]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
+(10) Project [codegen id : 2]
+Output [10]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
+Input [11]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_item_sk#7, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
 
-(10) Exchange
-Input [10]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Arguments: hashpartitioning(ss_customer_sk#2, 5), ENSURE_REQUIREMENTS, [id=#13]
+(11) Exchange
+Input [10]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
+Arguments: hashpartitioning(ss_customer_sk#2, 5), ENSURE_REQUIREMENTS, [id=#14]
 
-(11) Sort [codegen id : 3]
-Input [10]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
+(12) Sort [codegen id : 3]
+Input [10]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12]
 Arguments: [ss_customer_sk#2 ASC NULLS FIRST], false, 0
 
-(12) Scan parquet default.customer
-Output [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
+(13) Scan parquet default.customer
+Output [4]: [c_customer_sk#15, c_first_name#16, c_last_name#17, c_birth_country#18]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer]
 PushedFilters: [IsNotNull(c_customer_sk), IsNotNull(c_birth_country)]
 ReadSchema: struct<c_customer_sk:int,c_first_name:string,c_last_name:string,c_birth_country:string>
 
-(13) ColumnarToRow [codegen id : 4]
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
+(14) ColumnarToRow [codegen id : 4]
+Input [4]: [c_customer_sk#15, c_first_name#16, c_last_name#17, c_birth_country#18]
 
-(14) Filter [codegen id : 4]
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
-Condition : (isnotnull(c_customer_sk#14) AND isnotnull(c_birth_country#17))
+(15) Filter [codegen id : 4]
+Input [4]: [c_customer_sk#15, c_first_name#16, c_last_name#17, c_birth_country#18]
+Condition : (isnotnull(c_customer_sk#15) AND isnotnull(c_birth_country#18))
 
-(15) Exchange
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
-Arguments: hashpartitioning(c_customer_sk#14, 5), ENSURE_REQUIREMENTS, [id=#18]
+(16) Exchange
+Input [4]: [c_customer_sk#15, c_first_name#16, c_last_name#17, c_birth_country#18]
+Arguments: hashpartitioning(c_customer_sk#15, 5), ENSURE_REQUIREMENTS, [id=#19]
 
-(16) Sort [codegen id : 5]
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
-Arguments: [c_customer_sk#14 ASC NULLS FIRST], false, 0
+(17) Sort [codegen id : 5]
+Input [4]: [c_customer_sk#15, c_first_name#16, c_last_name#17, c_birth_country#18]
+Arguments: [c_customer_sk#15 ASC NULLS FIRST], false, 0
 
-(17) SortMergeJoin [codegen id : 8]
+(18) SortMergeJoin [codegen id : 8]
 Left keys [1]: [ss_customer_sk#2]
-Right keys [1]: [c_customer_sk#14]
+Right keys [1]: [c_customer_sk#15]
 Join condition: None
 
-(18) Project [codegen id : 8]
-Output [12]: [ss_item_sk#1, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, c_birth_country#17]
-Input [14]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
+(19) Project [codegen id : 8]
+Output [12]: [ss_item_sk#1, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, c_birth_country#18]
+Input [14]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_customer_sk#15, c_first_name#16, c_last_name#17, c_birth_country#18]
 
-(19) Scan parquet default.store
-Output [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
+(20) Scan parquet default.store
+Output [5]: [s_store_sk#20, s_store_name#21, s_market_id#22, s_state#23, s_zip#24]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/store]
 PushedFilters: [IsNotNull(s_market_id), EqualTo(s_market_id,8), IsNotNull(s_store_sk), IsNotNull(s_zip)]
 ReadSchema: struct<s_store_sk:int,s_store_name:string,s_market_id:int,s_state:string,s_zip:string>
 
-(20) ColumnarToRow [codegen id : 6]
-Input [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
+(21) ColumnarToRow [codegen id : 6]
+Input [5]: [s_store_sk#20, s_store_name#21, s_market_id#22, s_state#23, s_zip#24]
 
-(21) Filter [codegen id : 6]
-Input [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
-Condition : (((isnotnull(s_market_id#21) AND (s_market_id#21 = 8)) AND isnotnull(s_store_sk#19)) AND isnotnull(s_zip#23))
+(22) Filter [codegen id : 6]
+Input [5]: [s_store_sk#20, s_store_name#21, s_market_id#22, s_state#23, s_zip#24]
+Condition : (((isnotnull(s_market_id#22) AND (s_market_id#22 = 8)) AND isnotnull(s_store_sk#20)) AND isnotnull(s_zip#24))
 
-(22) Project [codegen id : 6]
-Output [4]: [s_store_sk#19, s_store_name#20, s_state#22, s_zip#23]
-Input [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
+(23) Project [codegen id : 6]
+Output [4]: [s_store_sk#20, s_store_name#21, s_state#23, s_zip#24]
+Input [5]: [s_store_sk#20, s_store_name#21, s_market_id#22, s_state#23, s_zip#24]
 
-(23) BroadcastExchange
-Input [4]: [s_store_sk#19, s_store_name#20, s_state#22, s_zip#23]
-Arguments: HashedRelationBroadcastMode(List(input[3, string, true]),false), [id=#24]
+(24) BroadcastExchange
+Input [4]: [s_store_sk#20, s_store_name#21, s_state#23, s_zip#24]
+Arguments: HashedRelationBroadcastMode(List(input[3, string, true]),false), [id=#25]
 
-(24) Scan parquet default.customer_address
-Output [3]: [ca_state#25, ca_zip#26, ca_country#27]
+(25) Scan parquet default.customer_address
+Output [3]: [ca_state#26, ca_zip#27, ca_country#28]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer_address]
 PushedFilters: [IsNotNull(ca_country), IsNotNull(ca_zip)]
 ReadSchema: struct<ca_state:string,ca_zip:string,ca_country:string>
 
-(25) ColumnarToRow
-Input [3]: [ca_state#25, ca_zip#26, ca_country#27]
+(26) ColumnarToRow
+Input [3]: [ca_state#26, ca_zip#27, ca_country#28]
 
-(26) Filter
-Input [3]: [ca_state#25, ca_zip#26, ca_country#27]
-Condition : (isnotnull(ca_country#27) AND isnotnull(ca_zip#26))
+(27) Filter
+Input [3]: [ca_state#26, ca_zip#27, ca_country#28]
+Condition : (isnotnull(ca_country#28) AND isnotnull(ca_zip#27))
 
-(27) BroadcastHashJoin [codegen id : 7]
-Left keys [1]: [s_zip#23]
-Right keys [1]: [ca_zip#26]
+(28) BroadcastHashJoin [codegen id : 7]
+Left keys [1]: [s_zip#24]
+Right keys [1]: [ca_zip#27]
 Join condition: None
 
-(28) Project [codegen id : 7]
-Output [5]: [s_store_sk#19, s_store_name#20, s_state#22, ca_state#25, ca_country#27]
-Input [7]: [s_store_sk#19, s_store_name#20, s_state#22, s_zip#23, ca_state#25, ca_zip#26, ca_country#27]
+(29) Project [codegen id : 7]
+Output [5]: [s_store_sk#20, s_store_name#21, s_state#23, ca_state#26, ca_country#28]
+Input [7]: [s_store_sk#20, s_store_name#21, s_state#23, s_zip#24, ca_state#26, ca_zip#27, ca_country#28]
 
-(29) BroadcastExchange
-Input [5]: [s_store_sk#19, s_store_name#20, s_state#22, ca_state#25, ca_country#27]
-Arguments: HashedRelationBroadcastMode(List(input[0, int, true], upper(input[4, string, true])),false), [id=#28]
+(30) BroadcastExchange
+Input [5]: [s_store_sk#20, s_store_name#21, s_state#23, ca_state#26, ca_country#28]
+Arguments: HashedRelationBroadcastMode(List(input[0, int, true], upper(input[4, string, true])),false), [id=#29]
 
-(30) BroadcastHashJoin [codegen id : 8]
-Left keys [2]: [ss_store_sk#3, c_birth_country#17]
-Right keys [2]: [s_store_sk#19, upper(ca_country#27)]
+(31) BroadcastHashJoin [codegen id : 8]
+Left keys [2]: [ss_store_sk#3, c_birth_country#18]
+Right keys [2]: [s_store_sk#20, upper(ca_country#28)]
 Join condition: None
 
-(31) Project [codegen id : 8]
-Output [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, s_store_name#20, s_state#22, ca_state#25]
-Input [17]: [ss_item_sk#1, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, c_birth_country#17, s_store_sk#19, s_store_name#20, s_state#22, ca_state#25, ca_country#27]
+(32) Project [codegen id : 8]
+Output [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, s_store_name#21, s_state#23, ca_state#26]
+Input [17]: [ss_item_sk#1, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, c_birth_country#18, s_store_sk#20, s_store_name#21, s_state#23, ca_state#26, ca_country#28]
 
-(32) Exchange
-Input [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, s_store_name#20, s_state#22, ca_state#25]
-Arguments: hashpartitioning(cast(ss_ticket_number#4 as bigint), cast(ss_item_sk#1 as bigint), 5), ENSURE_REQUIREMENTS, [id=#29]
+(33) Exchange
+Input [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, s_store_name#21, s_state#23, ca_state#26]
+Arguments: hashpartitioning(cast(ss_ticket_number#4 as bigint), cast(ss_item_sk#1 as bigint), 5), ENSURE_REQUIREMENTS, [id=#30]
 
-(33) Sort [codegen id : 9]
-Input [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, s_store_name#20, s_state#22, ca_state#25]
+(34) Sort [codegen id : 9]
+Input [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, s_store_name#21, s_state#23, ca_state#26]
 Arguments: [cast(ss_ticket_number#4 as bigint) ASC NULLS FIRST, cast(ss_item_sk#1 as bigint) ASC NULLS FIRST], false, 0
 
-(34) Scan parquet default.store_returns
-Output [2]: [sr_item_sk#30, sr_ticket_number#31]
+(35) Scan parquet default.store_returns
+Output [3]: [sr_item_sk#31, sr_ticket_number#32, sr_returned_date_sk#33]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/store_returns]
 PushedFilters: [IsNotNull(sr_ticket_number), IsNotNull(sr_item_sk)]
 ReadSchema: struct<sr_item_sk:bigint,sr_ticket_number:bigint>
 
-(35) ColumnarToRow [codegen id : 10]
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
+(36) ColumnarToRow [codegen id : 10]
+Input [3]: [sr_item_sk#31, sr_ticket_number#32, sr_returned_date_sk#33]
+
+(37) Filter [codegen id : 10]
+Input [3]: [sr_item_sk#31, sr_ticket_number#32, sr_returned_date_sk#33]
+Condition : (isnotnull(sr_ticket_number#32) AND isnotnull(sr_item_sk#31))
 
-(36) Filter [codegen id : 10]
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
-Condition : (isnotnull(sr_ticket_number#31) AND isnotnull(sr_item_sk#30))
+(38) Project [codegen id : 10]
+Output [2]: [sr_item_sk#31, sr_ticket_number#32]
+Input [3]: [sr_item_sk#31, sr_ticket_number#32, sr_returned_date_sk#33]
 
-(37) Exchange
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
-Arguments: hashpartitioning(sr_ticket_number#31, sr_item_sk#30, 5), ENSURE_REQUIREMENTS, [id=#32]
+(39) Exchange
+Input [2]: [sr_item_sk#31, sr_ticket_number#32]
+Arguments: hashpartitioning(sr_ticket_number#32, sr_item_sk#31, 5), ENSURE_REQUIREMENTS, [id=#34]
 
-(38) Sort [codegen id : 11]
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
-Arguments: [sr_ticket_number#31 ASC NULLS FIRST, sr_item_sk#30 ASC NULLS FIRST], false, 0
+(40) Sort [codegen id : 11]
+Input [2]: [sr_item_sk#31, sr_ticket_number#32]
+Arguments: [sr_ticket_number#32 ASC NULLS FIRST, sr_item_sk#31 ASC NULLS FIRST], false, 0
 
-(39) SortMergeJoin [codegen id : 12]
+(41) SortMergeJoin [codegen id : 12]
 Left keys [2]: [cast(ss_ticket_number#4 as bigint), cast(ss_item_sk#1 as bigint)]
-Right keys [2]: [sr_ticket_number#31, sr_item_sk#30]
+Right keys [2]: [sr_ticket_number#32, sr_item_sk#31]
 Join condition: None
 
-(40) Project [codegen id : 12]
-Output [11]: [ss_net_paid#5, s_store_name#20, s_state#22, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, ca_state#25]
-Input [15]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, s_store_name#20, s_state#22, ca_state#25, sr_item_sk#30, sr_ticket_number#31]
+(42) Project [codegen id : 12]
+Output [11]: [ss_net_paid#5, s_store_name#21, s_state#23, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, ca_state#26]
+Input [15]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, s_store_name#21, s_state#23, ca_state#26, sr_item_sk#31, sr_ticket_number#32]
 
-(41) HashAggregate [codegen id : 12]
-Input [11]: [ss_net_paid#5, s_store_name#20, s_state#22, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, ca_state#25]
-Keys [10]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8]
+(43) HashAggregate [codegen id : 12]
+Input [11]: [ss_net_paid#5, s_store_name#21, s_state#23, i_current_price#8, i_size#9, i_color#10, i_units#11, i_manager_id#12, c_first_name#16, c_last_name#17, ca_state#26]
+Keys [10]: [c_last_name#17, c_first_name#16, s_store_name#21, ca_state#26, s_state#23, i_color#10, i_current_price#8, i_manager_id#12, i_units#11, i_size#9]
 Functions [1]: [partial_sum(UnscaledValue(ss_net_paid#5))]
-Aggregate Attributes [1]: [sum#33]
-Results [11]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, sum#34]
+Aggregate Attributes [1]: [sum#35]
+Results [11]: [c_last_name#17, c_first_name#16, s_store_name#21, ca_state#26, s_state#23, i_color#10, i_current_price#8, i_manager_id#12, i_units#11, i_size#9, sum#36]
 
-(42) Exchange
-Input [11]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, sum#34]
-Arguments: hashpartitioning(c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, 5), ENSURE_REQUIREMENTS, [id=#35]
+(44) Exchange
+Input [11]: [c_last_name#17, c_first_name#16, s_store_name#21, ca_state#26, s_state#23, i_color#10, i_current_price#8, i_manager_id#12, i_units#11, i_size#9, sum#36]
+Arguments: hashpartitioning(c_last_name#17, c_first_name#16, s_store_name#21, ca_state#26, s_state#23, i_color#10, i_current_price#8, i_manager_id#12, i_units#11, i_size#9, 5), ENSURE_REQUIREMENTS, [id=#37]
 
-(43) HashAggregate [codegen id : 13]
-Input [11]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, sum#34]
-Keys [10]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8]
+(45) HashAggregate [codegen id : 13]
+Input [11]: [c_last_name#17, c_first_name#16, s_store_name#21, ca_state#26, s_state#23, i_color#10, i_current_price#8, i_manager_id#12, i_units#11, i_size#9, sum#36]
+Keys [10]: [c_last_name#17, c_first_name#16, s_store_name#21, ca_state#26, s_state#23, i_color#10, i_current_price#8, i_manager_id#12, i_units#11, i_size#9]
 Functions [1]: [sum(UnscaledValue(ss_net_paid#5))]
-Aggregate Attributes [1]: [sum(UnscaledValue(ss_net_paid#5))#36]
-Results [4]: [c_last_name#16, c_first_name#15, s_store_name#20, MakeDecimal(sum(UnscaledValue(ss_net_paid#5))#36,17,2) AS netpaid#37]
-
-(44) HashAggregate [codegen id : 13]
-Input [4]: [c_last_name#16, c_first_name#15, s_store_name#20, netpaid#37]
-Keys [3]: [c_last_name#16, c_first_name#15, s_store_name#20]
-Functions [1]: [partial_sum(netpaid#37)]
-Aggregate Attributes [2]: [sum#38, isEmpty#39]
-Results [5]: [c_last_name#16, c_first_name#15, s_store_name#20, sum#40, isEmpty#41]
-
-(45) Exchange
-Input [5]: [c_last_name#16, c_first_name#15, s_store_name#20, sum#40, isEmpty#41]
-Arguments: hashpartitioning(c_last_name#16, c_first_name#15, s_store_name#20, 5), ENSURE_REQUIREMENTS, [id=#42]
-
-(46) HashAggregate [codegen id : 14]
-Input [5]: [c_last_name#16, c_first_name#15, s_store_name#20, sum#40, isEmpty#41]
-Keys [3]: [c_last_name#16, c_first_name#15, s_store_name#20]
-Functions [1]: [sum(netpaid#37)]
-Aggregate Attributes [1]: [sum(netpaid#37)#43]
-Results [5]: [c_last_name#16, c_first_name#15, s_store_name#20, sum(netpaid#37)#43 AS paid#44, sum(netpaid#37)#43 AS sum(netpaid#37)#45]
-
-(47) Filter [codegen id : 14]
-Input [5]: [c_last_name#16, c_first_name#15, s_store_name#20, paid#44, sum(netpaid#37)#45]
-Condition : (isnotnull(sum(netpaid#37)#45) AND (cast(sum(netpaid#37)#45 as decimal(33,8)) > cast(Subquery scalar-subquery#46, [id=#47] as decimal(33,8))))
-
-(48) Project [codegen id : 14]
-Output [4]: [c_last_name#16, c_first_name#15, s_store_name#20, paid#44]
-Input [5]: [c_last_name#16, c_first_name#15, s_store_name#20, paid#44, sum(netpaid#37)#45]
+Aggregate Attributes [1]: [sum(UnscaledValue(ss_net_paid#5))#38]
+Results [4]: [c_last_name#17, c_first_name#16, s_store_name#21, MakeDecimal(sum(UnscaledValue(ss_net_paid#5))#38,17,2) AS netpaid#39]
+
+(46) HashAggregate [codegen id : 13]
+Input [4]: [c_last_name#17, c_first_name#16, s_store_name#21, netpaid#39]
+Keys [3]: [c_last_name#17, c_first_name#16, s_store_name#21]
+Functions [1]: [partial_sum(netpaid#39)]
+Aggregate Attributes [2]: [sum#40, isEmpty#41]
+Results [5]: [c_last_name#17, c_first_name#16, s_store_name#21, sum#42, isEmpty#43]
+
+(47) Exchange
+Input [5]: [c_last_name#17, c_first_name#16, s_store_name#21, sum#42, isEmpty#43]
+Arguments: hashpartitioning(c_last_name#17, c_first_name#16, s_store_name#21, 5), ENSURE_REQUIREMENTS, [id=#44]
+
+(48) HashAggregate [codegen id : 14]
+Input [5]: [c_last_name#17, c_first_name#16, s_store_name#21, sum#42, isEmpty#43]
+Keys [3]: [c_last_name#17, c_first_name#16, s_store_name#21]
+Functions [1]: [sum(netpaid#39)]
+Aggregate Attributes [1]: [sum(netpaid#39)#45]
+Results [5]: [c_last_name#17, c_first_name#16, s_store_name#21, sum(netpaid#39)#45 AS paid#46, sum(netpaid#39)#45 AS sum(netpaid#39)#47]
+
+(49) Filter [codegen id : 14]
+Input [5]: [c_last_name#17, c_first_name#16, s_store_name#21, paid#46, sum(netpaid#39)#47]
+Condition : (isnotnull(sum(netpaid#39)#47) AND (cast(sum(netpaid#39)#47 as decimal(33,8)) > cast(Subquery scalar-subquery#48, [id=#49] as decimal(33,8))))
+
+(50) Project [codegen id : 14]
+Output [4]: [c_last_name#17, c_first_name#16, s_store_name#21, paid#46]
+Input [5]: [c_last_name#17, c_first_name#16, s_store_name#21, paid#46, sum(netpaid#39)#47]
 
 ===== Subqueries =====
 
-Subquery:1 Hosting operator id = 47 Hosting Expression = Subquery scalar-subquery#46, [id=#47]
-* HashAggregate (100)
-+- Exchange (99)
-   +- * HashAggregate (98)
-      +- * HashAggregate (97)
-         +- Exchange (96)
-            +- * HashAggregate (95)
-               +- * Project (94)
-                  +- * SortMergeJoin Inner (93)
-                     :- * Sort (87)
-                     :  +- Exchange (86)
-                     :     +- * Project (85)
-                     :        +- * SortMergeJoin Inner (84)
-                     :           :- * Sort (78)
-                     :           :  +- Exchange (77)
-                     :           :     +- * Project (76)
-                     :           :        +- * SortMergeJoin Inner (75)
-                     :           :           :- * Sort (69)
-                     :           :           :  +- Exchange (68)
-                     :           :           :     +- * Project (67)
-                     :           :           :        +- * SortMergeJoin Inner (66)
-                     :           :           :           :- * Sort (60)
-                     :           :           :           :  +- Exchange (59)
-                     :           :           :           :     +- * Project (58)
-                     :           :           :           :        +- * BroadcastHashJoin Inner BuildRight (57)
-                     :           :           :           :           :- * Filter (51)
-                     :           :           :           :           :  +- * ColumnarToRow (50)
-                     :           :           :           :           :     +- Scan parquet default.store_sales (49)
-                     :           :           :           :           +- BroadcastExchange (56)
-                     :           :           :           :              +- * Project (55)
-                     :           :           :           :                 +- * Filter (54)
-                     :           :           :           :                    +- * ColumnarToRow (53)
-                     :           :           :           :                       +- Scan parquet default.store (52)
-                     :           :           :           +- * Sort (65)
-                     :           :           :              +- Exchange (64)
-                     :           :           :                 +- * Filter (63)
-                     :           :           :                    +- * ColumnarToRow (62)
-                     :           :           :                       +- Scan parquet default.customer (61)
-                     :           :           +- * Sort (74)
-                     :           :              +- Exchange (73)
-                     :           :                 +- * Filter (72)
-                     :           :                    +- * ColumnarToRow (71)
-                     :           :                       +- Scan parquet default.item (70)
-                     :           +- * Sort (83)
-                     :              +- Exchange (82)
-                     :                 +- * Filter (81)
-                     :                    +- * ColumnarToRow (80)
-                     :                       +- Scan parquet default.customer_address (79)
-                     +- * Sort (92)
-                        +- Exchange (91)
-                           +- * Filter (90)
-                              +- * ColumnarToRow (89)
-                                 +- Scan parquet default.store_returns (88)
-
-
-(49) Scan parquet default.store_sales
-Output [5]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5]
+Subquery:1 Hosting operator id = 49 Hosting Expression = Subquery scalar-subquery#48, [id=#49]
+* HashAggregate (104)
++- Exchange (103)
+   +- * HashAggregate (102)
+      +- * HashAggregate (101)
+         +- Exchange (100)
+            +- * HashAggregate (99)
+               +- * Project (98)
+                  +- * SortMergeJoin Inner (97)
+                     :- * Sort (90)
+                     :  +- Exchange (89)
+                     :     +- * Project (88)
+                     :        +- * SortMergeJoin Inner (87)
+                     :           :- * Sort (81)
+                     :           :  +- Exchange (80)
+                     :           :     +- * Project (79)
+                     :           :        +- * SortMergeJoin Inner (78)
+                     :           :           :- * Sort (72)
+                     :           :           :  +- Exchange (71)
+                     :           :           :     +- * Project (70)
+                     :           :           :        +- * SortMergeJoin Inner (69)
+                     :           :           :           :- * Sort (63)
+                     :           :           :           :  +- Exchange (62)
+                     :           :           :           :     +- * Project (61)
+                     :           :           :           :        +- * BroadcastHashJoin Inner BuildRight (60)
+                     :           :           :           :           :- * Project (54)
+                     :           :           :           :           :  +- * Filter (53)
+                     :           :           :           :           :     +- * ColumnarToRow (52)
+                     :           :           :           :           :        +- Scan parquet default.store_sales (51)
+                     :           :           :           :           +- BroadcastExchange (59)
+                     :           :           :           :              +- * Project (58)
+                     :           :           :           :                 +- * Filter (57)
+                     :           :           :           :                    +- * ColumnarToRow (56)
+                     :           :           :           :                       +- Scan parquet default.store (55)
+                     :           :           :           +- * Sort (68)
+                     :           :           :              +- Exchange (67)
+                     :           :           :                 +- * Filter (66)
+                     :           :           :                    +- * ColumnarToRow (65)
+                     :           :           :                       +- Scan parquet default.item (64)
+                     :           :           +- * Sort (77)
+                     :           :              +- Exchange (76)
+                     :           :                 +- * Filter (75)
+                     :           :                    +- * ColumnarToRow (74)
+                     :           :                       +- Scan parquet default.customer (73)
+                     :           +- * Sort (86)
+                     :              +- Exchange (85)
+                     :                 +- * Filter (84)
+                     :                    +- * ColumnarToRow (83)
+                     :                       +- Scan parquet default.customer_address (82)
+                     +- * Sort (96)
+                        +- Exchange (95)
+                           +- * Project (94)
+                              +- * Filter (93)
+                                 +- * ColumnarToRow (92)
+                                    +- Scan parquet default.store_returns (91)
+
+
+(51) Scan parquet default.store_sales
+Output [6]: [ss_item_sk#50, ss_customer_sk#51, ss_store_sk#52, ss_ticket_number#53, ss_net_paid#54, ss_sold_date_sk#55]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/store_sales]
 PushedFilters: [IsNotNull(ss_ticket_number), IsNotNull(ss_item_sk), IsNotNull(ss_store_sk), IsNotNull(ss_customer_sk)]
 ReadSchema: struct<ss_item_sk:int,ss_customer_sk:int,ss_store_sk:int,ss_ticket_number:int,ss_net_paid:decimal(7,2)>
 
-(50) ColumnarToRow [codegen id : 2]
-Input [5]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5]
+(52) ColumnarToRow [codegen id : 2]
+Input [6]: [ss_item_sk#50, ss_customer_sk#51, ss_store_sk#52, ss_ticket_number#53, ss_net_paid#54, ss_sold_date_sk#55]
 
-(51) Filter [codegen id : 2]
-Input [5]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5]
-Condition : (((isnotnull(ss_ticket_number#4) AND isnotnull(ss_item_sk#1)) AND isnotnull(ss_store_sk#3)) AND isnotnull(ss_customer_sk#2))
+(53) Filter [codegen id : 2]
+Input [6]: [ss_item_sk#50, ss_customer_sk#51, ss_store_sk#52, ss_ticket_number#53, ss_net_paid#54, ss_sold_date_sk#55]
+Condition : (((isnotnull(ss_ticket_number#53) AND isnotnull(ss_item_sk#50)) AND isnotnull(ss_store_sk#52)) AND isnotnull(ss_customer_sk#51))
 
-(52) Scan parquet default.store
-Output [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
+(54) Project [codegen id : 2]
+Output [5]: [ss_item_sk#50, ss_customer_sk#51, ss_store_sk#52, ss_ticket_number#53, ss_net_paid#54]
+Input [6]: [ss_item_sk#50, ss_customer_sk#51, ss_store_sk#52, ss_ticket_number#53, ss_net_paid#54, ss_sold_date_sk#55]
+
+(55) Scan parquet default.store
+Output [5]: [s_store_sk#56, s_store_name#57, s_market_id#58, s_state#59, s_zip#60]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/store]
 PushedFilters: [IsNotNull(s_market_id), EqualTo(s_market_id,8), IsNotNull(s_store_sk), IsNotNull(s_zip)]
 ReadSchema: struct<s_store_sk:int,s_store_name:string,s_market_id:int,s_state:string,s_zip:string>
 
-(53) ColumnarToRow [codegen id : 1]
-Input [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
+(56) ColumnarToRow [codegen id : 1]
+Input [5]: [s_store_sk#56, s_store_name#57, s_market_id#58, s_state#59, s_zip#60]
 
-(54) Filter [codegen id : 1]
-Input [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
-Condition : (((isnotnull(s_market_id#21) AND (s_market_id#21 = 8)) AND isnotnull(s_store_sk#19)) AND isnotnull(s_zip#23))
+(57) Filter [codegen id : 1]
+Input [5]: [s_store_sk#56, s_store_name#57, s_market_id#58, s_state#59, s_zip#60]
+Condition : (((isnotnull(s_market_id#58) AND (s_market_id#58 = 8)) AND isnotnull(s_store_sk#56)) AND isnotnull(s_zip#60))
 
-(55) Project [codegen id : 1]
-Output [4]: [s_store_sk#19, s_store_name#20, s_state#22, s_zip#23]
-Input [5]: [s_store_sk#19, s_store_name#20, s_market_id#21, s_state#22, s_zip#23]
+(58) Project [codegen id : 1]
+Output [4]: [s_store_sk#56, s_store_name#57, s_state#59, s_zip#60]
+Input [5]: [s_store_sk#56, s_store_name#57, s_market_id#58, s_state#59, s_zip#60]
 
-(56) BroadcastExchange
-Input [4]: [s_store_sk#19, s_store_name#20, s_state#22, s_zip#23]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#48]
+(59) BroadcastExchange
+Input [4]: [s_store_sk#56, s_store_name#57, s_state#59, s_zip#60]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#61]
 
-(57) BroadcastHashJoin [codegen id : 2]
-Left keys [1]: [ss_store_sk#3]
-Right keys [1]: [s_store_sk#19]
+(60) BroadcastHashJoin [codegen id : 2]
+Left keys [1]: [ss_store_sk#52]
+Right keys [1]: [s_store_sk#56]
 Join condition: None
 
-(58) Project [codegen id : 2]
-Output [7]: [ss_item_sk#1, ss_customer_sk#2, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23]
-Input [9]: [ss_item_sk#1, ss_customer_sk#2, ss_store_sk#3, ss_ticket_number#4, ss_net_paid#5, s_store_sk#19, s_store_name#20, s_state#22, s_zip#23]
+(61) Project [codegen id : 2]
+Output [7]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60]
+Input [9]: [ss_item_sk#50, ss_customer_sk#51, ss_store_sk#52, ss_ticket_number#53, ss_net_paid#54, s_store_sk#56, s_store_name#57, s_state#59, s_zip#60]
 
-(59) Exchange
-Input [7]: [ss_item_sk#1, ss_customer_sk#2, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23]
-Arguments: hashpartitioning(ss_customer_sk#2, 5), ENSURE_REQUIREMENTS, [id=#49]
+(62) Exchange
+Input [7]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60]
+Arguments: hashpartitioning(ss_item_sk#50, 5), ENSURE_REQUIREMENTS, [id=#62]
 
-(60) Sort [codegen id : 3]
-Input [7]: [ss_item_sk#1, ss_customer_sk#2, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23]
-Arguments: [ss_customer_sk#2 ASC NULLS FIRST], false, 0
+(63) Sort [codegen id : 3]
+Input [7]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60]
+Arguments: [ss_item_sk#50 ASC NULLS FIRST], false, 0
 
-(61) Scan parquet default.customer
-Output [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
+(64) Scan parquet default.item
+Output [6]: [i_item_sk#63, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/customer]
-PushedFilters: [IsNotNull(c_customer_sk), IsNotNull(c_birth_country)]
-ReadSchema: struct<c_customer_sk:int,c_first_name:string,c_last_name:string,c_birth_country:string>
+Location [not included in comparison]/{warehouse_dir}/item]
+PushedFilters: [IsNotNull(i_item_sk)]
+ReadSchema: struct<i_item_sk:int,i_current_price:decimal(7,2),i_size:string,i_color:string,i_units:string,i_manager_id:int>
 
-(62) ColumnarToRow [codegen id : 4]
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
+(65) ColumnarToRow [codegen id : 4]
+Input [6]: [i_item_sk#63, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
 
-(63) Filter [codegen id : 4]
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
-Condition : (isnotnull(c_customer_sk#14) AND isnotnull(c_birth_country#17))
+(66) Filter [codegen id : 4]
+Input [6]: [i_item_sk#63, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
+Condition : isnotnull(i_item_sk#63)
 
-(64) Exchange
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
-Arguments: hashpartitioning(c_customer_sk#14, 5), ENSURE_REQUIREMENTS, [id=#50]
+(67) Exchange
+Input [6]: [i_item_sk#63, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
+Arguments: hashpartitioning(i_item_sk#63, 5), ENSURE_REQUIREMENTS, [id=#69]
 
-(65) Sort [codegen id : 5]
-Input [4]: [c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
-Arguments: [c_customer_sk#14 ASC NULLS FIRST], false, 0
+(68) Sort [codegen id : 5]
+Input [6]: [i_item_sk#63, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
+Arguments: [i_item_sk#63 ASC NULLS FIRST], false, 0
 
-(66) SortMergeJoin [codegen id : 6]
-Left keys [1]: [ss_customer_sk#2]
-Right keys [1]: [c_customer_sk#14]
+(69) SortMergeJoin [codegen id : 6]
+Left keys [1]: [ss_item_sk#50]
+Right keys [1]: [i_item_sk#63]
 Join condition: None
 
-(67) Project [codegen id : 6]
-Output [9]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17]
-Input [11]: [ss_item_sk#1, ss_customer_sk#2, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_customer_sk#14, c_first_name#15, c_last_name#16, c_birth_country#17]
+(70) Project [codegen id : 6]
+Output [12]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
+Input [13]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_item_sk#63, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
 
-(68) Exchange
-Input [9]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17]
-Arguments: hashpartitioning(ss_item_sk#1, 5), ENSURE_REQUIREMENTS, [id=#51]
+(71) Exchange
+Input [12]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
+Arguments: hashpartitioning(ss_customer_sk#51, 5), ENSURE_REQUIREMENTS, [id=#70]
 
-(69) Sort [codegen id : 7]
-Input [9]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17]
-Arguments: [ss_item_sk#1 ASC NULLS FIRST], false, 0
+(72) Sort [codegen id : 7]
+Input [12]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68]
+Arguments: [ss_customer_sk#51 ASC NULLS FIRST], false, 0
 
-(70) Scan parquet default.item
-Output [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
+(73) Scan parquet default.customer
+Output [4]: [c_customer_sk#71, c_first_name#72, c_last_name#73, c_birth_country#74]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/item]
-PushedFilters: [IsNotNull(i_item_sk)]
-ReadSchema: struct<i_item_sk:int,i_current_price:decimal(7,2),i_size:string,i_color:string,i_units:string,i_manager_id:int>
+Location [not included in comparison]/{warehouse_dir}/customer]
+PushedFilters: [IsNotNull(c_customer_sk), IsNotNull(c_birth_country)]
+ReadSchema: struct<c_customer_sk:int,c_first_name:string,c_last_name:string,c_birth_country:string>
 
-(71) ColumnarToRow [codegen id : 8]
-Input [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
+(74) ColumnarToRow [codegen id : 8]
+Input [4]: [c_customer_sk#71, c_first_name#72, c_last_name#73, c_birth_country#74]
 
-(72) Filter [codegen id : 8]
-Input [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Condition : isnotnull(i_item_sk#6)
+(75) Filter [codegen id : 8]
+Input [4]: [c_customer_sk#71, c_first_name#72, c_last_name#73, c_birth_country#74]
+Condition : (isnotnull(c_customer_sk#71) AND isnotnull(c_birth_country#74))
 
-(73) Exchange
-Input [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Arguments: hashpartitioning(i_item_sk#6, 5), ENSURE_REQUIREMENTS, [id=#52]
+(76) Exchange
+Input [4]: [c_customer_sk#71, c_first_name#72, c_last_name#73, c_birth_country#74]
+Arguments: hashpartitioning(c_customer_sk#71, 5), ENSURE_REQUIREMENTS, [id=#75]
 
-(74) Sort [codegen id : 9]
-Input [6]: [i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Arguments: [i_item_sk#6 ASC NULLS FIRST], false, 0
+(77) Sort [codegen id : 9]
+Input [4]: [c_customer_sk#71, c_first_name#72, c_last_name#73, c_birth_country#74]
+Arguments: [c_customer_sk#71 ASC NULLS FIRST], false, 0
 
-(75) SortMergeJoin [codegen id : 10]
-Left keys [1]: [ss_item_sk#1]
-Right keys [1]: [i_item_sk#6]
+(78) SortMergeJoin [codegen id : 10]
+Left keys [1]: [ss_customer_sk#51]
+Right keys [1]: [c_customer_sk#71]
 Join condition: None
 
-(76) Project [codegen id : 10]
-Output [14]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Input [15]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17, i_item_sk#6, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
+(79) Project [codegen id : 10]
+Output [14]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, c_birth_country#74]
+Input [16]: [ss_item_sk#50, ss_customer_sk#51, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_customer_sk#71, c_first_name#72, c_last_name#73, c_birth_country#74]
 
-(77) Exchange
-Input [14]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Arguments: hashpartitioning(c_birth_country#17, s_zip#23, 5), ENSURE_REQUIREMENTS, [id=#53]
+(80) Exchange
+Input [14]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, c_birth_country#74]
+Arguments: hashpartitioning(c_birth_country#74, s_zip#60, 5), ENSURE_REQUIREMENTS, [id=#76]
 
-(78) Sort [codegen id : 11]
-Input [14]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11]
-Arguments: [c_birth_country#17 ASC NULLS FIRST, s_zip#23 ASC NULLS FIRST], false, 0
+(81) Sort [codegen id : 11]
+Input [14]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, c_birth_country#74]
+Arguments: [c_birth_country#74 ASC NULLS FIRST, s_zip#60 ASC NULLS FIRST], false, 0
 
-(79) Scan parquet default.customer_address
-Output [3]: [ca_state#25, ca_zip#26, ca_country#27]
+(82) Scan parquet default.customer_address
+Output [3]: [ca_state#77, ca_zip#78, ca_country#79]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer_address]
 PushedFilters: [IsNotNull(ca_country), IsNotNull(ca_zip)]
 ReadSchema: struct<ca_state:string,ca_zip:string,ca_country:string>
 
-(80) ColumnarToRow [codegen id : 12]
-Input [3]: [ca_state#25, ca_zip#26, ca_country#27]
+(83) ColumnarToRow [codegen id : 12]
+Input [3]: [ca_state#77, ca_zip#78, ca_country#79]
 
-(81) Filter [codegen id : 12]
-Input [3]: [ca_state#25, ca_zip#26, ca_country#27]
-Condition : (isnotnull(ca_country#27) AND isnotnull(ca_zip#26))
+(84) Filter [codegen id : 12]
+Input [3]: [ca_state#77, ca_zip#78, ca_country#79]
+Condition : (isnotnull(ca_country#79) AND isnotnull(ca_zip#78))
 
-(82) Exchange
-Input [3]: [ca_state#25, ca_zip#26, ca_country#27]
-Arguments: hashpartitioning(upper(ca_country#27), ca_zip#26, 5), ENSURE_REQUIREMENTS, [id=#54]
+(85) Exchange
+Input [3]: [ca_state#77, ca_zip#78, ca_country#79]
+Arguments: hashpartitioning(upper(ca_country#79), ca_zip#78, 5), ENSURE_REQUIREMENTS, [id=#80]
 
-(83) Sort [codegen id : 13]
-Input [3]: [ca_state#25, ca_zip#26, ca_country#27]
-Arguments: [upper(ca_country#27) ASC NULLS FIRST, ca_zip#26 ASC NULLS FIRST], false, 0
+(86) Sort [codegen id : 13]
+Input [3]: [ca_state#77, ca_zip#78, ca_country#79]
+Arguments: [upper(ca_country#79) ASC NULLS FIRST, ca_zip#78 ASC NULLS FIRST], false, 0
 
-(84) SortMergeJoin [codegen id : 14]
-Left keys [2]: [c_birth_country#17, s_zip#23]
-Right keys [2]: [upper(ca_country#27), ca_zip#26]
+(87) SortMergeJoin [codegen id : 14]
+Left keys [2]: [c_birth_country#74, s_zip#60]
+Right keys [2]: [upper(ca_country#79), ca_zip#78]
 Join condition: None
 
-(85) Project [codegen id : 14]
-Output [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, c_first_name#15, c_last_name#16, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, ca_state#25]
-Input [17]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, s_zip#23, c_first_name#15, c_last_name#16, c_birth_country#17, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, ca_state#25, ca_zip#26, ca_country#27]
+(88) Project [codegen id : 14]
+Output [13]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, ca_state#77]
+Input [17]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, s_zip#60, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, c_birth_country#74, ca_state#77, ca_zip#78, ca_country#79]
 
-(86) Exchange
-Input [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, c_first_name#15, c_last_name#16, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, ca_state#25]
-Arguments: hashpartitioning(cast(ss_ticket_number#4 as bigint), cast(ss_item_sk#1 as bigint), 5), ENSURE_REQUIREMENTS, [id=#55]
+(89) Exchange
+Input [13]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, ca_state#77]
+Arguments: hashpartitioning(cast(ss_ticket_number#53 as bigint), cast(ss_item_sk#50 as bigint), 5), ENSURE_REQUIREMENTS, [id=#81]
 
-(87) Sort [codegen id : 15]
-Input [13]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, c_first_name#15, c_last_name#16, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, ca_state#25]
-Arguments: [cast(ss_ticket_number#4 as bigint) ASC NULLS FIRST, cast(ss_item_sk#1 as bigint) ASC NULLS FIRST], false, 0
+(90) Sort [codegen id : 15]
+Input [13]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, ca_state#77]
+Arguments: [cast(ss_ticket_number#53 as bigint) ASC NULLS FIRST, cast(ss_item_sk#50 as bigint) ASC NULLS FIRST], false, 0
 
-(88) Scan parquet default.store_returns
-Output [2]: [sr_item_sk#30, sr_ticket_number#31]
+(91) Scan parquet default.store_returns
+Output [3]: [sr_item_sk#82, sr_ticket_number#83, sr_returned_date_sk#84]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/store_returns]
 PushedFilters: [IsNotNull(sr_ticket_number), IsNotNull(sr_item_sk)]
 ReadSchema: struct<sr_item_sk:bigint,sr_ticket_number:bigint>
 
-(89) ColumnarToRow [codegen id : 16]
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
+(92) ColumnarToRow [codegen id : 16]
+Input [3]: [sr_item_sk#82, sr_ticket_number#83, sr_returned_date_sk#84]
 
-(90) Filter [codegen id : 16]
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
-Condition : (isnotnull(sr_ticket_number#31) AND isnotnull(sr_item_sk#30))
+(93) Filter [codegen id : 16]
+Input [3]: [sr_item_sk#82, sr_ticket_number#83, sr_returned_date_sk#84]
+Condition : (isnotnull(sr_ticket_number#83) AND isnotnull(sr_item_sk#82))
 
-(91) Exchange
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
-Arguments: hashpartitioning(sr_ticket_number#31, sr_item_sk#30, 5), ENSURE_REQUIREMENTS, [id=#56]
+(94) Project [codegen id : 16]
+Output [2]: [sr_item_sk#82, sr_ticket_number#83]
+Input [3]: [sr_item_sk#82, sr_ticket_number#83, sr_returned_date_sk#84]
 
-(92) Sort [codegen id : 17]
-Input [2]: [sr_item_sk#30, sr_ticket_number#31]
-Arguments: [sr_ticket_number#31 ASC NULLS FIRST, sr_item_sk#30 ASC NULLS FIRST], false, 0
+(95) Exchange
+Input [2]: [sr_item_sk#82, sr_ticket_number#83]
+Arguments: hashpartitioning(sr_ticket_number#83, sr_item_sk#82, 5), ENSURE_REQUIREMENTS, [id=#85]
 
-(93) SortMergeJoin [codegen id : 18]
-Left keys [2]: [cast(ss_ticket_number#4 as bigint), cast(ss_item_sk#1 as bigint)]
-Right keys [2]: [sr_ticket_number#31, sr_item_sk#30]
-Join condition: None
-
-(94) Project [codegen id : 18]
-Output [11]: [ss_net_paid#5, s_store_name#20, s_state#22, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, ca_state#25]
-Input [15]: [ss_item_sk#1, ss_ticket_number#4, ss_net_paid#5, s_store_name#20, s_state#22, c_first_name#15, c_last_name#16, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, ca_state#25, sr_item_sk#30, sr_ticket_number#31]
-
-(95) HashAggregate [codegen id : 18]
-Input [11]: [ss_net_paid#5, s_store_name#20, s_state#22, i_current_price#7, i_size#8, i_color#9, i_units#10, i_manager_id#11, c_first_name#15, c_last_name#16, ca_state#25]
-Keys [10]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8]
-Functions [1]: [partial_sum(UnscaledValue(ss_net_paid#5))]
-Aggregate Attributes [1]: [sum#57]
-Results [11]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, sum#58]
+(96) Sort [codegen id : 17]
+Input [2]: [sr_item_sk#82, sr_ticket_number#83]
+Arguments: [sr_ticket_number#83 ASC NULLS FIRST, sr_item_sk#82 ASC NULLS FIRST], false, 0
 
-(96) Exchange
-Input [11]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, sum#58]
-Arguments: hashpartitioning(c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, 5), ENSURE_REQUIREMENTS, [id=#59]
-
-(97) HashAggregate [codegen id : 19]
-Input [11]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8, sum#58]
-Keys [10]: [c_last_name#16, c_first_name#15, s_store_name#20, ca_state#25, s_state#22, i_color#9, i_current_price#7, i_manager_id#11, i_units#10, i_size#8]
-Functions [1]: [sum(UnscaledValue(ss_net_paid#5))]
-Aggregate Attributes [1]: [sum(UnscaledValue(ss_net_paid#5))#60]
-Results [1]: [MakeDecimal(sum(UnscaledValue(ss_net_paid#5))#60,17,2) AS netpaid#37]
+(97) SortMergeJoin [codegen id : 18]
+Left keys [2]: [cast(ss_ticket_number#53 as bigint), cast(ss_item_sk#50 as bigint)]
+Right keys [2]: [sr_ticket_number#83, sr_item_sk#82]
+Join condition: None
 
-(98) HashAggregate [codegen id : 19]
-Input [1]: [netpaid#37]
+(98) Project [codegen id : 18]
+Output [11]: [ss_net_paid#54, s_store_name#57, s_state#59, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, ca_state#77]
+Input [15]: [ss_item_sk#50, ss_ticket_number#53, ss_net_paid#54, s_store_name#57, s_state#59, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, ca_state#77, sr_item_sk#82, sr_ticket_number#83]
+
+(99) HashAggregate [codegen id : 18]
+Input [11]: [ss_net_paid#54, s_store_name#57, s_state#59, i_current_price#64, i_size#65, i_color#66, i_units#67, i_manager_id#68, c_first_name#72, c_last_name#73, ca_state#77]
+Keys [10]: [c_last_name#73, c_first_name#72, s_store_name#57, ca_state#77, s_state#59, i_color#66, i_current_price#64, i_manager_id#68, i_units#67, i_size#65]
+Functions [1]: [partial_sum(UnscaledValue(ss_net_paid#54))]
+Aggregate Attributes [1]: [sum#86]
+Results [11]: [c_last_name#73, c_first_name#72, s_store_name#57, ca_state#77, s_state#59, i_color#66, i_current_price#64, i_manager_id#68, i_units#67, i_size#65, sum#87]
+
+(100) Exchange
+Input [11]: [c_last_name#73, c_first_name#72, s_store_name#57, ca_state#77, s_state#59, i_color#66, i_current_price#64, i_manager_id#68, i_units#67, i_size#65, sum#87]
+Arguments: hashpartitioning(c_last_name#73, c_first_name#72, s_store_name#57, ca_state#77, s_state#59, i_color#66, i_current_price#64, i_manager_id#68, i_units#67, i_size#65, 5), ENSURE_REQUIREMENTS, [id=#88]
+
+(101) HashAggregate [codegen id : 19]
+Input [11]: [c_last_name#73, c_first_name#72, s_store_name#57, ca_state#77, s_state#59, i_color#66, i_current_price#64, i_manager_id#68, i_units#67, i_size#65, sum#87]
+Keys [10]: [c_last_name#73, c_first_name#72, s_store_name#57, ca_state#77, s_state#59, i_color#66, i_current_price#64, i_manager_id#68, i_units#67, i_size#65]
+Functions [1]: [sum(UnscaledValue(ss_net_paid#54))]
+Aggregate Attributes [1]: [sum(UnscaledValue(ss_net_paid#54))#89]
+Results [1]: [MakeDecimal(sum(UnscaledValue(ss_net_paid#54))#89,17,2) AS netpaid#39]
+
+(102) HashAggregate [codegen id : 19]
+Input [1]: [netpaid#39]
 Keys: []
-Functions [1]: [partial_avg(netpaid#37)]
-Aggregate Attributes [2]: [sum#61, count#62]
-Results [2]: [sum#63, count#64]
+Functions [1]: [partial_avg(netpaid#39)]
+Aggregate Attributes [2]: [sum#90, count#91]
+Results [2]: [sum#92, count#93]
 
-(99) Exchange
-Input [2]: [sum#63, count#64]
-Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#65]
+(103) Exchange
+Input [2]: [sum#92, count#93]
+Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#94]
 
-(100) HashAggregate [codegen id : 20]
-Input [2]: [sum#63, count#64]
+(104) HashAggregate [codegen id : 20]
+Input [2]: [sum#92, count#93]
 Keys: []
-Functions [1]: [avg(netpaid#37)]
-Aggregate Attributes [1]: [avg(netpaid#37)#66]
-Results [1]: [CheckOverflow((0.050000 * promote_precision(avg(netpaid#37)#66)), DecimalType(24,8), true) AS (CAST(0.05 AS DECIMAL(21,6)) * CAST(avg(netpaid) AS DECIMAL(21,6)))#67]
+Functions [1]: [avg(netpaid#39)]
+Aggregate Attributes [1]: [avg(netpaid#39)#95]
+Results [1]: [CheckOverflow((0.050000 * promote_precision(avg(netpaid#39)#95)), DecimalType(24,8), true) AS (0.05 * avg(netpaid))#96]
 
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24b.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24b.sf100/simplified.txt
index 7de562c5d59a1..10f874f8f5543 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24b.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q24b.sf100/simplified.txt
@@ -21,7 +21,7 @@ WholeStageCodegen (14)
                                         InputAdapter
                                           Exchange [ss_ticket_number,ss_item_sk] #12
                                             WholeStageCodegen (14)
-                                              Project [ss_item_sk,ss_ticket_number,ss_net_paid,s_store_name,s_state,c_first_name,c_last_name,i_current_price,i_size,i_color,i_units,i_manager_id,ca_state]
+                                              Project [ss_item_sk,ss_ticket_number,ss_net_paid,s_store_name,s_state,i_current_price,i_size,i_color,i_units,i_manager_id,c_first_name,c_last_name,ca_state]
                                                 SortMergeJoin [c_birth_country,s_zip,ca_country,ca_zip]
                                                   InputAdapter
                                                     WholeStageCodegen (11)
@@ -29,21 +29,21 @@ WholeStageCodegen (14)
                                                         InputAdapter
                                                           Exchange [c_birth_country,s_zip] #13
                                                             WholeStageCodegen (10)
-                                                              Project [ss_item_sk,ss_ticket_number,ss_net_paid,s_store_name,s_state,s_zip,c_first_name,c_last_name,c_birth_country,i_current_price,i_size,i_color,i_units,i_manager_id]
-                                                                SortMergeJoin [ss_item_sk,i_item_sk]
+                                                              Project [ss_item_sk,ss_ticket_number,ss_net_paid,s_store_name,s_state,s_zip,i_current_price,i_size,i_color,i_units,i_manager_id,c_first_name,c_last_name,c_birth_country]
+                                                                SortMergeJoin [ss_customer_sk,c_customer_sk]
                                                                   InputAdapter
                                                                     WholeStageCodegen (7)
-                                                                      Sort [ss_item_sk]
+                                                                      Sort [ss_customer_sk]
                                                                         InputAdapter
-                                                                          Exchange [ss_item_sk] #14
+                                                                          Exchange [ss_customer_sk] #14
                                                                             WholeStageCodegen (6)
-                                                                              Project [ss_item_sk,ss_ticket_number,ss_net_paid,s_store_name,s_state,s_zip,c_first_name,c_last_name,c_birth_country]
-                                                                                SortMergeJoin [ss_customer_sk,c_customer_sk]
+                                                                              Project [ss_item_sk,ss_customer_sk,ss_ticket_number,ss_net_paid,s_store_name,s_state,s_zip,i_current_price,i_size,i_color,i_units,i_manager_id]
+                                                                                SortMergeJoin [ss_item_sk,i_item_sk]
                                                                                   InputAdapter
                                                                                     WholeStageCodegen (3)
-                                                                                      Sort [ss_customer_sk]
+                                                                                      Sort [ss_item_sk]
                                                                                         InputAdapter
-                                                                                          Exchange [ss_customer_sk] #15
+                                                                                          Exchange [ss_item_sk] #15
                                                                                             WholeStageCodegen (2)
                                                                                               Project [ss_item_sk,ss_customer_sk,ss_ticket_number,ss_net_paid,s_store_name,s_state,s_zip]
                                                                                                 BroadcastHashJoin [ss_store_sk,s_store_sk]
@@ -61,24 +61,24 @@ WholeStageCodegen (14)
                                                                                                                 Scan parquet default.store [s_store_sk,s_store_name,s_market_id,s_state,s_zip]
                                                                                   InputAdapter
                                                                                     WholeStageCodegen (5)
-                                                                                      Sort [c_customer_sk]
+                                                                                      Sort [i_item_sk]
                                                                                         InputAdapter
-                                                                                          Exchange [c_customer_sk] #17
+                                                                                          Exchange [i_item_sk] #17
                                                                                             WholeStageCodegen (4)
-                                                                                              Filter [c_customer_sk,c_birth_country]
+                                                                                              Filter [i_item_sk]
                                                                                                 ColumnarToRow
                                                                                                   InputAdapter
-                                                                                                    Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name,c_birth_country]
+                                                                                                    Scan parquet default.item [i_item_sk,i_current_price,i_size,i_color,i_units,i_manager_id]
                                                                   InputAdapter
                                                                     WholeStageCodegen (9)
-                                                                      Sort [i_item_sk]
+                                                                      Sort [c_customer_sk]
                                                                         InputAdapter
-                                                                          Exchange [i_item_sk] #18
+                                                                          Exchange [c_customer_sk] #18
                                                                             WholeStageCodegen (8)
-                                                                              Filter [i_item_sk]
+                                                                              Filter [c_customer_sk,c_birth_country]
                                                                                 ColumnarToRow
                                                                                   InputAdapter
-                                                                                    Scan parquet default.item [i_item_sk,i_current_price,i_size,i_color,i_units,i_manager_id]
+                                                                                    Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name,c_birth_country]
                                                   InputAdapter
                                                     WholeStageCodegen (13)
                                                       Sort [ca_country,ca_zip]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q25.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q25.sf100/explain.txt
index 3100e574e60e3..a531797a20c7e 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q25.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q25.sf100/explain.txt
@@ -5,57 +5,57 @@ TakeOrderedAndProject (57)
       +- * HashAggregate (54)
          +- * Project (53)
             +- * SortMergeJoin Inner (52)
-               :- * Sort (27)
-               :  +- Exchange (26)
-               :     +- * Project (25)
-               :        +- * SortMergeJoin Inner (24)
-               :           :- * Sort (18)
-               :           :  +- Exchange (17)
-               :           :     +- * Project (16)
-               :           :        +- * BroadcastHashJoin Inner BuildRight (15)
-               :           :           :- * Project (10)
-               :           :           :  +- * BroadcastHashJoin Inner BuildRight (9)
-               :           :           :     :- * Filter (3)
-               :           :           :     :  +- * ColumnarToRow (2)
-               :           :           :     :     +- Scan parquet default.store_sales (1)
-               :           :           :     +- BroadcastExchange (8)
-               :           :           :        +- * Project (7)
-               :           :           :           +- * Filter (6)
-               :           :           :              +- * ColumnarToRow (5)
-               :           :           :                 +- Scan parquet default.date_dim (4)
-               :           :           +- BroadcastExchange (14)
-               :           :              +- * Filter (13)
-               :           :                 +- * ColumnarToRow (12)
-               :           :                    +- Scan parquet default.store (11)
-               :           +- * Sort (23)
-               :              +- Exchange (22)
-               :                 +- * Filter (21)
-               :                    +- * ColumnarToRow (20)
-               :                       +- Scan parquet default.item (19)
+               :- * Sort (43)
+               :  +- Exchange (42)
+               :     +- * Project (41)
+               :        +- * SortMergeJoin Inner (40)
+               :           :- * Sort (27)
+               :           :  +- Exchange (26)
+               :           :     +- * Project (25)
+               :           :        +- * SortMergeJoin Inner (24)
+               :           :           :- * Sort (18)
+               :           :           :  +- Exchange (17)
+               :           :           :     +- * Project (16)
+               :           :           :        +- * BroadcastHashJoin Inner BuildRight (15)
+               :           :           :           :- * Project (10)
+               :           :           :           :  +- * BroadcastHashJoin Inner BuildRight (9)
+               :           :           :           :     :- * Filter (3)
+               :           :           :           :     :  +- * ColumnarToRow (2)
+               :           :           :           :     :     +- Scan parquet default.store_sales (1)
+               :           :           :           :     +- BroadcastExchange (8)
+               :           :           :           :        +- * Project (7)
+               :           :           :           :           +- * Filter (6)
+               :           :           :           :              +- * ColumnarToRow (5)
+               :           :           :           :                 +- Scan parquet default.date_dim (4)
+               :           :           :           +- BroadcastExchange (14)
+               :           :           :              +- * Filter (13)
+               :           :           :                 +- * ColumnarToRow (12)
+               :           :           :                    +- Scan parquet default.store (11)
+               :           :           +- * Sort (23)
+               :           :              +- Exchange (22)
+               :           :                 +- * Filter (21)
+               :           :                    +- * ColumnarToRow (20)
+               :           :                       +- Scan parquet default.item (19)
+               :           +- * Sort (39)
+               :              +- Exchange (38)
+               :                 +- * Project (37)
+               :                    +- * BroadcastHashJoin Inner BuildRight (36)
+               :                       :- * Filter (30)
+               :                       :  +- * ColumnarToRow (29)
+               :                       :     +- Scan parquet default.store_returns (28)
+               :                       +- BroadcastExchange (35)
+               :                          +- * Project (34)
+               :                             +- * Filter (33)
+               :                                +- * ColumnarToRow (32)
+               :                                   +- Scan parquet default.date_dim (31)
                +- * Sort (51)
                   +- Exchange (50)
                      +- * Project (49)
-                        +- * SortMergeJoin Inner (48)
-                           :- * Sort (39)
-                           :  +- Exchange (38)
-                           :     +- * Project (37)
-                           :        +- * BroadcastHashJoin Inner BuildRight (36)
-                           :           :- * Filter (30)
-                           :           :  +- * ColumnarToRow (29)
-                           :           :     +- Scan parquet default.store_returns (28)
-                           :           +- BroadcastExchange (35)
-                           :              +- * Project (34)
-                           :                 +- * Filter (33)
-                           :                    +- * ColumnarToRow (32)
-                           :                       +- Scan parquet default.date_dim (31)
-                           +- * Sort (47)
-                              +- Exchange (46)
-                                 +- * Project (45)
-                                    +- * BroadcastHashJoin Inner BuildRight (44)
-                                       :- * Filter (42)
-                                       :  +- * ColumnarToRow (41)
-                                       :     +- Scan parquet default.catalog_sales (40)
-                                       +- ReusedExchange (43)
+                        +- * BroadcastHashJoin Inner BuildRight (48)
+                           :- * Filter (46)
+                           :  +- * ColumnarToRow (45)
+                           :     +- Scan parquet default.catalog_sales (44)
+                           +- ReusedExchange (47)
 
 
 (1) Scan parquet default.store_sales
@@ -224,76 +224,76 @@ Input [6]: [sr_returned_date_sk#21, sr_item_sk#22, sr_customer_sk#23, sr_ticket_
 
 (38) Exchange
 Input [4]: [sr_item_sk#22, sr_customer_sk#23, sr_ticket_number#24, sr_net_loss#25]
-Arguments: hashpartitioning(sr_customer_sk#23, sr_item_sk#22, 5), ENSURE_REQUIREMENTS, [id=#30]
+Arguments: hashpartitioning(sr_customer_sk#23, sr_item_sk#22, sr_ticket_number#24, 5), ENSURE_REQUIREMENTS, [id=#30]
 
 (39) Sort [codegen id : 11]
 Input [4]: [sr_item_sk#22, sr_customer_sk#23, sr_ticket_number#24, sr_net_loss#25]
+Arguments: [sr_customer_sk#23 ASC NULLS FIRST, sr_item_sk#22 ASC NULLS FIRST, sr_ticket_number#24 ASC NULLS FIRST], false, 0
+
+(40) SortMergeJoin [codegen id : 12]
+Left keys [3]: [cast(ss_customer_sk#3 as bigint), cast(ss_item_sk#2 as bigint), cast(ss_ticket_number#5 as bigint)]
+Right keys [3]: [sr_customer_sk#23, sr_item_sk#22, sr_ticket_number#24]
+Join condition: None
+
+(41) Project [codegen id : 12]
+Output [8]: [ss_net_profit#6, s_store_id#12, s_store_name#13, i_item_id#17, i_item_desc#18, sr_item_sk#22, sr_customer_sk#23, sr_net_loss#25]
+Input [12]: [ss_item_sk#2, ss_customer_sk#3, ss_ticket_number#5, ss_net_profit#6, s_store_id#12, s_store_name#13, i_item_id#17, i_item_desc#18, sr_item_sk#22, sr_customer_sk#23, sr_ticket_number#24, sr_net_loss#25]
+
+(42) Exchange
+Input [8]: [ss_net_profit#6, s_store_id#12, s_store_name#13, i_item_id#17, i_item_desc#18, sr_item_sk#22, sr_customer_sk#23, sr_net_loss#25]
+Arguments: hashpartitioning(sr_customer_sk#23, sr_item_sk#22, 5), ENSURE_REQUIREMENTS, [id=#31]
+
+(43) Sort [codegen id : 13]
+Input [8]: [ss_net_profit#6, s_store_id#12, s_store_name#13, i_item_id#17, i_item_desc#18, sr_item_sk#22, sr_customer_sk#23, sr_net_loss#25]
 Arguments: [sr_customer_sk#23 ASC NULLS FIRST, sr_item_sk#22 ASC NULLS FIRST], false, 0
 
-(40) Scan parquet default.catalog_sales
-Output [4]: [cs_sold_date_sk#31, cs_bill_customer_sk#32, cs_item_sk#33, cs_net_profit#34]
+(44) Scan parquet default.catalog_sales
+Output [4]: [cs_sold_date_sk#32, cs_bill_customer_sk#33, cs_item_sk#34, cs_net_profit#35]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/catalog_sales]
 PushedFilters: [IsNotNull(cs_bill_customer_sk), IsNotNull(cs_item_sk), IsNotNull(cs_sold_date_sk)]
 ReadSchema: struct<cs_sold_date_sk:int,cs_bill_customer_sk:int,cs_item_sk:int,cs_net_profit:decimal(7,2)>
 
-(41) ColumnarToRow [codegen id : 13]
-Input [4]: [cs_sold_date_sk#31, cs_bill_customer_sk#32, cs_item_sk#33, cs_net_profit#34]
-
-(42) Filter [codegen id : 13]
-Input [4]: [cs_sold_date_sk#31, cs_bill_customer_sk#32, cs_item_sk#33, cs_net_profit#34]
-Condition : ((isnotnull(cs_bill_customer_sk#32) AND isnotnull(cs_item_sk#33)) AND isnotnull(cs_sold_date_sk#31))
+(45) ColumnarToRow [codegen id : 15]
+Input [4]: [cs_sold_date_sk#32, cs_bill_customer_sk#33, cs_item_sk#34, cs_net_profit#35]
 
-(43) ReusedExchange [Reuses operator id: 35]
-Output [1]: [d_date_sk#35]
+(46) Filter [codegen id : 15]
+Input [4]: [cs_sold_date_sk#32, cs_bill_customer_sk#33, cs_item_sk#34, cs_net_profit#35]
+Condition : ((isnotnull(cs_bill_customer_sk#33) AND isnotnull(cs_item_sk#34)) AND isnotnull(cs_sold_date_sk#32))
 
-(44) BroadcastHashJoin [codegen id : 13]
-Left keys [1]: [cs_sold_date_sk#31]
-Right keys [1]: [d_date_sk#35]
-Join condition: None
-
-(45) Project [codegen id : 13]
-Output [3]: [cs_bill_customer_sk#32, cs_item_sk#33, cs_net_profit#34]
-Input [5]: [cs_sold_date_sk#31, cs_bill_customer_sk#32, cs_item_sk#33, cs_net_profit#34, d_date_sk#35]
-
-(46) Exchange
-Input [3]: [cs_bill_customer_sk#32, cs_item_sk#33, cs_net_profit#34]
-Arguments: hashpartitioning(cast(cs_bill_customer_sk#32 as bigint), cast(cs_item_sk#33 as bigint), 5), ENSURE_REQUIREMENTS, [id=#36]
+(47) ReusedExchange [Reuses operator id: 35]
+Output [1]: [d_date_sk#36]
 
-(47) Sort [codegen id : 14]
-Input [3]: [cs_bill_customer_sk#32, cs_item_sk#33, cs_net_profit#34]
-Arguments: [cast(cs_bill_customer_sk#32 as bigint) ASC NULLS FIRST, cast(cs_item_sk#33 as bigint) ASC NULLS FIRST], false, 0
-
-(48) SortMergeJoin [codegen id : 15]
-Left keys [2]: [sr_customer_sk#23, sr_item_sk#22]
-Right keys [2]: [cast(cs_bill_customer_sk#32 as bigint), cast(cs_item_sk#33 as bigint)]
+(48) BroadcastHashJoin [codegen id : 15]
+Left keys [1]: [cs_sold_date_sk#32]
+Right keys [1]: [d_date_sk#36]
 Join condition: None
 
 (49) Project [codegen id : 15]
-Output [5]: [sr_item_sk#22, sr_customer_sk#23, sr_ticket_number#24, sr_net_loss#25, cs_net_profit#34]
-Input [7]: [sr_item_sk#22, sr_customer_sk#23, sr_ticket_number#24, sr_net_loss#25, cs_bill_customer_sk#32, cs_item_sk#33, cs_net_profit#34]
+Output [3]: [cs_bill_customer_sk#33, cs_item_sk#34, cs_net_profit#35]
+Input [5]: [cs_sold_date_sk#32, cs_bill_customer_sk#33, cs_item_sk#34, cs_net_profit#35, d_date_sk#36]
 
 (50) Exchange
-Input [5]: [sr_item_sk#22, sr_customer_sk#23, sr_ticket_number#24, sr_net_loss#25, cs_net_profit#34]
-Arguments: hashpartitioning(sr_customer_sk#23, sr_item_sk#22, sr_ticket_number#24, 5), ENSURE_REQUIREMENTS, [id=#37]
+Input [3]: [cs_bill_customer_sk#33, cs_item_sk#34, cs_net_profit#35]
+Arguments: hashpartitioning(cast(cs_bill_customer_sk#33 as bigint), cast(cs_item_sk#34 as bigint), 5), ENSURE_REQUIREMENTS, [id=#37]
 
 (51) Sort [codegen id : 16]
-Input [5]: [sr_item_sk#22, sr_customer_sk#23, sr_ticket_number#24, sr_net_loss#25, cs_net_profit#34]
-Arguments: [sr_customer_sk#23 ASC NULLS FIRST, sr_item_sk#22 ASC NULLS FIRST, sr_ticket_number#24 ASC NULLS FIRST], false, 0
+Input [3]: [cs_bill_customer_sk#33, cs_item_sk#34, cs_net_profit#35]
+Arguments: [cast(cs_bill_customer_sk#33 as bigint) ASC NULLS FIRST, cast(cs_item_sk#34 as bigint) ASC NULLS FIRST], false, 0
 
 (52) SortMergeJoin [codegen id : 17]
-Left keys [3]: [cast(ss_customer_sk#3 as bigint), cast(ss_item_sk#2 as bigint), cast(ss_ticket_number#5 as bigint)]
-Right keys [3]: [sr_customer_sk#23, sr_item_sk#22, sr_ticket_number#24]
+Left keys [2]: [sr_customer_sk#23, sr_item_sk#22]
+Right keys [2]: [cast(cs_bill_customer_sk#33 as bigint), cast(cs_item_sk#34 as bigint)]
 Join condition: None
 
 (53) Project [codegen id : 17]
-Output [7]: [ss_net_profit#6, sr_net_loss#25, cs_net_profit#34, s_store_id#12, s_store_name#13, i_item_id#17, i_item_desc#18]
-Input [13]: [ss_item_sk#2, ss_customer_sk#3, ss_ticket_number#5, ss_net_profit#6, s_store_id#12, s_store_name#13, i_item_id#17, i_item_desc#18, sr_item_sk#22, sr_customer_sk#23, sr_ticket_number#24, sr_net_loss#25, cs_net_profit#34]
+Output [7]: [ss_net_profit#6, sr_net_loss#25, cs_net_profit#35, s_store_id#12, s_store_name#13, i_item_id#17, i_item_desc#18]
+Input [11]: [ss_net_profit#6, s_store_id#12, s_store_name#13, i_item_id#17, i_item_desc#18, sr_item_sk#22, sr_customer_sk#23, sr_net_loss#25, cs_bill_customer_sk#33, cs_item_sk#34, cs_net_profit#35]
 
 (54) HashAggregate [codegen id : 17]
-Input [7]: [ss_net_profit#6, sr_net_loss#25, cs_net_profit#34, s_store_id#12, s_store_name#13, i_item_id#17, i_item_desc#18]
+Input [7]: [ss_net_profit#6, sr_net_loss#25, cs_net_profit#35, s_store_id#12, s_store_name#13, i_item_id#17, i_item_desc#18]
 Keys [4]: [i_item_id#17, i_item_desc#18, s_store_id#12, s_store_name#13]
-Functions [3]: [partial_sum(UnscaledValue(ss_net_profit#6)), partial_sum(UnscaledValue(sr_net_loss#25)), partial_sum(UnscaledValue(cs_net_profit#34))]
+Functions [3]: [partial_sum(UnscaledValue(ss_net_profit#6)), partial_sum(UnscaledValue(sr_net_loss#25)), partial_sum(UnscaledValue(cs_net_profit#35))]
 Aggregate Attributes [3]: [sum#38, sum#39, sum#40]
 Results [7]: [i_item_id#17, i_item_desc#18, s_store_id#12, s_store_name#13, sum#41, sum#42, sum#43]
 
@@ -304,9 +304,9 @@ Arguments: hashpartitioning(i_item_id#17, i_item_desc#18, s_store_id#12, s_store
 (56) HashAggregate [codegen id : 18]
 Input [7]: [i_item_id#17, i_item_desc#18, s_store_id#12, s_store_name#13, sum#41, sum#42, sum#43]
 Keys [4]: [i_item_id#17, i_item_desc#18, s_store_id#12, s_store_name#13]
-Functions [3]: [sum(UnscaledValue(ss_net_profit#6)), sum(UnscaledValue(sr_net_loss#25)), sum(UnscaledValue(cs_net_profit#34))]
-Aggregate Attributes [3]: [sum(UnscaledValue(ss_net_profit#6))#45, sum(UnscaledValue(sr_net_loss#25))#46, sum(UnscaledValue(cs_net_profit#34))#47]
-Results [7]: [i_item_id#17, i_item_desc#18, s_store_id#12, s_store_name#13, MakeDecimal(sum(UnscaledValue(ss_net_profit#6))#45,17,2) AS store_sales_profit#48, MakeDecimal(sum(UnscaledValue(sr_net_loss#25))#46,17,2) AS store_returns_loss#49, MakeDecimal(sum(UnscaledValue(cs_net_profit#34))#47,17,2) AS catalog_sales_profit#50]
+Functions [3]: [sum(UnscaledValue(ss_net_profit#6)), sum(UnscaledValue(sr_net_loss#25)), sum(UnscaledValue(cs_net_profit#35))]
+Aggregate Attributes [3]: [sum(UnscaledValue(ss_net_profit#6))#45, sum(UnscaledValue(sr_net_loss#25))#46, sum(UnscaledValue(cs_net_profit#35))#47]
+Results [7]: [i_item_id#17, i_item_desc#18, s_store_id#12, s_store_name#13, MakeDecimal(sum(UnscaledValue(ss_net_profit#6))#45,17,2) AS store_sales_profit#48, MakeDecimal(sum(UnscaledValue(sr_net_loss#25))#46,17,2) AS store_returns_loss#49, MakeDecimal(sum(UnscaledValue(cs_net_profit#35))#47,17,2) AS catalog_sales_profit#50]
 
 (57) TakeOrderedAndProject
 Input [7]: [i_item_id#17, i_item_desc#18, s_store_id#12, s_store_name#13, store_sales_profit#48, store_returns_loss#49, catalog_sales_profit#50]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q25.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q25.sf100/simplified.txt
index 9b53cdaa5dc67..ad9fa718ff2bd 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q25.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q25.sf100/simplified.txt
@@ -6,67 +6,67 @@ TakeOrderedAndProject [i_item_id,i_item_desc,s_store_id,s_store_name,store_sales
           WholeStageCodegen (17)
             HashAggregate [i_item_id,i_item_desc,s_store_id,s_store_name,ss_net_profit,sr_net_loss,cs_net_profit] [sum,sum,sum,sum,sum,sum]
               Project [ss_net_profit,sr_net_loss,cs_net_profit,s_store_id,s_store_name,i_item_id,i_item_desc]
-                SortMergeJoin [ss_customer_sk,ss_item_sk,ss_ticket_number,sr_customer_sk,sr_item_sk,sr_ticket_number]
+                SortMergeJoin [sr_customer_sk,sr_item_sk,cs_bill_customer_sk,cs_item_sk]
                   InputAdapter
-                    WholeStageCodegen (8)
-                      Sort [ss_customer_sk,ss_item_sk,ss_ticket_number]
+                    WholeStageCodegen (13)
+                      Sort [sr_customer_sk,sr_item_sk]
                         InputAdapter
-                          Exchange [ss_customer_sk,ss_item_sk,ss_ticket_number] #2
-                            WholeStageCodegen (7)
-                              Project [ss_item_sk,ss_customer_sk,ss_ticket_number,ss_net_profit,s_store_id,s_store_name,i_item_id,i_item_desc]
-                                SortMergeJoin [ss_item_sk,i_item_sk]
+                          Exchange [sr_customer_sk,sr_item_sk] #2
+                            WholeStageCodegen (12)
+                              Project [ss_net_profit,s_store_id,s_store_name,i_item_id,i_item_desc,sr_item_sk,sr_customer_sk,sr_net_loss]
+                                SortMergeJoin [ss_customer_sk,ss_item_sk,ss_ticket_number,sr_customer_sk,sr_item_sk,sr_ticket_number]
                                   InputAdapter
-                                    WholeStageCodegen (4)
-                                      Sort [ss_item_sk]
+                                    WholeStageCodegen (8)
+                                      Sort [ss_customer_sk,ss_item_sk,ss_ticket_number]
                                         InputAdapter
-                                          Exchange [ss_item_sk] #3
-                                            WholeStageCodegen (3)
-                                              Project [ss_item_sk,ss_customer_sk,ss_ticket_number,ss_net_profit,s_store_id,s_store_name]
-                                                BroadcastHashJoin [ss_store_sk,s_store_sk]
-                                                  Project [ss_item_sk,ss_customer_sk,ss_store_sk,ss_ticket_number,ss_net_profit]
-                                                    BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
-                                                      Filter [ss_customer_sk,ss_item_sk,ss_ticket_number,ss_sold_date_sk,ss_store_sk]
-                                                        ColumnarToRow
-                                                          InputAdapter
-                                                            Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk,ss_customer_sk,ss_store_sk,ss_ticket_number,ss_net_profit]
-                                                      InputAdapter
-                                                        BroadcastExchange #4
-                                                          WholeStageCodegen (1)
-                                                            Project [d_date_sk]
-                                                              Filter [d_moy,d_year,d_date_sk]
-                                                                ColumnarToRow
-                                                                  InputAdapter
-                                                                    Scan parquet default.date_dim [d_date_sk,d_year,d_moy]
+                                          Exchange [ss_customer_sk,ss_item_sk,ss_ticket_number] #3
+                                            WholeStageCodegen (7)
+                                              Project [ss_item_sk,ss_customer_sk,ss_ticket_number,ss_net_profit,s_store_id,s_store_name,i_item_id,i_item_desc]
+                                                SortMergeJoin [ss_item_sk,i_item_sk]
                                                   InputAdapter
-                                                    BroadcastExchange #5
-                                                      WholeStageCodegen (2)
-                                                        Filter [s_store_sk]
-                                                          ColumnarToRow
-                                                            InputAdapter
-                                                              Scan parquet default.store [s_store_sk,s_store_id,s_store_name]
-                                  InputAdapter
-                                    WholeStageCodegen (6)
-                                      Sort [i_item_sk]
-                                        InputAdapter
-                                          Exchange [i_item_sk] #6
-                                            WholeStageCodegen (5)
-                                              Filter [i_item_sk]
-                                                ColumnarToRow
+                                                    WholeStageCodegen (4)
+                                                      Sort [ss_item_sk]
+                                                        InputAdapter
+                                                          Exchange [ss_item_sk] #4
+                                                            WholeStageCodegen (3)
+                                                              Project [ss_item_sk,ss_customer_sk,ss_ticket_number,ss_net_profit,s_store_id,s_store_name]
+                                                                BroadcastHashJoin [ss_store_sk,s_store_sk]
+                                                                  Project [ss_item_sk,ss_customer_sk,ss_store_sk,ss_ticket_number,ss_net_profit]
+                                                                    BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
+                                                                      Filter [ss_customer_sk,ss_item_sk,ss_ticket_number,ss_sold_date_sk,ss_store_sk]
+                                                                        ColumnarToRow
+                                                                          InputAdapter
+                                                                            Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk,ss_customer_sk,ss_store_sk,ss_ticket_number,ss_net_profit]
+                                                                      InputAdapter
+                                                                        BroadcastExchange #5
+                                                                          WholeStageCodegen (1)
+                                                                            Project [d_date_sk]
+                                                                              Filter [d_moy,d_year,d_date_sk]
+                                                                                ColumnarToRow
+                                                                                  InputAdapter
+                                                                                    Scan parquet default.date_dim [d_date_sk,d_year,d_moy]
+                                                                  InputAdapter
+                                                                    BroadcastExchange #6
+                                                                      WholeStageCodegen (2)
+                                                                        Filter [s_store_sk]
+                                                                          ColumnarToRow
+                                                                            InputAdapter
+                                                                              Scan parquet default.store [s_store_sk,s_store_id,s_store_name]
                                                   InputAdapter
-                                                    Scan parquet default.item [i_item_sk,i_item_id,i_item_desc]
-                  InputAdapter
-                    WholeStageCodegen (16)
-                      Sort [sr_customer_sk,sr_item_sk,sr_ticket_number]
-                        InputAdapter
-                          Exchange [sr_customer_sk,sr_item_sk,sr_ticket_number] #7
-                            WholeStageCodegen (15)
-                              Project [sr_item_sk,sr_customer_sk,sr_ticket_number,sr_net_loss,cs_net_profit]
-                                SortMergeJoin [sr_customer_sk,sr_item_sk,cs_bill_customer_sk,cs_item_sk]
+                                                    WholeStageCodegen (6)
+                                                      Sort [i_item_sk]
+                                                        InputAdapter
+                                                          Exchange [i_item_sk] #7
+                                                            WholeStageCodegen (5)
+                                                              Filter [i_item_sk]
+                                                                ColumnarToRow
+                                                                  InputAdapter
+                                                                    Scan parquet default.item [i_item_sk,i_item_id,i_item_desc]
                                   InputAdapter
                                     WholeStageCodegen (11)
-                                      Sort [sr_customer_sk,sr_item_sk]
+                                      Sort [sr_customer_sk,sr_item_sk,sr_ticket_number]
                                         InputAdapter
-                                          Exchange [sr_customer_sk,sr_item_sk] #8
+                                          Exchange [sr_customer_sk,sr_item_sk,sr_ticket_number] #8
                                             WholeStageCodegen (10)
                                               Project [sr_item_sk,sr_customer_sk,sr_ticket_number,sr_net_loss]
                                                 BroadcastHashJoin [sr_returned_date_sk,d_date_sk]
@@ -82,17 +82,17 @@ TakeOrderedAndProject [i_item_id,i_item_desc,s_store_id,s_store_name,store_sales
                                                             ColumnarToRow
                                                               InputAdapter
                                                                 Scan parquet default.date_dim [d_date_sk,d_year,d_moy]
+                  InputAdapter
+                    WholeStageCodegen (16)
+                      Sort [cs_bill_customer_sk,cs_item_sk]
+                        InputAdapter
+                          Exchange [cs_bill_customer_sk,cs_item_sk] #10
+                            WholeStageCodegen (15)
+                              Project [cs_bill_customer_sk,cs_item_sk,cs_net_profit]
+                                BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
+                                  Filter [cs_bill_customer_sk,cs_item_sk,cs_sold_date_sk]
+                                    ColumnarToRow
+                                      InputAdapter
+                                        Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk,cs_item_sk,cs_net_profit]
                                   InputAdapter
-                                    WholeStageCodegen (14)
-                                      Sort [cs_bill_customer_sk,cs_item_sk]
-                                        InputAdapter
-                                          Exchange [cs_bill_customer_sk,cs_item_sk] #10
-                                            WholeStageCodegen (13)
-                                              Project [cs_bill_customer_sk,cs_item_sk,cs_net_profit]
-                                                BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
-                                                  Filter [cs_bill_customer_sk,cs_item_sk,cs_sold_date_sk]
-                                                    ColumnarToRow
-                                                      InputAdapter
-                                                        Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk,cs_item_sk,cs_net_profit]
-                                                  InputAdapter
-                                                    ReusedExchange [d_date_sk] #9
+                                    ReusedExchange [d_date_sk] #9
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q33.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q33.sf100/explain.txt
index cb8522545f1d3..2aa99626920ec 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q33.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q33.sf100/explain.txt
@@ -9,8 +9,8 @@ TakeOrderedAndProject (67)
             :     +- * HashAggregate (30)
             :        +- * Project (29)
             :           +- * BroadcastHashJoin Inner BuildRight (28)
-            :              :- * Project (17)
-            :              :  +- * BroadcastHashJoin Inner BuildRight (16)
+            :              :- * Project (22)
+            :              :  +- * BroadcastHashJoin Inner BuildRight (21)
             :              :     :- * Project (10)
             :              :     :  +- * BroadcastHashJoin Inner BuildRight (9)
             :              :     :     :- * Filter (3)
@@ -21,21 +21,21 @@ TakeOrderedAndProject (67)
             :              :     :           +- * Filter (6)
             :              :     :              +- * ColumnarToRow (5)
             :              :     :                 +- Scan parquet default.date_dim (4)
-            :              :     +- BroadcastExchange (15)
-            :              :        +- * Project (14)
-            :              :           +- * Filter (13)
-            :              :              +- * ColumnarToRow (12)
-            :              :                 +- Scan parquet default.customer_address (11)
+            :              :     +- BroadcastExchange (20)
+            :              :        +- * BroadcastHashJoin LeftSemi BuildRight (19)
+            :              :           :- * Filter (13)
+            :              :           :  +- * ColumnarToRow (12)
+            :              :           :     +- Scan parquet default.item (11)
+            :              :           +- BroadcastExchange (18)
+            :              :              +- * Project (17)
+            :              :                 +- * Filter (16)
+            :              :                    +- * ColumnarToRow (15)
+            :              :                       +- Scan parquet default.item (14)
             :              +- BroadcastExchange (27)
-            :                 +- * BroadcastHashJoin LeftSemi BuildRight (26)
-            :                    :- * Filter (20)
-            :                    :  +- * ColumnarToRow (19)
-            :                    :     +- Scan parquet default.item (18)
-            :                    +- BroadcastExchange (25)
-            :                       +- * Project (24)
-            :                          +- * Filter (23)
-            :                             +- * ColumnarToRow (22)
-            :                                +- Scan parquet default.item (21)
+            :                 +- * Project (26)
+            :                    +- * Filter (25)
+            :                       +- * ColumnarToRow (24)
+            :                          +- Scan parquet default.customer_address (23)
             :- * HashAggregate (47)
             :  +- Exchange (46)
             :     +- * HashAggregate (45)
@@ -69,310 +69,327 @@ TakeOrderedAndProject (67)
 
 
 (1) Scan parquet default.store_sales
-Output [4]: [ss_sold_date_sk#1, ss_item_sk#2, ss_addr_sk#3, ss_ext_sales_price#4]
+Output [4]: [ss_item_sk#1, ss_addr_sk#2, ss_ext_sales_price#3, ss_sold_date_sk#4]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/store_sales]
-PushedFilters: [IsNotNull(ss_sold_date_sk), IsNotNull(ss_addr_sk), IsNotNull(ss_item_sk)]
-ReadSchema: struct<ss_sold_date_sk:int,ss_item_sk:int,ss_addr_sk:int,ss_ext_sales_price:decimal(7,2)>
+Location: InMemoryFileIndex []
+PartitionFilters: [isnotnull(ss_sold_date_sk#4), dynamicpruningexpression(ss_sold_date_sk#4 IN dynamicpruning#5)]
+PushedFilters: [IsNotNull(ss_addr_sk), IsNotNull(ss_item_sk)]
+ReadSchema: struct<ss_item_sk:int,ss_addr_sk:int,ss_ext_sales_price:decimal(7,2)>
 
 (2) ColumnarToRow [codegen id : 5]
-Input [4]: [ss_sold_date_sk#1, ss_item_sk#2, ss_addr_sk#3, ss_ext_sales_price#4]
+Input [4]: [ss_item_sk#1, ss_addr_sk#2, ss_ext_sales_price#3, ss_sold_date_sk#4]
 
 (3) Filter [codegen id : 5]
-Input [4]: [ss_sold_date_sk#1, ss_item_sk#2, ss_addr_sk#3, ss_ext_sales_price#4]
-Condition : ((isnotnull(ss_sold_date_sk#1) AND isnotnull(ss_addr_sk#3)) AND isnotnull(ss_item_sk#2))
+Input [4]: [ss_item_sk#1, ss_addr_sk#2, ss_ext_sales_price#3, ss_sold_date_sk#4]
+Condition : (isnotnull(ss_addr_sk#2) AND isnotnull(ss_item_sk#1))
 
 (4) Scan parquet default.date_dim
-Output [3]: [d_date_sk#5, d_year#6, d_moy#7]
+Output [3]: [d_date_sk#6, d_year#7, d_moy#8]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/date_dim]
 PushedFilters: [IsNotNull(d_year), IsNotNull(d_moy), EqualTo(d_year,1998), EqualTo(d_moy,5), IsNotNull(d_date_sk)]
 ReadSchema: struct<d_date_sk:int,d_year:int,d_moy:int>
 
 (5) ColumnarToRow [codegen id : 1]
-Input [3]: [d_date_sk#5, d_year#6, d_moy#7]
+Input [3]: [d_date_sk#6, d_year#7, d_moy#8]
 
 (6) Filter [codegen id : 1]
-Input [3]: [d_date_sk#5, d_year#6, d_moy#7]
-Condition : ((((isnotnull(d_year#6) AND isnotnull(d_moy#7)) AND (d_year#6 = 1998)) AND (d_moy#7 = 5)) AND isnotnull(d_date_sk#5))
+Input [3]: [d_date_sk#6, d_year#7, d_moy#8]
+Condition : ((((isnotnull(d_year#7) AND isnotnull(d_moy#8)) AND (d_year#7 = 1998)) AND (d_moy#8 = 5)) AND isnotnull(d_date_sk#6))
 
 (7) Project [codegen id : 1]
-Output [1]: [d_date_sk#5]
-Input [3]: [d_date_sk#5, d_year#6, d_moy#7]
+Output [1]: [d_date_sk#6]
+Input [3]: [d_date_sk#6, d_year#7, d_moy#8]
 
 (8) BroadcastExchange
-Input [1]: [d_date_sk#5]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#8]
+Input [1]: [d_date_sk#6]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#9]
 
 (9) BroadcastHashJoin [codegen id : 5]
-Left keys [1]: [ss_sold_date_sk#1]
-Right keys [1]: [d_date_sk#5]
+Left keys [1]: [ss_sold_date_sk#4]
+Right keys [1]: [d_date_sk#6]
 Join condition: None
 
 (10) Project [codegen id : 5]
-Output [3]: [ss_item_sk#2, ss_addr_sk#3, ss_ext_sales_price#4]
-Input [5]: [ss_sold_date_sk#1, ss_item_sk#2, ss_addr_sk#3, ss_ext_sales_price#4, d_date_sk#5]
+Output [3]: [ss_item_sk#1, ss_addr_sk#2, ss_ext_sales_price#3]
+Input [5]: [ss_item_sk#1, ss_addr_sk#2, ss_ext_sales_price#3, ss_sold_date_sk#4, d_date_sk#6]
 
-(11) Scan parquet default.customer_address
-Output [2]: [ca_address_sk#9, ca_gmt_offset#10]
-Batched: true
-Location [not included in comparison]/{warehouse_dir}/customer_address]
-PushedFilters: [IsNotNull(ca_gmt_offset), EqualTo(ca_gmt_offset,-5.00), IsNotNull(ca_address_sk)]
-ReadSchema: struct<ca_address_sk:int,ca_gmt_offset:decimal(5,2)>
-
-(12) ColumnarToRow [codegen id : 2]
-Input [2]: [ca_address_sk#9, ca_gmt_offset#10]
-
-(13) Filter [codegen id : 2]
-Input [2]: [ca_address_sk#9, ca_gmt_offset#10]
-Condition : ((isnotnull(ca_gmt_offset#10) AND (ca_gmt_offset#10 = -5.00)) AND isnotnull(ca_address_sk#9))
-
-(14) Project [codegen id : 2]
-Output [1]: [ca_address_sk#9]
-Input [2]: [ca_address_sk#9, ca_gmt_offset#10]
-
-(15) BroadcastExchange
-Input [1]: [ca_address_sk#9]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#11]
-
-(16) BroadcastHashJoin [codegen id : 5]
-Left keys [1]: [ss_addr_sk#3]
-Right keys [1]: [ca_address_sk#9]
-Join condition: None
-
-(17) Project [codegen id : 5]
-Output [2]: [ss_item_sk#2, ss_ext_sales_price#4]
-Input [4]: [ss_item_sk#2, ss_addr_sk#3, ss_ext_sales_price#4, ca_address_sk#9]
-
-(18) Scan parquet default.item
-Output [2]: [i_item_sk#12, i_manufact_id#13]
+(11) Scan parquet default.item
+Output [2]: [i_item_sk#10, i_manufact_id#11]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/item]
 PushedFilters: [IsNotNull(i_item_sk)]
 ReadSchema: struct<i_item_sk:int,i_manufact_id:int>
 
-(19) ColumnarToRow [codegen id : 4]
-Input [2]: [i_item_sk#12, i_manufact_id#13]
+(12) ColumnarToRow [codegen id : 3]
+Input [2]: [i_item_sk#10, i_manufact_id#11]
 
-(20) Filter [codegen id : 4]
-Input [2]: [i_item_sk#12, i_manufact_id#13]
-Condition : isnotnull(i_item_sk#12)
+(13) Filter [codegen id : 3]
+Input [2]: [i_item_sk#10, i_manufact_id#11]
+Condition : isnotnull(i_item_sk#10)
 
-(21) Scan parquet default.item
-Output [2]: [i_category#14, i_manufact_id#13]
+(14) Scan parquet default.item
+Output [2]: [i_category#12, i_manufact_id#13]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/item]
-PushedFilters: [IsNotNull(i_category), EqualTo(i_category,Electronics)]
+PushedFilters: [IsNotNull(i_category), EqualTo(i_category,Electronics                                       )]
 ReadSchema: struct<i_category:string,i_manufact_id:int>
 
-(22) ColumnarToRow [codegen id : 3]
-Input [2]: [i_category#14, i_manufact_id#13]
+(15) ColumnarToRow [codegen id : 2]
+Input [2]: [i_category#12, i_manufact_id#13]
 
-(23) Filter [codegen id : 3]
-Input [2]: [i_category#14, i_manufact_id#13]
-Condition : (isnotnull(i_category#14) AND (i_category#14 = Electronics))
+(16) Filter [codegen id : 2]
+Input [2]: [i_category#12, i_manufact_id#13]
+Condition : (isnotnull(i_category#12) AND (i_category#12 = Electronics                                       ))
 
-(24) Project [codegen id : 3]
-Output [1]: [i_manufact_id#13 AS i_manufact_id#13#15]
-Input [2]: [i_category#14, i_manufact_id#13]
+(17) Project [codegen id : 2]
+Output [1]: [i_manufact_id#13]
+Input [2]: [i_category#12, i_manufact_id#13]
 
-(25) BroadcastExchange
-Input [1]: [i_manufact_id#13#15]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#16]
+(18) BroadcastExchange
+Input [1]: [i_manufact_id#13]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#14]
 
-(26) BroadcastHashJoin [codegen id : 4]
-Left keys [1]: [i_manufact_id#13]
-Right keys [1]: [i_manufact_id#13#15]
+(19) BroadcastHashJoin [codegen id : 3]
+Left keys [1]: [i_manufact_id#11]
+Right keys [1]: [i_manufact_id#13]
 Join condition: None
 
+(20) BroadcastExchange
+Input [2]: [i_item_sk#10, i_manufact_id#11]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#15]
+
+(21) BroadcastHashJoin [codegen id : 5]
+Left keys [1]: [ss_item_sk#1]
+Right keys [1]: [i_item_sk#10]
+Join condition: None
+
+(22) Project [codegen id : 5]
+Output [3]: [ss_addr_sk#2, ss_ext_sales_price#3, i_manufact_id#11]
+Input [5]: [ss_item_sk#1, ss_addr_sk#2, ss_ext_sales_price#3, i_item_sk#10, i_manufact_id#11]
+
+(23) Scan parquet default.customer_address
+Output [2]: [ca_address_sk#16, ca_gmt_offset#17]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/customer_address]
+PushedFilters: [IsNotNull(ca_gmt_offset), EqualTo(ca_gmt_offset,-5.00), IsNotNull(ca_address_sk)]
+ReadSchema: struct<ca_address_sk:int,ca_gmt_offset:decimal(5,2)>
+
+(24) ColumnarToRow [codegen id : 4]
+Input [2]: [ca_address_sk#16, ca_gmt_offset#17]
+
+(25) Filter [codegen id : 4]
+Input [2]: [ca_address_sk#16, ca_gmt_offset#17]
+Condition : ((isnotnull(ca_gmt_offset#17) AND (ca_gmt_offset#17 = -5.00)) AND isnotnull(ca_address_sk#16))
+
+(26) Project [codegen id : 4]
+Output [1]: [ca_address_sk#16]
+Input [2]: [ca_address_sk#16, ca_gmt_offset#17]
+
 (27) BroadcastExchange
-Input [2]: [i_item_sk#12, i_manufact_id#13]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#17]
+Input [1]: [ca_address_sk#16]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#18]
 
 (28) BroadcastHashJoin [codegen id : 5]
-Left keys [1]: [ss_item_sk#2]
-Right keys [1]: [i_item_sk#12]
+Left keys [1]: [ss_addr_sk#2]
+Right keys [1]: [ca_address_sk#16]
 Join condition: None
 
 (29) Project [codegen id : 5]
-Output [2]: [ss_ext_sales_price#4, i_manufact_id#13]
-Input [4]: [ss_item_sk#2, ss_ext_sales_price#4, i_item_sk#12, i_manufact_id#13]
+Output [2]: [ss_ext_sales_price#3, i_manufact_id#11]
+Input [4]: [ss_addr_sk#2, ss_ext_sales_price#3, i_manufact_id#11, ca_address_sk#16]
 
 (30) HashAggregate [codegen id : 5]
-Input [2]: [ss_ext_sales_price#4, i_manufact_id#13]
-Keys [1]: [i_manufact_id#13]
-Functions [1]: [partial_sum(UnscaledValue(ss_ext_sales_price#4))]
-Aggregate Attributes [1]: [sum#18]
-Results [2]: [i_manufact_id#13, sum#19]
+Input [2]: [ss_ext_sales_price#3, i_manufact_id#11]
+Keys [1]: [i_manufact_id#11]
+Functions [1]: [partial_sum(UnscaledValue(ss_ext_sales_price#3))]
+Aggregate Attributes [1]: [sum#19]
+Results [2]: [i_manufact_id#11, sum#20]
 
 (31) Exchange
-Input [2]: [i_manufact_id#13, sum#19]
-Arguments: hashpartitioning(i_manufact_id#13, 5), ENSURE_REQUIREMENTS, [id=#20]
+Input [2]: [i_manufact_id#11, sum#20]
+Arguments: hashpartitioning(i_manufact_id#11, 5), ENSURE_REQUIREMENTS, [id=#21]
 
 (32) HashAggregate [codegen id : 6]
-Input [2]: [i_manufact_id#13, sum#19]
-Keys [1]: [i_manufact_id#13]
-Functions [1]: [sum(UnscaledValue(ss_ext_sales_price#4))]
-Aggregate Attributes [1]: [sum(UnscaledValue(ss_ext_sales_price#4))#21]
-Results [2]: [i_manufact_id#13, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#4))#21,17,2) AS total_sales#22]
+Input [2]: [i_manufact_id#11, sum#20]
+Keys [1]: [i_manufact_id#11]
+Functions [1]: [sum(UnscaledValue(ss_ext_sales_price#3))]
+Aggregate Attributes [1]: [sum(UnscaledValue(ss_ext_sales_price#3))#22]
+Results [2]: [i_manufact_id#11, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#22,17,2) AS total_sales#23]
 
 (33) Scan parquet default.catalog_sales
-Output [4]: [cs_sold_date_sk#23, cs_bill_addr_sk#24, cs_item_sk#25, cs_ext_sales_price#26]
+Output [4]: [cs_bill_addr_sk#24, cs_item_sk#25, cs_ext_sales_price#26, cs_sold_date_sk#27]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/catalog_sales]
-PushedFilters: [IsNotNull(cs_sold_date_sk), IsNotNull(cs_bill_addr_sk), IsNotNull(cs_item_sk)]
-ReadSchema: struct<cs_sold_date_sk:int,cs_bill_addr_sk:int,cs_item_sk:int,cs_ext_sales_price:decimal(7,2)>
+Location: InMemoryFileIndex []
+PartitionFilters: [isnotnull(cs_sold_date_sk#27), dynamicpruningexpression(cs_sold_date_sk#27 IN dynamicpruning#5)]
+PushedFilters: [IsNotNull(cs_bill_addr_sk), IsNotNull(cs_item_sk)]
+ReadSchema: struct<cs_bill_addr_sk:int,cs_item_sk:int,cs_ext_sales_price:decimal(7,2)>
 
 (34) ColumnarToRow [codegen id : 11]
-Input [4]: [cs_sold_date_sk#23, cs_bill_addr_sk#24, cs_item_sk#25, cs_ext_sales_price#26]
+Input [4]: [cs_bill_addr_sk#24, cs_item_sk#25, cs_ext_sales_price#26, cs_sold_date_sk#27]
 
 (35) Filter [codegen id : 11]
-Input [4]: [cs_sold_date_sk#23, cs_bill_addr_sk#24, cs_item_sk#25, cs_ext_sales_price#26]
-Condition : ((isnotnull(cs_sold_date_sk#23) AND isnotnull(cs_bill_addr_sk#24)) AND isnotnull(cs_item_sk#25))
+Input [4]: [cs_bill_addr_sk#24, cs_item_sk#25, cs_ext_sales_price#26, cs_sold_date_sk#27]
+Condition : (isnotnull(cs_bill_addr_sk#24) AND isnotnull(cs_item_sk#25))
 
 (36) ReusedExchange [Reuses operator id: 8]
-Output [1]: [d_date_sk#5]
+Output [1]: [d_date_sk#28]
 
 (37) BroadcastHashJoin [codegen id : 11]
-Left keys [1]: [cs_sold_date_sk#23]
-Right keys [1]: [d_date_sk#5]
+Left keys [1]: [cs_sold_date_sk#27]
+Right keys [1]: [d_date_sk#28]
 Join condition: None
 
 (38) Project [codegen id : 11]
 Output [3]: [cs_bill_addr_sk#24, cs_item_sk#25, cs_ext_sales_price#26]
-Input [5]: [cs_sold_date_sk#23, cs_bill_addr_sk#24, cs_item_sk#25, cs_ext_sales_price#26, d_date_sk#5]
+Input [5]: [cs_bill_addr_sk#24, cs_item_sk#25, cs_ext_sales_price#26, cs_sold_date_sk#27, d_date_sk#28]
 
-(39) ReusedExchange [Reuses operator id: 15]
-Output [1]: [ca_address_sk#9]
+(39) ReusedExchange [Reuses operator id: 20]
+Output [2]: [i_item_sk#29, i_manufact_id#30]
 
 (40) BroadcastHashJoin [codegen id : 11]
-Left keys [1]: [cs_bill_addr_sk#24]
-Right keys [1]: [ca_address_sk#9]
+Left keys [1]: [cs_item_sk#25]
+Right keys [1]: [i_item_sk#29]
 Join condition: None
 
 (41) Project [codegen id : 11]
-Output [2]: [cs_item_sk#25, cs_ext_sales_price#26]
-Input [4]: [cs_bill_addr_sk#24, cs_item_sk#25, cs_ext_sales_price#26, ca_address_sk#9]
+Output [3]: [cs_bill_addr_sk#24, cs_ext_sales_price#26, i_manufact_id#30]
+Input [5]: [cs_bill_addr_sk#24, cs_item_sk#25, cs_ext_sales_price#26, i_item_sk#29, i_manufact_id#30]
 
 (42) ReusedExchange [Reuses operator id: 27]
-Output [2]: [i_item_sk#12, i_manufact_id#13]
+Output [1]: [ca_address_sk#31]
 
 (43) BroadcastHashJoin [codegen id : 11]
-Left keys [1]: [cs_item_sk#25]
-Right keys [1]: [i_item_sk#12]
+Left keys [1]: [cs_bill_addr_sk#24]
+Right keys [1]: [ca_address_sk#31]
 Join condition: None
 
 (44) Project [codegen id : 11]
-Output [2]: [cs_ext_sales_price#26, i_manufact_id#13]
-Input [4]: [cs_item_sk#25, cs_ext_sales_price#26, i_item_sk#12, i_manufact_id#13]
+Output [2]: [cs_ext_sales_price#26, i_manufact_id#30]
+Input [4]: [cs_bill_addr_sk#24, cs_ext_sales_price#26, i_manufact_id#30, ca_address_sk#31]
 
 (45) HashAggregate [codegen id : 11]
-Input [2]: [cs_ext_sales_price#26, i_manufact_id#13]
-Keys [1]: [i_manufact_id#13]
+Input [2]: [cs_ext_sales_price#26, i_manufact_id#30]
+Keys [1]: [i_manufact_id#30]
 Functions [1]: [partial_sum(UnscaledValue(cs_ext_sales_price#26))]
-Aggregate Attributes [1]: [sum#27]
-Results [2]: [i_manufact_id#13, sum#28]
+Aggregate Attributes [1]: [sum#32]
+Results [2]: [i_manufact_id#30, sum#33]
 
 (46) Exchange
-Input [2]: [i_manufact_id#13, sum#28]
-Arguments: hashpartitioning(i_manufact_id#13, 5), ENSURE_REQUIREMENTS, [id=#29]
+Input [2]: [i_manufact_id#30, sum#33]
+Arguments: hashpartitioning(i_manufact_id#30, 5), ENSURE_REQUIREMENTS, [id=#34]
 
 (47) HashAggregate [codegen id : 12]
-Input [2]: [i_manufact_id#13, sum#28]
-Keys [1]: [i_manufact_id#13]
+Input [2]: [i_manufact_id#30, sum#33]
+Keys [1]: [i_manufact_id#30]
 Functions [1]: [sum(UnscaledValue(cs_ext_sales_price#26))]
-Aggregate Attributes [1]: [sum(UnscaledValue(cs_ext_sales_price#26))#30]
-Results [2]: [i_manufact_id#13, MakeDecimal(sum(UnscaledValue(cs_ext_sales_price#26))#30,17,2) AS total_sales#31]
+Aggregate Attributes [1]: [sum(UnscaledValue(cs_ext_sales_price#26))#35]
+Results [2]: [i_manufact_id#30, MakeDecimal(sum(UnscaledValue(cs_ext_sales_price#26))#35,17,2) AS total_sales#36]
 
 (48) Scan parquet default.web_sales
-Output [4]: [ws_sold_date_sk#32, ws_item_sk#33, ws_bill_addr_sk#34, ws_ext_sales_price#35]
+Output [4]: [ws_item_sk#37, ws_bill_addr_sk#38, ws_ext_sales_price#39, ws_sold_date_sk#40]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/web_sales]
-PushedFilters: [IsNotNull(ws_sold_date_sk), IsNotNull(ws_bill_addr_sk), IsNotNull(ws_item_sk)]
-ReadSchema: struct<ws_sold_date_sk:int,ws_item_sk:int,ws_bill_addr_sk:int,ws_ext_sales_price:decimal(7,2)>
+Location: InMemoryFileIndex []
+PartitionFilters: [isnotnull(ws_sold_date_sk#40), dynamicpruningexpression(ws_sold_date_sk#40 IN dynamicpruning#5)]
+PushedFilters: [IsNotNull(ws_bill_addr_sk), IsNotNull(ws_item_sk)]
+ReadSchema: struct<ws_item_sk:int,ws_bill_addr_sk:int,ws_ext_sales_price:decimal(7,2)>
 
 (49) ColumnarToRow [codegen id : 17]
-Input [4]: [ws_sold_date_sk#32, ws_item_sk#33, ws_bill_addr_sk#34, ws_ext_sales_price#35]
+Input [4]: [ws_item_sk#37, ws_bill_addr_sk#38, ws_ext_sales_price#39, ws_sold_date_sk#40]
 
 (50) Filter [codegen id : 17]
-Input [4]: [ws_sold_date_sk#32, ws_item_sk#33, ws_bill_addr_sk#34, ws_ext_sales_price#35]
-Condition : ((isnotnull(ws_sold_date_sk#32) AND isnotnull(ws_bill_addr_sk#34)) AND isnotnull(ws_item_sk#33))
+Input [4]: [ws_item_sk#37, ws_bill_addr_sk#38, ws_ext_sales_price#39, ws_sold_date_sk#40]
+Condition : (isnotnull(ws_bill_addr_sk#38) AND isnotnull(ws_item_sk#37))
 
 (51) ReusedExchange [Reuses operator id: 8]
-Output [1]: [d_date_sk#5]
+Output [1]: [d_date_sk#41]
 
 (52) BroadcastHashJoin [codegen id : 17]
-Left keys [1]: [ws_sold_date_sk#32]
-Right keys [1]: [d_date_sk#5]
+Left keys [1]: [ws_sold_date_sk#40]
+Right keys [1]: [d_date_sk#41]
 Join condition: None
 
 (53) Project [codegen id : 17]
-Output [3]: [ws_item_sk#33, ws_bill_addr_sk#34, ws_ext_sales_price#35]
-Input [5]: [ws_sold_date_sk#32, ws_item_sk#33, ws_bill_addr_sk#34, ws_ext_sales_price#35, d_date_sk#5]
+Output [3]: [ws_item_sk#37, ws_bill_addr_sk#38, ws_ext_sales_price#39]
+Input [5]: [ws_item_sk#37, ws_bill_addr_sk#38, ws_ext_sales_price#39, ws_sold_date_sk#40, d_date_sk#41]
 
-(54) ReusedExchange [Reuses operator id: 15]
-Output [1]: [ca_address_sk#9]
+(54) ReusedExchange [Reuses operator id: 20]
+Output [2]: [i_item_sk#42, i_manufact_id#43]
 
 (55) BroadcastHashJoin [codegen id : 17]
-Left keys [1]: [ws_bill_addr_sk#34]
-Right keys [1]: [ca_address_sk#9]
+Left keys [1]: [ws_item_sk#37]
+Right keys [1]: [i_item_sk#42]
 Join condition: None
 
 (56) Project [codegen id : 17]
-Output [2]: [ws_item_sk#33, ws_ext_sales_price#35]
-Input [4]: [ws_item_sk#33, ws_bill_addr_sk#34, ws_ext_sales_price#35, ca_address_sk#9]
+Output [3]: [ws_bill_addr_sk#38, ws_ext_sales_price#39, i_manufact_id#43]
+Input [5]: [ws_item_sk#37, ws_bill_addr_sk#38, ws_ext_sales_price#39, i_item_sk#42, i_manufact_id#43]
 
 (57) ReusedExchange [Reuses operator id: 27]
-Output [2]: [i_item_sk#12, i_manufact_id#13]
+Output [1]: [ca_address_sk#44]
 
 (58) BroadcastHashJoin [codegen id : 17]
-Left keys [1]: [ws_item_sk#33]
-Right keys [1]: [i_item_sk#12]
+Left keys [1]: [ws_bill_addr_sk#38]
+Right keys [1]: [ca_address_sk#44]
 Join condition: None
 
 (59) Project [codegen id : 17]
-Output [2]: [ws_ext_sales_price#35, i_manufact_id#13]
-Input [4]: [ws_item_sk#33, ws_ext_sales_price#35, i_item_sk#12, i_manufact_id#13]
+Output [2]: [ws_ext_sales_price#39, i_manufact_id#43]
+Input [4]: [ws_bill_addr_sk#38, ws_ext_sales_price#39, i_manufact_id#43, ca_address_sk#44]
 
 (60) HashAggregate [codegen id : 17]
-Input [2]: [ws_ext_sales_price#35, i_manufact_id#13]
-Keys [1]: [i_manufact_id#13]
-Functions [1]: [partial_sum(UnscaledValue(ws_ext_sales_price#35))]
-Aggregate Attributes [1]: [sum#36]
-Results [2]: [i_manufact_id#13, sum#37]
+Input [2]: [ws_ext_sales_price#39, i_manufact_id#43]
+Keys [1]: [i_manufact_id#43]
+Functions [1]: [partial_sum(UnscaledValue(ws_ext_sales_price#39))]
+Aggregate Attributes [1]: [sum#45]
+Results [2]: [i_manufact_id#43, sum#46]
 
 (61) Exchange
-Input [2]: [i_manufact_id#13, sum#37]
-Arguments: hashpartitioning(i_manufact_id#13, 5), ENSURE_REQUIREMENTS, [id=#38]
+Input [2]: [i_manufact_id#43, sum#46]
+Arguments: hashpartitioning(i_manufact_id#43, 5), ENSURE_REQUIREMENTS, [id=#47]
 
 (62) HashAggregate [codegen id : 18]
-Input [2]: [i_manufact_id#13, sum#37]
-Keys [1]: [i_manufact_id#13]
-Functions [1]: [sum(UnscaledValue(ws_ext_sales_price#35))]
-Aggregate Attributes [1]: [sum(UnscaledValue(ws_ext_sales_price#35))#39]
-Results [2]: [i_manufact_id#13, MakeDecimal(sum(UnscaledValue(ws_ext_sales_price#35))#39,17,2) AS total_sales#40]
+Input [2]: [i_manufact_id#43, sum#46]
+Keys [1]: [i_manufact_id#43]
+Functions [1]: [sum(UnscaledValue(ws_ext_sales_price#39))]
+Aggregate Attributes [1]: [sum(UnscaledValue(ws_ext_sales_price#39))#48]
+Results [2]: [i_manufact_id#43, MakeDecimal(sum(UnscaledValue(ws_ext_sales_price#39))#48,17,2) AS total_sales#49]
 
 (63) Union
 
 (64) HashAggregate [codegen id : 19]
-Input [2]: [i_manufact_id#13, total_sales#22]
-Keys [1]: [i_manufact_id#13]
-Functions [1]: [partial_sum(total_sales#22)]
-Aggregate Attributes [2]: [sum#41, isEmpty#42]
-Results [3]: [i_manufact_id#13, sum#43, isEmpty#44]
+Input [2]: [i_manufact_id#11, total_sales#23]
+Keys [1]: [i_manufact_id#11]
+Functions [1]: [partial_sum(total_sales#23)]
+Aggregate Attributes [2]: [sum#50, isEmpty#51]
+Results [3]: [i_manufact_id#11, sum#52, isEmpty#53]
 
 (65) Exchange
-Input [3]: [i_manufact_id#13, sum#43, isEmpty#44]
-Arguments: hashpartitioning(i_manufact_id#13, 5), ENSURE_REQUIREMENTS, [id=#45]
+Input [3]: [i_manufact_id#11, sum#52, isEmpty#53]
+Arguments: hashpartitioning(i_manufact_id#11, 5), ENSURE_REQUIREMENTS, [id=#54]
 
 (66) HashAggregate [codegen id : 20]
-Input [3]: [i_manufact_id#13, sum#43, isEmpty#44]
-Keys [1]: [i_manufact_id#13]
-Functions [1]: [sum(total_sales#22)]
-Aggregate Attributes [1]: [sum(total_sales#22)#46]
-Results [2]: [i_manufact_id#13, sum(total_sales#22)#46 AS total_sales#47]
+Input [3]: [i_manufact_id#11, sum#52, isEmpty#53]
+Keys [1]: [i_manufact_id#11]
+Functions [1]: [sum(total_sales#23)]
+Aggregate Attributes [1]: [sum(total_sales#23)#55]
+Results [2]: [i_manufact_id#11, sum(total_sales#23)#55 AS total_sales#56]
 
 (67) TakeOrderedAndProject
-Input [2]: [i_manufact_id#13, total_sales#47]
-Arguments: 100, [total_sales#47 ASC NULLS FIRST], [i_manufact_id#13, total_sales#47]
+Input [2]: [i_manufact_id#11, total_sales#56]
+Arguments: 100, [total_sales#56 ASC NULLS FIRST], [i_manufact_id#11, total_sales#56]
+
+===== Subqueries =====
+
+Subquery:1 Hosting operator id = 1 Hosting Expression = ss_sold_date_sk#4 IN dynamicpruning#5
+ReusedExchange (68)
+
+
+(68) ReusedExchange [Reuses operator id: 8]
+Output [1]: [d_date_sk#6]
+
+Subquery:2 Hosting operator id = 33 Hosting Expression = cs_sold_date_sk#27 IN dynamicpruning#5
+
+Subquery:3 Hosting operator id = 48 Hosting Expression = ws_sold_date_sk#40 IN dynamicpruning#5
+
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q33.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q33.sf100/simplified.txt
index 14787f0bbce7b..410def2466e1a 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q33.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q33.sf100/simplified.txt
@@ -14,9 +14,9 @@ TakeOrderedAndProject [total_sales,i_manufact_id]
                           WholeStageCodegen (5)
                             HashAggregate [i_manufact_id,ss_ext_sales_price] [sum,sum]
                               Project [ss_ext_sales_price,i_manufact_id]
-                                BroadcastHashJoin [ss_item_sk,i_item_sk]
-                                  Project [ss_item_sk,ss_ext_sales_price]
-                                    BroadcastHashJoin [ss_addr_sk,ca_address_sk]
+                                BroadcastHashJoin [ss_addr_sk,ca_address_sk]
+                                  Project [ss_addr_sk,ss_ext_sales_price,i_manufact_id]
+                                    BroadcastHashJoin [ss_item_sk,i_item_sk]
                                       Project [ss_item_sk,ss_addr_sk,ss_ext_sales_price]
                                         BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
                                           Filter [ss_sold_date_sk,ss_addr_sk,ss_item_sk]
@@ -33,28 +33,28 @@ TakeOrderedAndProject [total_sales,i_manufact_id]
                                                         Scan parquet default.date_dim [d_date_sk,d_year,d_moy]
                                       InputAdapter
                                         BroadcastExchange #4
-                                          WholeStageCodegen (2)
-                                            Project [ca_address_sk]
-                                              Filter [ca_gmt_offset,ca_address_sk]
+                                          WholeStageCodegen (3)
+                                            BroadcastHashJoin [i_manufact_id,i_manufact_id]
+                                              Filter [i_item_sk]
                                                 ColumnarToRow
                                                   InputAdapter
-                                                    Scan parquet default.customer_address [ca_address_sk,ca_gmt_offset]
+                                                    Scan parquet default.item [i_item_sk,i_manufact_id]
+                                              InputAdapter
+                                                BroadcastExchange #5
+                                                  WholeStageCodegen (2)
+                                                    Project [i_manufact_id]
+                                                      Filter [i_category]
+                                                        ColumnarToRow
+                                                          InputAdapter
+                                                            Scan parquet default.item [i_category,i_manufact_id]
                                   InputAdapter
-                                    BroadcastExchange #5
+                                    BroadcastExchange #6
                                       WholeStageCodegen (4)
-                                        BroadcastHashJoin [i_manufact_id,i_manufact_id]
-                                          Filter [i_item_sk]
+                                        Project [ca_address_sk]
+                                          Filter [ca_gmt_offset,ca_address_sk]
                                             ColumnarToRow
                                               InputAdapter
-                                                Scan parquet default.item [i_item_sk,i_manufact_id]
-                                          InputAdapter
-                                            BroadcastExchange #6
-                                              WholeStageCodegen (3)
-                                                Project [i_manufact_id]
-                                                  Filter [i_category]
-                                                    ColumnarToRow
-                                                      InputAdapter
-                                                        Scan parquet default.item [i_category,i_manufact_id]
+                                                Scan parquet default.customer_address [ca_address_sk,ca_gmt_offset]
                   WholeStageCodegen (12)
                     HashAggregate [i_manufact_id,sum] [sum(UnscaledValue(cs_ext_sales_price)),total_sales,sum]
                       InputAdapter
@@ -62,9 +62,9 @@ TakeOrderedAndProject [total_sales,i_manufact_id]
                           WholeStageCodegen (11)
                             HashAggregate [i_manufact_id,cs_ext_sales_price] [sum,sum]
                               Project [cs_ext_sales_price,i_manufact_id]
-                                BroadcastHashJoin [cs_item_sk,i_item_sk]
-                                  Project [cs_item_sk,cs_ext_sales_price]
-                                    BroadcastHashJoin [cs_bill_addr_sk,ca_address_sk]
+                                BroadcastHashJoin [cs_bill_addr_sk,ca_address_sk]
+                                  Project [cs_bill_addr_sk,cs_ext_sales_price,i_manufact_id]
+                                    BroadcastHashJoin [cs_item_sk,i_item_sk]
                                       Project [cs_bill_addr_sk,cs_item_sk,cs_ext_sales_price]
                                         BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
                                           Filter [cs_sold_date_sk,cs_bill_addr_sk,cs_item_sk]
@@ -74,9 +74,9 @@ TakeOrderedAndProject [total_sales,i_manufact_id]
                                           InputAdapter
                                             ReusedExchange [d_date_sk] #3
                                       InputAdapter
-                                        ReusedExchange [ca_address_sk] #4
+                                        ReusedExchange [i_item_sk,i_manufact_id] #4
                                   InputAdapter
-                                    ReusedExchange [i_item_sk,i_manufact_id] #5
+                                    ReusedExchange [ca_address_sk] #6
                   WholeStageCodegen (18)
                     HashAggregate [i_manufact_id,sum] [sum(UnscaledValue(ws_ext_sales_price)),total_sales,sum]
                       InputAdapter
@@ -84,9 +84,9 @@ TakeOrderedAndProject [total_sales,i_manufact_id]
                           WholeStageCodegen (17)
                             HashAggregate [i_manufact_id,ws_ext_sales_price] [sum,sum]
                               Project [ws_ext_sales_price,i_manufact_id]
-                                BroadcastHashJoin [ws_item_sk,i_item_sk]
-                                  Project [ws_item_sk,ws_ext_sales_price]
-                                    BroadcastHashJoin [ws_bill_addr_sk,ca_address_sk]
+                                BroadcastHashJoin [ws_bill_addr_sk,ca_address_sk]
+                                  Project [ws_bill_addr_sk,ws_ext_sales_price,i_manufact_id]
+                                    BroadcastHashJoin [ws_item_sk,i_item_sk]
                                       Project [ws_item_sk,ws_bill_addr_sk,ws_ext_sales_price]
                                         BroadcastHashJoin [ws_sold_date_sk,d_date_sk]
                                           Filter [ws_sold_date_sk,ws_bill_addr_sk,ws_item_sk]
@@ -96,6 +96,6 @@ TakeOrderedAndProject [total_sales,i_manufact_id]
                                           InputAdapter
                                             ReusedExchange [d_date_sk] #3
                                       InputAdapter
-                                        ReusedExchange [ca_address_sk] #4
+                                        ReusedExchange [i_item_sk,i_manufact_id] #4
                                   InputAdapter
-                                    ReusedExchange [i_item_sk,i_manufact_id] #5
+                                    ReusedExchange [ca_address_sk] #6
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q52.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q52.sf100/explain.txt
index 6492918d3aa13..036de304b72cc 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q52.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q52.sf100/explain.txt
@@ -6,117 +6,117 @@ TakeOrderedAndProject (21)
          +- * Project (17)
             +- * BroadcastHashJoin Inner BuildRight (16)
                :- * Project (10)
-               :  +- * BroadcastHashJoin Inner BuildLeft (9)
-               :     :- BroadcastExchange (5)
-               :     :  +- * Project (4)
-               :     :     +- * Filter (3)
-               :     :        +- * ColumnarToRow (2)
-               :     :           +- Scan parquet default.date_dim (1)
-               :     +- * Filter (8)
-               :        +- * ColumnarToRow (7)
-               :           +- Scan parquet default.store_sales (6)
+               :  +- * BroadcastHashJoin Inner BuildRight (9)
+               :     :- * Filter (3)
+               :     :  +- * ColumnarToRow (2)
+               :     :     +- Scan parquet default.store_sales (1)
+               :     +- BroadcastExchange (8)
+               :        +- * Project (7)
+               :           +- * Filter (6)
+               :              +- * ColumnarToRow (5)
+               :                 +- Scan parquet default.item (4)
                +- BroadcastExchange (15)
                   +- * Project (14)
                      +- * Filter (13)
                         +- * ColumnarToRow (12)
-                           +- Scan parquet default.item (11)
+                           +- Scan parquet default.date_dim (11)
 
 
-(1) Scan parquet default.date_dim
-Output [3]: [d_date_sk#1, d_year#2, d_moy#3]
+(1) Scan parquet default.store_sales
+Output [3]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/date_dim]
-PushedFilters: [IsNotNull(d_moy), IsNotNull(d_year), EqualTo(d_moy,11), EqualTo(d_year,2000), IsNotNull(d_date_sk)]
-ReadSchema: struct<d_date_sk:int,d_year:int,d_moy:int>
+Location [not included in comparison]/{warehouse_dir}/store_sales]
+PushedFilters: [IsNotNull(ss_sold_date_sk), IsNotNull(ss_item_sk)]
+ReadSchema: struct<ss_sold_date_sk:int,ss_item_sk:int,ss_ext_sales_price:decimal(7,2)>
 
-(2) ColumnarToRow [codegen id : 1]
-Input [3]: [d_date_sk#1, d_year#2, d_moy#3]
+(2) ColumnarToRow [codegen id : 3]
+Input [3]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3]
 
-(3) Filter [codegen id : 1]
-Input [3]: [d_date_sk#1, d_year#2, d_moy#3]
-Condition : ((((isnotnull(d_moy#3) AND isnotnull(d_year#2)) AND (d_moy#3 = 11)) AND (d_year#2 = 2000)) AND isnotnull(d_date_sk#1))
+(3) Filter [codegen id : 3]
+Input [3]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3]
+Condition : (isnotnull(ss_sold_date_sk#1) AND isnotnull(ss_item_sk#2))
 
-(4) Project [codegen id : 1]
-Output [2]: [d_date_sk#1, d_year#2]
-Input [3]: [d_date_sk#1, d_year#2, d_moy#3]
+(4) Scan parquet default.item
+Output [4]: [i_item_sk#4, i_brand_id#5, i_brand#6, i_manager_id#7]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/item]
+PushedFilters: [IsNotNull(i_manager_id), EqualTo(i_manager_id,1), IsNotNull(i_item_sk)]
+ReadSchema: struct<i_item_sk:int,i_brand_id:int,i_brand:string,i_manager_id:int>
 
-(5) BroadcastExchange
-Input [2]: [d_date_sk#1, d_year#2]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#4]
+(5) ColumnarToRow [codegen id : 1]
+Input [4]: [i_item_sk#4, i_brand_id#5, i_brand#6, i_manager_id#7]
 
-(6) Scan parquet default.store_sales
-Output [3]: [ss_sold_date_sk#5, ss_item_sk#6, ss_ext_sales_price#7]
-Batched: true
-Location [not included in comparison]/{warehouse_dir}/store_sales]
-PushedFilters: [IsNotNull(ss_sold_date_sk), IsNotNull(ss_item_sk)]
-ReadSchema: struct<ss_sold_date_sk:int,ss_item_sk:int,ss_ext_sales_price:decimal(7,2)>
+(6) Filter [codegen id : 1]
+Input [4]: [i_item_sk#4, i_brand_id#5, i_brand#6, i_manager_id#7]
+Condition : ((isnotnull(i_manager_id#7) AND (i_manager_id#7 = 1)) AND isnotnull(i_item_sk#4))
 
-(7) ColumnarToRow
-Input [3]: [ss_sold_date_sk#5, ss_item_sk#6, ss_ext_sales_price#7]
+(7) Project [codegen id : 1]
+Output [3]: [i_item_sk#4, i_brand_id#5, i_brand#6]
+Input [4]: [i_item_sk#4, i_brand_id#5, i_brand#6, i_manager_id#7]
 
-(8) Filter
-Input [3]: [ss_sold_date_sk#5, ss_item_sk#6, ss_ext_sales_price#7]
-Condition : (isnotnull(ss_sold_date_sk#5) AND isnotnull(ss_item_sk#6))
+(8) BroadcastExchange
+Input [3]: [i_item_sk#4, i_brand_id#5, i_brand#6]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#8]
 
 (9) BroadcastHashJoin [codegen id : 3]
-Left keys [1]: [d_date_sk#1]
-Right keys [1]: [ss_sold_date_sk#5]
+Left keys [1]: [ss_item_sk#2]
+Right keys [1]: [i_item_sk#4]
 Join condition: None
 
 (10) Project [codegen id : 3]
-Output [3]: [d_year#2, ss_item_sk#6, ss_ext_sales_price#7]
-Input [5]: [d_date_sk#1, d_year#2, ss_sold_date_sk#5, ss_item_sk#6, ss_ext_sales_price#7]
+Output [4]: [ss_sold_date_sk#1, ss_ext_sales_price#3, i_brand_id#5, i_brand#6]
+Input [6]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3, i_item_sk#4, i_brand_id#5, i_brand#6]
 
-(11) Scan parquet default.item
-Output [4]: [i_item_sk#8, i_brand_id#9, i_brand#10, i_manager_id#11]
+(11) Scan parquet default.date_dim
+Output [3]: [d_date_sk#9, d_year#10, d_moy#11]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/item]
-PushedFilters: [IsNotNull(i_manager_id), EqualTo(i_manager_id,1), IsNotNull(i_item_sk)]
-ReadSchema: struct<i_item_sk:int,i_brand_id:int,i_brand:string,i_manager_id:int>
+Location [not included in comparison]/{warehouse_dir}/date_dim]
+PushedFilters: [IsNotNull(d_moy), IsNotNull(d_year), EqualTo(d_moy,11), EqualTo(d_year,2000), IsNotNull(d_date_sk)]
+ReadSchema: struct<d_date_sk:int,d_year:int,d_moy:int>
 
 (12) ColumnarToRow [codegen id : 2]
-Input [4]: [i_item_sk#8, i_brand_id#9, i_brand#10, i_manager_id#11]
+Input [3]: [d_date_sk#9, d_year#10, d_moy#11]
 
 (13) Filter [codegen id : 2]
-Input [4]: [i_item_sk#8, i_brand_id#9, i_brand#10, i_manager_id#11]
-Condition : ((isnotnull(i_manager_id#11) AND (i_manager_id#11 = 1)) AND isnotnull(i_item_sk#8))
+Input [3]: [d_date_sk#9, d_year#10, d_moy#11]
+Condition : ((((isnotnull(d_moy#11) AND isnotnull(d_year#10)) AND (d_moy#11 = 11)) AND (d_year#10 = 2000)) AND isnotnull(d_date_sk#9))
 
 (14) Project [codegen id : 2]
-Output [3]: [i_item_sk#8, i_brand_id#9, i_brand#10]
-Input [4]: [i_item_sk#8, i_brand_id#9, i_brand#10, i_manager_id#11]
+Output [2]: [d_date_sk#9, d_year#10]
+Input [3]: [d_date_sk#9, d_year#10, d_moy#11]
 
 (15) BroadcastExchange
-Input [3]: [i_item_sk#8, i_brand_id#9, i_brand#10]
+Input [2]: [d_date_sk#9, d_year#10]
 Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#12]
 
 (16) BroadcastHashJoin [codegen id : 3]
-Left keys [1]: [ss_item_sk#6]
-Right keys [1]: [i_item_sk#8]
+Left keys [1]: [ss_sold_date_sk#1]
+Right keys [1]: [d_date_sk#9]
 Join condition: None
 
 (17) Project [codegen id : 3]
-Output [4]: [d_year#2, ss_ext_sales_price#7, i_brand_id#9, i_brand#10]
-Input [6]: [d_year#2, ss_item_sk#6, ss_ext_sales_price#7, i_item_sk#8, i_brand_id#9, i_brand#10]
+Output [4]: [d_year#10, ss_ext_sales_price#3, i_brand_id#5, i_brand#6]
+Input [6]: [ss_sold_date_sk#1, ss_ext_sales_price#3, i_brand_id#5, i_brand#6, d_date_sk#9, d_year#10]
 
 (18) HashAggregate [codegen id : 3]
-Input [4]: [d_year#2, ss_ext_sales_price#7, i_brand_id#9, i_brand#10]
-Keys [3]: [d_year#2, i_brand#10, i_brand_id#9]
-Functions [1]: [partial_sum(UnscaledValue(ss_ext_sales_price#7))]
+Input [4]: [d_year#10, ss_ext_sales_price#3, i_brand_id#5, i_brand#6]
+Keys [3]: [d_year#10, i_brand#6, i_brand_id#5]
+Functions [1]: [partial_sum(UnscaledValue(ss_ext_sales_price#3))]
 Aggregate Attributes [1]: [sum#13]
-Results [4]: [d_year#2, i_brand#10, i_brand_id#9, sum#14]
+Results [4]: [d_year#10, i_brand#6, i_brand_id#5, sum#14]
 
 (19) Exchange
-Input [4]: [d_year#2, i_brand#10, i_brand_id#9, sum#14]
-Arguments: hashpartitioning(d_year#2, i_brand#10, i_brand_id#9, 5), ENSURE_REQUIREMENTS, [id=#15]
+Input [4]: [d_year#10, i_brand#6, i_brand_id#5, sum#14]
+Arguments: hashpartitioning(d_year#10, i_brand#6, i_brand_id#5, 5), ENSURE_REQUIREMENTS, [id=#15]
 
 (20) HashAggregate [codegen id : 4]
-Input [4]: [d_year#2, i_brand#10, i_brand_id#9, sum#14]
-Keys [3]: [d_year#2, i_brand#10, i_brand_id#9]
-Functions [1]: [sum(UnscaledValue(ss_ext_sales_price#7))]
-Aggregate Attributes [1]: [sum(UnscaledValue(ss_ext_sales_price#7))#16]
-Results [4]: [d_year#2, i_brand_id#9 AS brand_id#17, i_brand#10 AS brand#18, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#7))#16,17,2) AS ext_price#19]
+Input [4]: [d_year#10, i_brand#6, i_brand_id#5, sum#14]
+Keys [3]: [d_year#10, i_brand#6, i_brand_id#5]
+Functions [1]: [sum(UnscaledValue(ss_ext_sales_price#3))]
+Aggregate Attributes [1]: [sum(UnscaledValue(ss_ext_sales_price#3))#16]
+Results [4]: [d_year#10, i_brand_id#5 AS brand_id#17, i_brand#6 AS brand#18, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#16,17,2) AS ext_price#19]
 
 (21) TakeOrderedAndProject
-Input [4]: [d_year#2, brand_id#17, brand#18, ext_price#19]
-Arguments: 100, [d_year#2 ASC NULLS FIRST, ext_price#19 DESC NULLS LAST, brand_id#17 ASC NULLS FIRST], [d_year#2, brand_id#17, brand#18, ext_price#19]
+Input [4]: [d_year#10, brand_id#17, brand#18, ext_price#19]
+Arguments: 100, [d_year#10 ASC NULLS FIRST, ext_price#19 DESC NULLS LAST, brand_id#17 ASC NULLS FIRST], [d_year#10, brand_id#17, brand#18, ext_price#19]
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q52.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q52.sf100/simplified.txt
index f4aaf3df75135..8ed500d84390c 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q52.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q52.sf100/simplified.txt
@@ -6,26 +6,26 @@ TakeOrderedAndProject [d_year,ext_price,brand_id,brand]
           WholeStageCodegen (3)
             HashAggregate [d_year,i_brand,i_brand_id,ss_ext_sales_price] [sum,sum]
               Project [d_year,ss_ext_sales_price,i_brand_id,i_brand]
-                BroadcastHashJoin [ss_item_sk,i_item_sk]
-                  Project [d_year,ss_item_sk,ss_ext_sales_price]
-                    BroadcastHashJoin [d_date_sk,ss_sold_date_sk]
+                BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
+                  Project [ss_sold_date_sk,ss_ext_sales_price,i_brand_id,i_brand]
+                    BroadcastHashJoin [ss_item_sk,i_item_sk]
+                      Filter [ss_sold_date_sk,ss_item_sk]
+                        ColumnarToRow
+                          InputAdapter
+                            Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk,ss_ext_sales_price]
                       InputAdapter
                         BroadcastExchange #2
                           WholeStageCodegen (1)
-                            Project [d_date_sk,d_year]
-                              Filter [d_moy,d_year,d_date_sk]
+                            Project [i_item_sk,i_brand_id,i_brand]
+                              Filter [i_manager_id,i_item_sk]
                                 ColumnarToRow
                                   InputAdapter
-                                    Scan parquet default.date_dim [d_date_sk,d_year,d_moy]
-                      Filter [ss_sold_date_sk,ss_item_sk]
-                        ColumnarToRow
-                          InputAdapter
-                            Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk,ss_ext_sales_price]
+                                    Scan parquet default.item [i_item_sk,i_brand_id,i_brand,i_manager_id]
                   InputAdapter
                     BroadcastExchange #3
                       WholeStageCodegen (2)
-                        Project [i_item_sk,i_brand_id,i_brand]
-                          Filter [i_manager_id,i_item_sk]
+                        Project [d_date_sk,d_year]
+                          Filter [d_moy,d_year,d_date_sk]
                             ColumnarToRow
                               InputAdapter
-                                Scan parquet default.item [i_item_sk,i_brand_id,i_brand,i_manager_id]
+                                Scan parquet default.date_dim [d_date_sk,d_year,d_moy]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q55.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q55.sf100/explain.txt
index b8d8aa358d532..cca43a4232a8a 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q55.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q55.sf100/explain.txt
@@ -6,115 +6,115 @@ TakeOrderedAndProject (21)
          +- * Project (17)
             +- * BroadcastHashJoin Inner BuildRight (16)
                :- * Project (10)
-               :  +- * BroadcastHashJoin Inner BuildLeft (9)
-               :     :- BroadcastExchange (5)
-               :     :  +- * Project (4)
-               :     :     +- * Filter (3)
-               :     :        +- * ColumnarToRow (2)
-               :     :           +- Scan parquet default.date_dim (1)
-               :     +- * Filter (8)
-               :        +- * ColumnarToRow (7)
-               :           +- Scan parquet default.store_sales (6)
+               :  +- * BroadcastHashJoin Inner BuildRight (9)
+               :     :- * Filter (3)
+               :     :  +- * ColumnarToRow (2)
+               :     :     +- Scan parquet default.store_sales (1)
+               :     +- BroadcastExchange (8)
+               :        +- * Project (7)
+               :           +- * Filter (6)
+               :              +- * ColumnarToRow (5)
+               :                 +- Scan parquet default.item (4)
                +- BroadcastExchange (15)
                   +- * Project (14)
                      +- * Filter (13)
                         +- * ColumnarToRow (12)
-                           +- Scan parquet default.item (11)
+                           +- Scan parquet default.date_dim (11)
 
 
-(1) Scan parquet default.date_dim
-Output [3]: [d_date_sk#1, d_year#2, d_moy#3]
+(1) Scan parquet default.store_sales
+Output [3]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/date_dim]
-PushedFilters: [IsNotNull(d_moy), IsNotNull(d_year), EqualTo(d_moy,11), EqualTo(d_year,1999), IsNotNull(d_date_sk)]
-ReadSchema: struct<d_date_sk:int,d_year:int,d_moy:int>
+Location [not included in comparison]/{warehouse_dir}/store_sales]
+PushedFilters: [IsNotNull(ss_sold_date_sk), IsNotNull(ss_item_sk)]
+ReadSchema: struct<ss_sold_date_sk:int,ss_item_sk:int,ss_ext_sales_price:decimal(7,2)>
 
-(2) ColumnarToRow [codegen id : 1]
-Input [3]: [d_date_sk#1, d_year#2, d_moy#3]
+(2) ColumnarToRow [codegen id : 3]
+Input [3]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3]
 
-(3) Filter [codegen id : 1]
-Input [3]: [d_date_sk#1, d_year#2, d_moy#3]
-Condition : ((((isnotnull(d_moy#3) AND isnotnull(d_year#2)) AND (d_moy#3 = 11)) AND (d_year#2 = 1999)) AND isnotnull(d_date_sk#1))
+(3) Filter [codegen id : 3]
+Input [3]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3]
+Condition : (isnotnull(ss_sold_date_sk#1) AND isnotnull(ss_item_sk#2))
 
-(4) Project [codegen id : 1]
-Output [1]: [d_date_sk#1]
-Input [3]: [d_date_sk#1, d_year#2, d_moy#3]
+(4) Scan parquet default.item
+Output [4]: [i_item_sk#4, i_brand_id#5, i_brand#6, i_manager_id#7]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/item]
+PushedFilters: [IsNotNull(i_manager_id), EqualTo(i_manager_id,28), IsNotNull(i_item_sk)]
+ReadSchema: struct<i_item_sk:int,i_brand_id:int,i_brand:string,i_manager_id:int>
 
-(5) BroadcastExchange
-Input [1]: [d_date_sk#1]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#4]
+(5) ColumnarToRow [codegen id : 1]
+Input [4]: [i_item_sk#4, i_brand_id#5, i_brand#6, i_manager_id#7]
 
-(6) Scan parquet default.store_sales
-Output [3]: [ss_sold_date_sk#5, ss_item_sk#6, ss_ext_sales_price#7]
-Batched: true
-Location [not included in comparison]/{warehouse_dir}/store_sales]
-PushedFilters: [IsNotNull(ss_sold_date_sk), IsNotNull(ss_item_sk)]
-ReadSchema: struct<ss_sold_date_sk:int,ss_item_sk:int,ss_ext_sales_price:decimal(7,2)>
+(6) Filter [codegen id : 1]
+Input [4]: [i_item_sk#4, i_brand_id#5, i_brand#6, i_manager_id#7]
+Condition : ((isnotnull(i_manager_id#7) AND (i_manager_id#7 = 28)) AND isnotnull(i_item_sk#4))
 
-(7) ColumnarToRow
-Input [3]: [ss_sold_date_sk#5, ss_item_sk#6, ss_ext_sales_price#7]
+(7) Project [codegen id : 1]
+Output [3]: [i_item_sk#4, i_brand_id#5, i_brand#6]
+Input [4]: [i_item_sk#4, i_brand_id#5, i_brand#6, i_manager_id#7]
 
-(8) Filter
-Input [3]: [ss_sold_date_sk#5, ss_item_sk#6, ss_ext_sales_price#7]
-Condition : (isnotnull(ss_sold_date_sk#5) AND isnotnull(ss_item_sk#6))
+(8) BroadcastExchange
+Input [3]: [i_item_sk#4, i_brand_id#5, i_brand#6]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#8]
 
 (9) BroadcastHashJoin [codegen id : 3]
-Left keys [1]: [d_date_sk#1]
-Right keys [1]: [ss_sold_date_sk#5]
+Left keys [1]: [ss_item_sk#2]
+Right keys [1]: [i_item_sk#4]
 Join condition: None
 
 (10) Project [codegen id : 3]
-Output [2]: [ss_item_sk#6, ss_ext_sales_price#7]
-Input [4]: [d_date_sk#1, ss_sold_date_sk#5, ss_item_sk#6, ss_ext_sales_price#7]
+Output [4]: [ss_sold_date_sk#1, ss_ext_sales_price#3, i_brand_id#5, i_brand#6]
+Input [6]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3, i_item_sk#4, i_brand_id#5, i_brand#6]
 
-(11) Scan parquet default.item
-Output [4]: [i_item_sk#8, i_brand_id#9, i_brand#10, i_manager_id#11]
+(11) Scan parquet default.date_dim
+Output [3]: [d_date_sk#9, d_year#10, d_moy#11]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/item]
-PushedFilters: [IsNotNull(i_manager_id), EqualTo(i_manager_id,28), IsNotNull(i_item_sk)]
-ReadSchema: struct<i_item_sk:int,i_brand_id:int,i_brand:string,i_manager_id:int>
+Location [not included in comparison]/{warehouse_dir}/date_dim]
+PushedFilters: [IsNotNull(d_moy), IsNotNull(d_year), EqualTo(d_moy,11), EqualTo(d_year,1999), IsNotNull(d_date_sk)]
+ReadSchema: struct<d_date_sk:int,d_year:int,d_moy:int>
 
 (12) ColumnarToRow [codegen id : 2]
-Input [4]: [i_item_sk#8, i_brand_id#9, i_brand#10, i_manager_id#11]
+Input [3]: [d_date_sk#9, d_year#10, d_moy#11]
 
 (13) Filter [codegen id : 2]
-Input [4]: [i_item_sk#8, i_brand_id#9, i_brand#10, i_manager_id#11]
-Condition : ((isnotnull(i_manager_id#11) AND (i_manager_id#11 = 28)) AND isnotnull(i_item_sk#8))
+Input [3]: [d_date_sk#9, d_year#10, d_moy#11]
+Condition : ((((isnotnull(d_moy#11) AND isnotnull(d_year#10)) AND (d_moy#11 = 11)) AND (d_year#10 = 1999)) AND isnotnull(d_date_sk#9))
 
 (14) Project [codegen id : 2]
-Output [3]: [i_item_sk#8, i_brand_id#9, i_brand#10]
-Input [4]: [i_item_sk#8, i_brand_id#9, i_brand#10, i_manager_id#11]
+Output [1]: [d_date_sk#9]
+Input [3]: [d_date_sk#9, d_year#10, d_moy#11]
 
 (15) BroadcastExchange
-Input [3]: [i_item_sk#8, i_brand_id#9, i_brand#10]
+Input [1]: [d_date_sk#9]
 Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#12]
 
 (16) BroadcastHashJoin [codegen id : 3]
-Left keys [1]: [ss_item_sk#6]
-Right keys [1]: [i_item_sk#8]
+Left keys [1]: [ss_sold_date_sk#1]
+Right keys [1]: [d_date_sk#9]
 Join condition: None
 
 (17) Project [codegen id : 3]
-Output [3]: [ss_ext_sales_price#7, i_brand_id#9, i_brand#10]
-Input [5]: [ss_item_sk#6, ss_ext_sales_price#7, i_item_sk#8, i_brand_id#9, i_brand#10]
+Output [3]: [ss_ext_sales_price#3, i_brand_id#5, i_brand#6]
+Input [5]: [ss_sold_date_sk#1, ss_ext_sales_price#3, i_brand_id#5, i_brand#6, d_date_sk#9]
 
 (18) HashAggregate [codegen id : 3]
-Input [3]: [ss_ext_sales_price#7, i_brand_id#9, i_brand#10]
-Keys [2]: [i_brand#10, i_brand_id#9]
-Functions [1]: [partial_sum(UnscaledValue(ss_ext_sales_price#7))]
+Input [3]: [ss_ext_sales_price#3, i_brand_id#5, i_brand#6]
+Keys [2]: [i_brand#6, i_brand_id#5]
+Functions [1]: [partial_sum(UnscaledValue(ss_ext_sales_price#3))]
 Aggregate Attributes [1]: [sum#13]
-Results [3]: [i_brand#10, i_brand_id#9, sum#14]
+Results [3]: [i_brand#6, i_brand_id#5, sum#14]
 
 (19) Exchange
-Input [3]: [i_brand#10, i_brand_id#9, sum#14]
-Arguments: hashpartitioning(i_brand#10, i_brand_id#9, 5), ENSURE_REQUIREMENTS, [id=#15]
+Input [3]: [i_brand#6, i_brand_id#5, sum#14]
+Arguments: hashpartitioning(i_brand#6, i_brand_id#5, 5), ENSURE_REQUIREMENTS, [id=#15]
 
 (20) HashAggregate [codegen id : 4]
-Input [3]: [i_brand#10, i_brand_id#9, sum#14]
-Keys [2]: [i_brand#10, i_brand_id#9]
-Functions [1]: [sum(UnscaledValue(ss_ext_sales_price#7))]
-Aggregate Attributes [1]: [sum(UnscaledValue(ss_ext_sales_price#7))#16]
-Results [3]: [i_brand_id#9 AS brand_id#17, i_brand#10 AS brand#18, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#7))#16,17,2) AS ext_price#19]
+Input [3]: [i_brand#6, i_brand_id#5, sum#14]
+Keys [2]: [i_brand#6, i_brand_id#5]
+Functions [1]: [sum(UnscaledValue(ss_ext_sales_price#3))]
+Aggregate Attributes [1]: [sum(UnscaledValue(ss_ext_sales_price#3))#16]
+Results [3]: [i_brand_id#5 AS brand_id#17, i_brand#6 AS brand#18, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#16,17,2) AS ext_price#19]
 
 (21) TakeOrderedAndProject
 Input [3]: [brand_id#17, brand#18, ext_price#19]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q55.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q55.sf100/simplified.txt
index 4f375c80678e8..b0d0e0d809441 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q55.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q55.sf100/simplified.txt
@@ -6,26 +6,26 @@ TakeOrderedAndProject [ext_price,brand_id,brand]
           WholeStageCodegen (3)
             HashAggregate [i_brand,i_brand_id,ss_ext_sales_price] [sum,sum]
               Project [ss_ext_sales_price,i_brand_id,i_brand]
-                BroadcastHashJoin [ss_item_sk,i_item_sk]
-                  Project [ss_item_sk,ss_ext_sales_price]
-                    BroadcastHashJoin [d_date_sk,ss_sold_date_sk]
+                BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
+                  Project [ss_sold_date_sk,ss_ext_sales_price,i_brand_id,i_brand]
+                    BroadcastHashJoin [ss_item_sk,i_item_sk]
+                      Filter [ss_sold_date_sk,ss_item_sk]
+                        ColumnarToRow
+                          InputAdapter
+                            Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk,ss_ext_sales_price]
                       InputAdapter
                         BroadcastExchange #2
                           WholeStageCodegen (1)
-                            Project [d_date_sk]
-                              Filter [d_moy,d_year,d_date_sk]
+                            Project [i_item_sk,i_brand_id,i_brand]
+                              Filter [i_manager_id,i_item_sk]
                                 ColumnarToRow
                                   InputAdapter
-                                    Scan parquet default.date_dim [d_date_sk,d_year,d_moy]
-                      Filter [ss_sold_date_sk,ss_item_sk]
-                        ColumnarToRow
-                          InputAdapter
-                            Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk,ss_ext_sales_price]
+                                    Scan parquet default.item [i_item_sk,i_brand_id,i_brand,i_manager_id]
                   InputAdapter
                     BroadcastExchange #3
                       WholeStageCodegen (2)
-                        Project [i_item_sk,i_brand_id,i_brand]
-                          Filter [i_manager_id,i_item_sk]
+                        Project [d_date_sk]
+                          Filter [d_moy,d_year,d_date_sk]
                             ColumnarToRow
                               InputAdapter
-                                Scan parquet default.item [i_item_sk,i_brand_id,i_brand,i_manager_id]
+                                Scan parquet default.date_dim [d_date_sk,d_year,d_moy]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q72.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q72.sf100/explain.txt
index 3007b11a1a860..3a61d77f674f0 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q72.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q72.sf100/explain.txt
@@ -11,60 +11,60 @@ TakeOrderedAndProject (79)
                :        +- * BroadcastHashJoin LeftOuter BuildRight (65)
                :           :- * Project (60)
                :           :  +- * SortMergeJoin Inner (59)
-               :           :     :- * Sort (34)
-               :           :     :  +- Exchange (33)
-               :           :     :     +- * Project (32)
-               :           :     :        +- * SortMergeJoin Inner (31)
-               :           :     :           :- * Sort (25)
-               :           :     :           :  +- Exchange (24)
-               :           :     :           :     +- * Project (23)
-               :           :     :           :        +- * BroadcastHashJoin Inner BuildRight (22)
-               :           :     :           :           :- * Project (17)
-               :           :     :           :           :  +- * BroadcastHashJoin Inner BuildRight (16)
-               :           :     :           :           :     :- * Project (10)
-               :           :     :           :           :     :  +- * BroadcastHashJoin Inner BuildRight (9)
-               :           :     :           :           :     :     :- * Filter (3)
-               :           :     :           :           :     :     :  +- * ColumnarToRow (2)
-               :           :     :           :           :     :     :     +- Scan parquet default.catalog_sales (1)
-               :           :     :           :           :     :     +- BroadcastExchange (8)
-               :           :     :           :           :     :        +- * Project (7)
-               :           :     :           :           :     :           +- * Filter (6)
-               :           :     :           :           :     :              +- * ColumnarToRow (5)
-               :           :     :           :           :     :                 +- Scan parquet default.household_demographics (4)
-               :           :     :           :           :     +- BroadcastExchange (15)
-               :           :     :           :           :        +- * Project (14)
-               :           :     :           :           :           +- * Filter (13)
-               :           :     :           :           :              +- * ColumnarToRow (12)
-               :           :     :           :           :                 +- Scan parquet default.customer_demographics (11)
-               :           :     :           :           +- BroadcastExchange (21)
-               :           :     :           :              +- * Filter (20)
-               :           :     :           :                 +- * ColumnarToRow (19)
-               :           :     :           :                    +- Scan parquet default.date_dim (18)
-               :           :     :           +- * Sort (30)
-               :           :     :              +- Exchange (29)
-               :           :     :                 +- * Filter (28)
-               :           :     :                    +- * ColumnarToRow (27)
-               :           :     :                       +- Scan parquet default.item (26)
+               :           :     :- * Sort (47)
+               :           :     :  +- Exchange (46)
+               :           :     :     +- * Project (45)
+               :           :     :        +- * BroadcastHashJoin Inner BuildRight (44)
+               :           :     :           :- * Project (32)
+               :           :     :           :  +- * SortMergeJoin Inner (31)
+               :           :     :           :     :- * Sort (25)
+               :           :     :           :     :  +- Exchange (24)
+               :           :     :           :     :     +- * Project (23)
+               :           :     :           :     :        +- * BroadcastHashJoin Inner BuildRight (22)
+               :           :     :           :     :           :- * Project (17)
+               :           :     :           :     :           :  +- * BroadcastHashJoin Inner BuildRight (16)
+               :           :     :           :     :           :     :- * Project (10)
+               :           :     :           :     :           :     :  +- * BroadcastHashJoin Inner BuildRight (9)
+               :           :     :           :     :           :     :     :- * Filter (3)
+               :           :     :           :     :           :     :     :  +- * ColumnarToRow (2)
+               :           :     :           :     :           :     :     :     +- Scan parquet default.catalog_sales (1)
+               :           :     :           :     :           :     :     +- BroadcastExchange (8)
+               :           :     :           :     :           :     :        +- * Project (7)
+               :           :     :           :     :           :     :           +- * Filter (6)
+               :           :     :           :     :           :     :              +- * ColumnarToRow (5)
+               :           :     :           :     :           :     :                 +- Scan parquet default.household_demographics (4)
+               :           :     :           :     :           :     +- BroadcastExchange (15)
+               :           :     :           :     :           :        +- * Project (14)
+               :           :     :           :     :           :           +- * Filter (13)
+               :           :     :           :     :           :              +- * ColumnarToRow (12)
+               :           :     :           :     :           :                 +- Scan parquet default.customer_demographics (11)
+               :           :     :           :     :           +- BroadcastExchange (21)
+               :           :     :           :     :              +- * Filter (20)
+               :           :     :           :     :                 +- * ColumnarToRow (19)
+               :           :     :           :     :                    +- Scan parquet default.date_dim (18)
+               :           :     :           :     +- * Sort (30)
+               :           :     :           :        +- Exchange (29)
+               :           :     :           :           +- * Filter (28)
+               :           :     :           :              +- * ColumnarToRow (27)
+               :           :     :           :                 +- Scan parquet default.item (26)
+               :           :     :           +- BroadcastExchange (43)
+               :           :     :              +- * Project (42)
+               :           :     :                 +- * BroadcastHashJoin Inner BuildLeft (41)
+               :           :     :                    :- BroadcastExchange (37)
+               :           :     :                    :  +- * Project (36)
+               :           :     :                    :     +- * Filter (35)
+               :           :     :                    :        +- * ColumnarToRow (34)
+               :           :     :                    :           +- Scan parquet default.date_dim (33)
+               :           :     :                    +- * Filter (40)
+               :           :     :                       +- * ColumnarToRow (39)
+               :           :     :                          +- Scan parquet default.date_dim (38)
                :           :     +- * Sort (58)
                :           :        +- Exchange (57)
                :           :           +- * Project (56)
                :           :              +- * BroadcastHashJoin Inner BuildRight (55)
-               :           :                 :- * Project (50)
-               :           :                 :  +- * BroadcastHashJoin Inner BuildLeft (49)
-               :           :                 :     :- BroadcastExchange (45)
-               :           :                 :     :  +- * Project (44)
-               :           :                 :     :     +- * BroadcastHashJoin Inner BuildLeft (43)
-               :           :                 :     :        :- BroadcastExchange (39)
-               :           :                 :     :        :  +- * Project (38)
-               :           :                 :     :        :     +- * Filter (37)
-               :           :                 :     :        :        +- * ColumnarToRow (36)
-               :           :                 :     :        :           +- Scan parquet default.date_dim (35)
-               :           :                 :     :        +- * Filter (42)
-               :           :                 :     :           +- * ColumnarToRow (41)
-               :           :                 :     :              +- Scan parquet default.date_dim (40)
-               :           :                 :     +- * Filter (48)
-               :           :                 :        +- * ColumnarToRow (47)
-               :           :                 :           +- Scan parquet default.inventory (46)
+               :           :                 :- * Filter (50)
+               :           :                 :  +- * ColumnarToRow (49)
+               :           :                 :     +- Scan parquet default.inventory (48)
                :           :                 +- BroadcastExchange (54)
                :           :                    +- * Filter (53)
                :           :                       +- * ColumnarToRow (52)
@@ -98,7 +98,7 @@ Condition : (((((isnotnull(cs_quantity#8) AND isnotnull(cs_item_sk#5)) AND isnot
 Output [2]: [hd_demo_sk#9, hd_buy_potential#10]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/household_demographics]
-PushedFilters: [IsNotNull(hd_buy_potential), EqualTo(hd_buy_potential,>10000), IsNotNull(hd_demo_sk)]
+PushedFilters: [IsNotNull(hd_buy_potential), EqualTo(hd_buy_potential,>10000         ), IsNotNull(hd_demo_sk)]
 ReadSchema: struct<hd_demo_sk:int,hd_buy_potential:string>
 
 (5) ColumnarToRow [codegen id : 1]
@@ -106,7 +106,7 @@ Input [2]: [hd_demo_sk#9, hd_buy_potential#10]
 
 (6) Filter [codegen id : 1]
 Input [2]: [hd_demo_sk#9, hd_buy_potential#10]
-Condition : ((isnotnull(hd_buy_potential#10) AND (hd_buy_potential#10 = >10000)) AND isnotnull(hd_demo_sk#9))
+Condition : ((isnotnull(hd_buy_potential#10) AND (hd_buy_potential#10 = >10000         )) AND isnotnull(hd_demo_sk#9))
 
 (7) Project [codegen id : 1]
 Output [1]: [hd_demo_sk#9]
@@ -213,95 +213,95 @@ Arguments: hashpartitioning(i_item_sk#19, 5), ENSURE_REQUIREMENTS, [id=#21]
 Input [2]: [i_item_sk#19, i_item_desc#20]
 Arguments: [i_item_sk#19 ASC NULLS FIRST], false, 0
 
-(31) SortMergeJoin [codegen id : 8]
+(31) SortMergeJoin [codegen id : 10]
 Left keys [1]: [cs_item_sk#5]
 Right keys [1]: [i_item_sk#19]
 Join condition: None
 
-(32) Project [codegen id : 8]
+(32) Project [codegen id : 10]
 Output [7]: [cs_sold_date_sk#1, cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, d_date#16, i_item_desc#20]
 Input [8]: [cs_sold_date_sk#1, cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, d_date#16, i_item_sk#19, i_item_desc#20]
 
-(33) Exchange
-Input [7]: [cs_sold_date_sk#1, cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, d_date#16, i_item_desc#20]
-Arguments: hashpartitioning(cs_item_sk#5, cs_sold_date_sk#1, 5), ENSURE_REQUIREMENTS, [id=#22]
-
-(34) Sort [codegen id : 9]
-Input [7]: [cs_sold_date_sk#1, cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, d_date#16, i_item_desc#20]
-Arguments: [cs_item_sk#5 ASC NULLS FIRST, cs_sold_date_sk#1 ASC NULLS FIRST], false, 0
-
-(35) Scan parquet default.date_dim
-Output [4]: [d_date_sk#23, d_date#24, d_week_seq#25, d_year#26]
+(33) Scan parquet default.date_dim
+Output [4]: [d_date_sk#22, d_date#23, d_week_seq#24, d_year#25]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/date_dim]
 PushedFilters: [IsNotNull(d_year), EqualTo(d_year,1999), IsNotNull(d_date_sk), IsNotNull(d_week_seq), IsNotNull(d_date)]
 ReadSchema: struct<d_date_sk:int,d_date:date,d_week_seq:int,d_year:int>
 
-(36) ColumnarToRow [codegen id : 10]
-Input [4]: [d_date_sk#23, d_date#24, d_week_seq#25, d_year#26]
+(34) ColumnarToRow [codegen id : 8]
+Input [4]: [d_date_sk#22, d_date#23, d_week_seq#24, d_year#25]
 
-(37) Filter [codegen id : 10]
-Input [4]: [d_date_sk#23, d_date#24, d_week_seq#25, d_year#26]
-Condition : ((((isnotnull(d_year#26) AND (d_year#26 = 1999)) AND isnotnull(d_date_sk#23)) AND isnotnull(d_week_seq#25)) AND isnotnull(d_date#24))
+(35) Filter [codegen id : 8]
+Input [4]: [d_date_sk#22, d_date#23, d_week_seq#24, d_year#25]
+Condition : ((((isnotnull(d_year#25) AND (d_year#25 = 1999)) AND isnotnull(d_date_sk#22)) AND isnotnull(d_week_seq#24)) AND isnotnull(d_date#23))
 
-(38) Project [codegen id : 10]
-Output [3]: [d_date_sk#23, d_date#24, d_week_seq#25]
-Input [4]: [d_date_sk#23, d_date#24, d_week_seq#25, d_year#26]
+(36) Project [codegen id : 8]
+Output [3]: [d_date_sk#22, d_date#23, d_week_seq#24]
+Input [4]: [d_date_sk#22, d_date#23, d_week_seq#24, d_year#25]
 
-(39) BroadcastExchange
-Input [3]: [d_date_sk#23, d_date#24, d_week_seq#25]
-Arguments: HashedRelationBroadcastMode(List(cast(input[2, int, true] as bigint)),false), [id=#27]
+(37) BroadcastExchange
+Input [3]: [d_date_sk#22, d_date#23, d_week_seq#24]
+Arguments: HashedRelationBroadcastMode(List(cast(input[2, int, true] as bigint)),false), [id=#26]
 
-(40) Scan parquet default.date_dim
-Output [2]: [d_date_sk#28, d_week_seq#29]
+(38) Scan parquet default.date_dim
+Output [2]: [d_date_sk#27, d_week_seq#28]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/date_dim]
 PushedFilters: [IsNotNull(d_week_seq), IsNotNull(d_date_sk)]
 ReadSchema: struct<d_date_sk:int,d_week_seq:int>
 
-(41) ColumnarToRow
-Input [2]: [d_date_sk#28, d_week_seq#29]
+(39) ColumnarToRow
+Input [2]: [d_date_sk#27, d_week_seq#28]
 
-(42) Filter
-Input [2]: [d_date_sk#28, d_week_seq#29]
-Condition : (isnotnull(d_week_seq#29) AND isnotnull(d_date_sk#28))
+(40) Filter
+Input [2]: [d_date_sk#27, d_week_seq#28]
+Condition : (isnotnull(d_week_seq#28) AND isnotnull(d_date_sk#27))
 
-(43) BroadcastHashJoin [codegen id : 11]
-Left keys [1]: [d_week_seq#25]
-Right keys [1]: [d_week_seq#29]
+(41) BroadcastHashJoin [codegen id : 9]
+Left keys [1]: [d_week_seq#24]
+Right keys [1]: [d_week_seq#28]
 Join condition: None
 
-(44) Project [codegen id : 11]
-Output [4]: [d_date_sk#23, d_date#24, d_week_seq#25, d_date_sk#28]
-Input [5]: [d_date_sk#23, d_date#24, d_week_seq#25, d_date_sk#28, d_week_seq#29]
+(42) Project [codegen id : 9]
+Output [4]: [d_date_sk#22, d_date#23, d_week_seq#24, d_date_sk#27]
+Input [5]: [d_date_sk#22, d_date#23, d_week_seq#24, d_date_sk#27, d_week_seq#28]
+
+(43) BroadcastExchange
+Input [4]: [d_date_sk#22, d_date#23, d_week_seq#24, d_date_sk#27]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#29]
+
+(44) BroadcastHashJoin [codegen id : 10]
+Left keys [1]: [cs_sold_date_sk#1]
+Right keys [1]: [d_date_sk#22]
+Join condition: (d_date#16 > d_date#23 + 5 days)
+
+(45) Project [codegen id : 10]
+Output [7]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, i_item_desc#20, d_week_seq#24, d_date_sk#27]
+Input [11]: [cs_sold_date_sk#1, cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, d_date#16, i_item_desc#20, d_date_sk#22, d_date#23, d_week_seq#24, d_date_sk#27]
 
-(45) BroadcastExchange
-Input [4]: [d_date_sk#23, d_date#24, d_week_seq#25, d_date_sk#28]
-Arguments: HashedRelationBroadcastMode(List(cast(input[3, int, true] as bigint)),false), [id=#30]
+(46) Exchange
+Input [7]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, i_item_desc#20, d_week_seq#24, d_date_sk#27]
+Arguments: hashpartitioning(cs_item_sk#5, d_date_sk#27, 5), ENSURE_REQUIREMENTS, [id=#30]
 
-(46) Scan parquet default.inventory
+(47) Sort [codegen id : 11]
+Input [7]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, i_item_desc#20, d_week_seq#24, d_date_sk#27]
+Arguments: [cs_item_sk#5 ASC NULLS FIRST, d_date_sk#27 ASC NULLS FIRST], false, 0
+
+(48) Scan parquet default.inventory
 Output [4]: [inv_date_sk#31, inv_item_sk#32, inv_warehouse_sk#33, inv_quantity_on_hand#34]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/inventory]
 PushedFilters: [IsNotNull(inv_quantity_on_hand), IsNotNull(inv_item_sk), IsNotNull(inv_warehouse_sk), IsNotNull(inv_date_sk)]
 ReadSchema: struct<inv_date_sk:int,inv_item_sk:int,inv_warehouse_sk:int,inv_quantity_on_hand:int>
 
-(47) ColumnarToRow
+(49) ColumnarToRow [codegen id : 13]
 Input [4]: [inv_date_sk#31, inv_item_sk#32, inv_warehouse_sk#33, inv_quantity_on_hand#34]
 
-(48) Filter
+(50) Filter [codegen id : 13]
 Input [4]: [inv_date_sk#31, inv_item_sk#32, inv_warehouse_sk#33, inv_quantity_on_hand#34]
 Condition : (((isnotnull(inv_quantity_on_hand#34) AND isnotnull(inv_item_sk#32)) AND isnotnull(inv_warehouse_sk#33)) AND isnotnull(inv_date_sk#31))
 
-(49) BroadcastHashJoin [codegen id : 13]
-Left keys [1]: [d_date_sk#28]
-Right keys [1]: [inv_date_sk#31]
-Join condition: None
-
-(50) Project [codegen id : 13]
-Output [6]: [d_date_sk#23, d_date#24, d_week_seq#25, inv_item_sk#32, inv_warehouse_sk#33, inv_quantity_on_hand#34]
-Input [8]: [d_date_sk#23, d_date#24, d_week_seq#25, d_date_sk#28, inv_date_sk#31, inv_item_sk#32, inv_warehouse_sk#33, inv_quantity_on_hand#34]
-
 (51) Scan parquet default.warehouse
 Output [2]: [w_warehouse_sk#35, w_warehouse_name#36]
 Batched: true
@@ -326,25 +326,25 @@ Right keys [1]: [w_warehouse_sk#35]
 Join condition: None
 
 (56) Project [codegen id : 13]
-Output [6]: [d_date_sk#23, d_date#24, d_week_seq#25, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
-Input [8]: [d_date_sk#23, d_date#24, d_week_seq#25, inv_item_sk#32, inv_warehouse_sk#33, inv_quantity_on_hand#34, w_warehouse_sk#35, w_warehouse_name#36]
+Output [4]: [inv_date_sk#31, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
+Input [6]: [inv_date_sk#31, inv_item_sk#32, inv_warehouse_sk#33, inv_quantity_on_hand#34, w_warehouse_sk#35, w_warehouse_name#36]
 
 (57) Exchange
-Input [6]: [d_date_sk#23, d_date#24, d_week_seq#25, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
-Arguments: hashpartitioning(inv_item_sk#32, d_date_sk#23, 5), ENSURE_REQUIREMENTS, [id=#38]
+Input [4]: [inv_date_sk#31, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
+Arguments: hashpartitioning(inv_item_sk#32, inv_date_sk#31, 5), ENSURE_REQUIREMENTS, [id=#38]
 
 (58) Sort [codegen id : 14]
-Input [6]: [d_date_sk#23, d_date#24, d_week_seq#25, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
-Arguments: [inv_item_sk#32 ASC NULLS FIRST, d_date_sk#23 ASC NULLS FIRST], false, 0
+Input [4]: [inv_date_sk#31, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
+Arguments: [inv_item_sk#32 ASC NULLS FIRST, inv_date_sk#31 ASC NULLS FIRST], false, 0
 
 (59) SortMergeJoin [codegen id : 16]
-Left keys [2]: [cs_item_sk#5, cs_sold_date_sk#1]
-Right keys [2]: [inv_item_sk#32, d_date_sk#23]
-Join condition: ((inv_quantity_on_hand#34 < cs_quantity#8) AND (d_date#16 > d_date#24 + 5 days))
+Left keys [2]: [cs_item_sk#5, d_date_sk#27]
+Right keys [2]: [inv_item_sk#32, inv_date_sk#31]
+Join condition: (inv_quantity_on_hand#34 < cs_quantity#8)
 
 (60) Project [codegen id : 16]
-Output [6]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#25]
-Input [13]: [cs_sold_date_sk#1, cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, d_date#16, i_item_desc#20, d_date_sk#23, d_date#24, d_week_seq#25, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
+Output [6]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#24]
+Input [11]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, i_item_desc#20, d_week_seq#24, d_date_sk#27, inv_date_sk#31, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
 
 (61) Scan parquet default.promotion
 Output [1]: [p_promo_sk#39]
@@ -370,15 +370,15 @@ Right keys [1]: [p_promo_sk#39]
 Join condition: None
 
 (66) Project [codegen id : 16]
-Output [5]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#25]
-Input [7]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#25, p_promo_sk#39]
+Output [5]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#24]
+Input [7]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#24, p_promo_sk#39]
 
 (67) Exchange
-Input [5]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#25]
+Input [5]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#24]
 Arguments: hashpartitioning(cs_item_sk#5, cs_order_number#7, 5), ENSURE_REQUIREMENTS, [id=#41]
 
 (68) Sort [codegen id : 17]
-Input [5]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#25]
+Input [5]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#24]
 Arguments: [cs_item_sk#5 ASC NULLS FIRST, cs_order_number#7 ASC NULLS FIRST], false, 0
 
 (69) Scan parquet default.catalog_returns
@@ -409,28 +409,28 @@ Right keys [2]: [cr_item_sk#42, cr_order_number#43]
 Join condition: None
 
 (75) Project [codegen id : 20]
-Output [3]: [w_warehouse_name#36, i_item_desc#20, d_week_seq#25]
-Input [7]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#25, cr_item_sk#42, cr_order_number#43]
+Output [3]: [w_warehouse_name#36, i_item_desc#20, d_week_seq#24]
+Input [7]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#24, cr_item_sk#42, cr_order_number#43]
 
 (76) HashAggregate [codegen id : 20]
-Input [3]: [w_warehouse_name#36, i_item_desc#20, d_week_seq#25]
-Keys [3]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#25]
+Input [3]: [w_warehouse_name#36, i_item_desc#20, d_week_seq#24]
+Keys [3]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#24]
 Functions [1]: [partial_count(1)]
 Aggregate Attributes [1]: [count#45]
-Results [4]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#25, count#46]
+Results [4]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#24, count#46]
 
 (77) Exchange
-Input [4]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#25, count#46]
-Arguments: hashpartitioning(i_item_desc#20, w_warehouse_name#36, d_week_seq#25, 5), ENSURE_REQUIREMENTS, [id=#47]
+Input [4]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#24, count#46]
+Arguments: hashpartitioning(i_item_desc#20, w_warehouse_name#36, d_week_seq#24, 5), ENSURE_REQUIREMENTS, [id=#47]
 
 (78) HashAggregate [codegen id : 21]
-Input [4]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#25, count#46]
-Keys [3]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#25]
+Input [4]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#24, count#46]
+Keys [3]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#24]
 Functions [1]: [count(1)]
 Aggregate Attributes [1]: [count(1)#48]
-Results [6]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#25, count(1)#48 AS no_promo#49, count(1)#48 AS promo#50, count(1)#48 AS total_cnt#51]
+Results [6]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#24, count(1)#48 AS no_promo#49, count(1)#48 AS promo#50, count(1)#48 AS total_cnt#51]
 
 (79) TakeOrderedAndProject
-Input [6]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#25, no_promo#49, promo#50, total_cnt#51]
-Arguments: 100, [total_cnt#51 DESC NULLS LAST, i_item_desc#20 ASC NULLS FIRST, w_warehouse_name#36 ASC NULLS FIRST, d_week_seq#25 ASC NULLS FIRST], [i_item_desc#20, w_warehouse_name#36, d_week_seq#25, no_promo#49, promo#50, total_cnt#51]
+Input [6]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#24, no_promo#49, promo#50, total_cnt#51]
+Arguments: 100, [total_cnt#51 DESC NULLS LAST, i_item_desc#20 ASC NULLS FIRST, w_warehouse_name#36 ASC NULLS FIRST, d_week_seq#24 ASC NULLS FIRST], [i_item_desc#20, w_warehouse_name#36, d_week_seq#24, no_promo#49, promo#50, total_cnt#51]
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q72.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q72.sf100/simplified.txt
index b88505ad7b9bc..918508787c4b0 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q72.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q72.sf100/simplified.txt
@@ -16,95 +16,95 @@ TakeOrderedAndProject [total_cnt,i_item_desc,w_warehouse_name,d_week_seq,no_prom
                               Project [cs_item_sk,cs_order_number,w_warehouse_name,i_item_desc,d_week_seq]
                                 BroadcastHashJoin [cs_promo_sk,p_promo_sk]
                                   Project [cs_item_sk,cs_promo_sk,cs_order_number,w_warehouse_name,i_item_desc,d_week_seq]
-                                    SortMergeJoin [cs_item_sk,cs_sold_date_sk,inv_item_sk,d_date_sk,inv_quantity_on_hand,cs_quantity,d_date,d_date]
+                                    SortMergeJoin [cs_item_sk,d_date_sk,inv_item_sk,inv_date_sk,inv_quantity_on_hand,cs_quantity]
                                       InputAdapter
-                                        WholeStageCodegen (9)
-                                          Sort [cs_item_sk,cs_sold_date_sk]
+                                        WholeStageCodegen (11)
+                                          Sort [cs_item_sk,d_date_sk]
                                             InputAdapter
-                                              Exchange [cs_item_sk,cs_sold_date_sk] #3
-                                                WholeStageCodegen (8)
-                                                  Project [cs_sold_date_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity,d_date,i_item_desc]
-                                                    SortMergeJoin [cs_item_sk,i_item_sk]
-                                                      InputAdapter
-                                                        WholeStageCodegen (5)
-                                                          Sort [cs_item_sk]
-                                                            InputAdapter
-                                                              Exchange [cs_item_sk] #4
-                                                                WholeStageCodegen (4)
-                                                                  Project [cs_sold_date_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity,d_date]
-                                                                    BroadcastHashJoin [cs_ship_date_sk,d_date_sk]
-                                                                      Project [cs_sold_date_sk,cs_ship_date_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity]
-                                                                        BroadcastHashJoin [cs_bill_cdemo_sk,cd_demo_sk]
-                                                                          Project [cs_sold_date_sk,cs_ship_date_sk,cs_bill_cdemo_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity]
-                                                                            BroadcastHashJoin [cs_bill_hdemo_sk,hd_demo_sk]
-                                                                              Filter [cs_quantity,cs_item_sk,cs_bill_cdemo_sk,cs_bill_hdemo_sk,cs_sold_date_sk,cs_ship_date_sk]
-                                                                                ColumnarToRow
+                                              Exchange [cs_item_sk,d_date_sk] #3
+                                                WholeStageCodegen (10)
+                                                  Project [cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity,i_item_desc,d_week_seq,d_date_sk]
+                                                    BroadcastHashJoin [cs_sold_date_sk,d_date_sk,d_date,d_date]
+                                                      Project [cs_sold_date_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity,d_date,i_item_desc]
+                                                        SortMergeJoin [cs_item_sk,i_item_sk]
+                                                          InputAdapter
+                                                            WholeStageCodegen (5)
+                                                              Sort [cs_item_sk]
+                                                                InputAdapter
+                                                                  Exchange [cs_item_sk] #4
+                                                                    WholeStageCodegen (4)
+                                                                      Project [cs_sold_date_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity,d_date]
+                                                                        BroadcastHashJoin [cs_ship_date_sk,d_date_sk]
+                                                                          Project [cs_sold_date_sk,cs_ship_date_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity]
+                                                                            BroadcastHashJoin [cs_bill_cdemo_sk,cd_demo_sk]
+                                                                              Project [cs_sold_date_sk,cs_ship_date_sk,cs_bill_cdemo_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity]
+                                                                                BroadcastHashJoin [cs_bill_hdemo_sk,hd_demo_sk]
+                                                                                  Filter [cs_quantity,cs_item_sk,cs_bill_cdemo_sk,cs_bill_hdemo_sk,cs_sold_date_sk,cs_ship_date_sk]
+                                                                                    ColumnarToRow
+                                                                                      InputAdapter
+                                                                                        Scan parquet default.catalog_sales [cs_sold_date_sk,cs_ship_date_sk,cs_bill_cdemo_sk,cs_bill_hdemo_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity]
                                                                                   InputAdapter
-                                                                                    Scan parquet default.catalog_sales [cs_sold_date_sk,cs_ship_date_sk,cs_bill_cdemo_sk,cs_bill_hdemo_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity]
+                                                                                    BroadcastExchange #5
+                                                                                      WholeStageCodegen (1)
+                                                                                        Project [hd_demo_sk]
+                                                                                          Filter [hd_buy_potential,hd_demo_sk]
+                                                                                            ColumnarToRow
+                                                                                              InputAdapter
+                                                                                                Scan parquet default.household_demographics [hd_demo_sk,hd_buy_potential]
                                                                               InputAdapter
-                                                                                BroadcastExchange #5
-                                                                                  WholeStageCodegen (1)
-                                                                                    Project [hd_demo_sk]
-                                                                                      Filter [hd_buy_potential,hd_demo_sk]
+                                                                                BroadcastExchange #6
+                                                                                  WholeStageCodegen (2)
+                                                                                    Project [cd_demo_sk]
+                                                                                      Filter [cd_marital_status,cd_demo_sk]
                                                                                         ColumnarToRow
                                                                                           InputAdapter
-                                                                                            Scan parquet default.household_demographics [hd_demo_sk,hd_buy_potential]
+                                                                                            Scan parquet default.customer_demographics [cd_demo_sk,cd_marital_status]
                                                                           InputAdapter
-                                                                            BroadcastExchange #6
-                                                                              WholeStageCodegen (2)
-                                                                                Project [cd_demo_sk]
-                                                                                  Filter [cd_marital_status,cd_demo_sk]
-                                                                                    ColumnarToRow
-                                                                                      InputAdapter
-                                                                                        Scan parquet default.customer_demographics [cd_demo_sk,cd_marital_status]
-                                                                      InputAdapter
-                                                                        BroadcastExchange #7
-                                                                          WholeStageCodegen (3)
-                                                                            Filter [d_date,d_date_sk]
-                                                                              ColumnarToRow
-                                                                                InputAdapter
-                                                                                  Scan parquet default.date_dim [d_date_sk,d_date]
+                                                                            BroadcastExchange #7
+                                                                              WholeStageCodegen (3)
+                                                                                Filter [d_date,d_date_sk]
+                                                                                  ColumnarToRow
+                                                                                    InputAdapter
+                                                                                      Scan parquet default.date_dim [d_date_sk,d_date]
+                                                          InputAdapter
+                                                            WholeStageCodegen (7)
+                                                              Sort [i_item_sk]
+                                                                InputAdapter
+                                                                  Exchange [i_item_sk] #8
+                                                                    WholeStageCodegen (6)
+                                                                      Filter [i_item_sk]
+                                                                        ColumnarToRow
+                                                                          InputAdapter
+                                                                            Scan parquet default.item [i_item_sk,i_item_desc]
                                                       InputAdapter
-                                                        WholeStageCodegen (7)
-                                                          Sort [i_item_sk]
-                                                            InputAdapter
-                                                              Exchange [i_item_sk] #8
-                                                                WholeStageCodegen (6)
-                                                                  Filter [i_item_sk]
-                                                                    ColumnarToRow
-                                                                      InputAdapter
-                                                                        Scan parquet default.item [i_item_sk,i_item_desc]
+                                                        BroadcastExchange #9
+                                                          WholeStageCodegen (9)
+                                                            Project [d_date_sk,d_date,d_week_seq,d_date_sk]
+                                                              BroadcastHashJoin [d_week_seq,d_week_seq]
+                                                                InputAdapter
+                                                                  BroadcastExchange #10
+                                                                    WholeStageCodegen (8)
+                                                                      Project [d_date_sk,d_date,d_week_seq]
+                                                                        Filter [d_year,d_date_sk,d_week_seq,d_date]
+                                                                          ColumnarToRow
+                                                                            InputAdapter
+                                                                              Scan parquet default.date_dim [d_date_sk,d_date,d_week_seq,d_year]
+                                                                Filter [d_week_seq,d_date_sk]
+                                                                  ColumnarToRow
+                                                                    InputAdapter
+                                                                      Scan parquet default.date_dim [d_date_sk,d_week_seq]
                                       InputAdapter
                                         WholeStageCodegen (14)
-                                          Sort [inv_item_sk,d_date_sk]
+                                          Sort [inv_item_sk,inv_date_sk]
                                             InputAdapter
-                                              Exchange [inv_item_sk,d_date_sk] #9
+                                              Exchange [inv_item_sk,inv_date_sk] #11
                                                 WholeStageCodegen (13)
-                                                  Project [d_date_sk,d_date,d_week_seq,inv_item_sk,inv_quantity_on_hand,w_warehouse_name]
+                                                  Project [inv_date_sk,inv_item_sk,inv_quantity_on_hand,w_warehouse_name]
                                                     BroadcastHashJoin [inv_warehouse_sk,w_warehouse_sk]
-                                                      Project [d_date_sk,d_date,d_week_seq,inv_item_sk,inv_warehouse_sk,inv_quantity_on_hand]
-                                                        BroadcastHashJoin [d_date_sk,inv_date_sk]
+                                                      Filter [inv_quantity_on_hand,inv_item_sk,inv_warehouse_sk,inv_date_sk]
+                                                        ColumnarToRow
                                                           InputAdapter
-                                                            BroadcastExchange #10
-                                                              WholeStageCodegen (11)
-                                                                Project [d_date_sk,d_date,d_week_seq,d_date_sk]
-                                                                  BroadcastHashJoin [d_week_seq,d_week_seq]
-                                                                    InputAdapter
-                                                                      BroadcastExchange #11
-                                                                        WholeStageCodegen (10)
-                                                                          Project [d_date_sk,d_date,d_week_seq]
-                                                                            Filter [d_year,d_date_sk,d_week_seq,d_date]
-                                                                              ColumnarToRow
-                                                                                InputAdapter
-                                                                                  Scan parquet default.date_dim [d_date_sk,d_date,d_week_seq,d_year]
-                                                                    Filter [d_week_seq,d_date_sk]
-                                                                      ColumnarToRow
-                                                                        InputAdapter
-                                                                          Scan parquet default.date_dim [d_date_sk,d_week_seq]
-                                                          Filter [inv_quantity_on_hand,inv_item_sk,inv_warehouse_sk,inv_date_sk]
-                                                            ColumnarToRow
-                                                              InputAdapter
-                                                                Scan parquet default.inventory [inv_date_sk,inv_item_sk,inv_warehouse_sk,inv_quantity_on_hand]
+                                                            Scan parquet default.inventory [inv_date_sk,inv_item_sk,inv_warehouse_sk,inv_quantity_on_hand]
                                                       InputAdapter
                                                         BroadcastExchange #12
                                                           WholeStageCodegen (12)
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q81.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q81.sf100/explain.txt
index 6813696266ac5..f89e4b8d093f1 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q81.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q81.sf100/explain.txt
@@ -1,343 +1,343 @@
 == Physical Plan ==
 TakeOrderedAndProject (61)
 +- * Project (60)
-   +- * SortMergeJoin Inner (59)
-      :- * Sort (47)
-      :  +- Exchange (46)
-      :     +- * Project (45)
-      :        +- * BroadcastHashJoin Inner BuildRight (44)
-      :           :- * Filter (23)
-      :           :  +- * HashAggregate (22)
-      :           :     +- Exchange (21)
-      :           :        +- * HashAggregate (20)
-      :           :           +- * Project (19)
-      :           :              +- * SortMergeJoin Inner (18)
-      :           :                 :- * Sort (12)
-      :           :                 :  +- Exchange (11)
-      :           :                 :     +- * Project (10)
-      :           :                 :        +- * BroadcastHashJoin Inner BuildRight (9)
-      :           :                 :           :- * Filter (3)
-      :           :                 :           :  +- * ColumnarToRow (2)
-      :           :                 :           :     +- Scan parquet default.catalog_returns (1)
-      :           :                 :           +- BroadcastExchange (8)
-      :           :                 :              +- * Project (7)
-      :           :                 :                 +- * Filter (6)
-      :           :                 :                    +- * ColumnarToRow (5)
-      :           :                 :                       +- Scan parquet default.date_dim (4)
-      :           :                 +- * Sort (17)
-      :           :                    +- Exchange (16)
-      :           :                       +- * Filter (15)
-      :           :                          +- * ColumnarToRow (14)
-      :           :                             +- Scan parquet default.customer_address (13)
-      :           +- BroadcastExchange (43)
-      :              +- * Filter (42)
-      :                 +- * HashAggregate (41)
-      :                    +- Exchange (40)
-      :                       +- * HashAggregate (39)
-      :                          +- * HashAggregate (38)
-      :                             +- Exchange (37)
-      :                                +- * HashAggregate (36)
-      :                                   +- * Project (35)
-      :                                      +- * SortMergeJoin Inner (34)
-      :                                         :- * Sort (31)
-      :                                         :  +- Exchange (30)
-      :                                         :     +- * Project (29)
-      :                                         :        +- * BroadcastHashJoin Inner BuildRight (28)
-      :                                         :           :- * Filter (26)
-      :                                         :           :  +- * ColumnarToRow (25)
-      :                                         :           :     +- Scan parquet default.catalog_returns (24)
-      :                                         :           +- ReusedExchange (27)
-      :                                         +- * Sort (33)
-      :                                            +- ReusedExchange (32)
-      +- * Sort (58)
-         +- Exchange (57)
-            +- * Project (56)
-               +- * BroadcastHashJoin Inner BuildRight (55)
-                  :- * Filter (50)
-                  :  +- * ColumnarToRow (49)
-                  :     +- Scan parquet default.customer (48)
-                  +- BroadcastExchange (54)
-                     +- * Filter (53)
-                        +- * ColumnarToRow (52)
-                           +- Scan parquet default.customer_address (51)
-
-
-(1) Scan parquet default.catalog_returns
-Output [4]: [cr_returned_date_sk#1, cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4]
+   +- * BroadcastHashJoin Inner BuildRight (59)
+      :- * Project (38)
+      :  +- * SortMergeJoin Inner (37)
+      :     :- * Sort (11)
+      :     :  +- Exchange (10)
+      :     :     +- * Project (9)
+      :     :        +- * BroadcastHashJoin Inner BuildRight (8)
+      :     :           :- * Filter (3)
+      :     :           :  +- * ColumnarToRow (2)
+      :     :           :     +- Scan parquet default.customer (1)
+      :     :           +- BroadcastExchange (7)
+      :     :              +- * Filter (6)
+      :     :                 +- * ColumnarToRow (5)
+      :     :                    +- Scan parquet default.customer_address (4)
+      :     +- * Sort (36)
+      :        +- Exchange (35)
+      :           +- * Filter (34)
+      :              +- * HashAggregate (33)
+      :                 +- Exchange (32)
+      :                    +- * HashAggregate (31)
+      :                       +- * Project (30)
+      :                          +- * SortMergeJoin Inner (29)
+      :                             :- * Sort (23)
+      :                             :  +- Exchange (22)
+      :                             :     +- * Project (21)
+      :                             :        +- * BroadcastHashJoin Inner BuildRight (20)
+      :                             :           :- * Filter (14)
+      :                             :           :  +- * ColumnarToRow (13)
+      :                             :           :     +- Scan parquet default.catalog_returns (12)
+      :                             :           +- BroadcastExchange (19)
+      :                             :              +- * Project (18)
+      :                             :                 +- * Filter (17)
+      :                             :                    +- * ColumnarToRow (16)
+      :                             :                       +- Scan parquet default.date_dim (15)
+      :                             +- * Sort (28)
+      :                                +- Exchange (27)
+      :                                   +- * Filter (26)
+      :                                      +- * ColumnarToRow (25)
+      :                                         +- Scan parquet default.customer_address (24)
+      +- BroadcastExchange (58)
+         +- * Filter (57)
+            +- * HashAggregate (56)
+               +- Exchange (55)
+                  +- * HashAggregate (54)
+                     +- * HashAggregate (53)
+                        +- Exchange (52)
+                           +- * HashAggregate (51)
+                              +- * Project (50)
+                                 +- * SortMergeJoin Inner (49)
+                                    :- * Sort (46)
+                                    :  +- Exchange (45)
+                                    :     +- * Project (44)
+                                    :        +- * BroadcastHashJoin Inner BuildRight (43)
+                                    :           :- * Filter (41)
+                                    :           :  +- * ColumnarToRow (40)
+                                    :           :     +- Scan parquet default.catalog_returns (39)
+                                    :           +- ReusedExchange (42)
+                                    +- * Sort (48)
+                                       +- ReusedExchange (47)
+
+
+(1) Scan parquet default.customer
+Output [6]: [c_customer_sk#1, c_customer_id#2, c_current_addr_sk#3, c_salutation#4, c_first_name#5, c_last_name#6]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/catalog_returns]
-PushedFilters: [IsNotNull(cr_returned_date_sk), IsNotNull(cr_returning_addr_sk), IsNotNull(cr_returning_customer_sk)]
-ReadSchema: struct<cr_returned_date_sk:int,cr_returning_customer_sk:int,cr_returning_addr_sk:int,cr_return_amt_inc_tax:decimal(7,2)>
+Location [not included in comparison]/{warehouse_dir}/customer]
+PushedFilters: [IsNotNull(c_customer_sk), IsNotNull(c_current_addr_sk)]
+ReadSchema: struct<c_customer_sk:int,c_customer_id:string,c_current_addr_sk:int,c_salutation:string,c_first_name:string,c_last_name:string>
 
 (2) ColumnarToRow [codegen id : 2]
-Input [4]: [cr_returned_date_sk#1, cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4]
+Input [6]: [c_customer_sk#1, c_customer_id#2, c_current_addr_sk#3, c_salutation#4, c_first_name#5, c_last_name#6]
 
 (3) Filter [codegen id : 2]
-Input [4]: [cr_returned_date_sk#1, cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4]
-Condition : ((isnotnull(cr_returned_date_sk#1) AND isnotnull(cr_returning_addr_sk#3)) AND isnotnull(cr_returning_customer_sk#2))
+Input [6]: [c_customer_sk#1, c_customer_id#2, c_current_addr_sk#3, c_salutation#4, c_first_name#5, c_last_name#6]
+Condition : (isnotnull(c_customer_sk#1) AND isnotnull(c_current_addr_sk#3))
 
-(4) Scan parquet default.date_dim
-Output [2]: [d_date_sk#5, d_year#6]
+(4) Scan parquet default.customer_address
+Output [12]: [ca_address_sk#7, ca_street_number#8, ca_street_name#9, ca_street_type#10, ca_suite_number#11, ca_city#12, ca_county#13, ca_state#14, ca_zip#15, ca_country#16, ca_gmt_offset#17, ca_location_type#18]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/date_dim]
-PushedFilters: [IsNotNull(d_year), EqualTo(d_year,2000), IsNotNull(d_date_sk)]
-ReadSchema: struct<d_date_sk:int,d_year:int>
+Location [not included in comparison]/{warehouse_dir}/customer_address]
+PushedFilters: [IsNotNull(ca_state), EqualTo(ca_state,GA), IsNotNull(ca_address_sk)]
+ReadSchema: struct<ca_address_sk:int,ca_street_number:string,ca_street_name:string,ca_street_type:string,ca_suite_number:string,ca_city:string,ca_county:string,ca_state:string,ca_zip:string,ca_country:string,ca_gmt_offset:decimal(5,2),ca_location_type:string>
 
 (5) ColumnarToRow [codegen id : 1]
-Input [2]: [d_date_sk#5, d_year#6]
+Input [12]: [ca_address_sk#7, ca_street_number#8, ca_street_name#9, ca_street_type#10, ca_suite_number#11, ca_city#12, ca_county#13, ca_state#14, ca_zip#15, ca_country#16, ca_gmt_offset#17, ca_location_type#18]
 
 (6) Filter [codegen id : 1]
-Input [2]: [d_date_sk#5, d_year#6]
-Condition : ((isnotnull(d_year#6) AND (d_year#6 = 2000)) AND isnotnull(d_date_sk#5))
-
-(7) Project [codegen id : 1]
-Output [1]: [d_date_sk#5]
-Input [2]: [d_date_sk#5, d_year#6]
+Input [12]: [ca_address_sk#7, ca_street_number#8, ca_street_name#9, ca_street_type#10, ca_suite_number#11, ca_city#12, ca_county#13, ca_state#14, ca_zip#15, ca_country#16, ca_gmt_offset#17, ca_location_type#18]
+Condition : ((isnotnull(ca_state#14) AND (ca_state#14 = GA)) AND isnotnull(ca_address_sk#7))
 
-(8) BroadcastExchange
-Input [1]: [d_date_sk#5]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#7]
+(7) BroadcastExchange
+Input [12]: [ca_address_sk#7, ca_street_number#8, ca_street_name#9, ca_street_type#10, ca_suite_number#11, ca_city#12, ca_county#13, ca_state#14, ca_zip#15, ca_country#16, ca_gmt_offset#17, ca_location_type#18]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#19]
 
-(9) BroadcastHashJoin [codegen id : 2]
-Left keys [1]: [cr_returned_date_sk#1]
-Right keys [1]: [d_date_sk#5]
+(8) BroadcastHashJoin [codegen id : 2]
+Left keys [1]: [c_current_addr_sk#3]
+Right keys [1]: [ca_address_sk#7]
 Join condition: None
 
-(10) Project [codegen id : 2]
-Output [3]: [cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4]
-Input [5]: [cr_returned_date_sk#1, cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4, d_date_sk#5]
+(9) Project [codegen id : 2]
+Output [16]: [c_customer_sk#1, c_customer_id#2, c_salutation#4, c_first_name#5, c_last_name#6, ca_street_number#8, ca_street_name#9, ca_street_type#10, ca_suite_number#11, ca_city#12, ca_county#13, ca_state#14, ca_zip#15, ca_country#16, ca_gmt_offset#17, ca_location_type#18]
+Input [18]: [c_customer_sk#1, c_customer_id#2, c_current_addr_sk#3, c_salutation#4, c_first_name#5, c_last_name#6, ca_address_sk#7, ca_street_number#8, ca_street_name#9, ca_street_type#10, ca_suite_number#11, ca_city#12, ca_county#13, ca_state#14, ca_zip#15, ca_country#16, ca_gmt_offset#17, ca_location_type#18]
 
-(11) Exchange
-Input [3]: [cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4]
-Arguments: hashpartitioning(cr_returning_addr_sk#3, 5), ENSURE_REQUIREMENTS, [id=#8]
+(10) Exchange
+Input [16]: [c_customer_sk#1, c_customer_id#2, c_salutation#4, c_first_name#5, c_last_name#6, ca_street_number#8, ca_street_name#9, ca_street_type#10, ca_suite_number#11, ca_city#12, ca_county#13, ca_state#14, ca_zip#15, ca_country#16, ca_gmt_offset#17, ca_location_type#18]
+Arguments: hashpartitioning(c_customer_sk#1, 5), ENSURE_REQUIREMENTS, [id=#20]
 
-(12) Sort [codegen id : 3]
-Input [3]: [cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4]
-Arguments: [cr_returning_addr_sk#3 ASC NULLS FIRST], false, 0
+(11) Sort [codegen id : 3]
+Input [16]: [c_customer_sk#1, c_customer_id#2, c_salutation#4, c_first_name#5, c_last_name#6, ca_street_number#8, ca_street_name#9, ca_street_type#10, ca_suite_number#11, ca_city#12, ca_county#13, ca_state#14, ca_zip#15, ca_country#16, ca_gmt_offset#17, ca_location_type#18]
+Arguments: [c_customer_sk#1 ASC NULLS FIRST], false, 0
 
-(13) Scan parquet default.customer_address
-Output [2]: [ca_address_sk#9, ca_state#10]
+(12) Scan parquet default.catalog_returns
+Output [4]: [cr_returned_date_sk#21, cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/customer_address]
-PushedFilters: [IsNotNull(ca_address_sk), IsNotNull(ca_state)]
-ReadSchema: struct<ca_address_sk:int,ca_state:string>
+Location [not included in comparison]/{warehouse_dir}/catalog_returns]
+PushedFilters: [IsNotNull(cr_returned_date_sk), IsNotNull(cr_returning_addr_sk), IsNotNull(cr_returning_customer_sk)]
+ReadSchema: struct<cr_returned_date_sk:int,cr_returning_customer_sk:int,cr_returning_addr_sk:int,cr_return_amt_inc_tax:decimal(7,2)>
 
-(14) ColumnarToRow [codegen id : 4]
-Input [2]: [ca_address_sk#9, ca_state#10]
+(13) ColumnarToRow [codegen id : 5]
+Input [4]: [cr_returned_date_sk#21, cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24]
 
-(15) Filter [codegen id : 4]
-Input [2]: [ca_address_sk#9, ca_state#10]
-Condition : (isnotnull(ca_address_sk#9) AND isnotnull(ca_state#10))
+(14) Filter [codegen id : 5]
+Input [4]: [cr_returned_date_sk#21, cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24]
+Condition : ((isnotnull(cr_returned_date_sk#21) AND isnotnull(cr_returning_addr_sk#23)) AND isnotnull(cr_returning_customer_sk#22))
 
-(16) Exchange
-Input [2]: [ca_address_sk#9, ca_state#10]
-Arguments: hashpartitioning(ca_address_sk#9, 5), ENSURE_REQUIREMENTS, [id=#11]
+(15) Scan parquet default.date_dim
+Output [2]: [d_date_sk#25, d_year#26]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/date_dim]
+PushedFilters: [IsNotNull(d_year), EqualTo(d_year,2000), IsNotNull(d_date_sk)]
+ReadSchema: struct<d_date_sk:int,d_year:int>
 
-(17) Sort [codegen id : 5]
-Input [2]: [ca_address_sk#9, ca_state#10]
-Arguments: [ca_address_sk#9 ASC NULLS FIRST], false, 0
+(16) ColumnarToRow [codegen id : 4]
+Input [2]: [d_date_sk#25, d_year#26]
 
-(18) SortMergeJoin [codegen id : 6]
-Left keys [1]: [cr_returning_addr_sk#3]
-Right keys [1]: [ca_address_sk#9]
-Join condition: None
+(17) Filter [codegen id : 4]
+Input [2]: [d_date_sk#25, d_year#26]
+Condition : ((isnotnull(d_year#26) AND (d_year#26 = 2000)) AND isnotnull(d_date_sk#25))
 
-(19) Project [codegen id : 6]
-Output [3]: [cr_returning_customer_sk#2, cr_return_amt_inc_tax#4, ca_state#10]
-Input [5]: [cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4, ca_address_sk#9, ca_state#10]
-
-(20) HashAggregate [codegen id : 6]
-Input [3]: [cr_returning_customer_sk#2, cr_return_amt_inc_tax#4, ca_state#10]
-Keys [2]: [cr_returning_customer_sk#2, ca_state#10]
-Functions [1]: [partial_sum(UnscaledValue(cr_return_amt_inc_tax#4))]
-Aggregate Attributes [1]: [sum#12]
-Results [3]: [cr_returning_customer_sk#2, ca_state#10, sum#13]
-
-(21) Exchange
-Input [3]: [cr_returning_customer_sk#2, ca_state#10, sum#13]
-Arguments: hashpartitioning(cr_returning_customer_sk#2, ca_state#10, 5), ENSURE_REQUIREMENTS, [id=#14]
-
-(22) HashAggregate [codegen id : 15]
-Input [3]: [cr_returning_customer_sk#2, ca_state#10, sum#13]
-Keys [2]: [cr_returning_customer_sk#2, ca_state#10]
-Functions [1]: [sum(UnscaledValue(cr_return_amt_inc_tax#4))]
-Aggregate Attributes [1]: [sum(UnscaledValue(cr_return_amt_inc_tax#4))#15]
-Results [3]: [cr_returning_customer_sk#2 AS ctr_customer_sk#16, ca_state#10 AS ctr_state#17, MakeDecimal(sum(UnscaledValue(cr_return_amt_inc_tax#4))#15,17,2) AS ctr_total_return#18]
-
-(23) Filter [codegen id : 15]
-Input [3]: [ctr_customer_sk#16, ctr_state#17, ctr_total_return#18]
-Condition : isnotnull(ctr_total_return#18)
-
-(24) Scan parquet default.catalog_returns
-Output [4]: [cr_returned_date_sk#1, cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4]
-Batched: true
-Location [not included in comparison]/{warehouse_dir}/catalog_returns]
-PushedFilters: [IsNotNull(cr_returned_date_sk), IsNotNull(cr_returning_addr_sk)]
-ReadSchema: struct<cr_returned_date_sk:int,cr_returning_customer_sk:int,cr_returning_addr_sk:int,cr_return_amt_inc_tax:decimal(7,2)>
+(18) Project [codegen id : 4]
+Output [1]: [d_date_sk#25]
+Input [2]: [d_date_sk#25, d_year#26]
 
-(25) ColumnarToRow [codegen id : 8]
-Input [4]: [cr_returned_date_sk#1, cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4]
+(19) BroadcastExchange
+Input [1]: [d_date_sk#25]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#27]
 
-(26) Filter [codegen id : 8]
-Input [4]: [cr_returned_date_sk#1, cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4]
-Condition : (isnotnull(cr_returned_date_sk#1) AND isnotnull(cr_returning_addr_sk#3))
+(20) BroadcastHashJoin [codegen id : 5]
+Left keys [1]: [cr_returned_date_sk#21]
+Right keys [1]: [d_date_sk#25]
+Join condition: None
 
-(27) ReusedExchange [Reuses operator id: 8]
-Output [1]: [d_date_sk#5]
+(21) Project [codegen id : 5]
+Output [3]: [cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24]
+Input [5]: [cr_returned_date_sk#21, cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24, d_date_sk#25]
 
-(28) BroadcastHashJoin [codegen id : 8]
-Left keys [1]: [cr_returned_date_sk#1]
-Right keys [1]: [d_date_sk#5]
-Join condition: None
+(22) Exchange
+Input [3]: [cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24]
+Arguments: hashpartitioning(cr_returning_addr_sk#23, 5), ENSURE_REQUIREMENTS, [id=#28]
 
-(29) Project [codegen id : 8]
-Output [3]: [cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4]
-Input [5]: [cr_returned_date_sk#1, cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4, d_date_sk#5]
+(23) Sort [codegen id : 6]
+Input [3]: [cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24]
+Arguments: [cr_returning_addr_sk#23 ASC NULLS FIRST], false, 0
 
-(30) Exchange
-Input [3]: [cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4]
-Arguments: hashpartitioning(cr_returning_addr_sk#3, 5), ENSURE_REQUIREMENTS, [id=#19]
+(24) Scan parquet default.customer_address
+Output [2]: [ca_address_sk#7, ca_state#14]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/customer_address]
+PushedFilters: [IsNotNull(ca_address_sk), IsNotNull(ca_state)]
+ReadSchema: struct<ca_address_sk:int,ca_state:string>
 
-(31) Sort [codegen id : 9]
-Input [3]: [cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4]
-Arguments: [cr_returning_addr_sk#3 ASC NULLS FIRST], false, 0
+(25) ColumnarToRow [codegen id : 7]
+Input [2]: [ca_address_sk#7, ca_state#14]
 
-(32) ReusedExchange [Reuses operator id: 16]
-Output [2]: [ca_address_sk#9, ca_state#10]
+(26) Filter [codegen id : 7]
+Input [2]: [ca_address_sk#7, ca_state#14]
+Condition : (isnotnull(ca_address_sk#7) AND isnotnull(ca_state#14))
 
-(33) Sort [codegen id : 11]
-Input [2]: [ca_address_sk#9, ca_state#10]
-Arguments: [ca_address_sk#9 ASC NULLS FIRST], false, 0
+(27) Exchange
+Input [2]: [ca_address_sk#7, ca_state#14]
+Arguments: hashpartitioning(ca_address_sk#7, 5), ENSURE_REQUIREMENTS, [id=#29]
 
-(34) SortMergeJoin [codegen id : 12]
-Left keys [1]: [cr_returning_addr_sk#3]
-Right keys [1]: [ca_address_sk#9]
-Join condition: None
+(28) Sort [codegen id : 8]
+Input [2]: [ca_address_sk#7, ca_state#14]
+Arguments: [ca_address_sk#7 ASC NULLS FIRST], false, 0
 
-(35) Project [codegen id : 12]
-Output [3]: [cr_returning_customer_sk#2, cr_return_amt_inc_tax#4, ca_state#10]
-Input [5]: [cr_returning_customer_sk#2, cr_returning_addr_sk#3, cr_return_amt_inc_tax#4, ca_address_sk#9, ca_state#10]
-
-(36) HashAggregate [codegen id : 12]
-Input [3]: [cr_returning_customer_sk#2, cr_return_amt_inc_tax#4, ca_state#10]
-Keys [2]: [cr_returning_customer_sk#2, ca_state#10]
-Functions [1]: [partial_sum(UnscaledValue(cr_return_amt_inc_tax#4))]
-Aggregate Attributes [1]: [sum#20]
-Results [3]: [cr_returning_customer_sk#2, ca_state#10, sum#21]
-
-(37) Exchange
-Input [3]: [cr_returning_customer_sk#2, ca_state#10, sum#21]
-Arguments: hashpartitioning(cr_returning_customer_sk#2, ca_state#10, 5), ENSURE_REQUIREMENTS, [id=#22]
-
-(38) HashAggregate [codegen id : 13]
-Input [3]: [cr_returning_customer_sk#2, ca_state#10, sum#21]
-Keys [2]: [cr_returning_customer_sk#2, ca_state#10]
-Functions [1]: [sum(UnscaledValue(cr_return_amt_inc_tax#4))]
-Aggregate Attributes [1]: [sum(UnscaledValue(cr_return_amt_inc_tax#4))#23]
-Results [2]: [ca_state#10 AS ctr_state#17, MakeDecimal(sum(UnscaledValue(cr_return_amt_inc_tax#4))#23,17,2) AS ctr_total_return#18]
-
-(39) HashAggregate [codegen id : 13]
-Input [2]: [ctr_state#17, ctr_total_return#18]
-Keys [1]: [ctr_state#17]
-Functions [1]: [partial_avg(ctr_total_return#18)]
-Aggregate Attributes [2]: [sum#24, count#25]
-Results [3]: [ctr_state#17, sum#26, count#27]
-
-(40) Exchange
-Input [3]: [ctr_state#17, sum#26, count#27]
-Arguments: hashpartitioning(ctr_state#17, 5), ENSURE_REQUIREMENTS, [id=#28]
-
-(41) HashAggregate [codegen id : 14]
-Input [3]: [ctr_state#17, sum#26, count#27]
-Keys [1]: [ctr_state#17]
-Functions [1]: [avg(ctr_total_return#18)]
-Aggregate Attributes [1]: [avg(ctr_total_return#18)#29]
-Results [2]: [CheckOverflow((promote_precision(avg(ctr_total_return#18)#29) * 1.200000), DecimalType(24,7), true) AS (CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))#30, ctr_state#17 AS ctr_state#17#31]
-
-(42) Filter [codegen id : 14]
-Input [2]: [(CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))#30, ctr_state#17#31]
-Condition : isnotnull((CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))#30)
-
-(43) BroadcastExchange
-Input [2]: [(CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))#30, ctr_state#17#31]
-Arguments: HashedRelationBroadcastMode(List(input[1, string, true]),false), [id=#32]
-
-(44) BroadcastHashJoin [codegen id : 15]
-Left keys [1]: [ctr_state#17]
-Right keys [1]: [ctr_state#17#31]
-Join condition: (cast(ctr_total_return#18 as decimal(24,7)) > (CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))#30)
-
-(45) Project [codegen id : 15]
-Output [2]: [ctr_customer_sk#16, ctr_total_return#18]
-Input [5]: [ctr_customer_sk#16, ctr_state#17, ctr_total_return#18, (CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))#30, ctr_state#17#31]
-
-(46) Exchange
-Input [2]: [ctr_customer_sk#16, ctr_total_return#18]
-Arguments: hashpartitioning(ctr_customer_sk#16, 5), ENSURE_REQUIREMENTS, [id=#33]
-
-(47) Sort [codegen id : 16]
-Input [2]: [ctr_customer_sk#16, ctr_total_return#18]
-Arguments: [ctr_customer_sk#16 ASC NULLS FIRST], false, 0
-
-(48) Scan parquet default.customer
-Output [6]: [c_customer_sk#34, c_customer_id#35, c_current_addr_sk#36, c_salutation#37, c_first_name#38, c_last_name#39]
-Batched: true
-Location [not included in comparison]/{warehouse_dir}/customer]
-PushedFilters: [IsNotNull(c_customer_sk), IsNotNull(c_current_addr_sk)]
-ReadSchema: struct<c_customer_sk:int,c_customer_id:string,c_current_addr_sk:int,c_salutation:string,c_first_name:string,c_last_name:string>
+(29) SortMergeJoin [codegen id : 9]
+Left keys [1]: [cr_returning_addr_sk#23]
+Right keys [1]: [ca_address_sk#7]
+Join condition: None
 
-(49) ColumnarToRow [codegen id : 18]
-Input [6]: [c_customer_sk#34, c_customer_id#35, c_current_addr_sk#36, c_salutation#37, c_first_name#38, c_last_name#39]
+(30) Project [codegen id : 9]
+Output [3]: [cr_returning_customer_sk#22, cr_return_amt_inc_tax#24, ca_state#14]
+Input [5]: [cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24, ca_address_sk#7, ca_state#14]
+
+(31) HashAggregate [codegen id : 9]
+Input [3]: [cr_returning_customer_sk#22, cr_return_amt_inc_tax#24, ca_state#14]
+Keys [2]: [cr_returning_customer_sk#22, ca_state#14]
+Functions [1]: [partial_sum(UnscaledValue(cr_return_amt_inc_tax#24))]
+Aggregate Attributes [1]: [sum#30]
+Results [3]: [cr_returning_customer_sk#22, ca_state#14, sum#31]
+
+(32) Exchange
+Input [3]: [cr_returning_customer_sk#22, ca_state#14, sum#31]
+Arguments: hashpartitioning(cr_returning_customer_sk#22, ca_state#14, 5), ENSURE_REQUIREMENTS, [id=#32]
+
+(33) HashAggregate [codegen id : 10]
+Input [3]: [cr_returning_customer_sk#22, ca_state#14, sum#31]
+Keys [2]: [cr_returning_customer_sk#22, ca_state#14]
+Functions [1]: [sum(UnscaledValue(cr_return_amt_inc_tax#24))]
+Aggregate Attributes [1]: [sum(UnscaledValue(cr_return_amt_inc_tax#24))#33]
+Results [3]: [cr_returning_customer_sk#22 AS ctr_customer_sk#34, ca_state#14 AS ctr_state#35, MakeDecimal(sum(UnscaledValue(cr_return_amt_inc_tax#24))#33,17,2) AS ctr_total_return#36]
+
+(34) Filter [codegen id : 10]
+Input [3]: [ctr_customer_sk#34, ctr_state#35, ctr_total_return#36]
+Condition : isnotnull(ctr_total_return#36)
+
+(35) Exchange
+Input [3]: [ctr_customer_sk#34, ctr_state#35, ctr_total_return#36]
+Arguments: hashpartitioning(ctr_customer_sk#34, 5), ENSURE_REQUIREMENTS, [id=#37]
+
+(36) Sort [codegen id : 11]
+Input [3]: [ctr_customer_sk#34, ctr_state#35, ctr_total_return#36]
+Arguments: [ctr_customer_sk#34 ASC NULLS FIRST], false, 0
+
+(37) SortMergeJoin [codegen id : 20]
+Left keys [1]: [c_customer_sk#1]
+Right keys [1]: [ctr_customer_sk#34]
+Join condition: None
 
-(50) Filter [codegen id : 18]
-Input [6]: [c_customer_sk#34, c_customer_id#35, c_current_addr_sk#36, c_salutation#37, c_first_name#38, c_last_name#39]
-Condition : (isnotnull(c_customer_sk#34) AND isnotnull(c_current_addr_sk#36))
+(38) Project [codegen id : 20]
+Output [17]: [c_customer_id#2, c_salutation#4, c_first_name#5, c_last_name#6, ca_street_number#8, ca_street_name#9, ca_street_type#10, ca_suite_number#11, ca_city#12, ca_county#13, ca_state#14, ca_zip#15, ca_country#16, ca_gmt_offset#17, ca_location_type#18, ctr_state#35, ctr_total_return#36]
+Input [19]: [c_customer_sk#1, c_customer_id#2, c_salutation#4, c_first_name#5, c_last_name#6, ca_street_number#8, ca_street_name#9, ca_street_type#10, ca_suite_number#11, ca_city#12, ca_county#13, ca_state#14, ca_zip#15, ca_country#16, ca_gmt_offset#17, ca_location_type#18, ctr_customer_sk#34, ctr_state#35, ctr_total_return#36]
 
-(51) Scan parquet default.customer_address
-Output [12]: [ca_address_sk#9, ca_street_number#40, ca_street_name#41, ca_street_type#42, ca_suite_number#43, ca_city#44, ca_county#45, ca_state#10, ca_zip#46, ca_country#47, ca_gmt_offset#48, ca_location_type#49]
+(39) Scan parquet default.catalog_returns
+Output [4]: [cr_returned_date_sk#21, cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/customer_address]
-PushedFilters: [IsNotNull(ca_state), EqualTo(ca_state,GA), IsNotNull(ca_address_sk)]
-ReadSchema: struct<ca_address_sk:int,ca_street_number:string,ca_street_name:string,ca_street_type:string,ca_suite_number:string,ca_city:string,ca_county:string,ca_state:string,ca_zip:string,ca_country:string,ca_gmt_offset:decimal(5,2),ca_location_type:string>
+Location [not included in comparison]/{warehouse_dir}/catalog_returns]
+PushedFilters: [IsNotNull(cr_returned_date_sk), IsNotNull(cr_returning_addr_sk)]
+ReadSchema: struct<cr_returned_date_sk:int,cr_returning_customer_sk:int,cr_returning_addr_sk:int,cr_return_amt_inc_tax:decimal(7,2)>
 
-(52) ColumnarToRow [codegen id : 17]
-Input [12]: [ca_address_sk#9, ca_street_number#40, ca_street_name#41, ca_street_type#42, ca_suite_number#43, ca_city#44, ca_county#45, ca_state#10, ca_zip#46, ca_country#47, ca_gmt_offset#48, ca_location_type#49]
+(40) ColumnarToRow [codegen id : 13]
+Input [4]: [cr_returned_date_sk#21, cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24]
 
-(53) Filter [codegen id : 17]
-Input [12]: [ca_address_sk#9, ca_street_number#40, ca_street_name#41, ca_street_type#42, ca_suite_number#43, ca_city#44, ca_county#45, ca_state#10, ca_zip#46, ca_country#47, ca_gmt_offset#48, ca_location_type#49]
-Condition : ((isnotnull(ca_state#10) AND (ca_state#10 = GA)) AND isnotnull(ca_address_sk#9))
+(41) Filter [codegen id : 13]
+Input [4]: [cr_returned_date_sk#21, cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24]
+Condition : (isnotnull(cr_returned_date_sk#21) AND isnotnull(cr_returning_addr_sk#23))
 
-(54) BroadcastExchange
-Input [12]: [ca_address_sk#9, ca_street_number#40, ca_street_name#41, ca_street_type#42, ca_suite_number#43, ca_city#44, ca_county#45, ca_state#10, ca_zip#46, ca_country#47, ca_gmt_offset#48, ca_location_type#49]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#50]
+(42) ReusedExchange [Reuses operator id: 19]
+Output [1]: [d_date_sk#25]
 
-(55) BroadcastHashJoin [codegen id : 18]
-Left keys [1]: [c_current_addr_sk#36]
-Right keys [1]: [ca_address_sk#9]
+(43) BroadcastHashJoin [codegen id : 13]
+Left keys [1]: [cr_returned_date_sk#21]
+Right keys [1]: [d_date_sk#25]
 Join condition: None
 
-(56) Project [codegen id : 18]
-Output [16]: [c_customer_sk#34, c_customer_id#35, c_salutation#37, c_first_name#38, c_last_name#39, ca_street_number#40, ca_street_name#41, ca_street_type#42, ca_suite_number#43, ca_city#44, ca_county#45, ca_state#10, ca_zip#46, ca_country#47, ca_gmt_offset#48, ca_location_type#49]
-Input [18]: [c_customer_sk#34, c_customer_id#35, c_current_addr_sk#36, c_salutation#37, c_first_name#38, c_last_name#39, ca_address_sk#9, ca_street_number#40, ca_street_name#41, ca_street_type#42, ca_suite_number#43, ca_city#44, ca_county#45, ca_state#10, ca_zip#46, ca_country#47, ca_gmt_offset#48, ca_location_type#49]
+(44) Project [codegen id : 13]
+Output [3]: [cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24]
+Input [5]: [cr_returned_date_sk#21, cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24, d_date_sk#25]
 
-(57) Exchange
-Input [16]: [c_customer_sk#34, c_customer_id#35, c_salutation#37, c_first_name#38, c_last_name#39, ca_street_number#40, ca_street_name#41, ca_street_type#42, ca_suite_number#43, ca_city#44, ca_county#45, ca_state#10, ca_zip#46, ca_country#47, ca_gmt_offset#48, ca_location_type#49]
-Arguments: hashpartitioning(c_customer_sk#34, 5), ENSURE_REQUIREMENTS, [id=#51]
+(45) Exchange
+Input [3]: [cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24]
+Arguments: hashpartitioning(cr_returning_addr_sk#23, 5), ENSURE_REQUIREMENTS, [id=#38]
 
-(58) Sort [codegen id : 19]
-Input [16]: [c_customer_sk#34, c_customer_id#35, c_salutation#37, c_first_name#38, c_last_name#39, ca_street_number#40, ca_street_name#41, ca_street_type#42, ca_suite_number#43, ca_city#44, ca_county#45, ca_state#10, ca_zip#46, ca_country#47, ca_gmt_offset#48, ca_location_type#49]
-Arguments: [c_customer_sk#34 ASC NULLS FIRST], false, 0
+(46) Sort [codegen id : 14]
+Input [3]: [cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24]
+Arguments: [cr_returning_addr_sk#23 ASC NULLS FIRST], false, 0
 
-(59) SortMergeJoin [codegen id : 20]
-Left keys [1]: [ctr_customer_sk#16]
-Right keys [1]: [c_customer_sk#34]
+(47) ReusedExchange [Reuses operator id: 27]
+Output [2]: [ca_address_sk#7, ca_state#14]
+
+(48) Sort [codegen id : 16]
+Input [2]: [ca_address_sk#7, ca_state#14]
+Arguments: [ca_address_sk#7 ASC NULLS FIRST], false, 0
+
+(49) SortMergeJoin [codegen id : 17]
+Left keys [1]: [cr_returning_addr_sk#23]
+Right keys [1]: [ca_address_sk#7]
 Join condition: None
 
+(50) Project [codegen id : 17]
+Output [3]: [cr_returning_customer_sk#22, cr_return_amt_inc_tax#24, ca_state#14]
+Input [5]: [cr_returning_customer_sk#22, cr_returning_addr_sk#23, cr_return_amt_inc_tax#24, ca_address_sk#7, ca_state#14]
+
+(51) HashAggregate [codegen id : 17]
+Input [3]: [cr_returning_customer_sk#22, cr_return_amt_inc_tax#24, ca_state#14]
+Keys [2]: [cr_returning_customer_sk#22, ca_state#14]
+Functions [1]: [partial_sum(UnscaledValue(cr_return_amt_inc_tax#24))]
+Aggregate Attributes [1]: [sum#39]
+Results [3]: [cr_returning_customer_sk#22, ca_state#14, sum#40]
+
+(52) Exchange
+Input [3]: [cr_returning_customer_sk#22, ca_state#14, sum#40]
+Arguments: hashpartitioning(cr_returning_customer_sk#22, ca_state#14, 5), ENSURE_REQUIREMENTS, [id=#41]
+
+(53) HashAggregate [codegen id : 18]
+Input [3]: [cr_returning_customer_sk#22, ca_state#14, sum#40]
+Keys [2]: [cr_returning_customer_sk#22, ca_state#14]
+Functions [1]: [sum(UnscaledValue(cr_return_amt_inc_tax#24))]
+Aggregate Attributes [1]: [sum(UnscaledValue(cr_return_amt_inc_tax#24))#42]
+Results [2]: [ca_state#14 AS ctr_state#35, MakeDecimal(sum(UnscaledValue(cr_return_amt_inc_tax#24))#42,17,2) AS ctr_total_return#36]
+
+(54) HashAggregate [codegen id : 18]
+Input [2]: [ctr_state#35, ctr_total_return#36]
+Keys [1]: [ctr_state#35]
+Functions [1]: [partial_avg(ctr_total_return#36)]
+Aggregate Attributes [2]: [sum#43, count#44]
+Results [3]: [ctr_state#35, sum#45, count#46]
+
+(55) Exchange
+Input [3]: [ctr_state#35, sum#45, count#46]
+Arguments: hashpartitioning(ctr_state#35, 5), ENSURE_REQUIREMENTS, [id=#47]
+
+(56) HashAggregate [codegen id : 19]
+Input [3]: [ctr_state#35, sum#45, count#46]
+Keys [1]: [ctr_state#35]
+Functions [1]: [avg(ctr_total_return#36)]
+Aggregate Attributes [1]: [avg(ctr_total_return#36)#48]
+Results [2]: [CheckOverflow((promote_precision(avg(ctr_total_return#36)#48) * 1.200000), DecimalType(24,7), true) AS (CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))#49, ctr_state#35 AS ctr_state#35#50]
+
+(57) Filter [codegen id : 19]
+Input [2]: [(CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))#49, ctr_state#35#50]
+Condition : isnotnull((CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))#49)
+
+(58) BroadcastExchange
+Input [2]: [(CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))#49, ctr_state#35#50]
+Arguments: HashedRelationBroadcastMode(List(input[1, string, true]),false), [id=#51]
+
+(59) BroadcastHashJoin [codegen id : 20]
+Left keys [1]: [ctr_state#35]
+Right keys [1]: [ctr_state#35#50]
+Join condition: (cast(ctr_total_return#36 as decimal(24,7)) > (CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))#49)
+
 (60) Project [codegen id : 20]
-Output [16]: [c_customer_id#35, c_salutation#37, c_first_name#38, c_last_name#39, ca_street_number#40, ca_street_name#41, ca_street_type#42, ca_suite_number#43, ca_city#44, ca_county#45, ca_state#10, ca_zip#46, ca_country#47, ca_gmt_offset#48, ca_location_type#49, ctr_total_return#18]
-Input [18]: [ctr_customer_sk#16, ctr_total_return#18, c_customer_sk#34, c_customer_id#35, c_salutation#37, c_first_name#38, c_last_name#39, ca_street_number#40, ca_street_name#41, ca_street_type#42, ca_suite_number#43, ca_city#44, ca_county#45, ca_state#10, ca_zip#46, ca_country#47, ca_gmt_offset#48, ca_location_type#49]
+Output [16]: [c_customer_id#2, c_salutation#4, c_first_name#5, c_last_name#6, ca_street_number#8, ca_street_name#9, ca_street_type#10, ca_suite_number#11, ca_city#12, ca_county#13, ca_state#14, ca_zip#15, ca_country#16, ca_gmt_offset#17, ca_location_type#18, ctr_total_return#36]
+Input [19]: [c_customer_id#2, c_salutation#4, c_first_name#5, c_last_name#6, ca_street_number#8, ca_street_name#9, ca_street_type#10, ca_suite_number#11, ca_city#12, ca_county#13, ca_state#14, ca_zip#15, ca_country#16, ca_gmt_offset#17, ca_location_type#18, ctr_state#35, ctr_total_return#36, (CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))#49, ctr_state#35#50]
 
 (61) TakeOrderedAndProject
-Input [16]: [c_customer_id#35, c_salutation#37, c_first_name#38, c_last_name#39, ca_street_number#40, ca_street_name#41, ca_street_type#42, ca_suite_number#43, ca_city#44, ca_county#45, ca_state#10, ca_zip#46, ca_country#47, ca_gmt_offset#48, ca_location_type#49, ctr_total_return#18]
-Arguments: 100, [c_customer_id#35 ASC NULLS FIRST, c_salutation#37 ASC NULLS FIRST, c_first_name#38 ASC NULLS FIRST, c_last_name#39 ASC NULLS FIRST, ca_street_number#40 ASC NULLS FIRST, ca_street_name#41 ASC NULLS FIRST, ca_street_type#42 ASC NULLS FIRST, ca_suite_number#43 ASC NULLS FIRST, ca_city#44 ASC NULLS FIRST, ca_county#45 ASC NULLS FIRST, ca_state#10 ASC NULLS FIRST, ca_zip#46 ASC NULLS FIRST, ca_country#47 ASC NULLS FIRST, ca_gmt_offset#48 ASC NULLS FIRST, ca_location_type#49 ASC NULLS FIRST, ctr_total_return#18 ASC NULLS FIRST], [c_customer_id#35, c_salutation#37, c_first_name#38, c_last_name#39, ca_street_number#40, ca_street_name#41, ca_street_type#42, ca_suite_number#43, ca_city#44, ca_county#45, ca_state#10, ca_zip#46, ca_country#47, ca_gmt_offset#48, ca_location_type#49, ctr_total_return#18]
+Input [16]: [c_customer_id#2, c_salutation#4, c_first_name#5, c_last_name#6, ca_street_number#8, ca_street_name#9, ca_street_type#10, ca_suite_number#11, ca_city#12, ca_county#13, ca_state#14, ca_zip#15, ca_country#16, ca_gmt_offset#17, ca_location_type#18, ctr_total_return#36]
+Arguments: 100, [c_customer_id#2 ASC NULLS FIRST, c_salutation#4 ASC NULLS FIRST, c_first_name#5 ASC NULLS FIRST, c_last_name#6 ASC NULLS FIRST, ca_street_number#8 ASC NULLS FIRST, ca_street_name#9 ASC NULLS FIRST, ca_street_type#10 ASC NULLS FIRST, ca_suite_number#11 ASC NULLS FIRST, ca_city#12 ASC NULLS FIRST, ca_county#13 ASC NULLS FIRST, ca_state#14 ASC NULLS FIRST, ca_zip#15 ASC NULLS FIRST, ca_country#16 ASC NULLS FIRST, ca_gmt_offset#17 ASC NULLS FIRST, ca_location_type#18 ASC NULLS FIRST, ctr_total_return#36 ASC NULLS FIRST], [c_customer_id#2, c_salutation#4, c_first_name#5, c_last_name#6, ca_street_number#8, ca_street_name#9, ca_street_type#10, ca_suite_number#11, ca_city#12, ca_county#13, ca_state#14, ca_zip#15, ca_country#16, ca_gmt_offset#17, ca_location_type#18, ctr_total_return#36]
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q81.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q81.sf100/simplified.txt
index 99677b6e39736..c603ab5194286 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q81.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q81.sf100/simplified.txt
@@ -1,29 +1,48 @@
 TakeOrderedAndProject [c_customer_id,c_salutation,c_first_name,c_last_name,ca_street_number,ca_street_name,ca_street_type,ca_suite_number,ca_city,ca_county,ca_state,ca_zip,ca_country,ca_gmt_offset,ca_location_type,ctr_total_return]
   WholeStageCodegen (20)
     Project [c_customer_id,c_salutation,c_first_name,c_last_name,ca_street_number,ca_street_name,ca_street_type,ca_suite_number,ca_city,ca_county,ca_state,ca_zip,ca_country,ca_gmt_offset,ca_location_type,ctr_total_return]
-      SortMergeJoin [ctr_customer_sk,c_customer_sk]
-        InputAdapter
-          WholeStageCodegen (16)
-            Sort [ctr_customer_sk]
-              InputAdapter
-                Exchange [ctr_customer_sk] #1
-                  WholeStageCodegen (15)
-                    Project [ctr_customer_sk,ctr_total_return]
-                      BroadcastHashJoin [ctr_state,ctr_state,ctr_total_return,(CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))]
+      BroadcastHashJoin [ctr_state,ctr_state,ctr_total_return,(CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))]
+        Project [c_customer_id,c_salutation,c_first_name,c_last_name,ca_street_number,ca_street_name,ca_street_type,ca_suite_number,ca_city,ca_county,ca_state,ca_zip,ca_country,ca_gmt_offset,ca_location_type,ctr_state,ctr_total_return]
+          SortMergeJoin [c_customer_sk,ctr_customer_sk]
+            InputAdapter
+              WholeStageCodegen (3)
+                Sort [c_customer_sk]
+                  InputAdapter
+                    Exchange [c_customer_sk] #1
+                      WholeStageCodegen (2)
+                        Project [c_customer_sk,c_customer_id,c_salutation,c_first_name,c_last_name,ca_street_number,ca_street_name,ca_street_type,ca_suite_number,ca_city,ca_county,ca_state,ca_zip,ca_country,ca_gmt_offset,ca_location_type]
+                          BroadcastHashJoin [c_current_addr_sk,ca_address_sk]
+                            Filter [c_customer_sk,c_current_addr_sk]
+                              ColumnarToRow
+                                InputAdapter
+                                  Scan parquet default.customer [c_customer_sk,c_customer_id,c_current_addr_sk,c_salutation,c_first_name,c_last_name]
+                            InputAdapter
+                              BroadcastExchange #2
+                                WholeStageCodegen (1)
+                                  Filter [ca_state,ca_address_sk]
+                                    ColumnarToRow
+                                      InputAdapter
+                                        Scan parquet default.customer_address [ca_address_sk,ca_street_number,ca_street_name,ca_street_type,ca_suite_number,ca_city,ca_county,ca_state,ca_zip,ca_country,ca_gmt_offset,ca_location_type]
+            InputAdapter
+              WholeStageCodegen (11)
+                Sort [ctr_customer_sk]
+                  InputAdapter
+                    Exchange [ctr_customer_sk] #3
+                      WholeStageCodegen (10)
                         Filter [ctr_total_return]
                           HashAggregate [cr_returning_customer_sk,ca_state,sum] [sum(UnscaledValue(cr_return_amt_inc_tax)),ctr_customer_sk,ctr_state,ctr_total_return,sum]
                             InputAdapter
-                              Exchange [cr_returning_customer_sk,ca_state] #2
-                                WholeStageCodegen (6)
+                              Exchange [cr_returning_customer_sk,ca_state] #4
+                                WholeStageCodegen (9)
                                   HashAggregate [cr_returning_customer_sk,ca_state,cr_return_amt_inc_tax] [sum,sum]
                                     Project [cr_returning_customer_sk,cr_return_amt_inc_tax,ca_state]
                                       SortMergeJoin [cr_returning_addr_sk,ca_address_sk]
                                         InputAdapter
-                                          WholeStageCodegen (3)
+                                          WholeStageCodegen (6)
                                             Sort [cr_returning_addr_sk]
                                               InputAdapter
-                                                Exchange [cr_returning_addr_sk] #3
-                                                  WholeStageCodegen (2)
+                                                Exchange [cr_returning_addr_sk] #5
+                                                  WholeStageCodegen (5)
                                                     Project [cr_returning_customer_sk,cr_returning_addr_sk,cr_return_amt_inc_tax]
                                                       BroadcastHashJoin [cr_returned_date_sk,d_date_sk]
                                                         Filter [cr_returned_date_sk,cr_returning_addr_sk,cr_returning_customer_sk]
@@ -31,74 +50,55 @@ TakeOrderedAndProject [c_customer_id,c_salutation,c_first_name,c_last_name,ca_st
                                                             InputAdapter
                                                               Scan parquet default.catalog_returns [cr_returned_date_sk,cr_returning_customer_sk,cr_returning_addr_sk,cr_return_amt_inc_tax]
                                                         InputAdapter
-                                                          BroadcastExchange #4
-                                                            WholeStageCodegen (1)
+                                                          BroadcastExchange #6
+                                                            WholeStageCodegen (4)
                                                               Project [d_date_sk]
                                                                 Filter [d_year,d_date_sk]
                                                                   ColumnarToRow
                                                                     InputAdapter
                                                                       Scan parquet default.date_dim [d_date_sk,d_year]
                                         InputAdapter
-                                          WholeStageCodegen (5)
+                                          WholeStageCodegen (8)
                                             Sort [ca_address_sk]
                                               InputAdapter
-                                                Exchange [ca_address_sk] #5
-                                                  WholeStageCodegen (4)
+                                                Exchange [ca_address_sk] #7
+                                                  WholeStageCodegen (7)
                                                     Filter [ca_address_sk,ca_state]
                                                       ColumnarToRow
                                                         InputAdapter
                                                           Scan parquet default.customer_address [ca_address_sk,ca_state]
-                        InputAdapter
-                          BroadcastExchange #6
-                            WholeStageCodegen (14)
-                              Filter [(CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))]
-                                HashAggregate [ctr_state,sum,count] [avg(ctr_total_return),(CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6))),ctr_state,sum,count]
-                                  InputAdapter
-                                    Exchange [ctr_state] #7
-                                      WholeStageCodegen (13)
-                                        HashAggregate [ctr_state,ctr_total_return] [sum,count,sum,count]
-                                          HashAggregate [cr_returning_customer_sk,ca_state,sum] [sum(UnscaledValue(cr_return_amt_inc_tax)),ctr_state,ctr_total_return,sum]
-                                            InputAdapter
-                                              Exchange [cr_returning_customer_sk,ca_state] #8
-                                                WholeStageCodegen (12)
-                                                  HashAggregate [cr_returning_customer_sk,ca_state,cr_return_amt_inc_tax] [sum,sum]
-                                                    Project [cr_returning_customer_sk,cr_return_amt_inc_tax,ca_state]
-                                                      SortMergeJoin [cr_returning_addr_sk,ca_address_sk]
-                                                        InputAdapter
-                                                          WholeStageCodegen (9)
-                                                            Sort [cr_returning_addr_sk]
-                                                              InputAdapter
-                                                                Exchange [cr_returning_addr_sk] #9
-                                                                  WholeStageCodegen (8)
-                                                                    Project [cr_returning_customer_sk,cr_returning_addr_sk,cr_return_amt_inc_tax]
-                                                                      BroadcastHashJoin [cr_returned_date_sk,d_date_sk]
-                                                                        Filter [cr_returned_date_sk,cr_returning_addr_sk]
-                                                                          ColumnarToRow
-                                                                            InputAdapter
-                                                                              Scan parquet default.catalog_returns [cr_returned_date_sk,cr_returning_customer_sk,cr_returning_addr_sk,cr_return_amt_inc_tax]
-                                                                        InputAdapter
-                                                                          ReusedExchange [d_date_sk] #4
-                                                        InputAdapter
-                                                          WholeStageCodegen (11)
-                                                            Sort [ca_address_sk]
-                                                              InputAdapter
-                                                                ReusedExchange [ca_address_sk,ca_state] #5
         InputAdapter
-          WholeStageCodegen (19)
-            Sort [c_customer_sk]
-              InputAdapter
-                Exchange [c_customer_sk] #10
-                  WholeStageCodegen (18)
-                    Project [c_customer_sk,c_customer_id,c_salutation,c_first_name,c_last_name,ca_street_number,ca_street_name,ca_street_type,ca_suite_number,ca_city,ca_county,ca_state,ca_zip,ca_country,ca_gmt_offset,ca_location_type]
-                      BroadcastHashJoin [c_current_addr_sk,ca_address_sk]
-                        Filter [c_customer_sk,c_current_addr_sk]
-                          ColumnarToRow
+          BroadcastExchange #8
+            WholeStageCodegen (19)
+              Filter [(CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6)))]
+                HashAggregate [ctr_state,sum,count] [avg(ctr_total_return),(CAST(avg(ctr_total_return) AS DECIMAL(21,6)) * CAST(1.2 AS DECIMAL(21,6))),ctr_state,sum,count]
+                  InputAdapter
+                    Exchange [ctr_state] #9
+                      WholeStageCodegen (18)
+                        HashAggregate [ctr_state,ctr_total_return] [sum,count,sum,count]
+                          HashAggregate [cr_returning_customer_sk,ca_state,sum] [sum(UnscaledValue(cr_return_amt_inc_tax)),ctr_state,ctr_total_return,sum]
                             InputAdapter
-                              Scan parquet default.customer [c_customer_sk,c_customer_id,c_current_addr_sk,c_salutation,c_first_name,c_last_name]
-                        InputAdapter
-                          BroadcastExchange #11
-                            WholeStageCodegen (17)
-                              Filter [ca_state,ca_address_sk]
-                                ColumnarToRow
-                                  InputAdapter
-                                    Scan parquet default.customer_address [ca_address_sk,ca_street_number,ca_street_name,ca_street_type,ca_suite_number,ca_city,ca_county,ca_state,ca_zip,ca_country,ca_gmt_offset,ca_location_type]
+                              Exchange [cr_returning_customer_sk,ca_state] #10
+                                WholeStageCodegen (17)
+                                  HashAggregate [cr_returning_customer_sk,ca_state,cr_return_amt_inc_tax] [sum,sum]
+                                    Project [cr_returning_customer_sk,cr_return_amt_inc_tax,ca_state]
+                                      SortMergeJoin [cr_returning_addr_sk,ca_address_sk]
+                                        InputAdapter
+                                          WholeStageCodegen (14)
+                                            Sort [cr_returning_addr_sk]
+                                              InputAdapter
+                                                Exchange [cr_returning_addr_sk] #11
+                                                  WholeStageCodegen (13)
+                                                    Project [cr_returning_customer_sk,cr_returning_addr_sk,cr_return_amt_inc_tax]
+                                                      BroadcastHashJoin [cr_returned_date_sk,d_date_sk]
+                                                        Filter [cr_returned_date_sk,cr_returning_addr_sk]
+                                                          ColumnarToRow
+                                                            InputAdapter
+                                                              Scan parquet default.catalog_returns [cr_returned_date_sk,cr_returning_customer_sk,cr_returning_addr_sk,cr_return_amt_inc_tax]
+                                                        InputAdapter
+                                                          ReusedExchange [d_date_sk] #6
+                                        InputAdapter
+                                          WholeStageCodegen (16)
+                                            Sort [ca_address_sk]
+                                              InputAdapter
+                                                ReusedExchange [ca_address_sk,ca_state] #7
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q91.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q91.sf100/explain.txt
index 6bcbe470cec50..16589f89deb6e 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q91.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q91.sf100/explain.txt
@@ -8,206 +8,206 @@
                +- * BroadcastHashJoin Inner BuildRight (41)
                   :- * Project (36)
                   :  +- * BroadcastHashJoin Inner BuildRight (35)
-                  :     :- * Project (30)
-                  :     :  +- * BroadcastHashJoin Inner BuildRight (29)
-                  :     :     :- * Project (23)
-                  :     :     :  +- * BroadcastHashJoin Inner BuildRight (22)
-                  :     :     :     :- * Project (16)
-                  :     :     :     :  +- * BroadcastHashJoin Inner BuildLeft (15)
-                  :     :     :     :     :- BroadcastExchange (11)
-                  :     :     :     :     :  +- * Project (10)
-                  :     :     :     :     :     +- * BroadcastHashJoin Inner BuildRight (9)
-                  :     :     :     :     :        :- * Filter (3)
-                  :     :     :     :     :        :  +- * ColumnarToRow (2)
-                  :     :     :     :     :        :     +- Scan parquet default.catalog_returns (1)
-                  :     :     :     :     :        +- BroadcastExchange (8)
-                  :     :     :     :     :           +- * Project (7)
-                  :     :     :     :     :              +- * Filter (6)
-                  :     :     :     :     :                 +- * ColumnarToRow (5)
-                  :     :     :     :     :                    +- Scan parquet default.date_dim (4)
-                  :     :     :     :     +- * Filter (14)
-                  :     :     :     :        +- * ColumnarToRow (13)
-                  :     :     :     :           +- Scan parquet default.customer (12)
-                  :     :     :     +- BroadcastExchange (21)
-                  :     :     :        +- * Project (20)
-                  :     :     :           +- * Filter (19)
-                  :     :     :              +- * ColumnarToRow (18)
-                  :     :     :                 +- Scan parquet default.household_demographics (17)
-                  :     :     +- BroadcastExchange (28)
-                  :     :        +- * Project (27)
-                  :     :           +- * Filter (26)
-                  :     :              +- * ColumnarToRow (25)
-                  :     :                 +- Scan parquet default.customer_address (24)
+                  :     :- * Project (23)
+                  :     :  +- * BroadcastHashJoin Inner BuildRight (22)
+                  :     :     :- * Project (16)
+                  :     :     :  +- * BroadcastHashJoin Inner BuildRight (15)
+                  :     :     :     :- * Project (9)
+                  :     :     :     :  +- * BroadcastHashJoin Inner BuildRight (8)
+                  :     :     :     :     :- * Filter (3)
+                  :     :     :     :     :  +- * ColumnarToRow (2)
+                  :     :     :     :     :     +- Scan parquet default.customer (1)
+                  :     :     :     :     +- BroadcastExchange (7)
+                  :     :     :     :        +- * Filter (6)
+                  :     :     :     :           +- * ColumnarToRow (5)
+                  :     :     :     :              +- Scan parquet default.customer_demographics (4)
+                  :     :     :     +- BroadcastExchange (14)
+                  :     :     :        +- * Project (13)
+                  :     :     :           +- * Filter (12)
+                  :     :     :              +- * ColumnarToRow (11)
+                  :     :     :                 +- Scan parquet default.household_demographics (10)
+                  :     :     +- BroadcastExchange (21)
+                  :     :        +- * Project (20)
+                  :     :           +- * Filter (19)
+                  :     :              +- * ColumnarToRow (18)
+                  :     :                 +- Scan parquet default.customer_address (17)
                   :     +- BroadcastExchange (34)
-                  :        +- * Filter (33)
-                  :           +- * ColumnarToRow (32)
-                  :              +- Scan parquet default.customer_demographics (31)
+                  :        +- * Project (33)
+                  :           +- * BroadcastHashJoin Inner BuildRight (32)
+                  :              :- * Filter (26)
+                  :              :  +- * ColumnarToRow (25)
+                  :              :     +- Scan parquet default.catalog_returns (24)
+                  :              +- BroadcastExchange (31)
+                  :                 +- * Project (30)
+                  :                    +- * Filter (29)
+                  :                       +- * ColumnarToRow (28)
+                  :                          +- Scan parquet default.date_dim (27)
                   +- BroadcastExchange (40)
                      +- * Filter (39)
                         +- * ColumnarToRow (38)
                            +- Scan parquet default.call_center (37)
 
 
-(1) Scan parquet default.catalog_returns
-Output [4]: [cr_returned_date_sk#1, cr_returning_customer_sk#2, cr_call_center_sk#3, cr_net_loss#4]
+(1) Scan parquet default.customer
+Output [4]: [c_customer_sk#1, c_current_cdemo_sk#2, c_current_hdemo_sk#3, c_current_addr_sk#4]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/catalog_returns]
-PushedFilters: [IsNotNull(cr_call_center_sk), IsNotNull(cr_returned_date_sk), IsNotNull(cr_returning_customer_sk)]
-ReadSchema: struct<cr_returned_date_sk:int,cr_returning_customer_sk:int,cr_call_center_sk:int,cr_net_loss:decimal(7,2)>
+Location [not included in comparison]/{warehouse_dir}/customer]
+PushedFilters: [IsNotNull(c_customer_sk), IsNotNull(c_current_addr_sk), IsNotNull(c_current_cdemo_sk), IsNotNull(c_current_hdemo_sk)]
+ReadSchema: struct<c_customer_sk:int,c_current_cdemo_sk:int,c_current_hdemo_sk:int,c_current_addr_sk:int>
 
-(2) ColumnarToRow [codegen id : 2]
-Input [4]: [cr_returned_date_sk#1, cr_returning_customer_sk#2, cr_call_center_sk#3, cr_net_loss#4]
+(2) ColumnarToRow [codegen id : 7]
+Input [4]: [c_customer_sk#1, c_current_cdemo_sk#2, c_current_hdemo_sk#3, c_current_addr_sk#4]
 
-(3) Filter [codegen id : 2]
-Input [4]: [cr_returned_date_sk#1, cr_returning_customer_sk#2, cr_call_center_sk#3, cr_net_loss#4]
-Condition : ((isnotnull(cr_call_center_sk#3) AND isnotnull(cr_returned_date_sk#1)) AND isnotnull(cr_returning_customer_sk#2))
+(3) Filter [codegen id : 7]
+Input [4]: [c_customer_sk#1, c_current_cdemo_sk#2, c_current_hdemo_sk#3, c_current_addr_sk#4]
+Condition : (((isnotnull(c_customer_sk#1) AND isnotnull(c_current_addr_sk#4)) AND isnotnull(c_current_cdemo_sk#2)) AND isnotnull(c_current_hdemo_sk#3))
 
-(4) Scan parquet default.date_dim
-Output [3]: [d_date_sk#5, d_year#6, d_moy#7]
+(4) Scan parquet default.customer_demographics
+Output [3]: [cd_demo_sk#5, cd_marital_status#6, cd_education_status#7]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/date_dim]
-PushedFilters: [IsNotNull(d_year), IsNotNull(d_moy), EqualTo(d_year,1998), EqualTo(d_moy,11), IsNotNull(d_date_sk)]
-ReadSchema: struct<d_date_sk:int,d_year:int,d_moy:int>
+Location [not included in comparison]/{warehouse_dir}/customer_demographics]
+PushedFilters: [Or(And(EqualTo(cd_marital_status,M),EqualTo(cd_education_status,Unknown             )),And(EqualTo(cd_marital_status,W),EqualTo(cd_education_status,Advanced Degree     ))), IsNotNull(cd_demo_sk)]
+ReadSchema: struct<cd_demo_sk:int,cd_marital_status:string,cd_education_status:string>
 
 (5) ColumnarToRow [codegen id : 1]
-Input [3]: [d_date_sk#5, d_year#6, d_moy#7]
+Input [3]: [cd_demo_sk#5, cd_marital_status#6, cd_education_status#7]
 
 (6) Filter [codegen id : 1]
-Input [3]: [d_date_sk#5, d_year#6, d_moy#7]
-Condition : ((((isnotnull(d_year#6) AND isnotnull(d_moy#7)) AND (d_year#6 = 1998)) AND (d_moy#7 = 11)) AND isnotnull(d_date_sk#5))
+Input [3]: [cd_demo_sk#5, cd_marital_status#6, cd_education_status#7]
+Condition : ((((cd_marital_status#6 = M) AND (cd_education_status#7 = Unknown             )) OR ((cd_marital_status#6 = W) AND (cd_education_status#7 = Advanced Degree     ))) AND isnotnull(cd_demo_sk#5))
 
-(7) Project [codegen id : 1]
-Output [1]: [d_date_sk#5]
-Input [3]: [d_date_sk#5, d_year#6, d_moy#7]
+(7) BroadcastExchange
+Input [3]: [cd_demo_sk#5, cd_marital_status#6, cd_education_status#7]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#8]
 
-(8) BroadcastExchange
-Input [1]: [d_date_sk#5]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#8]
-
-(9) BroadcastHashJoin [codegen id : 2]
-Left keys [1]: [cr_returned_date_sk#1]
-Right keys [1]: [d_date_sk#5]
+(8) BroadcastHashJoin [codegen id : 7]
+Left keys [1]: [c_current_cdemo_sk#2]
+Right keys [1]: [cd_demo_sk#5]
 Join condition: None
 
-(10) Project [codegen id : 2]
-Output [3]: [cr_returning_customer_sk#2, cr_call_center_sk#3, cr_net_loss#4]
-Input [5]: [cr_returned_date_sk#1, cr_returning_customer_sk#2, cr_call_center_sk#3, cr_net_loss#4, d_date_sk#5]
-
-(11) BroadcastExchange
-Input [3]: [cr_returning_customer_sk#2, cr_call_center_sk#3, cr_net_loss#4]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#9]
+(9) Project [codegen id : 7]
+Output [5]: [c_customer_sk#1, c_current_hdemo_sk#3, c_current_addr_sk#4, cd_marital_status#6, cd_education_status#7]
+Input [7]: [c_customer_sk#1, c_current_cdemo_sk#2, c_current_hdemo_sk#3, c_current_addr_sk#4, cd_demo_sk#5, cd_marital_status#6, cd_education_status#7]
 
-(12) Scan parquet default.customer
-Output [4]: [c_customer_sk#10, c_current_cdemo_sk#11, c_current_hdemo_sk#12, c_current_addr_sk#13]
+(10) Scan parquet default.household_demographics
+Output [2]: [hd_demo_sk#9, hd_buy_potential#10]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/customer]
-PushedFilters: [IsNotNull(c_customer_sk), IsNotNull(c_current_addr_sk), IsNotNull(c_current_cdemo_sk), IsNotNull(c_current_hdemo_sk)]
-ReadSchema: struct<c_customer_sk:int,c_current_cdemo_sk:int,c_current_hdemo_sk:int,c_current_addr_sk:int>
+Location [not included in comparison]/{warehouse_dir}/household_demographics]
+PushedFilters: [IsNotNull(hd_buy_potential), StringStartsWith(hd_buy_potential,Unknown), IsNotNull(hd_demo_sk)]
+ReadSchema: struct<hd_demo_sk:int,hd_buy_potential:string>
+
+(11) ColumnarToRow [codegen id : 2]
+Input [2]: [hd_demo_sk#9, hd_buy_potential#10]
+
+(12) Filter [codegen id : 2]
+Input [2]: [hd_demo_sk#9, hd_buy_potential#10]
+Condition : ((isnotnull(hd_buy_potential#10) AND StartsWith(hd_buy_potential#10, Unknown)) AND isnotnull(hd_demo_sk#9))
 
-(13) ColumnarToRow
-Input [4]: [c_customer_sk#10, c_current_cdemo_sk#11, c_current_hdemo_sk#12, c_current_addr_sk#13]
+(13) Project [codegen id : 2]
+Output [1]: [hd_demo_sk#9]
+Input [2]: [hd_demo_sk#9, hd_buy_potential#10]
 
-(14) Filter
-Input [4]: [c_customer_sk#10, c_current_cdemo_sk#11, c_current_hdemo_sk#12, c_current_addr_sk#13]
-Condition : (((isnotnull(c_customer_sk#10) AND isnotnull(c_current_addr_sk#13)) AND isnotnull(c_current_cdemo_sk#11)) AND isnotnull(c_current_hdemo_sk#12))
+(14) BroadcastExchange
+Input [1]: [hd_demo_sk#9]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#11]
 
 (15) BroadcastHashJoin [codegen id : 7]
-Left keys [1]: [cr_returning_customer_sk#2]
-Right keys [1]: [c_customer_sk#10]
+Left keys [1]: [c_current_hdemo_sk#3]
+Right keys [1]: [hd_demo_sk#9]
 Join condition: None
 
 (16) Project [codegen id : 7]
-Output [5]: [cr_call_center_sk#3, cr_net_loss#4, c_current_cdemo_sk#11, c_current_hdemo_sk#12, c_current_addr_sk#13]
-Input [7]: [cr_returning_customer_sk#2, cr_call_center_sk#3, cr_net_loss#4, c_customer_sk#10, c_current_cdemo_sk#11, c_current_hdemo_sk#12, c_current_addr_sk#13]
+Output [4]: [c_customer_sk#1, c_current_addr_sk#4, cd_marital_status#6, cd_education_status#7]
+Input [6]: [c_customer_sk#1, c_current_hdemo_sk#3, c_current_addr_sk#4, cd_marital_status#6, cd_education_status#7, hd_demo_sk#9]
 
-(17) Scan parquet default.household_demographics
-Output [2]: [hd_demo_sk#14, hd_buy_potential#15]
+(17) Scan parquet default.customer_address
+Output [2]: [ca_address_sk#12, ca_gmt_offset#13]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/household_demographics]
-PushedFilters: [IsNotNull(hd_buy_potential), StringStartsWith(hd_buy_potential,Unknown), IsNotNull(hd_demo_sk)]
-ReadSchema: struct<hd_demo_sk:int,hd_buy_potential:string>
+Location [not included in comparison]/{warehouse_dir}/customer_address]
+PushedFilters: [IsNotNull(ca_gmt_offset), EqualTo(ca_gmt_offset,-7.00), IsNotNull(ca_address_sk)]
+ReadSchema: struct<ca_address_sk:int,ca_gmt_offset:decimal(5,2)>
 
 (18) ColumnarToRow [codegen id : 3]
-Input [2]: [hd_demo_sk#14, hd_buy_potential#15]
+Input [2]: [ca_address_sk#12, ca_gmt_offset#13]
 
 (19) Filter [codegen id : 3]
-Input [2]: [hd_demo_sk#14, hd_buy_potential#15]
-Condition : ((isnotnull(hd_buy_potential#15) AND StartsWith(hd_buy_potential#15, Unknown)) AND isnotnull(hd_demo_sk#14))
+Input [2]: [ca_address_sk#12, ca_gmt_offset#13]
+Condition : ((isnotnull(ca_gmt_offset#13) AND (ca_gmt_offset#13 = -7.00)) AND isnotnull(ca_address_sk#12))
 
 (20) Project [codegen id : 3]
-Output [1]: [hd_demo_sk#14]
-Input [2]: [hd_demo_sk#14, hd_buy_potential#15]
+Output [1]: [ca_address_sk#12]
+Input [2]: [ca_address_sk#12, ca_gmt_offset#13]
 
 (21) BroadcastExchange
-Input [1]: [hd_demo_sk#14]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#16]
+Input [1]: [ca_address_sk#12]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#14]
 
 (22) BroadcastHashJoin [codegen id : 7]
-Left keys [1]: [c_current_hdemo_sk#12]
-Right keys [1]: [hd_demo_sk#14]
+Left keys [1]: [c_current_addr_sk#4]
+Right keys [1]: [ca_address_sk#12]
 Join condition: None
 
 (23) Project [codegen id : 7]
-Output [4]: [cr_call_center_sk#3, cr_net_loss#4, c_current_cdemo_sk#11, c_current_addr_sk#13]
-Input [6]: [cr_call_center_sk#3, cr_net_loss#4, c_current_cdemo_sk#11, c_current_hdemo_sk#12, c_current_addr_sk#13, hd_demo_sk#14]
+Output [3]: [c_customer_sk#1, cd_marital_status#6, cd_education_status#7]
+Input [5]: [c_customer_sk#1, c_current_addr_sk#4, cd_marital_status#6, cd_education_status#7, ca_address_sk#12]
 
-(24) Scan parquet default.customer_address
-Output [2]: [ca_address_sk#17, ca_gmt_offset#18]
+(24) Scan parquet default.catalog_returns
+Output [4]: [cr_returned_date_sk#15, cr_returning_customer_sk#16, cr_call_center_sk#17, cr_net_loss#18]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/customer_address]
-PushedFilters: [IsNotNull(ca_gmt_offset), EqualTo(ca_gmt_offset,-7.00), IsNotNull(ca_address_sk)]
-ReadSchema: struct<ca_address_sk:int,ca_gmt_offset:decimal(5,2)>
+Location [not included in comparison]/{warehouse_dir}/catalog_returns]
+PushedFilters: [IsNotNull(cr_call_center_sk), IsNotNull(cr_returned_date_sk), IsNotNull(cr_returning_customer_sk)]
+ReadSchema: struct<cr_returned_date_sk:int,cr_returning_customer_sk:int,cr_call_center_sk:int,cr_net_loss:decimal(7,2)>
 
-(25) ColumnarToRow [codegen id : 4]
-Input [2]: [ca_address_sk#17, ca_gmt_offset#18]
+(25) ColumnarToRow [codegen id : 5]
+Input [4]: [cr_returned_date_sk#15, cr_returning_customer_sk#16, cr_call_center_sk#17, cr_net_loss#18]
 
-(26) Filter [codegen id : 4]
-Input [2]: [ca_address_sk#17, ca_gmt_offset#18]
-Condition : ((isnotnull(ca_gmt_offset#18) AND (ca_gmt_offset#18 = -7.00)) AND isnotnull(ca_address_sk#17))
+(26) Filter [codegen id : 5]
+Input [4]: [cr_returned_date_sk#15, cr_returning_customer_sk#16, cr_call_center_sk#17, cr_net_loss#18]
+Condition : ((isnotnull(cr_call_center_sk#17) AND isnotnull(cr_returned_date_sk#15)) AND isnotnull(cr_returning_customer_sk#16))
 
-(27) Project [codegen id : 4]
-Output [1]: [ca_address_sk#17]
-Input [2]: [ca_address_sk#17, ca_gmt_offset#18]
+(27) Scan parquet default.date_dim
+Output [3]: [d_date_sk#19, d_year#20, d_moy#21]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/date_dim]
+PushedFilters: [IsNotNull(d_year), IsNotNull(d_moy), EqualTo(d_year,1998), EqualTo(d_moy,11), IsNotNull(d_date_sk)]
+ReadSchema: struct<d_date_sk:int,d_year:int,d_moy:int>
 
-(28) BroadcastExchange
-Input [1]: [ca_address_sk#17]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#19]
+(28) ColumnarToRow [codegen id : 4]
+Input [3]: [d_date_sk#19, d_year#20, d_moy#21]
 
-(29) BroadcastHashJoin [codegen id : 7]
-Left keys [1]: [c_current_addr_sk#13]
-Right keys [1]: [ca_address_sk#17]
-Join condition: None
+(29) Filter [codegen id : 4]
+Input [3]: [d_date_sk#19, d_year#20, d_moy#21]
+Condition : ((((isnotnull(d_year#20) AND isnotnull(d_moy#21)) AND (d_year#20 = 1998)) AND (d_moy#21 = 11)) AND isnotnull(d_date_sk#19))
 
-(30) Project [codegen id : 7]
-Output [3]: [cr_call_center_sk#3, cr_net_loss#4, c_current_cdemo_sk#11]
-Input [5]: [cr_call_center_sk#3, cr_net_loss#4, c_current_cdemo_sk#11, c_current_addr_sk#13, ca_address_sk#17]
+(30) Project [codegen id : 4]
+Output [1]: [d_date_sk#19]
+Input [3]: [d_date_sk#19, d_year#20, d_moy#21]
 
-(31) Scan parquet default.customer_demographics
-Output [3]: [cd_demo_sk#20, cd_marital_status#21, cd_education_status#22]
-Batched: true
-Location [not included in comparison]/{warehouse_dir}/customer_demographics]
-PushedFilters: [Or(And(EqualTo(cd_marital_status,M),EqualTo(cd_education_status,Unknown)),And(EqualTo(cd_marital_status,W),EqualTo(cd_education_status,Advanced Degree))), IsNotNull(cd_demo_sk)]
-ReadSchema: struct<cd_demo_sk:int,cd_marital_status:string,cd_education_status:string>
+(31) BroadcastExchange
+Input [1]: [d_date_sk#19]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#22]
 
-(32) ColumnarToRow [codegen id : 5]
-Input [3]: [cd_demo_sk#20, cd_marital_status#21, cd_education_status#22]
+(32) BroadcastHashJoin [codegen id : 5]
+Left keys [1]: [cr_returned_date_sk#15]
+Right keys [1]: [d_date_sk#19]
+Join condition: None
 
-(33) Filter [codegen id : 5]
-Input [3]: [cd_demo_sk#20, cd_marital_status#21, cd_education_status#22]
-Condition : ((((cd_marital_status#21 = M) AND (cd_education_status#22 = Unknown)) OR ((cd_marital_status#21 = W) AND (cd_education_status#22 = Advanced Degree))) AND isnotnull(cd_demo_sk#20))
+(33) Project [codegen id : 5]
+Output [3]: [cr_returning_customer_sk#16, cr_call_center_sk#17, cr_net_loss#18]
+Input [5]: [cr_returned_date_sk#15, cr_returning_customer_sk#16, cr_call_center_sk#17, cr_net_loss#18, d_date_sk#19]
 
 (34) BroadcastExchange
-Input [3]: [cd_demo_sk#20, cd_marital_status#21, cd_education_status#22]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#23]
+Input [3]: [cr_returning_customer_sk#16, cr_call_center_sk#17, cr_net_loss#18]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#23]
 
 (35) BroadcastHashJoin [codegen id : 7]
-Left keys [1]: [c_current_cdemo_sk#11]
-Right keys [1]: [cd_demo_sk#20]
+Left keys [1]: [c_customer_sk#1]
+Right keys [1]: [cr_returning_customer_sk#16]
 Join condition: None
 
 (36) Project [codegen id : 7]
-Output [4]: [cr_call_center_sk#3, cr_net_loss#4, cd_marital_status#21, cd_education_status#22]
-Input [6]: [cr_call_center_sk#3, cr_net_loss#4, c_current_cdemo_sk#11, cd_demo_sk#20, cd_marital_status#21, cd_education_status#22]
+Output [4]: [cd_marital_status#6, cd_education_status#7, cr_call_center_sk#17, cr_net_loss#18]
+Input [6]: [c_customer_sk#1, cd_marital_status#6, cd_education_status#7, cr_returning_customer_sk#16, cr_call_center_sk#17, cr_net_loss#18]
 
 (37) Scan parquet default.call_center
 Output [4]: [cc_call_center_sk#24, cc_call_center_id#25, cc_name#26, cc_manager#27]
@@ -228,31 +228,31 @@ Input [4]: [cc_call_center_sk#24, cc_call_center_id#25, cc_name#26, cc_manager#2
 Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#28]
 
 (41) BroadcastHashJoin [codegen id : 7]
-Left keys [1]: [cr_call_center_sk#3]
+Left keys [1]: [cr_call_center_sk#17]
 Right keys [1]: [cc_call_center_sk#24]
 Join condition: None
 
 (42) Project [codegen id : 7]
-Output [6]: [cc_call_center_id#25, cc_name#26, cc_manager#27, cr_net_loss#4, cd_marital_status#21, cd_education_status#22]
-Input [8]: [cr_call_center_sk#3, cr_net_loss#4, cd_marital_status#21, cd_education_status#22, cc_call_center_sk#24, cc_call_center_id#25, cc_name#26, cc_manager#27]
+Output [6]: [cc_call_center_id#25, cc_name#26, cc_manager#27, cr_net_loss#18, cd_marital_status#6, cd_education_status#7]
+Input [8]: [cd_marital_status#6, cd_education_status#7, cr_call_center_sk#17, cr_net_loss#18, cc_call_center_sk#24, cc_call_center_id#25, cc_name#26, cc_manager#27]
 
 (43) HashAggregate [codegen id : 7]
-Input [6]: [cc_call_center_id#25, cc_name#26, cc_manager#27, cr_net_loss#4, cd_marital_status#21, cd_education_status#22]
-Keys [5]: [cc_call_center_id#25, cc_name#26, cc_manager#27, cd_marital_status#21, cd_education_status#22]
-Functions [1]: [partial_sum(UnscaledValue(cr_net_loss#4))]
+Input [6]: [cc_call_center_id#25, cc_name#26, cc_manager#27, cr_net_loss#18, cd_marital_status#6, cd_education_status#7]
+Keys [5]: [cc_call_center_id#25, cc_name#26, cc_manager#27, cd_marital_status#6, cd_education_status#7]
+Functions [1]: [partial_sum(UnscaledValue(cr_net_loss#18))]
 Aggregate Attributes [1]: [sum#29]
-Results [6]: [cc_call_center_id#25, cc_name#26, cc_manager#27, cd_marital_status#21, cd_education_status#22, sum#30]
+Results [6]: [cc_call_center_id#25, cc_name#26, cc_manager#27, cd_marital_status#6, cd_education_status#7, sum#30]
 
 (44) Exchange
-Input [6]: [cc_call_center_id#25, cc_name#26, cc_manager#27, cd_marital_status#21, cd_education_status#22, sum#30]
-Arguments: hashpartitioning(cc_call_center_id#25, cc_name#26, cc_manager#27, cd_marital_status#21, cd_education_status#22, 5), ENSURE_REQUIREMENTS, [id=#31]
+Input [6]: [cc_call_center_id#25, cc_name#26, cc_manager#27, cd_marital_status#6, cd_education_status#7, sum#30]
+Arguments: hashpartitioning(cc_call_center_id#25, cc_name#26, cc_manager#27, cd_marital_status#6, cd_education_status#7, 5), ENSURE_REQUIREMENTS, [id=#31]
 
 (45) HashAggregate [codegen id : 8]
-Input [6]: [cc_call_center_id#25, cc_name#26, cc_manager#27, cd_marital_status#21, cd_education_status#22, sum#30]
-Keys [5]: [cc_call_center_id#25, cc_name#26, cc_manager#27, cd_marital_status#21, cd_education_status#22]
-Functions [1]: [sum(UnscaledValue(cr_net_loss#4))]
-Aggregate Attributes [1]: [sum(UnscaledValue(cr_net_loss#4))#32]
-Results [4]: [cc_call_center_id#25 AS Call_Center#33, cc_name#26 AS Call_Center_Name#34, cc_manager#27 AS Manager#35, MakeDecimal(sum(UnscaledValue(cr_net_loss#4))#32,17,2) AS Returns_Loss#36]
+Input [6]: [cc_call_center_id#25, cc_name#26, cc_manager#27, cd_marital_status#6, cd_education_status#7, sum#30]
+Keys [5]: [cc_call_center_id#25, cc_name#26, cc_manager#27, cd_marital_status#6, cd_education_status#7]
+Functions [1]: [sum(UnscaledValue(cr_net_loss#18))]
+Aggregate Attributes [1]: [sum(UnscaledValue(cr_net_loss#18))#32]
+Results [4]: [cc_call_center_id#25 AS Call_Center#33, cc_name#26 AS Call_Center_Name#34, cc_manager#27 AS Manager#35, MakeDecimal(sum(UnscaledValue(cr_net_loss#18))#32,17,2) AS Returns_Loss#36]
 
 (46) Exchange
 Input [4]: [Call_Center#33, Call_Center_Name#34, Manager#35, Returns_Loss#36]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q91.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q91.sf100/simplified.txt
index 6c8d629feed3e..87beb3b565cc1 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q91.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q91.sf100/simplified.txt
@@ -10,58 +10,58 @@ WholeStageCodegen (9)
                   HashAggregate [cc_call_center_id,cc_name,cc_manager,cd_marital_status,cd_education_status,cr_net_loss] [sum,sum]
                     Project [cc_call_center_id,cc_name,cc_manager,cr_net_loss,cd_marital_status,cd_education_status]
                       BroadcastHashJoin [cr_call_center_sk,cc_call_center_sk]
-                        Project [cr_call_center_sk,cr_net_loss,cd_marital_status,cd_education_status]
-                          BroadcastHashJoin [c_current_cdemo_sk,cd_demo_sk]
-                            Project [cr_call_center_sk,cr_net_loss,c_current_cdemo_sk]
+                        Project [cd_marital_status,cd_education_status,cr_call_center_sk,cr_net_loss]
+                          BroadcastHashJoin [c_customer_sk,cr_returning_customer_sk]
+                            Project [c_customer_sk,cd_marital_status,cd_education_status]
                               BroadcastHashJoin [c_current_addr_sk,ca_address_sk]
-                                Project [cr_call_center_sk,cr_net_loss,c_current_cdemo_sk,c_current_addr_sk]
+                                Project [c_customer_sk,c_current_addr_sk,cd_marital_status,cd_education_status]
                                   BroadcastHashJoin [c_current_hdemo_sk,hd_demo_sk]
-                                    Project [cr_call_center_sk,cr_net_loss,c_current_cdemo_sk,c_current_hdemo_sk,c_current_addr_sk]
-                                      BroadcastHashJoin [cr_returning_customer_sk,c_customer_sk]
-                                        InputAdapter
-                                          BroadcastExchange #3
-                                            WholeStageCodegen (2)
-                                              Project [cr_returning_customer_sk,cr_call_center_sk,cr_net_loss]
-                                                BroadcastHashJoin [cr_returned_date_sk,d_date_sk]
-                                                  Filter [cr_call_center_sk,cr_returned_date_sk,cr_returning_customer_sk]
-                                                    ColumnarToRow
-                                                      InputAdapter
-                                                        Scan parquet default.catalog_returns [cr_returned_date_sk,cr_returning_customer_sk,cr_call_center_sk,cr_net_loss]
-                                                  InputAdapter
-                                                    BroadcastExchange #4
-                                                      WholeStageCodegen (1)
-                                                        Project [d_date_sk]
-                                                          Filter [d_year,d_moy,d_date_sk]
-                                                            ColumnarToRow
-                                                              InputAdapter
-                                                                Scan parquet default.date_dim [d_date_sk,d_year,d_moy]
+                                    Project [c_customer_sk,c_current_hdemo_sk,c_current_addr_sk,cd_marital_status,cd_education_status]
+                                      BroadcastHashJoin [c_current_cdemo_sk,cd_demo_sk]
                                         Filter [c_customer_sk,c_current_addr_sk,c_current_cdemo_sk,c_current_hdemo_sk]
                                           ColumnarToRow
                                             InputAdapter
                                               Scan parquet default.customer [c_customer_sk,c_current_cdemo_sk,c_current_hdemo_sk,c_current_addr_sk]
+                                        InputAdapter
+                                          BroadcastExchange #3
+                                            WholeStageCodegen (1)
+                                              Filter [cd_marital_status,cd_education_status,cd_demo_sk]
+                                                ColumnarToRow
+                                                  InputAdapter
+                                                    Scan parquet default.customer_demographics [cd_demo_sk,cd_marital_status,cd_education_status]
                                     InputAdapter
-                                      BroadcastExchange #5
-                                        WholeStageCodegen (3)
+                                      BroadcastExchange #4
+                                        WholeStageCodegen (2)
                                           Project [hd_demo_sk]
                                             Filter [hd_buy_potential,hd_demo_sk]
                                               ColumnarToRow
                                                 InputAdapter
                                                   Scan parquet default.household_demographics [hd_demo_sk,hd_buy_potential]
                                 InputAdapter
-                                  BroadcastExchange #6
-                                    WholeStageCodegen (4)
+                                  BroadcastExchange #5
+                                    WholeStageCodegen (3)
                                       Project [ca_address_sk]
                                         Filter [ca_gmt_offset,ca_address_sk]
                                           ColumnarToRow
                                             InputAdapter
                                               Scan parquet default.customer_address [ca_address_sk,ca_gmt_offset]
                             InputAdapter
-                              BroadcastExchange #7
+                              BroadcastExchange #6
                                 WholeStageCodegen (5)
-                                  Filter [cd_marital_status,cd_education_status,cd_demo_sk]
-                                    ColumnarToRow
+                                  Project [cr_returning_customer_sk,cr_call_center_sk,cr_net_loss]
+                                    BroadcastHashJoin [cr_returned_date_sk,d_date_sk]
+                                      Filter [cr_call_center_sk,cr_returned_date_sk,cr_returning_customer_sk]
+                                        ColumnarToRow
+                                          InputAdapter
+                                            Scan parquet default.catalog_returns [cr_returned_date_sk,cr_returning_customer_sk,cr_call_center_sk,cr_net_loss]
                                       InputAdapter
-                                        Scan parquet default.customer_demographics [cd_demo_sk,cd_marital_status,cd_education_status]
+                                        BroadcastExchange #7
+                                          WholeStageCodegen (4)
+                                            Project [d_date_sk]
+                                              Filter [d_year,d_moy,d_date_sk]
+                                                ColumnarToRow
+                                                  InputAdapter
+                                                    Scan parquet default.date_dim [d_date_sk,d_year,d_moy]
                         InputAdapter
                           BroadcastExchange #8
                             WholeStageCodegen (6)
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q98.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q98.sf100/explain.txt
index 30dabdd2d5523..c005ef0f9ff1a 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q98.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q98.sf100/explain.txt
@@ -10,24 +10,24 @@
                      +- Exchange (21)
                         +- * HashAggregate (20)
                            +- * Project (19)
-                              +- * SortMergeJoin Inner (18)
-                                 :- * Sort (12)
-                                 :  +- Exchange (11)
-                                 :     +- * Project (10)
-                                 :        +- * BroadcastHashJoin Inner BuildRight (9)
-                                 :           :- * Filter (3)
-                                 :           :  +- * ColumnarToRow (2)
-                                 :           :     +- Scan parquet default.store_sales (1)
-                                 :           +- BroadcastExchange (8)
-                                 :              +- * Project (7)
-                                 :                 +- * Filter (6)
-                                 :                    +- * ColumnarToRow (5)
-                                 :                       +- Scan parquet default.date_dim (4)
-                                 +- * Sort (17)
-                                    +- Exchange (16)
+                              +- * BroadcastHashJoin Inner BuildRight (18)
+                                 :- * Project (12)
+                                 :  +- * SortMergeJoin Inner (11)
+                                 :     :- * Sort (5)
+                                 :     :  +- Exchange (4)
+                                 :     :     +- * Filter (3)
+                                 :     :        +- * ColumnarToRow (2)
+                                 :     :           +- Scan parquet default.store_sales (1)
+                                 :     +- * Sort (10)
+                                 :        +- Exchange (9)
+                                 :           +- * Filter (8)
+                                 :              +- * ColumnarToRow (7)
+                                 :                 +- Scan parquet default.item (6)
+                                 +- BroadcastExchange (17)
+                                    +- * Project (16)
                                        +- * Filter (15)
                                           +- * ColumnarToRow (14)
-                                             +- Scan parquet default.item (13)
+                                             +- Scan parquet default.date_dim (13)
 
 
 (1) Scan parquet default.store_sales
@@ -37,126 +37,126 @@ Location [not included in comparison]/{warehouse_dir}/store_sales]
 PushedFilters: [IsNotNull(ss_item_sk), IsNotNull(ss_sold_date_sk)]
 ReadSchema: struct<ss_sold_date_sk:int,ss_item_sk:int,ss_ext_sales_price:decimal(7,2)>
 
-(2) ColumnarToRow [codegen id : 2]
+(2) ColumnarToRow [codegen id : 1]
 Input [3]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3]
 
-(3) Filter [codegen id : 2]
+(3) Filter [codegen id : 1]
 Input [3]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3]
 Condition : (isnotnull(ss_item_sk#2) AND isnotnull(ss_sold_date_sk#1))
 
-(4) Scan parquet default.date_dim
-Output [2]: [d_date_sk#4, d_date#5]
-Batched: true
-Location [not included in comparison]/{warehouse_dir}/date_dim]
-PushedFilters: [IsNotNull(d_date), GreaterThanOrEqual(d_date,1999-02-22), LessThanOrEqual(d_date,1999-03-24), IsNotNull(d_date_sk)]
-ReadSchema: struct<d_date_sk:int,d_date:date>
+(4) Exchange
+Input [3]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3]
+Arguments: hashpartitioning(ss_item_sk#2, 5), ENSURE_REQUIREMENTS, [id=#4]
 
-(5) ColumnarToRow [codegen id : 1]
-Input [2]: [d_date_sk#4, d_date#5]
+(5) Sort [codegen id : 2]
+Input [3]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3]
+Arguments: [ss_item_sk#2 ASC NULLS FIRST], false, 0
 
-(6) Filter [codegen id : 1]
-Input [2]: [d_date_sk#4, d_date#5]
-Condition : (((isnotnull(d_date#5) AND (d_date#5 >= 10644)) AND (d_date#5 <= 10674)) AND isnotnull(d_date_sk#4))
+(6) Scan parquet default.item
+Output [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/item]
+PushedFilters: [In(i_category, [Sports                                            ,Books                                             ,Home                                              ]), IsNotNull(i_item_sk)]
+ReadSchema: struct<i_item_sk:int,i_item_id:string,i_item_desc:string,i_current_price:decimal(7,2),i_class:string,i_category:string>
 
-(7) Project [codegen id : 1]
-Output [1]: [d_date_sk#4]
-Input [2]: [d_date_sk#4, d_date#5]
+(7) ColumnarToRow [codegen id : 3]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
 
-(8) BroadcastExchange
-Input [1]: [d_date_sk#4]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#6]
+(8) Filter [codegen id : 3]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Condition : (i_category#10 IN (Sports                                            ,Books                                             ,Home                                              ) AND isnotnull(i_item_sk#5))
 
-(9) BroadcastHashJoin [codegen id : 2]
-Left keys [1]: [ss_sold_date_sk#1]
-Right keys [1]: [d_date_sk#4]
-Join condition: None
+(9) Exchange
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Arguments: hashpartitioning(i_item_sk#5, 5), ENSURE_REQUIREMENTS, [id=#11]
 
-(10) Project [codegen id : 2]
-Output [2]: [ss_item_sk#2, ss_ext_sales_price#3]
-Input [4]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3, d_date_sk#4]
+(10) Sort [codegen id : 4]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Arguments: [i_item_sk#5 ASC NULLS FIRST], false, 0
 
-(11) Exchange
-Input [2]: [ss_item_sk#2, ss_ext_sales_price#3]
-Arguments: hashpartitioning(ss_item_sk#2, 5), true, [id=#7]
+(11) SortMergeJoin [codegen id : 6]
+Left keys [1]: [ss_item_sk#2]
+Right keys [1]: [i_item_sk#5]
+Join condition: None
 
-(12) Sort [codegen id : 3]
-Input [2]: [ss_item_sk#2, ss_ext_sales_price#3]
-Arguments: [ss_item_sk#2 ASC NULLS FIRST], false, 0
+(12) Project [codegen id : 6]
+Output [7]: [ss_sold_date_sk#1, ss_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Input [9]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3, i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
 
-(13) Scan parquet default.item
-Output [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+(13) Scan parquet default.date_dim
+Output [2]: [d_date_sk#12, d_date#13]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/item]
-PushedFilters: [In(i_category, [Sports,Books,Home]), IsNotNull(i_item_sk)]
-ReadSchema: struct<i_item_sk:int,i_item_id:string,i_item_desc:string,i_current_price:decimal(7,2),i_class:string,i_category:string>
+Location [not included in comparison]/{warehouse_dir}/date_dim]
+PushedFilters: [IsNotNull(d_date), GreaterThanOrEqual(d_date,1999-02-22), LessThanOrEqual(d_date,1999-03-24), IsNotNull(d_date_sk)]
+ReadSchema: struct<d_date_sk:int,d_date:date>
 
-(14) ColumnarToRow [codegen id : 4]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+(14) ColumnarToRow [codegen id : 5]
+Input [2]: [d_date_sk#12, d_date#13]
 
-(15) Filter [codegen id : 4]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Condition : (i_category#13 IN (Sports,Books,Home) AND isnotnull(i_item_sk#8))
+(15) Filter [codegen id : 5]
+Input [2]: [d_date_sk#12, d_date#13]
+Condition : (((isnotnull(d_date#13) AND (d_date#13 >= 10644)) AND (d_date#13 <= 10674)) AND isnotnull(d_date_sk#12))
 
-(16) Exchange
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Arguments: hashpartitioning(i_item_sk#8, 5), true, [id=#14]
+(16) Project [codegen id : 5]
+Output [1]: [d_date_sk#12]
+Input [2]: [d_date_sk#12, d_date#13]
 
-(17) Sort [codegen id : 5]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Arguments: [i_item_sk#8 ASC NULLS FIRST], false, 0
+(17) BroadcastExchange
+Input [1]: [d_date_sk#12]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#14]
 
-(18) SortMergeJoin [codegen id : 6]
-Left keys [1]: [ss_item_sk#2]
-Right keys [1]: [i_item_sk#8]
+(18) BroadcastHashJoin [codegen id : 6]
+Left keys [1]: [ss_sold_date_sk#1]
+Right keys [1]: [d_date_sk#12]
 Join condition: None
 
 (19) Project [codegen id : 6]
-Output [6]: [ss_ext_sales_price#3, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Input [8]: [ss_item_sk#2, ss_ext_sales_price#3, i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+Output [6]: [ss_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Input [8]: [ss_sold_date_sk#1, ss_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10, d_date_sk#12]
 
 (20) HashAggregate [codegen id : 6]
-Input [6]: [ss_ext_sales_price#3, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Keys [5]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11]
+Input [6]: [ss_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Keys [5]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8]
 Functions [1]: [partial_sum(UnscaledValue(ss_ext_sales_price#3))]
 Aggregate Attributes [1]: [sum#15]
-Results [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
+Results [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
 
 (21) Exchange
-Input [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
-Arguments: hashpartitioning(i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, 5), true, [id=#17]
+Input [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
+Arguments: hashpartitioning(i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, 5), ENSURE_REQUIREMENTS, [id=#17]
 
 (22) HashAggregate [codegen id : 7]
-Input [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
-Keys [5]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11]
+Input [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
+Keys [5]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8]
 Functions [1]: [sum(UnscaledValue(ss_ext_sales_price#3))]
 Aggregate Attributes [1]: [sum(UnscaledValue(ss_ext_sales_price#3))#18]
-Results [8]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#18,17,2) AS itemrevenue#19, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#18,17,2) AS _w0#20, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#18,17,2) AS _w1#21, i_item_id#9]
+Results [8]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#18,17,2) AS itemrevenue#19, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#18,17,2) AS _w0#20, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#18,17,2) AS _w1#21, i_item_id#6]
 
 (23) Exchange
-Input [8]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, i_item_id#9]
-Arguments: hashpartitioning(i_class#12, 5), true, [id=#22]
+Input [8]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, i_item_id#6]
+Arguments: hashpartitioning(i_class#9, 5), ENSURE_REQUIREMENTS, [id=#22]
 
 (24) Sort [codegen id : 8]
-Input [8]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, i_item_id#9]
-Arguments: [i_class#12 ASC NULLS FIRST], false, 0
+Input [8]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, i_item_id#6]
+Arguments: [i_class#9 ASC NULLS FIRST], false, 0
 
 (25) Window
-Input [8]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, i_item_id#9]
-Arguments: [sum(_w1#21) windowspecdefinition(i_class#12, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#23], [i_class#12]
+Input [8]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, i_item_id#6]
+Arguments: [sum(_w1#21) windowspecdefinition(i_class#9, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#23], [i_class#9]
 
 (26) Project [codegen id : 9]
-Output [7]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(_w0#20) * 100.00), DecimalType(21,2), true) as decimal(27,2))) / promote_precision(_we0#23)), DecimalType(38,17), true) AS revenueratio#24, i_item_id#9]
-Input [9]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, i_item_id#9, _we0#23]
+Output [7]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(_w0#20) * 100.00), DecimalType(21,2), true) as decimal(27,2))) / promote_precision(_we0#23)), DecimalType(38,17), true) AS revenueratio#24, i_item_id#6]
+Input [9]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, i_item_id#6, _we0#23]
 
 (27) Exchange
-Input [7]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, revenueratio#24, i_item_id#9]
-Arguments: rangepartitioning(i_category#13 ASC NULLS FIRST, i_class#12 ASC NULLS FIRST, i_item_id#9 ASC NULLS FIRST, i_item_desc#10 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST, 5), true, [id=#25]
+Input [7]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, revenueratio#24, i_item_id#6]
+Arguments: rangepartitioning(i_category#10 ASC NULLS FIRST, i_class#9 ASC NULLS FIRST, i_item_id#6 ASC NULLS FIRST, i_item_desc#7 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST, 5), ENSURE_REQUIREMENTS, [id=#25]
 
 (28) Sort [codegen id : 10]
-Input [7]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, revenueratio#24, i_item_id#9]
-Arguments: [i_category#13 ASC NULLS FIRST, i_class#12 ASC NULLS FIRST, i_item_id#9 ASC NULLS FIRST, i_item_desc#10 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST], true, 0
+Input [7]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, revenueratio#24, i_item_id#6]
+Arguments: [i_category#10 ASC NULLS FIRST, i_class#9 ASC NULLS FIRST, i_item_id#6 ASC NULLS FIRST, i_item_desc#7 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST], true, 0
 
 (29) Project [codegen id : 10]
-Output [6]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, revenueratio#24]
-Input [7]: [i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, revenueratio#24, i_item_id#9]
+Output [6]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, revenueratio#24]
+Input [7]: [i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, revenueratio#24, i_item_id#6]
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q98.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q98.sf100/simplified.txt
index a180ed2b060b9..c6ce597f3c1bf 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q98.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q98.sf100/simplified.txt
@@ -18,34 +18,34 @@ WholeStageCodegen (10)
                                   WholeStageCodegen (6)
                                     HashAggregate [i_item_id,i_item_desc,i_category,i_class,i_current_price,ss_ext_sales_price] [sum,sum]
                                       Project [ss_ext_sales_price,i_item_id,i_item_desc,i_current_price,i_class,i_category]
-                                        SortMergeJoin [ss_item_sk,i_item_sk]
-                                          InputAdapter
-                                            WholeStageCodegen (3)
-                                              Sort [ss_item_sk]
-                                                InputAdapter
-                                                  Exchange [ss_item_sk] #4
-                                                    WholeStageCodegen (2)
-                                                      Project [ss_item_sk,ss_ext_sales_price]
-                                                        BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
+                                        BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
+                                          Project [ss_sold_date_sk,ss_ext_sales_price,i_item_id,i_item_desc,i_current_price,i_class,i_category]
+                                            SortMergeJoin [ss_item_sk,i_item_sk]
+                                              InputAdapter
+                                                WholeStageCodegen (2)
+                                                  Sort [ss_item_sk]
+                                                    InputAdapter
+                                                      Exchange [ss_item_sk] #4
+                                                        WholeStageCodegen (1)
                                                           Filter [ss_item_sk,ss_sold_date_sk]
                                                             ColumnarToRow
                                                               InputAdapter
                                                                 Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk,ss_ext_sales_price]
-                                                          InputAdapter
-                                                            BroadcastExchange #5
-                                                              WholeStageCodegen (1)
-                                                                Project [d_date_sk]
-                                                                  Filter [d_date,d_date_sk]
-                                                                    ColumnarToRow
-                                                                      InputAdapter
-                                                                        Scan parquet default.date_dim [d_date_sk,d_date]
+                                              InputAdapter
+                                                WholeStageCodegen (4)
+                                                  Sort [i_item_sk]
+                                                    InputAdapter
+                                                      Exchange [i_item_sk] #5
+                                                        WholeStageCodegen (3)
+                                                          Filter [i_category,i_item_sk]
+                                                            ColumnarToRow
+                                                              InputAdapter
+                                                                Scan parquet default.item [i_item_sk,i_item_id,i_item_desc,i_current_price,i_class,i_category]
                                           InputAdapter
-                                            WholeStageCodegen (5)
-                                              Sort [i_item_sk]
-                                                InputAdapter
-                                                  Exchange [i_item_sk] #6
-                                                    WholeStageCodegen (4)
-                                                      Filter [i_category,i_item_sk]
-                                                        ColumnarToRow
-                                                          InputAdapter
-                                                            Scan parquet default.item [i_item_sk,i_item_id,i_item_desc,i_current_price,i_class,i_category]
+                                            BroadcastExchange #6
+                                              WholeStageCodegen (5)
+                                                Project [d_date_sk]
+                                                  Filter [d_date,d_date_sk]
+                                                    ColumnarToRow
+                                                      InputAdapter
+                                                        Scan parquet default.date_dim [d_date_sk,d_date]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q12.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q12.sf100/explain.txt
index c1bf12b7c2c5a..60d89b18f8fc9 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q12.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q12.sf100/explain.txt
@@ -8,24 +8,24 @@ TakeOrderedAndProject (27)
                +- Exchange (21)
                   +- * HashAggregate (20)
                      +- * Project (19)
-                        +- * SortMergeJoin Inner (18)
-                           :- * Sort (12)
-                           :  +- Exchange (11)
-                           :     +- * Project (10)
-                           :        +- * BroadcastHashJoin Inner BuildRight (9)
-                           :           :- * Filter (3)
-                           :           :  +- * ColumnarToRow (2)
-                           :           :     +- Scan parquet default.web_sales (1)
-                           :           +- BroadcastExchange (8)
-                           :              +- * Project (7)
-                           :                 +- * Filter (6)
-                           :                    +- * ColumnarToRow (5)
-                           :                       +- Scan parquet default.date_dim (4)
-                           +- * Sort (17)
-                              +- Exchange (16)
+                        +- * BroadcastHashJoin Inner BuildRight (18)
+                           :- * Project (12)
+                           :  +- * SortMergeJoin Inner (11)
+                           :     :- * Sort (5)
+                           :     :  +- Exchange (4)
+                           :     :     +- * Filter (3)
+                           :     :        +- * ColumnarToRow (2)
+                           :     :           +- Scan parquet default.web_sales (1)
+                           :     +- * Sort (10)
+                           :        +- Exchange (9)
+                           :           +- * Filter (8)
+                           :              +- * ColumnarToRow (7)
+                           :                 +- Scan parquet default.item (6)
+                           +- BroadcastExchange (17)
+                              +- * Project (16)
                                  +- * Filter (15)
                                     +- * ColumnarToRow (14)
-                                       +- Scan parquet default.item (13)
+                                       +- Scan parquet default.date_dim (13)
 
 
 (1) Scan parquet default.web_sales
@@ -35,118 +35,118 @@ Location [not included in comparison]/{warehouse_dir}/web_sales]
 PushedFilters: [IsNotNull(ws_item_sk), IsNotNull(ws_sold_date_sk)]
 ReadSchema: struct<ws_sold_date_sk:int,ws_item_sk:int,ws_ext_sales_price:decimal(7,2)>
 
-(2) ColumnarToRow [codegen id : 2]
+(2) ColumnarToRow [codegen id : 1]
 Input [3]: [ws_sold_date_sk#1, ws_item_sk#2, ws_ext_sales_price#3]
 
-(3) Filter [codegen id : 2]
+(3) Filter [codegen id : 1]
 Input [3]: [ws_sold_date_sk#1, ws_item_sk#2, ws_ext_sales_price#3]
 Condition : (isnotnull(ws_item_sk#2) AND isnotnull(ws_sold_date_sk#1))
 
-(4) Scan parquet default.date_dim
-Output [2]: [d_date_sk#4, d_date#5]
-Batched: true
-Location [not included in comparison]/{warehouse_dir}/date_dim]
-PushedFilters: [IsNotNull(d_date), GreaterThanOrEqual(d_date,1999-02-22), LessThanOrEqual(d_date,1999-03-24), IsNotNull(d_date_sk)]
-ReadSchema: struct<d_date_sk:int,d_date:date>
+(4) Exchange
+Input [3]: [ws_sold_date_sk#1, ws_item_sk#2, ws_ext_sales_price#3]
+Arguments: hashpartitioning(ws_item_sk#2, 5), ENSURE_REQUIREMENTS, [id=#4]
 
-(5) ColumnarToRow [codegen id : 1]
-Input [2]: [d_date_sk#4, d_date#5]
+(5) Sort [codegen id : 2]
+Input [3]: [ws_sold_date_sk#1, ws_item_sk#2, ws_ext_sales_price#3]
+Arguments: [ws_item_sk#2 ASC NULLS FIRST], false, 0
 
-(6) Filter [codegen id : 1]
-Input [2]: [d_date_sk#4, d_date#5]
-Condition : (((isnotnull(d_date#5) AND (d_date#5 >= 10644)) AND (d_date#5 <= 10674)) AND isnotnull(d_date_sk#4))
+(6) Scan parquet default.item
+Output [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/item]
+PushedFilters: [In(i_category, [Sports                                            ,Books                                             ,Home                                              ]), IsNotNull(i_item_sk)]
+ReadSchema: struct<i_item_sk:int,i_item_id:string,i_item_desc:string,i_current_price:decimal(7,2),i_class:string,i_category:string>
 
-(7) Project [codegen id : 1]
-Output [1]: [d_date_sk#4]
-Input [2]: [d_date_sk#4, d_date#5]
+(7) ColumnarToRow [codegen id : 3]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
 
-(8) BroadcastExchange
-Input [1]: [d_date_sk#4]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#6]
+(8) Filter [codegen id : 3]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Condition : (i_category#10 IN (Sports                                            ,Books                                             ,Home                                              ) AND isnotnull(i_item_sk#5))
 
-(9) BroadcastHashJoin [codegen id : 2]
-Left keys [1]: [ws_sold_date_sk#1]
-Right keys [1]: [d_date_sk#4]
-Join condition: None
+(9) Exchange
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Arguments: hashpartitioning(i_item_sk#5, 5), ENSURE_REQUIREMENTS, [id=#11]
 
-(10) Project [codegen id : 2]
-Output [2]: [ws_item_sk#2, ws_ext_sales_price#3]
-Input [4]: [ws_sold_date_sk#1, ws_item_sk#2, ws_ext_sales_price#3, d_date_sk#4]
+(10) Sort [codegen id : 4]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Arguments: [i_item_sk#5 ASC NULLS FIRST], false, 0
 
-(11) Exchange
-Input [2]: [ws_item_sk#2, ws_ext_sales_price#3]
-Arguments: hashpartitioning(ws_item_sk#2, 5), true, [id=#7]
+(11) SortMergeJoin [codegen id : 6]
+Left keys [1]: [ws_item_sk#2]
+Right keys [1]: [i_item_sk#5]
+Join condition: None
 
-(12) Sort [codegen id : 3]
-Input [2]: [ws_item_sk#2, ws_ext_sales_price#3]
-Arguments: [ws_item_sk#2 ASC NULLS FIRST], false, 0
+(12) Project [codegen id : 6]
+Output [7]: [ws_sold_date_sk#1, ws_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Input [9]: [ws_sold_date_sk#1, ws_item_sk#2, ws_ext_sales_price#3, i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
 
-(13) Scan parquet default.item
-Output [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+(13) Scan parquet default.date_dim
+Output [2]: [d_date_sk#12, d_date#13]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/item]
-PushedFilters: [In(i_category, [Sports,Books,Home]), IsNotNull(i_item_sk)]
-ReadSchema: struct<i_item_sk:int,i_item_id:string,i_item_desc:string,i_current_price:decimal(7,2),i_class:string,i_category:string>
+Location [not included in comparison]/{warehouse_dir}/date_dim]
+PushedFilters: [IsNotNull(d_date), GreaterThanOrEqual(d_date,1999-02-22), LessThanOrEqual(d_date,1999-03-24), IsNotNull(d_date_sk)]
+ReadSchema: struct<d_date_sk:int,d_date:date>
 
-(14) ColumnarToRow [codegen id : 4]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+(14) ColumnarToRow [codegen id : 5]
+Input [2]: [d_date_sk#12, d_date#13]
 
-(15) Filter [codegen id : 4]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Condition : (i_category#13 IN (Sports,Books,Home) AND isnotnull(i_item_sk#8))
+(15) Filter [codegen id : 5]
+Input [2]: [d_date_sk#12, d_date#13]
+Condition : (((isnotnull(d_date#13) AND (d_date#13 >= 10644)) AND (d_date#13 <= 10674)) AND isnotnull(d_date_sk#12))
 
-(16) Exchange
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Arguments: hashpartitioning(i_item_sk#8, 5), true, [id=#14]
+(16) Project [codegen id : 5]
+Output [1]: [d_date_sk#12]
+Input [2]: [d_date_sk#12, d_date#13]
 
-(17) Sort [codegen id : 5]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Arguments: [i_item_sk#8 ASC NULLS FIRST], false, 0
+(17) BroadcastExchange
+Input [1]: [d_date_sk#12]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#14]
 
-(18) SortMergeJoin [codegen id : 6]
-Left keys [1]: [ws_item_sk#2]
-Right keys [1]: [i_item_sk#8]
+(18) BroadcastHashJoin [codegen id : 6]
+Left keys [1]: [ws_sold_date_sk#1]
+Right keys [1]: [d_date_sk#12]
 Join condition: None
 
 (19) Project [codegen id : 6]
-Output [6]: [ws_ext_sales_price#3, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Input [8]: [ws_item_sk#2, ws_ext_sales_price#3, i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+Output [6]: [ws_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Input [8]: [ws_sold_date_sk#1, ws_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10, d_date_sk#12]
 
 (20) HashAggregate [codegen id : 6]
-Input [6]: [ws_ext_sales_price#3, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Keys [5]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11]
+Input [6]: [ws_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Keys [5]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8]
 Functions [1]: [partial_sum(UnscaledValue(ws_ext_sales_price#3))]
 Aggregate Attributes [1]: [sum#15]
-Results [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
+Results [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
 
 (21) Exchange
-Input [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
-Arguments: hashpartitioning(i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, 5), true, [id=#17]
+Input [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
+Arguments: hashpartitioning(i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, 5), ENSURE_REQUIREMENTS, [id=#17]
 
 (22) HashAggregate [codegen id : 7]
-Input [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
-Keys [5]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11]
+Input [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
+Keys [5]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8]
 Functions [1]: [sum(UnscaledValue(ws_ext_sales_price#3))]
 Aggregate Attributes [1]: [sum(UnscaledValue(ws_ext_sales_price#3))#18]
-Results [8]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, MakeDecimal(sum(UnscaledValue(ws_ext_sales_price#3))#18,17,2) AS itemrevenue#19, MakeDecimal(sum(UnscaledValue(ws_ext_sales_price#3))#18,17,2) AS _w0#20, MakeDecimal(sum(UnscaledValue(ws_ext_sales_price#3))#18,17,2) AS _w1#21]
+Results [8]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, MakeDecimal(sum(UnscaledValue(ws_ext_sales_price#3))#18,17,2) AS itemrevenue#19, MakeDecimal(sum(UnscaledValue(ws_ext_sales_price#3))#18,17,2) AS _w0#20, MakeDecimal(sum(UnscaledValue(ws_ext_sales_price#3))#18,17,2) AS _w1#21]
 
 (23) Exchange
-Input [8]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21]
-Arguments: hashpartitioning(i_class#12, 5), true, [id=#22]
+Input [8]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21]
+Arguments: hashpartitioning(i_class#9, 5), ENSURE_REQUIREMENTS, [id=#22]
 
 (24) Sort [codegen id : 8]
-Input [8]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21]
-Arguments: [i_class#12 ASC NULLS FIRST], false, 0
+Input [8]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21]
+Arguments: [i_class#9 ASC NULLS FIRST], false, 0
 
 (25) Window
-Input [8]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21]
-Arguments: [sum(_w1#21) windowspecdefinition(i_class#12, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#23], [i_class#12]
+Input [8]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21]
+Arguments: [sum(_w1#21) windowspecdefinition(i_class#9, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#23], [i_class#9]
 
 (26) Project [codegen id : 9]
-Output [7]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(_w0#20) * 100.00), DecimalType(21,2), true) as decimal(27,2))) / promote_precision(_we0#23)), DecimalType(38,17), true) AS revenueratio#24]
-Input [9]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, _we0#23]
+Output [7]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(_w0#20) * 100.00), DecimalType(21,2), true) as decimal(27,2))) / promote_precision(_we0#23)), DecimalType(38,17), true) AS revenueratio#24]
+Input [9]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, _we0#23]
 
 (27) TakeOrderedAndProject
-Input [7]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, revenueratio#24]
-Arguments: 100, [i_category#13 ASC NULLS FIRST, i_class#12 ASC NULLS FIRST, i_item_id#9 ASC NULLS FIRST, i_item_desc#10 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST], [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, revenueratio#24]
+Input [7]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, revenueratio#24]
+Arguments: 100, [i_category#10 ASC NULLS FIRST, i_class#9 ASC NULLS FIRST, i_item_id#6 ASC NULLS FIRST, i_item_desc#7 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST], [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, revenueratio#24]
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q12.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q12.sf100/simplified.txt
index d8db515f84f68..cfb9973ef6983 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q12.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q12.sf100/simplified.txt
@@ -14,34 +14,34 @@ TakeOrderedAndProject [i_category,i_class,i_item_id,i_item_desc,revenueratio,i_c
                           WholeStageCodegen (6)
                             HashAggregate [i_item_id,i_item_desc,i_category,i_class,i_current_price,ws_ext_sales_price] [sum,sum]
                               Project [ws_ext_sales_price,i_item_id,i_item_desc,i_current_price,i_class,i_category]
-                                SortMergeJoin [ws_item_sk,i_item_sk]
-                                  InputAdapter
-                                    WholeStageCodegen (3)
-                                      Sort [ws_item_sk]
-                                        InputAdapter
-                                          Exchange [ws_item_sk] #3
-                                            WholeStageCodegen (2)
-                                              Project [ws_item_sk,ws_ext_sales_price]
-                                                BroadcastHashJoin [ws_sold_date_sk,d_date_sk]
+                                BroadcastHashJoin [ws_sold_date_sk,d_date_sk]
+                                  Project [ws_sold_date_sk,ws_ext_sales_price,i_item_id,i_item_desc,i_current_price,i_class,i_category]
+                                    SortMergeJoin [ws_item_sk,i_item_sk]
+                                      InputAdapter
+                                        WholeStageCodegen (2)
+                                          Sort [ws_item_sk]
+                                            InputAdapter
+                                              Exchange [ws_item_sk] #3
+                                                WholeStageCodegen (1)
                                                   Filter [ws_item_sk,ws_sold_date_sk]
                                                     ColumnarToRow
                                                       InputAdapter
                                                         Scan parquet default.web_sales [ws_sold_date_sk,ws_item_sk,ws_ext_sales_price]
-                                                  InputAdapter
-                                                    BroadcastExchange #4
-                                                      WholeStageCodegen (1)
-                                                        Project [d_date_sk]
-                                                          Filter [d_date,d_date_sk]
-                                                            ColumnarToRow
-                                                              InputAdapter
-                                                                Scan parquet default.date_dim [d_date_sk,d_date]
+                                      InputAdapter
+                                        WholeStageCodegen (4)
+                                          Sort [i_item_sk]
+                                            InputAdapter
+                                              Exchange [i_item_sk] #4
+                                                WholeStageCodegen (3)
+                                                  Filter [i_category,i_item_sk]
+                                                    ColumnarToRow
+                                                      InputAdapter
+                                                        Scan parquet default.item [i_item_sk,i_item_id,i_item_desc,i_current_price,i_class,i_category]
                                   InputAdapter
-                                    WholeStageCodegen (5)
-                                      Sort [i_item_sk]
-                                        InputAdapter
-                                          Exchange [i_item_sk] #5
-                                            WholeStageCodegen (4)
-                                              Filter [i_category,i_item_sk]
-                                                ColumnarToRow
-                                                  InputAdapter
-                                                    Scan parquet default.item [i_item_sk,i_item_id,i_item_desc,i_current_price,i_class,i_category]
+                                    BroadcastExchange #5
+                                      WholeStageCodegen (5)
+                                        Project [d_date_sk]
+                                          Filter [d_date,d_date_sk]
+                                            ColumnarToRow
+                                              InputAdapter
+                                                Scan parquet default.date_dim [d_date_sk,d_date]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q18a.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q18a.sf100/explain.txt
index f6c5258701525..35e3304de7082 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q18a.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q18a.sf100/explain.txt
@@ -34,24 +34,24 @@ TakeOrderedAndProject (160)
    :              +- * Sort (46)
    :                 +- Exchange (45)
    :                    +- * Project (44)
-   :                       +- * BroadcastHashJoin Inner BuildRight (43)
-   :                          :- * Project (38)
-   :                          :  +- * SortMergeJoin Inner (37)
-   :                          :     :- * Sort (31)
-   :                          :     :  +- Exchange (30)
-   :                          :     :     +- * Project (29)
-   :                          :     :        +- * Filter (28)
-   :                          :     :           +- * ColumnarToRow (27)
-   :                          :     :              +- Scan parquet default.customer (26)
-   :                          :     +- * Sort (36)
-   :                          :        +- Exchange (35)
-   :                          :           +- * Filter (34)
-   :                          :              +- * ColumnarToRow (33)
-   :                          :                 +- Scan parquet default.customer_demographics (32)
-   :                          +- BroadcastExchange (42)
-   :                             +- * Filter (41)
-   :                                +- * ColumnarToRow (40)
-   :                                   +- Scan parquet default.customer_address (39)
+   :                       +- * SortMergeJoin Inner (43)
+   :                          :- * Sort (37)
+   :                          :  +- Exchange (36)
+   :                          :     +- * Project (35)
+   :                          :        +- * BroadcastHashJoin Inner BuildRight (34)
+   :                          :           :- * Project (29)
+   :                          :           :  +- * Filter (28)
+   :                          :           :     +- * ColumnarToRow (27)
+   :                          :           :        +- Scan parquet default.customer (26)
+   :                          :           +- BroadcastExchange (33)
+   :                          :              +- * Filter (32)
+   :                          :                 +- * ColumnarToRow (31)
+   :                          :                    +- Scan parquet default.customer_address (30)
+   :                          +- * Sort (42)
+   :                             +- Exchange (41)
+   :                                +- * Filter (40)
+   :                                   +- * ColumnarToRow (39)
+   :                                      +- Scan parquet default.customer_demographics (38)
    :- * HashAggregate (76)
    :  +- Exchange (75)
    :     +- * HashAggregate (74)
@@ -162,716 +162,733 @@ TakeOrderedAndProject (160)
 
 
 (1) Scan parquet default.catalog_sales
-Output [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9]
+Output [9]: [cs_bill_customer_sk#1, cs_bill_cdemo_sk#2, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cs_sold_date_sk#9]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/catalog_sales]
-PushedFilters: [IsNotNull(cs_bill_cdemo_sk), IsNotNull(cs_bill_customer_sk), IsNotNull(cs_sold_date_sk), IsNotNull(cs_item_sk)]
-ReadSchema: struct<cs_sold_date_sk:int,cs_bill_customer_sk:int,cs_bill_cdemo_sk:int,cs_item_sk:int,cs_quantity:int,cs_list_price:decimal(7,2),cs_sales_price:decimal(7,2),cs_coupon_amt:decimal(7,2),cs_net_profit:decimal(7,2)>
+Location: InMemoryFileIndex []
+PartitionFilters: [isnotnull(cs_sold_date_sk#9), dynamicpruningexpression(cs_sold_date_sk#9 IN dynamicpruning#10)]
+PushedFilters: [IsNotNull(cs_bill_cdemo_sk), IsNotNull(cs_bill_customer_sk), IsNotNull(cs_item_sk)]
+ReadSchema: struct<cs_bill_customer_sk:int,cs_bill_cdemo_sk:int,cs_item_sk:int,cs_quantity:int,cs_list_price:decimal(7,2),cs_sales_price:decimal(7,2),cs_coupon_amt:decimal(7,2),cs_net_profit:decimal(7,2)>
 
 (2) ColumnarToRow [codegen id : 4]
-Input [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9]
+Input [9]: [cs_bill_customer_sk#1, cs_bill_cdemo_sk#2, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cs_sold_date_sk#9]
 
 (3) Filter [codegen id : 4]
-Input [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9]
-Condition : (((isnotnull(cs_bill_cdemo_sk#3) AND isnotnull(cs_bill_customer_sk#2)) AND isnotnull(cs_sold_date_sk#1)) AND isnotnull(cs_item_sk#4))
+Input [9]: [cs_bill_customer_sk#1, cs_bill_cdemo_sk#2, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cs_sold_date_sk#9]
+Condition : ((isnotnull(cs_bill_cdemo_sk#2) AND isnotnull(cs_bill_customer_sk#1)) AND isnotnull(cs_item_sk#3))
 
 (4) Scan parquet default.customer_demographics
-Output [4]: [cd_demo_sk#10, cd_gender#11, cd_education_status#12, cd_dep_count#13]
+Output [4]: [cd_demo_sk#11, cd_gender#12, cd_education_status#13, cd_dep_count#14]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer_demographics]
-PushedFilters: [IsNotNull(cd_gender), IsNotNull(cd_education_status), EqualTo(cd_gender,M), EqualTo(cd_education_status,College), IsNotNull(cd_demo_sk)]
+PushedFilters: [IsNotNull(cd_gender), IsNotNull(cd_education_status), EqualTo(cd_gender,M), EqualTo(cd_education_status,College             ), IsNotNull(cd_demo_sk)]
 ReadSchema: struct<cd_demo_sk:int,cd_gender:string,cd_education_status:string,cd_dep_count:int>
 
 (5) ColumnarToRow [codegen id : 1]
-Input [4]: [cd_demo_sk#10, cd_gender#11, cd_education_status#12, cd_dep_count#13]
+Input [4]: [cd_demo_sk#11, cd_gender#12, cd_education_status#13, cd_dep_count#14]
 
 (6) Filter [codegen id : 1]
-Input [4]: [cd_demo_sk#10, cd_gender#11, cd_education_status#12, cd_dep_count#13]
-Condition : ((((isnotnull(cd_gender#11) AND isnotnull(cd_education_status#12)) AND (cd_gender#11 = M)) AND (cd_education_status#12 = College)) AND isnotnull(cd_demo_sk#10))
+Input [4]: [cd_demo_sk#11, cd_gender#12, cd_education_status#13, cd_dep_count#14]
+Condition : ((((isnotnull(cd_gender#12) AND isnotnull(cd_education_status#13)) AND (cd_gender#12 = M)) AND (cd_education_status#13 = College             )) AND isnotnull(cd_demo_sk#11))
 
 (7) Project [codegen id : 1]
-Output [2]: [cd_demo_sk#10, cd_dep_count#13]
-Input [4]: [cd_demo_sk#10, cd_gender#11, cd_education_status#12, cd_dep_count#13]
+Output [2]: [cd_demo_sk#11, cd_dep_count#14]
+Input [4]: [cd_demo_sk#11, cd_gender#12, cd_education_status#13, cd_dep_count#14]
 
 (8) BroadcastExchange
-Input [2]: [cd_demo_sk#10, cd_dep_count#13]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#14]
+Input [2]: [cd_demo_sk#11, cd_dep_count#14]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#15]
 
 (9) BroadcastHashJoin [codegen id : 4]
-Left keys [1]: [cs_bill_cdemo_sk#3]
-Right keys [1]: [cd_demo_sk#10]
+Left keys [1]: [cs_bill_cdemo_sk#2]
+Right keys [1]: [cd_demo_sk#11]
 Join condition: None
 
 (10) Project [codegen id : 4]
-Output [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13]
-Input [11]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_demo_sk#10, cd_dep_count#13]
+Output [9]: [cs_bill_customer_sk#1, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cs_sold_date_sk#9, cd_dep_count#14]
+Input [11]: [cs_bill_customer_sk#1, cs_bill_cdemo_sk#2, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cs_sold_date_sk#9, cd_demo_sk#11, cd_dep_count#14]
 
 (11) Scan parquet default.date_dim
-Output [2]: [d_date_sk#15, d_year#16]
+Output [2]: [d_date_sk#16, d_year#17]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/date_dim]
 PushedFilters: [IsNotNull(d_year), EqualTo(d_year,2001), IsNotNull(d_date_sk)]
 ReadSchema: struct<d_date_sk:int,d_year:int>
 
 (12) ColumnarToRow [codegen id : 2]
-Input [2]: [d_date_sk#15, d_year#16]
+Input [2]: [d_date_sk#16, d_year#17]
 
 (13) Filter [codegen id : 2]
-Input [2]: [d_date_sk#15, d_year#16]
-Condition : ((isnotnull(d_year#16) AND (d_year#16 = 2001)) AND isnotnull(d_date_sk#15))
+Input [2]: [d_date_sk#16, d_year#17]
+Condition : ((isnotnull(d_year#17) AND (d_year#17 = 2001)) AND isnotnull(d_date_sk#16))
 
 (14) Project [codegen id : 2]
-Output [1]: [d_date_sk#15]
-Input [2]: [d_date_sk#15, d_year#16]
+Output [1]: [d_date_sk#16]
+Input [2]: [d_date_sk#16, d_year#17]
 
 (15) BroadcastExchange
-Input [1]: [d_date_sk#15]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#17]
+Input [1]: [d_date_sk#16]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#18]
 
 (16) BroadcastHashJoin [codegen id : 4]
-Left keys [1]: [cs_sold_date_sk#1]
-Right keys [1]: [d_date_sk#15]
+Left keys [1]: [cs_sold_date_sk#9]
+Right keys [1]: [d_date_sk#16]
 Join condition: None
 
 (17) Project [codegen id : 4]
-Output [8]: [cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13]
-Input [10]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, d_date_sk#15]
+Output [8]: [cs_bill_customer_sk#1, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14]
+Input [10]: [cs_bill_customer_sk#1, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cs_sold_date_sk#9, cd_dep_count#14, d_date_sk#16]
 
 (18) Scan parquet default.item
-Output [2]: [i_item_sk#18, i_item_id#19]
+Output [2]: [i_item_sk#19, i_item_id#20]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/item]
 PushedFilters: [IsNotNull(i_item_sk)]
 ReadSchema: struct<i_item_sk:int,i_item_id:string>
 
 (19) ColumnarToRow [codegen id : 3]
-Input [2]: [i_item_sk#18, i_item_id#19]
+Input [2]: [i_item_sk#19, i_item_id#20]
 
 (20) Filter [codegen id : 3]
-Input [2]: [i_item_sk#18, i_item_id#19]
-Condition : isnotnull(i_item_sk#18)
+Input [2]: [i_item_sk#19, i_item_id#20]
+Condition : isnotnull(i_item_sk#19)
 
 (21) BroadcastExchange
-Input [2]: [i_item_sk#18, i_item_id#19]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#20]
+Input [2]: [i_item_sk#19, i_item_id#20]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#21]
 
 (22) BroadcastHashJoin [codegen id : 4]
-Left keys [1]: [cs_item_sk#4]
-Right keys [1]: [i_item_sk#18]
+Left keys [1]: [cs_item_sk#3]
+Right keys [1]: [i_item_sk#19]
 Join condition: None
 
 (23) Project [codegen id : 4]
-Output [8]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_id#19]
-Input [10]: [cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_sk#18, i_item_id#19]
+Output [8]: [cs_bill_customer_sk#1, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, i_item_id#20]
+Input [10]: [cs_bill_customer_sk#1, cs_item_sk#3, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, i_item_sk#19, i_item_id#20]
 
 (24) Exchange
-Input [8]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_id#19]
-Arguments: hashpartitioning(cs_bill_customer_sk#2, 5), ENSURE_REQUIREMENTS, [id=#21]
+Input [8]: [cs_bill_customer_sk#1, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, i_item_id#20]
+Arguments: hashpartitioning(cs_bill_customer_sk#1, 5), ENSURE_REQUIREMENTS, [id=#22]
 
 (25) Sort [codegen id : 5]
-Input [8]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_id#19]
-Arguments: [cs_bill_customer_sk#2 ASC NULLS FIRST], false, 0
+Input [8]: [cs_bill_customer_sk#1, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, i_item_id#20]
+Arguments: [cs_bill_customer_sk#1 ASC NULLS FIRST], false, 0
 
 (26) Scan parquet default.customer
-Output [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+Output [5]: [c_customer_sk#23, c_current_cdemo_sk#24, c_current_addr_sk#25, c_birth_month#26, c_birth_year#27]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer]
 PushedFilters: [In(c_birth_month, [9,5,12,4,1,10]), IsNotNull(c_customer_sk), IsNotNull(c_current_cdemo_sk), IsNotNull(c_current_addr_sk)]
 ReadSchema: struct<c_customer_sk:int,c_current_cdemo_sk:int,c_current_addr_sk:int,c_birth_month:int,c_birth_year:int>
 
-(27) ColumnarToRow [codegen id : 6]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+(27) ColumnarToRow [codegen id : 7]
+Input [5]: [c_customer_sk#23, c_current_cdemo_sk#24, c_current_addr_sk#25, c_birth_month#26, c_birth_year#27]
 
-(28) Filter [codegen id : 6]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
-Condition : (((c_birth_month#25 IN (9,5,12,4,1,10) AND isnotnull(c_customer_sk#22)) AND isnotnull(c_current_cdemo_sk#23)) AND isnotnull(c_current_addr_sk#24))
+(28) Filter [codegen id : 7]
+Input [5]: [c_customer_sk#23, c_current_cdemo_sk#24, c_current_addr_sk#25, c_birth_month#26, c_birth_year#27]
+Condition : (((c_birth_month#26 IN (9,5,12,4,1,10) AND isnotnull(c_customer_sk#23)) AND isnotnull(c_current_cdemo_sk#24)) AND isnotnull(c_current_addr_sk#25))
 
-(29) Project [codegen id : 6]
-Output [4]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_year#26]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+(29) Project [codegen id : 7]
+Output [4]: [c_customer_sk#23, c_current_cdemo_sk#24, c_current_addr_sk#25, c_birth_year#27]
+Input [5]: [c_customer_sk#23, c_current_cdemo_sk#24, c_current_addr_sk#25, c_birth_month#26, c_birth_year#27]
 
-(30) Exchange
-Input [4]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_year#26]
-Arguments: hashpartitioning(c_current_cdemo_sk#23, 5), ENSURE_REQUIREMENTS, [id=#27]
-
-(31) Sort [codegen id : 7]
-Input [4]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_year#26]
-Arguments: [c_current_cdemo_sk#23 ASC NULLS FIRST], false, 0
-
-(32) Scan parquet default.customer_demographics
-Output [1]: [cd_demo_sk#28]
+(30) Scan parquet default.customer_address
+Output [4]: [ca_address_sk#28, ca_county#29, ca_state#30, ca_country#31]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/customer_demographics]
-PushedFilters: [IsNotNull(cd_demo_sk)]
-ReadSchema: struct<cd_demo_sk:int>
-
-(33) ColumnarToRow [codegen id : 8]
-Input [1]: [cd_demo_sk#28]
+Location [not included in comparison]/{warehouse_dir}/customer_address]
+PushedFilters: [In(ca_state, [ND,WI,AL,NC,OK,MS,TN]), IsNotNull(ca_address_sk)]
+ReadSchema: struct<ca_address_sk:int,ca_county:string,ca_state:string,ca_country:string>
 
-(34) Filter [codegen id : 8]
-Input [1]: [cd_demo_sk#28]
-Condition : isnotnull(cd_demo_sk#28)
+(31) ColumnarToRow [codegen id : 6]
+Input [4]: [ca_address_sk#28, ca_county#29, ca_state#30, ca_country#31]
 
-(35) Exchange
-Input [1]: [cd_demo_sk#28]
-Arguments: hashpartitioning(cd_demo_sk#28, 5), ENSURE_REQUIREMENTS, [id=#29]
+(32) Filter [codegen id : 6]
+Input [4]: [ca_address_sk#28, ca_county#29, ca_state#30, ca_country#31]
+Condition : (ca_state#30 IN (ND,WI,AL,NC,OK,MS,TN) AND isnotnull(ca_address_sk#28))
 
-(36) Sort [codegen id : 9]
-Input [1]: [cd_demo_sk#28]
-Arguments: [cd_demo_sk#28 ASC NULLS FIRST], false, 0
+(33) BroadcastExchange
+Input [4]: [ca_address_sk#28, ca_county#29, ca_state#30, ca_country#31]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#32]
 
-(37) SortMergeJoin [codegen id : 11]
-Left keys [1]: [c_current_cdemo_sk#23]
-Right keys [1]: [cd_demo_sk#28]
+(34) BroadcastHashJoin [codegen id : 7]
+Left keys [1]: [c_current_addr_sk#25]
+Right keys [1]: [ca_address_sk#28]
 Join condition: None
 
-(38) Project [codegen id : 11]
-Output [3]: [c_customer_sk#22, c_current_addr_sk#24, c_birth_year#26]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_year#26, cd_demo_sk#28]
+(35) Project [codegen id : 7]
+Output [6]: [c_customer_sk#23, c_current_cdemo_sk#24, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31]
+Input [8]: [c_customer_sk#23, c_current_cdemo_sk#24, c_current_addr_sk#25, c_birth_year#27, ca_address_sk#28, ca_county#29, ca_state#30, ca_country#31]
+
+(36) Exchange
+Input [6]: [c_customer_sk#23, c_current_cdemo_sk#24, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31]
+Arguments: hashpartitioning(c_current_cdemo_sk#24, 5), ENSURE_REQUIREMENTS, [id=#33]
 
-(39) Scan parquet default.customer_address
-Output [4]: [ca_address_sk#30, ca_county#31, ca_state#32, ca_country#33]
+(37) Sort [codegen id : 8]
+Input [6]: [c_customer_sk#23, c_current_cdemo_sk#24, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31]
+Arguments: [c_current_cdemo_sk#24 ASC NULLS FIRST], false, 0
+
+(38) Scan parquet default.customer_demographics
+Output [1]: [cd_demo_sk#34]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/customer_address]
-PushedFilters: [In(ca_state, [ND,WI,AL,NC,OK,MS,TN]), IsNotNull(ca_address_sk)]
-ReadSchema: struct<ca_address_sk:int,ca_county:string,ca_state:string,ca_country:string>
+Location [not included in comparison]/{warehouse_dir}/customer_demographics]
+PushedFilters: [IsNotNull(cd_demo_sk)]
+ReadSchema: struct<cd_demo_sk:int>
+
+(39) ColumnarToRow [codegen id : 9]
+Input [1]: [cd_demo_sk#34]
 
-(40) ColumnarToRow [codegen id : 10]
-Input [4]: [ca_address_sk#30, ca_county#31, ca_state#32, ca_country#33]
+(40) Filter [codegen id : 9]
+Input [1]: [cd_demo_sk#34]
+Condition : isnotnull(cd_demo_sk#34)
 
-(41) Filter [codegen id : 10]
-Input [4]: [ca_address_sk#30, ca_county#31, ca_state#32, ca_country#33]
-Condition : (ca_state#32 IN (ND,WI,AL,NC,OK,MS,TN) AND isnotnull(ca_address_sk#30))
+(41) Exchange
+Input [1]: [cd_demo_sk#34]
+Arguments: hashpartitioning(cd_demo_sk#34, 5), ENSURE_REQUIREMENTS, [id=#35]
 
-(42) BroadcastExchange
-Input [4]: [ca_address_sk#30, ca_county#31, ca_state#32, ca_country#33]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#34]
+(42) Sort [codegen id : 10]
+Input [1]: [cd_demo_sk#34]
+Arguments: [cd_demo_sk#34 ASC NULLS FIRST], false, 0
 
-(43) BroadcastHashJoin [codegen id : 11]
-Left keys [1]: [c_current_addr_sk#24]
-Right keys [1]: [ca_address_sk#30]
+(43) SortMergeJoin [codegen id : 11]
+Left keys [1]: [c_current_cdemo_sk#24]
+Right keys [1]: [cd_demo_sk#34]
 Join condition: None
 
 (44) Project [codegen id : 11]
-Output [5]: [c_customer_sk#22, c_birth_year#26, ca_county#31, ca_state#32, ca_country#33]
-Input [7]: [c_customer_sk#22, c_current_addr_sk#24, c_birth_year#26, ca_address_sk#30, ca_county#31, ca_state#32, ca_country#33]
+Output [5]: [c_customer_sk#23, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31]
+Input [7]: [c_customer_sk#23, c_current_cdemo_sk#24, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31, cd_demo_sk#34]
 
 (45) Exchange
-Input [5]: [c_customer_sk#22, c_birth_year#26, ca_county#31, ca_state#32, ca_country#33]
-Arguments: hashpartitioning(c_customer_sk#22, 5), ENSURE_REQUIREMENTS, [id=#35]
+Input [5]: [c_customer_sk#23, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31]
+Arguments: hashpartitioning(c_customer_sk#23, 5), ENSURE_REQUIREMENTS, [id=#36]
 
 (46) Sort [codegen id : 12]
-Input [5]: [c_customer_sk#22, c_birth_year#26, ca_county#31, ca_state#32, ca_country#33]
-Arguments: [c_customer_sk#22 ASC NULLS FIRST], false, 0
+Input [5]: [c_customer_sk#23, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31]
+Arguments: [c_customer_sk#23 ASC NULLS FIRST], false, 0
 
 (47) SortMergeJoin [codegen id : 13]
-Left keys [1]: [cs_bill_customer_sk#2]
-Right keys [1]: [c_customer_sk#22]
+Left keys [1]: [cs_bill_customer_sk#1]
+Right keys [1]: [c_customer_sk#23]
 Join condition: None
 
 (48) Project [codegen id : 13]
-Output [11]: [i_item_id#19, ca_country#33, ca_state#32, ca_county#31, cast(cs_quantity#5 as decimal(12,2)) AS agg1#36, cast(cs_list_price#6 as decimal(12,2)) AS agg2#37, cast(cs_coupon_amt#8 as decimal(12,2)) AS agg3#38, cast(cs_sales_price#7 as decimal(12,2)) AS agg4#39, cast(cs_net_profit#9 as decimal(12,2)) AS agg5#40, cast(c_birth_year#26 as decimal(12,2)) AS agg6#41, cast(cd_dep_count#13 as decimal(12,2)) AS agg7#42]
-Input [13]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_id#19, c_customer_sk#22, c_birth_year#26, ca_county#31, ca_state#32, ca_country#33]
+Output [11]: [i_item_id#20, ca_country#31, ca_state#30, ca_county#29, cast(cs_quantity#4 as decimal(12,2)) AS agg1#37, cast(cs_list_price#5 as decimal(12,2)) AS agg2#38, cast(cs_coupon_amt#7 as decimal(12,2)) AS agg3#39, cast(cs_sales_price#6 as decimal(12,2)) AS agg4#40, cast(cs_net_profit#8 as decimal(12,2)) AS agg5#41, cast(c_birth_year#27 as decimal(12,2)) AS agg6#42, cast(cd_dep_count#14 as decimal(12,2)) AS agg7#43]
+Input [13]: [cs_bill_customer_sk#1, cs_quantity#4, cs_list_price#5, cs_sales_price#6, cs_coupon_amt#7, cs_net_profit#8, cd_dep_count#14, i_item_id#20, c_customer_sk#23, c_birth_year#27, ca_county#29, ca_state#30, ca_country#31]
 
 (49) HashAggregate [codegen id : 13]
-Input [11]: [i_item_id#19, ca_country#33, ca_state#32, ca_county#31, agg1#36, agg2#37, agg3#38, agg4#39, agg5#40, agg6#41, agg7#42]
-Keys [4]: [i_item_id#19, ca_country#33, ca_state#32, ca_county#31]
-Functions [7]: [partial_avg(agg1#36), partial_avg(agg2#37), partial_avg(agg3#38), partial_avg(agg4#39), partial_avg(agg5#40), partial_avg(agg6#41), partial_avg(agg7#42)]
-Aggregate Attributes [14]: [sum#43, count#44, sum#45, count#46, sum#47, count#48, sum#49, count#50, sum#51, count#52, sum#53, count#54, sum#55, count#56]
-Results [18]: [i_item_id#19, ca_country#33, ca_state#32, ca_county#31, sum#57, count#58, sum#59, count#60, sum#61, count#62, sum#63, count#64, sum#65, count#66, sum#67, count#68, sum#69, count#70]
+Input [11]: [i_item_id#20, ca_country#31, ca_state#30, ca_county#29, agg1#37, agg2#38, agg3#39, agg4#40, agg5#41, agg6#42, agg7#43]
+Keys [4]: [i_item_id#20, ca_country#31, ca_state#30, ca_county#29]
+Functions [7]: [partial_avg(agg1#37), partial_avg(agg2#38), partial_avg(agg3#39), partial_avg(agg4#40), partial_avg(agg5#41), partial_avg(agg6#42), partial_avg(agg7#43)]
+Aggregate Attributes [14]: [sum#44, count#45, sum#46, count#47, sum#48, count#49, sum#50, count#51, sum#52, count#53, sum#54, count#55, sum#56, count#57]
+Results [18]: [i_item_id#20, ca_country#31, ca_state#30, ca_county#29, sum#58, count#59, sum#60, count#61, sum#62, count#63, sum#64, count#65, sum#66, count#67, sum#68, count#69, sum#70, count#71]
 
 (50) Exchange
-Input [18]: [i_item_id#19, ca_country#33, ca_state#32, ca_county#31, sum#57, count#58, sum#59, count#60, sum#61, count#62, sum#63, count#64, sum#65, count#66, sum#67, count#68, sum#69, count#70]
-Arguments: hashpartitioning(i_item_id#19, ca_country#33, ca_state#32, ca_county#31, 5), ENSURE_REQUIREMENTS, [id=#71]
+Input [18]: [i_item_id#20, ca_country#31, ca_state#30, ca_county#29, sum#58, count#59, sum#60, count#61, sum#62, count#63, sum#64, count#65, sum#66, count#67, sum#68, count#69, sum#70, count#71]
+Arguments: hashpartitioning(i_item_id#20, ca_country#31, ca_state#30, ca_county#29, 5), ENSURE_REQUIREMENTS, [id=#72]
 
 (51) HashAggregate [codegen id : 14]
-Input [18]: [i_item_id#19, ca_country#33, ca_state#32, ca_county#31, sum#57, count#58, sum#59, count#60, sum#61, count#62, sum#63, count#64, sum#65, count#66, sum#67, count#68, sum#69, count#70]
-Keys [4]: [i_item_id#19, ca_country#33, ca_state#32, ca_county#31]
-Functions [7]: [avg(agg1#36), avg(agg2#37), avg(agg3#38), avg(agg4#39), avg(agg5#40), avg(agg6#41), avg(agg7#42)]
-Aggregate Attributes [7]: [avg(agg1#36)#72, avg(agg2#37)#73, avg(agg3#38)#74, avg(agg4#39)#75, avg(agg5#40)#76, avg(agg6#41)#77, avg(agg7#42)#78]
-Results [11]: [i_item_id#19, ca_country#33, ca_state#32, ca_county#31, avg(agg1#36)#72 AS agg1#79, avg(agg2#37)#73 AS agg2#80, avg(agg3#38)#74 AS agg3#81, avg(agg4#39)#75 AS agg4#82, avg(agg5#40)#76 AS agg5#83, avg(agg6#41)#77 AS agg6#84, avg(agg7#42)#78 AS agg7#85]
+Input [18]: [i_item_id#20, ca_country#31, ca_state#30, ca_county#29, sum#58, count#59, sum#60, count#61, sum#62, count#63, sum#64, count#65, sum#66, count#67, sum#68, count#69, sum#70, count#71]
+Keys [4]: [i_item_id#20, ca_country#31, ca_state#30, ca_county#29]
+Functions [7]: [avg(agg1#37), avg(agg2#38), avg(agg3#39), avg(agg4#40), avg(agg5#41), avg(agg6#42), avg(agg7#43)]
+Aggregate Attributes [7]: [avg(agg1#37)#73, avg(agg2#38)#74, avg(agg3#39)#75, avg(agg4#40)#76, avg(agg5#41)#77, avg(agg6#42)#78, avg(agg7#43)#79]
+Results [11]: [i_item_id#20, ca_country#31, ca_state#30, ca_county#29, avg(agg1#37)#73 AS agg1#80, avg(agg2#38)#74 AS agg2#81, avg(agg3#39)#75 AS agg3#82, avg(agg4#40)#76 AS agg4#83, avg(agg5#41)#77 AS agg5#84, avg(agg6#42)#78 AS agg6#85, avg(agg7#43)#79 AS agg7#86]
 
-(52) ReusedExchange [Reuses operator id: 24]
-Output [8]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_id#19]
+(52) ReusedExchange [Reuses operator id: unknown]
+Output [8]: [cs_bill_customer_sk#87, cs_quantity#88, cs_list_price#89, cs_sales_price#90, cs_coupon_amt#91, cs_net_profit#92, cd_dep_count#93, i_item_id#94]
 
 (53) Sort [codegen id : 19]
-Input [8]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_id#19]
-Arguments: [cs_bill_customer_sk#2 ASC NULLS FIRST], false, 0
+Input [8]: [cs_bill_customer_sk#87, cs_quantity#88, cs_list_price#89, cs_sales_price#90, cs_coupon_amt#91, cs_net_profit#92, cd_dep_count#93, i_item_id#94]
+Arguments: [cs_bill_customer_sk#87 ASC NULLS FIRST], false, 0
 
 (54) Scan parquet default.customer
-Output [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+Output [5]: [c_customer_sk#95, c_current_cdemo_sk#96, c_current_addr_sk#97, c_birth_month#98, c_birth_year#99]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer]
 PushedFilters: [In(c_birth_month, [9,5,12,4,1,10]), IsNotNull(c_customer_sk), IsNotNull(c_current_cdemo_sk), IsNotNull(c_current_addr_sk)]
 ReadSchema: struct<c_customer_sk:int,c_current_cdemo_sk:int,c_current_addr_sk:int,c_birth_month:int,c_birth_year:int>
 
 (55) ColumnarToRow [codegen id : 21]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+Input [5]: [c_customer_sk#95, c_current_cdemo_sk#96, c_current_addr_sk#97, c_birth_month#98, c_birth_year#99]
 
 (56) Filter [codegen id : 21]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
-Condition : (((c_birth_month#25 IN (9,5,12,4,1,10) AND isnotnull(c_customer_sk#22)) AND isnotnull(c_current_cdemo_sk#23)) AND isnotnull(c_current_addr_sk#24))
+Input [5]: [c_customer_sk#95, c_current_cdemo_sk#96, c_current_addr_sk#97, c_birth_month#98, c_birth_year#99]
+Condition : (((c_birth_month#98 IN (9,5,12,4,1,10) AND isnotnull(c_customer_sk#95)) AND isnotnull(c_current_cdemo_sk#96)) AND isnotnull(c_current_addr_sk#97))
 
 (57) Project [codegen id : 21]
-Output [4]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_year#26]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+Output [4]: [c_customer_sk#95, c_current_cdemo_sk#96, c_current_addr_sk#97, c_birth_year#99]
+Input [5]: [c_customer_sk#95, c_current_cdemo_sk#96, c_current_addr_sk#97, c_birth_month#98, c_birth_year#99]
 
 (58) Scan parquet default.customer_address
-Output [3]: [ca_address_sk#30, ca_state#32, ca_country#33]
+Output [3]: [ca_address_sk#100, ca_state#101, ca_country#102]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer_address]
 PushedFilters: [In(ca_state, [ND,WI,AL,NC,OK,MS,TN]), IsNotNull(ca_address_sk)]
 ReadSchema: struct<ca_address_sk:int,ca_state:string,ca_country:string>
 
 (59) ColumnarToRow [codegen id : 20]
-Input [3]: [ca_address_sk#30, ca_state#32, ca_country#33]
+Input [3]: [ca_address_sk#100, ca_state#101, ca_country#102]
 
 (60) Filter [codegen id : 20]
-Input [3]: [ca_address_sk#30, ca_state#32, ca_country#33]
-Condition : (ca_state#32 IN (ND,WI,AL,NC,OK,MS,TN) AND isnotnull(ca_address_sk#30))
+Input [3]: [ca_address_sk#100, ca_state#101, ca_country#102]
+Condition : (ca_state#101 IN (ND,WI,AL,NC,OK,MS,TN) AND isnotnull(ca_address_sk#100))
 
 (61) BroadcastExchange
-Input [3]: [ca_address_sk#30, ca_state#32, ca_country#33]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#86]
+Input [3]: [ca_address_sk#100, ca_state#101, ca_country#102]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#103]
 
 (62) BroadcastHashJoin [codegen id : 21]
-Left keys [1]: [c_current_addr_sk#24]
-Right keys [1]: [ca_address_sk#30]
+Left keys [1]: [c_current_addr_sk#97]
+Right keys [1]: [ca_address_sk#100]
 Join condition: None
 
 (63) Project [codegen id : 21]
-Output [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_birth_year#26, ca_state#32, ca_country#33]
-Input [7]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_year#26, ca_address_sk#30, ca_state#32, ca_country#33]
+Output [5]: [c_customer_sk#95, c_current_cdemo_sk#96, c_birth_year#99, ca_state#101, ca_country#102]
+Input [7]: [c_customer_sk#95, c_current_cdemo_sk#96, c_current_addr_sk#97, c_birth_year#99, ca_address_sk#100, ca_state#101, ca_country#102]
 
 (64) Exchange
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_birth_year#26, ca_state#32, ca_country#33]
-Arguments: hashpartitioning(c_current_cdemo_sk#23, 5), ENSURE_REQUIREMENTS, [id=#87]
+Input [5]: [c_customer_sk#95, c_current_cdemo_sk#96, c_birth_year#99, ca_state#101, ca_country#102]
+Arguments: hashpartitioning(c_current_cdemo_sk#96, 5), ENSURE_REQUIREMENTS, [id=#104]
 
 (65) Sort [codegen id : 22]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_birth_year#26, ca_state#32, ca_country#33]
-Arguments: [c_current_cdemo_sk#23 ASC NULLS FIRST], false, 0
+Input [5]: [c_customer_sk#95, c_current_cdemo_sk#96, c_birth_year#99, ca_state#101, ca_country#102]
+Arguments: [c_current_cdemo_sk#96 ASC NULLS FIRST], false, 0
 
-(66) ReusedExchange [Reuses operator id: 35]
-Output [1]: [cd_demo_sk#88]
+(66) ReusedExchange [Reuses operator id: 41]
+Output [1]: [cd_demo_sk#105]
 
 (67) Sort [codegen id : 24]
-Input [1]: [cd_demo_sk#88]
-Arguments: [cd_demo_sk#88 ASC NULLS FIRST], false, 0
+Input [1]: [cd_demo_sk#105]
+Arguments: [cd_demo_sk#105 ASC NULLS FIRST], false, 0
 
 (68) SortMergeJoin [codegen id : 25]
-Left keys [1]: [c_current_cdemo_sk#23]
-Right keys [1]: [cd_demo_sk#88]
+Left keys [1]: [c_current_cdemo_sk#96]
+Right keys [1]: [cd_demo_sk#105]
 Join condition: None
 
 (69) Project [codegen id : 25]
-Output [4]: [c_customer_sk#22, c_birth_year#26, ca_state#32, ca_country#33]
-Input [6]: [c_customer_sk#22, c_current_cdemo_sk#23, c_birth_year#26, ca_state#32, ca_country#33, cd_demo_sk#88]
+Output [4]: [c_customer_sk#95, c_birth_year#99, ca_state#101, ca_country#102]
+Input [6]: [c_customer_sk#95, c_current_cdemo_sk#96, c_birth_year#99, ca_state#101, ca_country#102, cd_demo_sk#105]
 
 (70) Exchange
-Input [4]: [c_customer_sk#22, c_birth_year#26, ca_state#32, ca_country#33]
-Arguments: hashpartitioning(c_customer_sk#22, 5), ENSURE_REQUIREMENTS, [id=#89]
+Input [4]: [c_customer_sk#95, c_birth_year#99, ca_state#101, ca_country#102]
+Arguments: hashpartitioning(c_customer_sk#95, 5), ENSURE_REQUIREMENTS, [id=#106]
 
 (71) Sort [codegen id : 26]
-Input [4]: [c_customer_sk#22, c_birth_year#26, ca_state#32, ca_country#33]
-Arguments: [c_customer_sk#22 ASC NULLS FIRST], false, 0
+Input [4]: [c_customer_sk#95, c_birth_year#99, ca_state#101, ca_country#102]
+Arguments: [c_customer_sk#95 ASC NULLS FIRST], false, 0
 
 (72) SortMergeJoin [codegen id : 27]
-Left keys [1]: [cs_bill_customer_sk#2]
-Right keys [1]: [c_customer_sk#22]
+Left keys [1]: [cs_bill_customer_sk#87]
+Right keys [1]: [c_customer_sk#95]
 Join condition: None
 
 (73) Project [codegen id : 27]
-Output [10]: [i_item_id#19, ca_country#33, ca_state#32, cast(cs_quantity#5 as decimal(12,2)) AS agg1#36, cast(cs_list_price#6 as decimal(12,2)) AS agg2#37, cast(cs_coupon_amt#8 as decimal(12,2)) AS agg3#38, cast(cs_sales_price#7 as decimal(12,2)) AS agg4#39, cast(cs_net_profit#9 as decimal(12,2)) AS agg5#40, cast(c_birth_year#26 as decimal(12,2)) AS agg6#41, cast(cd_dep_count#13 as decimal(12,2)) AS agg7#42]
-Input [12]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_id#19, c_customer_sk#22, c_birth_year#26, ca_state#32, ca_country#33]
+Output [10]: [i_item_id#94, ca_country#102, ca_state#101, cast(cs_quantity#88 as decimal(12,2)) AS agg1#37, cast(cs_list_price#89 as decimal(12,2)) AS agg2#38, cast(cs_coupon_amt#91 as decimal(12,2)) AS agg3#39, cast(cs_sales_price#90 as decimal(12,2)) AS agg4#40, cast(cs_net_profit#92 as decimal(12,2)) AS agg5#41, cast(c_birth_year#99 as decimal(12,2)) AS agg6#42, cast(cd_dep_count#93 as decimal(12,2)) AS agg7#43]
+Input [12]: [cs_bill_customer_sk#87, cs_quantity#88, cs_list_price#89, cs_sales_price#90, cs_coupon_amt#91, cs_net_profit#92, cd_dep_count#93, i_item_id#94, c_customer_sk#95, c_birth_year#99, ca_state#101, ca_country#102]
 
 (74) HashAggregate [codegen id : 27]
-Input [10]: [i_item_id#19, ca_country#33, ca_state#32, agg1#36, agg2#37, agg3#38, agg4#39, agg5#40, agg6#41, agg7#42]
-Keys [3]: [i_item_id#19, ca_country#33, ca_state#32]
-Functions [7]: [partial_avg(agg1#36), partial_avg(agg2#37), partial_avg(agg3#38), partial_avg(agg4#39), partial_avg(agg5#40), partial_avg(agg6#41), partial_avg(agg7#42)]
-Aggregate Attributes [14]: [sum#90, count#91, sum#92, count#93, sum#94, count#95, sum#96, count#97, sum#98, count#99, sum#100, count#101, sum#102, count#103]
-Results [17]: [i_item_id#19, ca_country#33, ca_state#32, sum#104, count#105, sum#106, count#107, sum#108, count#109, sum#110, count#111, sum#112, count#113, sum#114, count#115, sum#116, count#117]
+Input [10]: [i_item_id#94, ca_country#102, ca_state#101, agg1#37, agg2#38, agg3#39, agg4#40, agg5#41, agg6#42, agg7#43]
+Keys [3]: [i_item_id#94, ca_country#102, ca_state#101]
+Functions [7]: [partial_avg(agg1#37), partial_avg(agg2#38), partial_avg(agg3#39), partial_avg(agg4#40), partial_avg(agg5#41), partial_avg(agg6#42), partial_avg(agg7#43)]
+Aggregate Attributes [14]: [sum#107, count#108, sum#109, count#110, sum#111, count#112, sum#113, count#114, sum#115, count#116, sum#117, count#118, sum#119, count#120]
+Results [17]: [i_item_id#94, ca_country#102, ca_state#101, sum#121, count#122, sum#123, count#124, sum#125, count#126, sum#127, count#128, sum#129, count#130, sum#131, count#132, sum#133, count#134]
 
 (75) Exchange
-Input [17]: [i_item_id#19, ca_country#33, ca_state#32, sum#104, count#105, sum#106, count#107, sum#108, count#109, sum#110, count#111, sum#112, count#113, sum#114, count#115, sum#116, count#117]
-Arguments: hashpartitioning(i_item_id#19, ca_country#33, ca_state#32, 5), ENSURE_REQUIREMENTS, [id=#118]
+Input [17]: [i_item_id#94, ca_country#102, ca_state#101, sum#121, count#122, sum#123, count#124, sum#125, count#126, sum#127, count#128, sum#129, count#130, sum#131, count#132, sum#133, count#134]
+Arguments: hashpartitioning(i_item_id#94, ca_country#102, ca_state#101, 5), ENSURE_REQUIREMENTS, [id=#135]
 
 (76) HashAggregate [codegen id : 28]
-Input [17]: [i_item_id#19, ca_country#33, ca_state#32, sum#104, count#105, sum#106, count#107, sum#108, count#109, sum#110, count#111, sum#112, count#113, sum#114, count#115, sum#116, count#117]
-Keys [3]: [i_item_id#19, ca_country#33, ca_state#32]
-Functions [7]: [avg(agg1#36), avg(agg2#37), avg(agg3#38), avg(agg4#39), avg(agg5#40), avg(agg6#41), avg(agg7#42)]
-Aggregate Attributes [7]: [avg(agg1#36)#119, avg(agg2#37)#120, avg(agg3#38)#121, avg(agg4#39)#122, avg(agg5#40)#123, avg(agg6#41)#124, avg(agg7#42)#125]
-Results [11]: [i_item_id#19, ca_country#33, ca_state#32, null AS county#126, avg(agg1#36)#119 AS agg1#127, avg(agg2#37)#120 AS agg2#128, avg(agg3#38)#121 AS agg3#129, avg(agg4#39)#122 AS agg4#130, avg(agg5#40)#123 AS agg5#131, avg(agg6#41)#124 AS agg6#132, avg(agg7#42)#125 AS agg7#133]
+Input [17]: [i_item_id#94, ca_country#102, ca_state#101, sum#121, count#122, sum#123, count#124, sum#125, count#126, sum#127, count#128, sum#129, count#130, sum#131, count#132, sum#133, count#134]
+Keys [3]: [i_item_id#94, ca_country#102, ca_state#101]
+Functions [7]: [avg(agg1#37), avg(agg2#38), avg(agg3#39), avg(agg4#40), avg(agg5#41), avg(agg6#42), avg(agg7#43)]
+Aggregate Attributes [7]: [avg(agg1#37)#136, avg(agg2#38)#137, avg(agg3#39)#138, avg(agg4#40)#139, avg(agg5#41)#140, avg(agg6#42)#141, avg(agg7#43)#142]
+Results [11]: [i_item_id#94, ca_country#102, ca_state#101, null AS county#143, avg(agg1#37)#136 AS agg1#144, avg(agg2#38)#137 AS agg2#145, avg(agg3#39)#138 AS agg3#146, avg(agg4#40)#139 AS agg4#147, avg(agg5#41)#140 AS agg5#148, avg(agg6#42)#141 AS agg6#149, avg(agg7#43)#142 AS agg7#150]
 
-(77) ReusedExchange [Reuses operator id: 24]
-Output [8]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_id#19]
+(77) ReusedExchange [Reuses operator id: unknown]
+Output [8]: [cs_bill_customer_sk#151, cs_quantity#152, cs_list_price#153, cs_sales_price#154, cs_coupon_amt#155, cs_net_profit#156, cd_dep_count#157, i_item_id#158]
 
 (78) Sort [codegen id : 33]
-Input [8]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_id#19]
-Arguments: [cs_bill_customer_sk#2 ASC NULLS FIRST], false, 0
+Input [8]: [cs_bill_customer_sk#151, cs_quantity#152, cs_list_price#153, cs_sales_price#154, cs_coupon_amt#155, cs_net_profit#156, cd_dep_count#157, i_item_id#158]
+Arguments: [cs_bill_customer_sk#151 ASC NULLS FIRST], false, 0
 
 (79) Scan parquet default.customer
-Output [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+Output [5]: [c_customer_sk#159, c_current_cdemo_sk#160, c_current_addr_sk#161, c_birth_month#162, c_birth_year#163]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer]
 PushedFilters: [In(c_birth_month, [9,5,12,4,1,10]), IsNotNull(c_customer_sk), IsNotNull(c_current_cdemo_sk), IsNotNull(c_current_addr_sk)]
 ReadSchema: struct<c_customer_sk:int,c_current_cdemo_sk:int,c_current_addr_sk:int,c_birth_month:int,c_birth_year:int>
 
 (80) ColumnarToRow [codegen id : 35]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+Input [5]: [c_customer_sk#159, c_current_cdemo_sk#160, c_current_addr_sk#161, c_birth_month#162, c_birth_year#163]
 
 (81) Filter [codegen id : 35]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
-Condition : (((c_birth_month#25 IN (9,5,12,4,1,10) AND isnotnull(c_customer_sk#22)) AND isnotnull(c_current_cdemo_sk#23)) AND isnotnull(c_current_addr_sk#24))
+Input [5]: [c_customer_sk#159, c_current_cdemo_sk#160, c_current_addr_sk#161, c_birth_month#162, c_birth_year#163]
+Condition : (((c_birth_month#162 IN (9,5,12,4,1,10) AND isnotnull(c_customer_sk#159)) AND isnotnull(c_current_cdemo_sk#160)) AND isnotnull(c_current_addr_sk#161))
 
 (82) Project [codegen id : 35]
-Output [4]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_year#26]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+Output [4]: [c_customer_sk#159, c_current_cdemo_sk#160, c_current_addr_sk#161, c_birth_year#163]
+Input [5]: [c_customer_sk#159, c_current_cdemo_sk#160, c_current_addr_sk#161, c_birth_month#162, c_birth_year#163]
 
 (83) Scan parquet default.customer_address
-Output [3]: [ca_address_sk#30, ca_state#32, ca_country#33]
+Output [3]: [ca_address_sk#164, ca_state#165, ca_country#166]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer_address]
 PushedFilters: [In(ca_state, [ND,WI,AL,NC,OK,MS,TN]), IsNotNull(ca_address_sk)]
 ReadSchema: struct<ca_address_sk:int,ca_state:string,ca_country:string>
 
 (84) ColumnarToRow [codegen id : 34]
-Input [3]: [ca_address_sk#30, ca_state#32, ca_country#33]
+Input [3]: [ca_address_sk#164, ca_state#165, ca_country#166]
 
 (85) Filter [codegen id : 34]
-Input [3]: [ca_address_sk#30, ca_state#32, ca_country#33]
-Condition : (ca_state#32 IN (ND,WI,AL,NC,OK,MS,TN) AND isnotnull(ca_address_sk#30))
+Input [3]: [ca_address_sk#164, ca_state#165, ca_country#166]
+Condition : (ca_state#165 IN (ND,WI,AL,NC,OK,MS,TN) AND isnotnull(ca_address_sk#164))
 
 (86) Project [codegen id : 34]
-Output [2]: [ca_address_sk#30, ca_country#33]
-Input [3]: [ca_address_sk#30, ca_state#32, ca_country#33]
+Output [2]: [ca_address_sk#164, ca_country#166]
+Input [3]: [ca_address_sk#164, ca_state#165, ca_country#166]
 
 (87) BroadcastExchange
-Input [2]: [ca_address_sk#30, ca_country#33]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#134]
+Input [2]: [ca_address_sk#164, ca_country#166]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#167]
 
 (88) BroadcastHashJoin [codegen id : 35]
-Left keys [1]: [c_current_addr_sk#24]
-Right keys [1]: [ca_address_sk#30]
+Left keys [1]: [c_current_addr_sk#161]
+Right keys [1]: [ca_address_sk#164]
 Join condition: None
 
 (89) Project [codegen id : 35]
-Output [4]: [c_customer_sk#22, c_current_cdemo_sk#23, c_birth_year#26, ca_country#33]
-Input [6]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_year#26, ca_address_sk#30, ca_country#33]
+Output [4]: [c_customer_sk#159, c_current_cdemo_sk#160, c_birth_year#163, ca_country#166]
+Input [6]: [c_customer_sk#159, c_current_cdemo_sk#160, c_current_addr_sk#161, c_birth_year#163, ca_address_sk#164, ca_country#166]
 
 (90) Exchange
-Input [4]: [c_customer_sk#22, c_current_cdemo_sk#23, c_birth_year#26, ca_country#33]
-Arguments: hashpartitioning(c_current_cdemo_sk#23, 5), ENSURE_REQUIREMENTS, [id=#135]
+Input [4]: [c_customer_sk#159, c_current_cdemo_sk#160, c_birth_year#163, ca_country#166]
+Arguments: hashpartitioning(c_current_cdemo_sk#160, 5), ENSURE_REQUIREMENTS, [id=#168]
 
 (91) Sort [codegen id : 36]
-Input [4]: [c_customer_sk#22, c_current_cdemo_sk#23, c_birth_year#26, ca_country#33]
-Arguments: [c_current_cdemo_sk#23 ASC NULLS FIRST], false, 0
+Input [4]: [c_customer_sk#159, c_current_cdemo_sk#160, c_birth_year#163, ca_country#166]
+Arguments: [c_current_cdemo_sk#160 ASC NULLS FIRST], false, 0
 
-(92) ReusedExchange [Reuses operator id: 35]
-Output [1]: [cd_demo_sk#136]
+(92) ReusedExchange [Reuses operator id: 41]
+Output [1]: [cd_demo_sk#169]
 
 (93) Sort [codegen id : 38]
-Input [1]: [cd_demo_sk#136]
-Arguments: [cd_demo_sk#136 ASC NULLS FIRST], false, 0
+Input [1]: [cd_demo_sk#169]
+Arguments: [cd_demo_sk#169 ASC NULLS FIRST], false, 0
 
 (94) SortMergeJoin [codegen id : 39]
-Left keys [1]: [c_current_cdemo_sk#23]
-Right keys [1]: [cd_demo_sk#136]
+Left keys [1]: [c_current_cdemo_sk#160]
+Right keys [1]: [cd_demo_sk#169]
 Join condition: None
 
 (95) Project [codegen id : 39]
-Output [3]: [c_customer_sk#22, c_birth_year#26, ca_country#33]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_birth_year#26, ca_country#33, cd_demo_sk#136]
+Output [3]: [c_customer_sk#159, c_birth_year#163, ca_country#166]
+Input [5]: [c_customer_sk#159, c_current_cdemo_sk#160, c_birth_year#163, ca_country#166, cd_demo_sk#169]
 
 (96) Exchange
-Input [3]: [c_customer_sk#22, c_birth_year#26, ca_country#33]
-Arguments: hashpartitioning(c_customer_sk#22, 5), ENSURE_REQUIREMENTS, [id=#137]
+Input [3]: [c_customer_sk#159, c_birth_year#163, ca_country#166]
+Arguments: hashpartitioning(c_customer_sk#159, 5), ENSURE_REQUIREMENTS, [id=#170]
 
 (97) Sort [codegen id : 40]
-Input [3]: [c_customer_sk#22, c_birth_year#26, ca_country#33]
-Arguments: [c_customer_sk#22 ASC NULLS FIRST], false, 0
+Input [3]: [c_customer_sk#159, c_birth_year#163, ca_country#166]
+Arguments: [c_customer_sk#159 ASC NULLS FIRST], false, 0
 
 (98) SortMergeJoin [codegen id : 41]
-Left keys [1]: [cs_bill_customer_sk#2]
-Right keys [1]: [c_customer_sk#22]
+Left keys [1]: [cs_bill_customer_sk#151]
+Right keys [1]: [c_customer_sk#159]
 Join condition: None
 
 (99) Project [codegen id : 41]
-Output [9]: [i_item_id#19, ca_country#33, cast(cs_quantity#5 as decimal(12,2)) AS agg1#36, cast(cs_list_price#6 as decimal(12,2)) AS agg2#37, cast(cs_coupon_amt#8 as decimal(12,2)) AS agg3#38, cast(cs_sales_price#7 as decimal(12,2)) AS agg4#39, cast(cs_net_profit#9 as decimal(12,2)) AS agg5#40, cast(c_birth_year#26 as decimal(12,2)) AS agg6#41, cast(cd_dep_count#13 as decimal(12,2)) AS agg7#42]
-Input [11]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_id#19, c_customer_sk#22, c_birth_year#26, ca_country#33]
+Output [9]: [i_item_id#158, ca_country#166, cast(cs_quantity#152 as decimal(12,2)) AS agg1#37, cast(cs_list_price#153 as decimal(12,2)) AS agg2#38, cast(cs_coupon_amt#155 as decimal(12,2)) AS agg3#39, cast(cs_sales_price#154 as decimal(12,2)) AS agg4#40, cast(cs_net_profit#156 as decimal(12,2)) AS agg5#41, cast(c_birth_year#163 as decimal(12,2)) AS agg6#42, cast(cd_dep_count#157 as decimal(12,2)) AS agg7#43]
+Input [11]: [cs_bill_customer_sk#151, cs_quantity#152, cs_list_price#153, cs_sales_price#154, cs_coupon_amt#155, cs_net_profit#156, cd_dep_count#157, i_item_id#158, c_customer_sk#159, c_birth_year#163, ca_country#166]
 
 (100) HashAggregate [codegen id : 41]
-Input [9]: [i_item_id#19, ca_country#33, agg1#36, agg2#37, agg3#38, agg4#39, agg5#40, agg6#41, agg7#42]
-Keys [2]: [i_item_id#19, ca_country#33]
-Functions [7]: [partial_avg(agg1#36), partial_avg(agg2#37), partial_avg(agg3#38), partial_avg(agg4#39), partial_avg(agg5#40), partial_avg(agg6#41), partial_avg(agg7#42)]
-Aggregate Attributes [14]: [sum#138, count#139, sum#140, count#141, sum#142, count#143, sum#144, count#145, sum#146, count#147, sum#148, count#149, sum#150, count#151]
-Results [16]: [i_item_id#19, ca_country#33, sum#152, count#153, sum#154, count#155, sum#156, count#157, sum#158, count#159, sum#160, count#161, sum#162, count#163, sum#164, count#165]
+Input [9]: [i_item_id#158, ca_country#166, agg1#37, agg2#38, agg3#39, agg4#40, agg5#41, agg6#42, agg7#43]
+Keys [2]: [i_item_id#158, ca_country#166]
+Functions [7]: [partial_avg(agg1#37), partial_avg(agg2#38), partial_avg(agg3#39), partial_avg(agg4#40), partial_avg(agg5#41), partial_avg(agg6#42), partial_avg(agg7#43)]
+Aggregate Attributes [14]: [sum#171, count#172, sum#173, count#174, sum#175, count#176, sum#177, count#178, sum#179, count#180, sum#181, count#182, sum#183, count#184]
+Results [16]: [i_item_id#158, ca_country#166, sum#185, count#186, sum#187, count#188, sum#189, count#190, sum#191, count#192, sum#193, count#194, sum#195, count#196, sum#197, count#198]
 
 (101) Exchange
-Input [16]: [i_item_id#19, ca_country#33, sum#152, count#153, sum#154, count#155, sum#156, count#157, sum#158, count#159, sum#160, count#161, sum#162, count#163, sum#164, count#165]
-Arguments: hashpartitioning(i_item_id#19, ca_country#33, 5), ENSURE_REQUIREMENTS, [id=#166]
+Input [16]: [i_item_id#158, ca_country#166, sum#185, count#186, sum#187, count#188, sum#189, count#190, sum#191, count#192, sum#193, count#194, sum#195, count#196, sum#197, count#198]
+Arguments: hashpartitioning(i_item_id#158, ca_country#166, 5), ENSURE_REQUIREMENTS, [id=#199]
 
 (102) HashAggregate [codegen id : 42]
-Input [16]: [i_item_id#19, ca_country#33, sum#152, count#153, sum#154, count#155, sum#156, count#157, sum#158, count#159, sum#160, count#161, sum#162, count#163, sum#164, count#165]
-Keys [2]: [i_item_id#19, ca_country#33]
-Functions [7]: [avg(agg1#36), avg(agg2#37), avg(agg3#38), avg(agg4#39), avg(agg5#40), avg(agg6#41), avg(agg7#42)]
-Aggregate Attributes [7]: [avg(agg1#36)#167, avg(agg2#37)#168, avg(agg3#38)#169, avg(agg4#39)#170, avg(agg5#40)#171, avg(agg6#41)#172, avg(agg7#42)#173]
-Results [11]: [i_item_id#19, ca_country#33, null AS ca_state#174, null AS county#175, avg(agg1#36)#167 AS agg1#176, avg(agg2#37)#168 AS agg2#177, avg(agg3#38)#169 AS agg3#178, avg(agg4#39)#170 AS agg4#179, avg(agg5#40)#171 AS agg5#180, avg(agg6#41)#172 AS agg6#181, avg(agg7#42)#173 AS agg7#182]
+Input [16]: [i_item_id#158, ca_country#166, sum#185, count#186, sum#187, count#188, sum#189, count#190, sum#191, count#192, sum#193, count#194, sum#195, count#196, sum#197, count#198]
+Keys [2]: [i_item_id#158, ca_country#166]
+Functions [7]: [avg(agg1#37), avg(agg2#38), avg(agg3#39), avg(agg4#40), avg(agg5#41), avg(agg6#42), avg(agg7#43)]
+Aggregate Attributes [7]: [avg(agg1#37)#200, avg(agg2#38)#201, avg(agg3#39)#202, avg(agg4#40)#203, avg(agg5#41)#204, avg(agg6#42)#205, avg(agg7#43)#206]
+Results [11]: [i_item_id#158, ca_country#166, null AS ca_state#207, null AS county#208, avg(agg1#37)#200 AS agg1#209, avg(agg2#38)#201 AS agg2#210, avg(agg3#39)#202 AS agg3#211, avg(agg4#40)#203 AS agg4#212, avg(agg5#41)#204 AS agg5#213, avg(agg6#42)#205 AS agg6#214, avg(agg7#43)#206 AS agg7#215]
 
 (103) Scan parquet default.catalog_sales
-Output [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9]
+Output [9]: [cs_bill_customer_sk#216, cs_bill_cdemo_sk#217, cs_item_sk#218, cs_quantity#219, cs_list_price#220, cs_sales_price#221, cs_coupon_amt#222, cs_net_profit#223, cs_sold_date_sk#224]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/catalog_sales]
-PushedFilters: [IsNotNull(cs_bill_cdemo_sk), IsNotNull(cs_bill_customer_sk), IsNotNull(cs_sold_date_sk), IsNotNull(cs_item_sk)]
-ReadSchema: struct<cs_sold_date_sk:int,cs_bill_customer_sk:int,cs_bill_cdemo_sk:int,cs_item_sk:int,cs_quantity:int,cs_list_price:decimal(7,2),cs_sales_price:decimal(7,2),cs_coupon_amt:decimal(7,2),cs_net_profit:decimal(7,2)>
+Location: InMemoryFileIndex []
+PartitionFilters: [isnotnull(cs_sold_date_sk#224), dynamicpruningexpression(cs_sold_date_sk#224 IN dynamicpruning#10)]
+PushedFilters: [IsNotNull(cs_bill_cdemo_sk), IsNotNull(cs_bill_customer_sk), IsNotNull(cs_item_sk)]
+ReadSchema: struct<cs_bill_customer_sk:int,cs_bill_cdemo_sk:int,cs_item_sk:int,cs_quantity:int,cs_list_price:decimal(7,2),cs_sales_price:decimal(7,2),cs_coupon_amt:decimal(7,2),cs_net_profit:decimal(7,2)>
 
 (104) ColumnarToRow [codegen id : 49]
-Input [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9]
+Input [9]: [cs_bill_customer_sk#216, cs_bill_cdemo_sk#217, cs_item_sk#218, cs_quantity#219, cs_list_price#220, cs_sales_price#221, cs_coupon_amt#222, cs_net_profit#223, cs_sold_date_sk#224]
 
 (105) Filter [codegen id : 49]
-Input [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9]
-Condition : (((isnotnull(cs_bill_cdemo_sk#3) AND isnotnull(cs_bill_customer_sk#2)) AND isnotnull(cs_sold_date_sk#1)) AND isnotnull(cs_item_sk#4))
+Input [9]: [cs_bill_customer_sk#216, cs_bill_cdemo_sk#217, cs_item_sk#218, cs_quantity#219, cs_list_price#220, cs_sales_price#221, cs_coupon_amt#222, cs_net_profit#223, cs_sold_date_sk#224]
+Condition : ((isnotnull(cs_bill_cdemo_sk#217) AND isnotnull(cs_bill_customer_sk#216)) AND isnotnull(cs_item_sk#218))
 
 (106) ReusedExchange [Reuses operator id: 8]
-Output [2]: [cd_demo_sk#10, cd_dep_count#13]
+Output [2]: [cd_demo_sk#225, cd_dep_count#226]
 
 (107) BroadcastHashJoin [codegen id : 49]
-Left keys [1]: [cs_bill_cdemo_sk#3]
-Right keys [1]: [cd_demo_sk#10]
+Left keys [1]: [cs_bill_cdemo_sk#217]
+Right keys [1]: [cd_demo_sk#225]
 Join condition: None
 
 (108) Project [codegen id : 49]
-Output [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13]
-Input [11]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_demo_sk#10, cd_dep_count#13]
+Output [9]: [cs_bill_customer_sk#216, cs_item_sk#218, cs_quantity#219, cs_list_price#220, cs_sales_price#221, cs_coupon_amt#222, cs_net_profit#223, cs_sold_date_sk#224, cd_dep_count#226]
+Input [11]: [cs_bill_customer_sk#216, cs_bill_cdemo_sk#217, cs_item_sk#218, cs_quantity#219, cs_list_price#220, cs_sales_price#221, cs_coupon_amt#222, cs_net_profit#223, cs_sold_date_sk#224, cd_demo_sk#225, cd_dep_count#226]
 
 (109) ReusedExchange [Reuses operator id: 15]
-Output [1]: [d_date_sk#15]
+Output [1]: [d_date_sk#227]
 
 (110) BroadcastHashJoin [codegen id : 49]
-Left keys [1]: [cs_sold_date_sk#1]
-Right keys [1]: [d_date_sk#15]
+Left keys [1]: [cs_sold_date_sk#224]
+Right keys [1]: [d_date_sk#227]
 Join condition: None
 
 (111) Project [codegen id : 49]
-Output [8]: [cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13]
-Input [10]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, d_date_sk#15]
+Output [8]: [cs_bill_customer_sk#216, cs_item_sk#218, cs_quantity#219, cs_list_price#220, cs_sales_price#221, cs_coupon_amt#222, cs_net_profit#223, cd_dep_count#226]
+Input [10]: [cs_bill_customer_sk#216, cs_item_sk#218, cs_quantity#219, cs_list_price#220, cs_sales_price#221, cs_coupon_amt#222, cs_net_profit#223, cs_sold_date_sk#224, cd_dep_count#226, d_date_sk#227]
 
 (112) Scan parquet default.customer
-Output [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+Output [5]: [c_customer_sk#228, c_current_cdemo_sk#229, c_current_addr_sk#230, c_birth_month#231, c_birth_year#232]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer]
 PushedFilters: [In(c_birth_month, [9,5,12,4,1,10]), IsNotNull(c_customer_sk), IsNotNull(c_current_cdemo_sk), IsNotNull(c_current_addr_sk)]
 ReadSchema: struct<c_customer_sk:int,c_current_cdemo_sk:int,c_current_addr_sk:int,c_birth_month:int,c_birth_year:int>
 
 (113) ColumnarToRow [codegen id : 46]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+Input [5]: [c_customer_sk#228, c_current_cdemo_sk#229, c_current_addr_sk#230, c_birth_month#231, c_birth_year#232]
 
 (114) Filter [codegen id : 46]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
-Condition : (((c_birth_month#25 IN (9,5,12,4,1,10) AND isnotnull(c_customer_sk#22)) AND isnotnull(c_current_cdemo_sk#23)) AND isnotnull(c_current_addr_sk#24))
+Input [5]: [c_customer_sk#228, c_current_cdemo_sk#229, c_current_addr_sk#230, c_birth_month#231, c_birth_year#232]
+Condition : (((c_birth_month#231 IN (9,5,12,4,1,10) AND isnotnull(c_customer_sk#228)) AND isnotnull(c_current_cdemo_sk#229)) AND isnotnull(c_current_addr_sk#230))
 
 (115) Project [codegen id : 46]
-Output [4]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_year#26]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_month#25, c_birth_year#26]
+Output [4]: [c_customer_sk#228, c_current_cdemo_sk#229, c_current_addr_sk#230, c_birth_year#232]
+Input [5]: [c_customer_sk#228, c_current_cdemo_sk#229, c_current_addr_sk#230, c_birth_month#231, c_birth_year#232]
 
 (116) Scan parquet default.customer_address
-Output [2]: [ca_address_sk#30, ca_state#32]
+Output [2]: [ca_address_sk#233, ca_state#234]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer_address]
 PushedFilters: [In(ca_state, [ND,WI,AL,NC,OK,MS,TN]), IsNotNull(ca_address_sk)]
 ReadSchema: struct<ca_address_sk:int,ca_state:string>
 
 (117) ColumnarToRow [codegen id : 45]
-Input [2]: [ca_address_sk#30, ca_state#32]
+Input [2]: [ca_address_sk#233, ca_state#234]
 
 (118) Filter [codegen id : 45]
-Input [2]: [ca_address_sk#30, ca_state#32]
-Condition : (ca_state#32 IN (ND,WI,AL,NC,OK,MS,TN) AND isnotnull(ca_address_sk#30))
+Input [2]: [ca_address_sk#233, ca_state#234]
+Condition : (ca_state#234 IN (ND,WI,AL,NC,OK,MS,TN) AND isnotnull(ca_address_sk#233))
 
 (119) Project [codegen id : 45]
-Output [1]: [ca_address_sk#30]
-Input [2]: [ca_address_sk#30, ca_state#32]
+Output [1]: [ca_address_sk#233]
+Input [2]: [ca_address_sk#233, ca_state#234]
 
 (120) BroadcastExchange
-Input [1]: [ca_address_sk#30]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#183]
+Input [1]: [ca_address_sk#233]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#235]
 
 (121) BroadcastHashJoin [codegen id : 46]
-Left keys [1]: [c_current_addr_sk#24]
-Right keys [1]: [ca_address_sk#30]
+Left keys [1]: [c_current_addr_sk#230]
+Right keys [1]: [ca_address_sk#233]
 Join condition: None
 
 (122) Project [codegen id : 46]
-Output [3]: [c_customer_sk#22, c_current_cdemo_sk#23, c_birth_year#26]
-Input [5]: [c_customer_sk#22, c_current_cdemo_sk#23, c_current_addr_sk#24, c_birth_year#26, ca_address_sk#30]
+Output [3]: [c_customer_sk#228, c_current_cdemo_sk#229, c_birth_year#232]
+Input [5]: [c_customer_sk#228, c_current_cdemo_sk#229, c_current_addr_sk#230, c_birth_year#232, ca_address_sk#233]
 
 (123) BroadcastExchange
-Input [3]: [c_customer_sk#22, c_current_cdemo_sk#23, c_birth_year#26]
-Arguments: HashedRelationBroadcastMode(List(cast(input[1, int, true] as bigint)),false), [id=#184]
+Input [3]: [c_customer_sk#228, c_current_cdemo_sk#229, c_birth_year#232]
+Arguments: HashedRelationBroadcastMode(List(cast(input[1, int, true] as bigint)),false), [id=#236]
 
 (124) Scan parquet default.customer_demographics
-Output [1]: [cd_demo_sk#185]
+Output [1]: [cd_demo_sk#237]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/customer_demographics]
 PushedFilters: [IsNotNull(cd_demo_sk)]
 ReadSchema: struct<cd_demo_sk:int>
 
 (125) ColumnarToRow
-Input [1]: [cd_demo_sk#185]
+Input [1]: [cd_demo_sk#237]
 
 (126) Filter
-Input [1]: [cd_demo_sk#185]
-Condition : isnotnull(cd_demo_sk#185)
+Input [1]: [cd_demo_sk#237]
+Condition : isnotnull(cd_demo_sk#237)
 
 (127) BroadcastHashJoin [codegen id : 47]
-Left keys [1]: [c_current_cdemo_sk#23]
-Right keys [1]: [cd_demo_sk#185]
+Left keys [1]: [c_current_cdemo_sk#229]
+Right keys [1]: [cd_demo_sk#237]
 Join condition: None
 
 (128) Project [codegen id : 47]
-Output [2]: [c_customer_sk#22, c_birth_year#26]
-Input [4]: [c_customer_sk#22, c_current_cdemo_sk#23, c_birth_year#26, cd_demo_sk#185]
+Output [2]: [c_customer_sk#228, c_birth_year#232]
+Input [4]: [c_customer_sk#228, c_current_cdemo_sk#229, c_birth_year#232, cd_demo_sk#237]
 
 (129) BroadcastExchange
-Input [2]: [c_customer_sk#22, c_birth_year#26]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#186]
+Input [2]: [c_customer_sk#228, c_birth_year#232]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#238]
 
 (130) BroadcastHashJoin [codegen id : 49]
-Left keys [1]: [cs_bill_customer_sk#2]
-Right keys [1]: [c_customer_sk#22]
+Left keys [1]: [cs_bill_customer_sk#216]
+Right keys [1]: [c_customer_sk#228]
 Join condition: None
 
 (131) Project [codegen id : 49]
-Output [8]: [cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, c_birth_year#26]
-Input [10]: [cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, c_customer_sk#22, c_birth_year#26]
+Output [8]: [cs_item_sk#218, cs_quantity#219, cs_list_price#220, cs_sales_price#221, cs_coupon_amt#222, cs_net_profit#223, cd_dep_count#226, c_birth_year#232]
+Input [10]: [cs_bill_customer_sk#216, cs_item_sk#218, cs_quantity#219, cs_list_price#220, cs_sales_price#221, cs_coupon_amt#222, cs_net_profit#223, cd_dep_count#226, c_customer_sk#228, c_birth_year#232]
 
 (132) ReusedExchange [Reuses operator id: 21]
-Output [2]: [i_item_sk#18, i_item_id#19]
+Output [2]: [i_item_sk#239, i_item_id#240]
 
 (133) BroadcastHashJoin [codegen id : 49]
-Left keys [1]: [cs_item_sk#4]
-Right keys [1]: [i_item_sk#18]
+Left keys [1]: [cs_item_sk#218]
+Right keys [1]: [i_item_sk#239]
 Join condition: None
 
 (134) Project [codegen id : 49]
-Output [8]: [i_item_id#19, cast(cs_quantity#5 as decimal(12,2)) AS agg1#36, cast(cs_list_price#6 as decimal(12,2)) AS agg2#37, cast(cs_coupon_amt#8 as decimal(12,2)) AS agg3#38, cast(cs_sales_price#7 as decimal(12,2)) AS agg4#39, cast(cs_net_profit#9 as decimal(12,2)) AS agg5#40, cast(c_birth_year#26 as decimal(12,2)) AS agg6#41, cast(cd_dep_count#13 as decimal(12,2)) AS agg7#42]
-Input [10]: [cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, c_birth_year#26, i_item_sk#18, i_item_id#19]
+Output [8]: [i_item_id#240, cast(cs_quantity#219 as decimal(12,2)) AS agg1#37, cast(cs_list_price#220 as decimal(12,2)) AS agg2#38, cast(cs_coupon_amt#222 as decimal(12,2)) AS agg3#39, cast(cs_sales_price#221 as decimal(12,2)) AS agg4#40, cast(cs_net_profit#223 as decimal(12,2)) AS agg5#41, cast(c_birth_year#232 as decimal(12,2)) AS agg6#42, cast(cd_dep_count#226 as decimal(12,2)) AS agg7#43]
+Input [10]: [cs_item_sk#218, cs_quantity#219, cs_list_price#220, cs_sales_price#221, cs_coupon_amt#222, cs_net_profit#223, cd_dep_count#226, c_birth_year#232, i_item_sk#239, i_item_id#240]
 
 (135) HashAggregate [codegen id : 49]
-Input [8]: [i_item_id#19, agg1#36, agg2#37, agg3#38, agg4#39, agg5#40, agg6#41, agg7#42]
-Keys [1]: [i_item_id#19]
-Functions [7]: [partial_avg(agg1#36), partial_avg(agg2#37), partial_avg(agg3#38), partial_avg(agg4#39), partial_avg(agg5#40), partial_avg(agg6#41), partial_avg(agg7#42)]
-Aggregate Attributes [14]: [sum#187, count#188, sum#189, count#190, sum#191, count#192, sum#193, count#194, sum#195, count#196, sum#197, count#198, sum#199, count#200]
-Results [15]: [i_item_id#19, sum#201, count#202, sum#203, count#204, sum#205, count#206, sum#207, count#208, sum#209, count#210, sum#211, count#212, sum#213, count#214]
+Input [8]: [i_item_id#240, agg1#37, agg2#38, agg3#39, agg4#40, agg5#41, agg6#42, agg7#43]
+Keys [1]: [i_item_id#240]
+Functions [7]: [partial_avg(agg1#37), partial_avg(agg2#38), partial_avg(agg3#39), partial_avg(agg4#40), partial_avg(agg5#41), partial_avg(agg6#42), partial_avg(agg7#43)]
+Aggregate Attributes [14]: [sum#241, count#242, sum#243, count#244, sum#245, count#246, sum#247, count#248, sum#249, count#250, sum#251, count#252, sum#253, count#254]
+Results [15]: [i_item_id#240, sum#255, count#256, sum#257, count#258, sum#259, count#260, sum#261, count#262, sum#263, count#264, sum#265, count#266, sum#267, count#268]
 
 (136) Exchange
-Input [15]: [i_item_id#19, sum#201, count#202, sum#203, count#204, sum#205, count#206, sum#207, count#208, sum#209, count#210, sum#211, count#212, sum#213, count#214]
-Arguments: hashpartitioning(i_item_id#19, 5), ENSURE_REQUIREMENTS, [id=#215]
+Input [15]: [i_item_id#240, sum#255, count#256, sum#257, count#258, sum#259, count#260, sum#261, count#262, sum#263, count#264, sum#265, count#266, sum#267, count#268]
+Arguments: hashpartitioning(i_item_id#240, 5), ENSURE_REQUIREMENTS, [id=#269]
 
 (137) HashAggregate [codegen id : 50]
-Input [15]: [i_item_id#19, sum#201, count#202, sum#203, count#204, sum#205, count#206, sum#207, count#208, sum#209, count#210, sum#211, count#212, sum#213, count#214]
-Keys [1]: [i_item_id#19]
-Functions [7]: [avg(agg1#36), avg(agg2#37), avg(agg3#38), avg(agg4#39), avg(agg5#40), avg(agg6#41), avg(agg7#42)]
-Aggregate Attributes [7]: [avg(agg1#36)#216, avg(agg2#37)#217, avg(agg3#38)#218, avg(agg4#39)#219, avg(agg5#40)#220, avg(agg6#41)#221, avg(agg7#42)#222]
-Results [11]: [i_item_id#19, null AS ca_country#223, null AS ca_state#224, null AS county#225, avg(agg1#36)#216 AS agg1#226, avg(agg2#37)#217 AS agg2#227, avg(agg3#38)#218 AS agg3#228, avg(agg4#39)#219 AS agg4#229, avg(agg5#40)#220 AS agg5#230, avg(agg6#41)#221 AS agg6#231, avg(agg7#42)#222 AS agg7#232]
+Input [15]: [i_item_id#240, sum#255, count#256, sum#257, count#258, sum#259, count#260, sum#261, count#262, sum#263, count#264, sum#265, count#266, sum#267, count#268]
+Keys [1]: [i_item_id#240]
+Functions [7]: [avg(agg1#37), avg(agg2#38), avg(agg3#39), avg(agg4#40), avg(agg5#41), avg(agg6#42), avg(agg7#43)]
+Aggregate Attributes [7]: [avg(agg1#37)#270, avg(agg2#38)#271, avg(agg3#39)#272, avg(agg4#40)#273, avg(agg5#41)#274, avg(agg6#42)#275, avg(agg7#43)#276]
+Results [11]: [i_item_id#240, null AS ca_country#277, null AS ca_state#278, null AS county#279, avg(agg1#37)#270 AS agg1#280, avg(agg2#38)#271 AS agg2#281, avg(agg3#39)#272 AS agg3#282, avg(agg4#40)#273 AS agg4#283, avg(agg5#41)#274 AS agg5#284, avg(agg6#42)#275 AS agg6#285, avg(agg7#43)#276 AS agg7#286]
 
 (138) Scan parquet default.catalog_sales
-Output [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9]
+Output [9]: [cs_bill_customer_sk#287, cs_bill_cdemo_sk#288, cs_item_sk#289, cs_quantity#290, cs_list_price#291, cs_sales_price#292, cs_coupon_amt#293, cs_net_profit#294, cs_sold_date_sk#295]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/catalog_sales]
-PushedFilters: [IsNotNull(cs_bill_cdemo_sk), IsNotNull(cs_bill_customer_sk), IsNotNull(cs_sold_date_sk), IsNotNull(cs_item_sk)]
-ReadSchema: struct<cs_sold_date_sk:int,cs_bill_customer_sk:int,cs_bill_cdemo_sk:int,cs_item_sk:int,cs_quantity:int,cs_list_price:decimal(7,2),cs_sales_price:decimal(7,2),cs_coupon_amt:decimal(7,2),cs_net_profit:decimal(7,2)>
+Location: InMemoryFileIndex []
+PartitionFilters: [isnotnull(cs_sold_date_sk#295), dynamicpruningexpression(cs_sold_date_sk#295 IN dynamicpruning#10)]
+PushedFilters: [IsNotNull(cs_bill_cdemo_sk), IsNotNull(cs_bill_customer_sk), IsNotNull(cs_item_sk)]
+ReadSchema: struct<cs_bill_customer_sk:int,cs_bill_cdemo_sk:int,cs_item_sk:int,cs_quantity:int,cs_list_price:decimal(7,2),cs_sales_price:decimal(7,2),cs_coupon_amt:decimal(7,2),cs_net_profit:decimal(7,2)>
 
 (139) ColumnarToRow [codegen id : 57]
-Input [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9]
+Input [9]: [cs_bill_customer_sk#287, cs_bill_cdemo_sk#288, cs_item_sk#289, cs_quantity#290, cs_list_price#291, cs_sales_price#292, cs_coupon_amt#293, cs_net_profit#294, cs_sold_date_sk#295]
 
 (140) Filter [codegen id : 57]
-Input [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9]
-Condition : (((isnotnull(cs_bill_cdemo_sk#3) AND isnotnull(cs_bill_customer_sk#2)) AND isnotnull(cs_sold_date_sk#1)) AND isnotnull(cs_item_sk#4))
+Input [9]: [cs_bill_customer_sk#287, cs_bill_cdemo_sk#288, cs_item_sk#289, cs_quantity#290, cs_list_price#291, cs_sales_price#292, cs_coupon_amt#293, cs_net_profit#294, cs_sold_date_sk#295]
+Condition : ((isnotnull(cs_bill_cdemo_sk#288) AND isnotnull(cs_bill_customer_sk#287)) AND isnotnull(cs_item_sk#289))
 
 (141) ReusedExchange [Reuses operator id: 8]
-Output [2]: [cd_demo_sk#10, cd_dep_count#13]
+Output [2]: [cd_demo_sk#296, cd_dep_count#297]
 
 (142) BroadcastHashJoin [codegen id : 57]
-Left keys [1]: [cs_bill_cdemo_sk#3]
-Right keys [1]: [cd_demo_sk#10]
+Left keys [1]: [cs_bill_cdemo_sk#288]
+Right keys [1]: [cd_demo_sk#296]
 Join condition: None
 
 (143) Project [codegen id : 57]
-Output [9]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13]
-Input [11]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_bill_cdemo_sk#3, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_demo_sk#10, cd_dep_count#13]
+Output [9]: [cs_bill_customer_sk#287, cs_item_sk#289, cs_quantity#290, cs_list_price#291, cs_sales_price#292, cs_coupon_amt#293, cs_net_profit#294, cs_sold_date_sk#295, cd_dep_count#297]
+Input [11]: [cs_bill_customer_sk#287, cs_bill_cdemo_sk#288, cs_item_sk#289, cs_quantity#290, cs_list_price#291, cs_sales_price#292, cs_coupon_amt#293, cs_net_profit#294, cs_sold_date_sk#295, cd_demo_sk#296, cd_dep_count#297]
 
 (144) ReusedExchange [Reuses operator id: 15]
-Output [1]: [d_date_sk#15]
+Output [1]: [d_date_sk#298]
 
 (145) BroadcastHashJoin [codegen id : 57]
-Left keys [1]: [cs_sold_date_sk#1]
-Right keys [1]: [d_date_sk#15]
+Left keys [1]: [cs_sold_date_sk#295]
+Right keys [1]: [d_date_sk#298]
 Join condition: None
 
 (146) Project [codegen id : 57]
-Output [8]: [cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13]
-Input [10]: [cs_sold_date_sk#1, cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, d_date_sk#15]
+Output [8]: [cs_bill_customer_sk#287, cs_item_sk#289, cs_quantity#290, cs_list_price#291, cs_sales_price#292, cs_coupon_amt#293, cs_net_profit#294, cd_dep_count#297]
+Input [10]: [cs_bill_customer_sk#287, cs_item_sk#289, cs_quantity#290, cs_list_price#291, cs_sales_price#292, cs_coupon_amt#293, cs_net_profit#294, cs_sold_date_sk#295, cd_dep_count#297, d_date_sk#298]
 
 (147) Scan parquet default.item
-Output [1]: [i_item_sk#18]
+Output [1]: [i_item_sk#299]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/item]
 PushedFilters: [IsNotNull(i_item_sk)]
 ReadSchema: struct<i_item_sk:int>
 
 (148) ColumnarToRow [codegen id : 53]
-Input [1]: [i_item_sk#18]
+Input [1]: [i_item_sk#299]
 
 (149) Filter [codegen id : 53]
-Input [1]: [i_item_sk#18]
-Condition : isnotnull(i_item_sk#18)
+Input [1]: [i_item_sk#299]
+Condition : isnotnull(i_item_sk#299)
 
 (150) BroadcastExchange
-Input [1]: [i_item_sk#18]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#233]
+Input [1]: [i_item_sk#299]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, false] as bigint)),false), [id=#300]
 
 (151) BroadcastHashJoin [codegen id : 57]
-Left keys [1]: [cs_item_sk#4]
-Right keys [1]: [i_item_sk#18]
+Left keys [1]: [cs_item_sk#289]
+Right keys [1]: [i_item_sk#299]
 Join condition: None
 
 (152) Project [codegen id : 57]
-Output [7]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13]
-Input [9]: [cs_bill_customer_sk#2, cs_item_sk#4, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, i_item_sk#18]
+Output [7]: [cs_bill_customer_sk#287, cs_quantity#290, cs_list_price#291, cs_sales_price#292, cs_coupon_amt#293, cs_net_profit#294, cd_dep_count#297]
+Input [9]: [cs_bill_customer_sk#287, cs_item_sk#289, cs_quantity#290, cs_list_price#291, cs_sales_price#292, cs_coupon_amt#293, cs_net_profit#294, cd_dep_count#297, i_item_sk#299]
 
 (153) ReusedExchange [Reuses operator id: 129]
-Output [2]: [c_customer_sk#22, c_birth_year#26]
+Output [2]: [c_customer_sk#301, c_birth_year#302]
 
 (154) BroadcastHashJoin [codegen id : 57]
-Left keys [1]: [cs_bill_customer_sk#2]
-Right keys [1]: [c_customer_sk#22]
+Left keys [1]: [cs_bill_customer_sk#287]
+Right keys [1]: [c_customer_sk#301]
 Join condition: None
 
 (155) Project [codegen id : 57]
-Output [7]: [cast(cs_quantity#5 as decimal(12,2)) AS agg1#36, cast(cs_list_price#6 as decimal(12,2)) AS agg2#37, cast(cs_coupon_amt#8 as decimal(12,2)) AS agg3#38, cast(cs_sales_price#7 as decimal(12,2)) AS agg4#39, cast(cs_net_profit#9 as decimal(12,2)) AS agg5#40, cast(c_birth_year#26 as decimal(12,2)) AS agg6#41, cast(cd_dep_count#13 as decimal(12,2)) AS agg7#42]
-Input [9]: [cs_bill_customer_sk#2, cs_quantity#5, cs_list_price#6, cs_sales_price#7, cs_coupon_amt#8, cs_net_profit#9, cd_dep_count#13, c_customer_sk#22, c_birth_year#26]
+Output [7]: [cast(cs_quantity#290 as decimal(12,2)) AS agg1#37, cast(cs_list_price#291 as decimal(12,2)) AS agg2#38, cast(cs_coupon_amt#293 as decimal(12,2)) AS agg3#39, cast(cs_sales_price#292 as decimal(12,2)) AS agg4#40, cast(cs_net_profit#294 as decimal(12,2)) AS agg5#41, cast(c_birth_year#302 as decimal(12,2)) AS agg6#42, cast(cd_dep_count#297 as decimal(12,2)) AS agg7#43]
+Input [9]: [cs_bill_customer_sk#287, cs_quantity#290, cs_list_price#291, cs_sales_price#292, cs_coupon_amt#293, cs_net_profit#294, cd_dep_count#297, c_customer_sk#301, c_birth_year#302]
 
 (156) HashAggregate [codegen id : 57]
-Input [7]: [agg1#36, agg2#37, agg3#38, agg4#39, agg5#40, agg6#41, agg7#42]
+Input [7]: [agg1#37, agg2#38, agg3#39, agg4#40, agg5#41, agg6#42, agg7#43]
 Keys: []
-Functions [7]: [partial_avg(agg1#36), partial_avg(agg2#37), partial_avg(agg3#38), partial_avg(agg4#39), partial_avg(agg5#40), partial_avg(agg6#41), partial_avg(agg7#42)]
-Aggregate Attributes [14]: [sum#234, count#235, sum#236, count#237, sum#238, count#239, sum#240, count#241, sum#242, count#243, sum#244, count#245, sum#246, count#247]
-Results [14]: [sum#248, count#249, sum#250, count#251, sum#252, count#253, sum#254, count#255, sum#256, count#257, sum#258, count#259, sum#260, count#261]
+Functions [7]: [partial_avg(agg1#37), partial_avg(agg2#38), partial_avg(agg3#39), partial_avg(agg4#40), partial_avg(agg5#41), partial_avg(agg6#42), partial_avg(agg7#43)]
+Aggregate Attributes [14]: [sum#303, count#304, sum#305, count#306, sum#307, count#308, sum#309, count#310, sum#311, count#312, sum#313, count#314, sum#315, count#316]
+Results [14]: [sum#317, count#318, sum#319, count#320, sum#321, count#322, sum#323, count#324, sum#325, count#326, sum#327, count#328, sum#329, count#330]
 
 (157) Exchange
-Input [14]: [sum#248, count#249, sum#250, count#251, sum#252, count#253, sum#254, count#255, sum#256, count#257, sum#258, count#259, sum#260, count#261]
-Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#262]
+Input [14]: [sum#317, count#318, sum#319, count#320, sum#321, count#322, sum#323, count#324, sum#325, count#326, sum#327, count#328, sum#329, count#330]
+Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#331]
 
 (158) HashAggregate [codegen id : 58]
-Input [14]: [sum#248, count#249, sum#250, count#251, sum#252, count#253, sum#254, count#255, sum#256, count#257, sum#258, count#259, sum#260, count#261]
+Input [14]: [sum#317, count#318, sum#319, count#320, sum#321, count#322, sum#323, count#324, sum#325, count#326, sum#327, count#328, sum#329, count#330]
 Keys: []
-Functions [7]: [avg(agg1#36), avg(agg2#37), avg(agg3#38), avg(agg4#39), avg(agg5#40), avg(agg6#41), avg(agg7#42)]
-Aggregate Attributes [7]: [avg(agg1#36)#263, avg(agg2#37)#264, avg(agg3#38)#265, avg(agg4#39)#266, avg(agg5#40)#267, avg(agg6#41)#268, avg(agg7#42)#269]
-Results [11]: [null AS i_item_id#270, null AS ca_country#271, null AS ca_state#272, null AS county#273, avg(agg1#36)#263 AS agg1#274, avg(agg2#37)#264 AS agg2#275, avg(agg3#38)#265 AS agg3#276, avg(agg4#39)#266 AS agg4#277, avg(agg5#40)#267 AS agg5#278, avg(agg6#41)#268 AS agg6#279, avg(agg7#42)#269 AS agg7#280]
+Functions [7]: [avg(agg1#37), avg(agg2#38), avg(agg3#39), avg(agg4#40), avg(agg5#41), avg(agg6#42), avg(agg7#43)]
+Aggregate Attributes [7]: [avg(agg1#37)#332, avg(agg2#38)#333, avg(agg3#39)#334, avg(agg4#40)#335, avg(agg5#41)#336, avg(agg6#42)#337, avg(agg7#43)#338]
+Results [11]: [null AS i_item_id#339, null AS ca_country#340, null AS ca_state#341, null AS county#342, avg(agg1#37)#332 AS agg1#343, avg(agg2#38)#333 AS agg2#344, avg(agg3#39)#334 AS agg3#345, avg(agg4#40)#335 AS agg4#346, avg(agg5#41)#336 AS agg5#347, avg(agg6#42)#337 AS agg6#348, avg(agg7#43)#338 AS agg7#349]
 
 (159) Union
 
 (160) TakeOrderedAndProject
-Input [11]: [i_item_id#19, ca_country#33, ca_state#32, ca_county#31, agg1#79, agg2#80, agg3#81, agg4#82, agg5#83, agg6#84, agg7#85]
-Arguments: 100, [ca_country#33 ASC NULLS FIRST, ca_state#32 ASC NULLS FIRST, ca_county#31 ASC NULLS FIRST, i_item_id#19 ASC NULLS FIRST], [i_item_id#19, ca_country#33, ca_state#32, ca_county#31, agg1#79, agg2#80, agg3#81, agg4#82, agg5#83, agg6#84, agg7#85]
+Input [11]: [i_item_id#20, ca_country#31, ca_state#30, ca_county#29, agg1#80, agg2#81, agg3#82, agg4#83, agg5#84, agg6#85, agg7#86]
+Arguments: 100, [ca_country#31 ASC NULLS FIRST, ca_state#30 ASC NULLS FIRST, ca_county#29 ASC NULLS FIRST, i_item_id#20 ASC NULLS FIRST], [i_item_id#20, ca_country#31, ca_state#30, ca_county#29, agg1#80, agg2#81, agg3#82, agg4#83, agg5#84, agg6#85, agg7#86]
+
+===== Subqueries =====
+
+Subquery:1 Hosting operator id = 1 Hosting Expression = cs_sold_date_sk#9 IN dynamicpruning#10
+ReusedExchange (161)
+
+
+(161) ReusedExchange [Reuses operator id: 15]
+Output [1]: [d_date_sk#16]
+
+Subquery:2 Hosting operator id = 103 Hosting Expression = cs_sold_date_sk#224 IN dynamicpruning#10
+
+Subquery:3 Hosting operator id = 138 Hosting Expression = cs_sold_date_sk#295 IN dynamicpruning#10
+
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q18a.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q18a.sf100/simplified.txt
index 4566929712713..5514e335f1b51 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q18a.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q18a.sf100/simplified.txt
@@ -54,37 +54,37 @@ TakeOrderedAndProject [ca_country,ca_state,ca_county,i_item_id,agg1,agg2,agg3,ag
                             Exchange [c_customer_sk] #6
                               WholeStageCodegen (11)
                                 Project [c_customer_sk,c_birth_year,ca_county,ca_state,ca_country]
-                                  BroadcastHashJoin [c_current_addr_sk,ca_address_sk]
-                                    Project [c_customer_sk,c_current_addr_sk,c_birth_year]
-                                      SortMergeJoin [c_current_cdemo_sk,cd_demo_sk]
-                                        InputAdapter
-                                          WholeStageCodegen (7)
-                                            Sort [c_current_cdemo_sk]
-                                              InputAdapter
-                                                Exchange [c_current_cdemo_sk] #7
-                                                  WholeStageCodegen (6)
+                                  SortMergeJoin [c_current_cdemo_sk,cd_demo_sk]
+                                    InputAdapter
+                                      WholeStageCodegen (8)
+                                        Sort [c_current_cdemo_sk]
+                                          InputAdapter
+                                            Exchange [c_current_cdemo_sk] #7
+                                              WholeStageCodegen (7)
+                                                Project [c_customer_sk,c_current_cdemo_sk,c_birth_year,ca_county,ca_state,ca_country]
+                                                  BroadcastHashJoin [c_current_addr_sk,ca_address_sk]
                                                     Project [c_customer_sk,c_current_cdemo_sk,c_current_addr_sk,c_birth_year]
                                                       Filter [c_birth_month,c_customer_sk,c_current_cdemo_sk,c_current_addr_sk]
                                                         ColumnarToRow
                                                           InputAdapter
                                                             Scan parquet default.customer [c_customer_sk,c_current_cdemo_sk,c_current_addr_sk,c_birth_month,c_birth_year]
-                                        InputAdapter
-                                          WholeStageCodegen (9)
-                                            Sort [cd_demo_sk]
-                                              InputAdapter
-                                                Exchange [cd_demo_sk] #8
-                                                  WholeStageCodegen (8)
-                                                    Filter [cd_demo_sk]
-                                                      ColumnarToRow
-                                                        InputAdapter
-                                                          Scan parquet default.customer_demographics [cd_demo_sk]
+                                                    InputAdapter
+                                                      BroadcastExchange #8
+                                                        WholeStageCodegen (6)
+                                                          Filter [ca_state,ca_address_sk]
+                                                            ColumnarToRow
+                                                              InputAdapter
+                                                                Scan parquet default.customer_address [ca_address_sk,ca_county,ca_state,ca_country]
                                     InputAdapter
-                                      BroadcastExchange #9
-                                        WholeStageCodegen (10)
-                                          Filter [ca_state,ca_address_sk]
-                                            ColumnarToRow
-                                              InputAdapter
-                                                Scan parquet default.customer_address [ca_address_sk,ca_county,ca_state,ca_country]
+                                      WholeStageCodegen (10)
+                                        Sort [cd_demo_sk]
+                                          InputAdapter
+                                            Exchange [cd_demo_sk] #9
+                                              WholeStageCodegen (9)
+                                                Filter [cd_demo_sk]
+                                                  ColumnarToRow
+                                                    InputAdapter
+                                                      Scan parquet default.customer_demographics [cd_demo_sk]
     WholeStageCodegen (28)
       HashAggregate [i_item_id,ca_country,ca_state,sum,count,sum,count,sum,count,sum,count,sum,count,sum,count,sum,count] [avg(agg1),avg(agg2),avg(agg3),avg(agg4),avg(agg5),avg(agg6),avg(agg7),county,agg1,agg2,agg3,agg4,agg5,agg6,agg7,sum,count,sum,count,sum,count,sum,count,sum,count,sum,count,sum,count]
         InputAdapter
@@ -130,7 +130,7 @@ TakeOrderedAndProject [ca_country,ca_state,ca_county,i_item_id,agg1,agg2,agg3,ag
                                       WholeStageCodegen (24)
                                         Sort [cd_demo_sk]
                                           InputAdapter
-                                            ReusedExchange [cd_demo_sk] #8
+                                            ReusedExchange [cd_demo_sk] #9
     WholeStageCodegen (42)
       HashAggregate [i_item_id,ca_country,sum,count,sum,count,sum,count,sum,count,sum,count,sum,count,sum,count] [avg(agg1),avg(agg2),avg(agg3),avg(agg4),avg(agg5),avg(agg6),avg(agg7),ca_state,county,agg1,agg2,agg3,agg4,agg5,agg6,agg7,sum,count,sum,count,sum,count,sum,count,sum,count,sum,count,sum,count]
         InputAdapter
@@ -177,7 +177,7 @@ TakeOrderedAndProject [ca_country,ca_state,ca_county,i_item_id,agg1,agg2,agg3,ag
                                       WholeStageCodegen (38)
                                         Sort [cd_demo_sk]
                                           InputAdapter
-                                            ReusedExchange [cd_demo_sk] #8
+                                            ReusedExchange [cd_demo_sk] #9
     WholeStageCodegen (50)
       HashAggregate [i_item_id,sum,count,sum,count,sum,count,sum,count,sum,count,sum,count,sum,count] [avg(agg1),avg(agg2),avg(agg3),avg(agg4),avg(agg5),avg(agg6),avg(agg7),ca_country,ca_state,county,agg1,agg2,agg3,agg4,agg5,agg6,agg7,sum,count,sum,count,sum,count,sum,count,sum,count,sum,count,sum,count]
         InputAdapter
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q20.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q20.sf100/explain.txt
index 15cde1a45a99b..3c2fc95d274b6 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q20.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q20.sf100/explain.txt
@@ -8,24 +8,24 @@ TakeOrderedAndProject (27)
                +- Exchange (21)
                   +- * HashAggregate (20)
                      +- * Project (19)
-                        +- * SortMergeJoin Inner (18)
-                           :- * Sort (12)
-                           :  +- Exchange (11)
-                           :     +- * Project (10)
-                           :        +- * BroadcastHashJoin Inner BuildRight (9)
-                           :           :- * Filter (3)
-                           :           :  +- * ColumnarToRow (2)
-                           :           :     +- Scan parquet default.catalog_sales (1)
-                           :           +- BroadcastExchange (8)
-                           :              +- * Project (7)
-                           :                 +- * Filter (6)
-                           :                    +- * ColumnarToRow (5)
-                           :                       +- Scan parquet default.date_dim (4)
-                           +- * Sort (17)
-                              +- Exchange (16)
+                        +- * BroadcastHashJoin Inner BuildRight (18)
+                           :- * Project (12)
+                           :  +- * SortMergeJoin Inner (11)
+                           :     :- * Sort (5)
+                           :     :  +- Exchange (4)
+                           :     :     +- * Filter (3)
+                           :     :        +- * ColumnarToRow (2)
+                           :     :           +- Scan parquet default.catalog_sales (1)
+                           :     +- * Sort (10)
+                           :        +- Exchange (9)
+                           :           +- * Filter (8)
+                           :              +- * ColumnarToRow (7)
+                           :                 +- Scan parquet default.item (6)
+                           +- BroadcastExchange (17)
+                              +- * Project (16)
                                  +- * Filter (15)
                                     +- * ColumnarToRow (14)
-                                       +- Scan parquet default.item (13)
+                                       +- Scan parquet default.date_dim (13)
 
 
 (1) Scan parquet default.catalog_sales
@@ -35,118 +35,118 @@ Location [not included in comparison]/{warehouse_dir}/catalog_sales]
 PushedFilters: [IsNotNull(cs_item_sk), IsNotNull(cs_sold_date_sk)]
 ReadSchema: struct<cs_sold_date_sk:int,cs_item_sk:int,cs_ext_sales_price:decimal(7,2)>
 
-(2) ColumnarToRow [codegen id : 2]
+(2) ColumnarToRow [codegen id : 1]
 Input [3]: [cs_sold_date_sk#1, cs_item_sk#2, cs_ext_sales_price#3]
 
-(3) Filter [codegen id : 2]
+(3) Filter [codegen id : 1]
 Input [3]: [cs_sold_date_sk#1, cs_item_sk#2, cs_ext_sales_price#3]
 Condition : (isnotnull(cs_item_sk#2) AND isnotnull(cs_sold_date_sk#1))
 
-(4) Scan parquet default.date_dim
-Output [2]: [d_date_sk#4, d_date#5]
-Batched: true
-Location [not included in comparison]/{warehouse_dir}/date_dim]
-PushedFilters: [IsNotNull(d_date), GreaterThanOrEqual(d_date,1999-02-22), LessThanOrEqual(d_date,1999-03-24), IsNotNull(d_date_sk)]
-ReadSchema: struct<d_date_sk:int,d_date:date>
+(4) Exchange
+Input [3]: [cs_sold_date_sk#1, cs_item_sk#2, cs_ext_sales_price#3]
+Arguments: hashpartitioning(cs_item_sk#2, 5), ENSURE_REQUIREMENTS, [id=#4]
 
-(5) ColumnarToRow [codegen id : 1]
-Input [2]: [d_date_sk#4, d_date#5]
+(5) Sort [codegen id : 2]
+Input [3]: [cs_sold_date_sk#1, cs_item_sk#2, cs_ext_sales_price#3]
+Arguments: [cs_item_sk#2 ASC NULLS FIRST], false, 0
 
-(6) Filter [codegen id : 1]
-Input [2]: [d_date_sk#4, d_date#5]
-Condition : (((isnotnull(d_date#5) AND (d_date#5 >= 10644)) AND (d_date#5 <= 10674)) AND isnotnull(d_date_sk#4))
+(6) Scan parquet default.item
+Output [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/item]
+PushedFilters: [In(i_category, [Sports                                            ,Books                                             ,Home                                              ]), IsNotNull(i_item_sk)]
+ReadSchema: struct<i_item_sk:int,i_item_id:string,i_item_desc:string,i_current_price:decimal(7,2),i_class:string,i_category:string>
 
-(7) Project [codegen id : 1]
-Output [1]: [d_date_sk#4]
-Input [2]: [d_date_sk#4, d_date#5]
+(7) ColumnarToRow [codegen id : 3]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
 
-(8) BroadcastExchange
-Input [1]: [d_date_sk#4]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#6]
+(8) Filter [codegen id : 3]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Condition : (i_category#10 IN (Sports                                            ,Books                                             ,Home                                              ) AND isnotnull(i_item_sk#5))
 
-(9) BroadcastHashJoin [codegen id : 2]
-Left keys [1]: [cs_sold_date_sk#1]
-Right keys [1]: [d_date_sk#4]
-Join condition: None
+(9) Exchange
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Arguments: hashpartitioning(i_item_sk#5, 5), ENSURE_REQUIREMENTS, [id=#11]
 
-(10) Project [codegen id : 2]
-Output [2]: [cs_item_sk#2, cs_ext_sales_price#3]
-Input [4]: [cs_sold_date_sk#1, cs_item_sk#2, cs_ext_sales_price#3, d_date_sk#4]
+(10) Sort [codegen id : 4]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Arguments: [i_item_sk#5 ASC NULLS FIRST], false, 0
 
-(11) Exchange
-Input [2]: [cs_item_sk#2, cs_ext_sales_price#3]
-Arguments: hashpartitioning(cs_item_sk#2, 5), true, [id=#7]
+(11) SortMergeJoin [codegen id : 6]
+Left keys [1]: [cs_item_sk#2]
+Right keys [1]: [i_item_sk#5]
+Join condition: None
 
-(12) Sort [codegen id : 3]
-Input [2]: [cs_item_sk#2, cs_ext_sales_price#3]
-Arguments: [cs_item_sk#2 ASC NULLS FIRST], false, 0
+(12) Project [codegen id : 6]
+Output [7]: [cs_sold_date_sk#1, cs_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Input [9]: [cs_sold_date_sk#1, cs_item_sk#2, cs_ext_sales_price#3, i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
 
-(13) Scan parquet default.item
-Output [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+(13) Scan parquet default.date_dim
+Output [2]: [d_date_sk#12, d_date#13]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/item]
-PushedFilters: [In(i_category, [Sports,Books,Home]), IsNotNull(i_item_sk)]
-ReadSchema: struct<i_item_sk:int,i_item_id:string,i_item_desc:string,i_current_price:decimal(7,2),i_class:string,i_category:string>
+Location [not included in comparison]/{warehouse_dir}/date_dim]
+PushedFilters: [IsNotNull(d_date), GreaterThanOrEqual(d_date,1999-02-22), LessThanOrEqual(d_date,1999-03-24), IsNotNull(d_date_sk)]
+ReadSchema: struct<d_date_sk:int,d_date:date>
 
-(14) ColumnarToRow [codegen id : 4]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+(14) ColumnarToRow [codegen id : 5]
+Input [2]: [d_date_sk#12, d_date#13]
 
-(15) Filter [codegen id : 4]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Condition : (i_category#13 IN (Sports,Books,Home) AND isnotnull(i_item_sk#8))
+(15) Filter [codegen id : 5]
+Input [2]: [d_date_sk#12, d_date#13]
+Condition : (((isnotnull(d_date#13) AND (d_date#13 >= 10644)) AND (d_date#13 <= 10674)) AND isnotnull(d_date_sk#12))
 
-(16) Exchange
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Arguments: hashpartitioning(i_item_sk#8, 5), true, [id=#14]
+(16) Project [codegen id : 5]
+Output [1]: [d_date_sk#12]
+Input [2]: [d_date_sk#12, d_date#13]
 
-(17) Sort [codegen id : 5]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Arguments: [i_item_sk#8 ASC NULLS FIRST], false, 0
+(17) BroadcastExchange
+Input [1]: [d_date_sk#12]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#14]
 
-(18) SortMergeJoin [codegen id : 6]
-Left keys [1]: [cs_item_sk#2]
-Right keys [1]: [i_item_sk#8]
+(18) BroadcastHashJoin [codegen id : 6]
+Left keys [1]: [cs_sold_date_sk#1]
+Right keys [1]: [d_date_sk#12]
 Join condition: None
 
 (19) Project [codegen id : 6]
-Output [6]: [cs_ext_sales_price#3, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Input [8]: [cs_item_sk#2, cs_ext_sales_price#3, i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+Output [6]: [cs_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Input [8]: [cs_sold_date_sk#1, cs_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10, d_date_sk#12]
 
 (20) HashAggregate [codegen id : 6]
-Input [6]: [cs_ext_sales_price#3, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Keys [5]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11]
+Input [6]: [cs_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Keys [5]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8]
 Functions [1]: [partial_sum(UnscaledValue(cs_ext_sales_price#3))]
 Aggregate Attributes [1]: [sum#15]
-Results [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
+Results [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
 
 (21) Exchange
-Input [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
-Arguments: hashpartitioning(i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, 5), true, [id=#17]
+Input [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
+Arguments: hashpartitioning(i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, 5), ENSURE_REQUIREMENTS, [id=#17]
 
 (22) HashAggregate [codegen id : 7]
-Input [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
-Keys [5]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11]
+Input [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
+Keys [5]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8]
 Functions [1]: [sum(UnscaledValue(cs_ext_sales_price#3))]
 Aggregate Attributes [1]: [sum(UnscaledValue(cs_ext_sales_price#3))#18]
-Results [8]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, MakeDecimal(sum(UnscaledValue(cs_ext_sales_price#3))#18,17,2) AS itemrevenue#19, MakeDecimal(sum(UnscaledValue(cs_ext_sales_price#3))#18,17,2) AS _w0#20, MakeDecimal(sum(UnscaledValue(cs_ext_sales_price#3))#18,17,2) AS _w1#21]
+Results [8]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, MakeDecimal(sum(UnscaledValue(cs_ext_sales_price#3))#18,17,2) AS itemrevenue#19, MakeDecimal(sum(UnscaledValue(cs_ext_sales_price#3))#18,17,2) AS _w0#20, MakeDecimal(sum(UnscaledValue(cs_ext_sales_price#3))#18,17,2) AS _w1#21]
 
 (23) Exchange
-Input [8]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21]
-Arguments: hashpartitioning(i_class#12, 5), true, [id=#22]
+Input [8]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21]
+Arguments: hashpartitioning(i_class#9, 5), ENSURE_REQUIREMENTS, [id=#22]
 
 (24) Sort [codegen id : 8]
-Input [8]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21]
-Arguments: [i_class#12 ASC NULLS FIRST], false, 0
+Input [8]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21]
+Arguments: [i_class#9 ASC NULLS FIRST], false, 0
 
 (25) Window
-Input [8]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21]
-Arguments: [sum(_w1#21) windowspecdefinition(i_class#12, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#23], [i_class#12]
+Input [8]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21]
+Arguments: [sum(_w1#21) windowspecdefinition(i_class#9, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#23], [i_class#9]
 
 (26) Project [codegen id : 9]
-Output [7]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(_w0#20) * 100.00), DecimalType(21,2), true) as decimal(27,2))) / promote_precision(_we0#23)), DecimalType(38,17), true) AS revenueratio#24]
-Input [9]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, _we0#23]
+Output [7]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(_w0#20) * 100.00), DecimalType(21,2), true) as decimal(27,2))) / promote_precision(_we0#23)), DecimalType(38,17), true) AS revenueratio#24]
+Input [9]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, _we0#23]
 
 (27) TakeOrderedAndProject
-Input [7]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, revenueratio#24]
-Arguments: 100, [i_category#13 ASC NULLS FIRST, i_class#12 ASC NULLS FIRST, i_item_id#9 ASC NULLS FIRST, i_item_desc#10 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST], [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, revenueratio#24]
+Input [7]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, revenueratio#24]
+Arguments: 100, [i_category#10 ASC NULLS FIRST, i_class#9 ASC NULLS FIRST, i_item_id#6 ASC NULLS FIRST, i_item_desc#7 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST], [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, revenueratio#24]
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q20.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q20.sf100/simplified.txt
index e66efff75180a..3f5e15881c33c 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q20.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q20.sf100/simplified.txt
@@ -14,34 +14,34 @@ TakeOrderedAndProject [i_category,i_class,i_item_id,i_item_desc,revenueratio,i_c
                           WholeStageCodegen (6)
                             HashAggregate [i_item_id,i_item_desc,i_category,i_class,i_current_price,cs_ext_sales_price] [sum,sum]
                               Project [cs_ext_sales_price,i_item_id,i_item_desc,i_current_price,i_class,i_category]
-                                SortMergeJoin [cs_item_sk,i_item_sk]
-                                  InputAdapter
-                                    WholeStageCodegen (3)
-                                      Sort [cs_item_sk]
-                                        InputAdapter
-                                          Exchange [cs_item_sk] #3
-                                            WholeStageCodegen (2)
-                                              Project [cs_item_sk,cs_ext_sales_price]
-                                                BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
+                                BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
+                                  Project [cs_sold_date_sk,cs_ext_sales_price,i_item_id,i_item_desc,i_current_price,i_class,i_category]
+                                    SortMergeJoin [cs_item_sk,i_item_sk]
+                                      InputAdapter
+                                        WholeStageCodegen (2)
+                                          Sort [cs_item_sk]
+                                            InputAdapter
+                                              Exchange [cs_item_sk] #3
+                                                WholeStageCodegen (1)
                                                   Filter [cs_item_sk,cs_sold_date_sk]
                                                     ColumnarToRow
                                                       InputAdapter
                                                         Scan parquet default.catalog_sales [cs_sold_date_sk,cs_item_sk,cs_ext_sales_price]
-                                                  InputAdapter
-                                                    BroadcastExchange #4
-                                                      WholeStageCodegen (1)
-                                                        Project [d_date_sk]
-                                                          Filter [d_date,d_date_sk]
-                                                            ColumnarToRow
-                                                              InputAdapter
-                                                                Scan parquet default.date_dim [d_date_sk,d_date]
+                                      InputAdapter
+                                        WholeStageCodegen (4)
+                                          Sort [i_item_sk]
+                                            InputAdapter
+                                              Exchange [i_item_sk] #4
+                                                WholeStageCodegen (3)
+                                                  Filter [i_category,i_item_sk]
+                                                    ColumnarToRow
+                                                      InputAdapter
+                                                        Scan parquet default.item [i_item_sk,i_item_id,i_item_desc,i_current_price,i_class,i_category]
                                   InputAdapter
-                                    WholeStageCodegen (5)
-                                      Sort [i_item_sk]
-                                        InputAdapter
-                                          Exchange [i_item_sk] #5
-                                            WholeStageCodegen (4)
-                                              Filter [i_category,i_item_sk]
-                                                ColumnarToRow
-                                                  InputAdapter
-                                                    Scan parquet default.item [i_item_sk,i_item_id,i_item_desc,i_current_price,i_class,i_category]
+                                    BroadcastExchange #5
+                                      WholeStageCodegen (5)
+                                        Project [d_date_sk]
+                                          Filter [d_date,d_date_sk]
+                                            ColumnarToRow
+                                              InputAdapter
+                                                Scan parquet default.date_dim [d_date_sk,d_date]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q72.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q72.sf100/explain.txt
index 04ff822b1ce52..a993f22e559ab 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q72.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q72.sf100/explain.txt
@@ -11,60 +11,60 @@ TakeOrderedAndProject (79)
                :        +- * BroadcastHashJoin LeftOuter BuildRight (65)
                :           :- * Project (60)
                :           :  +- * SortMergeJoin Inner (59)
-               :           :     :- * Sort (34)
-               :           :     :  +- Exchange (33)
-               :           :     :     +- * Project (32)
-               :           :     :        +- * SortMergeJoin Inner (31)
-               :           :     :           :- * Sort (25)
-               :           :     :           :  +- Exchange (24)
-               :           :     :           :     +- * Project (23)
-               :           :     :           :        +- * BroadcastHashJoin Inner BuildRight (22)
-               :           :     :           :           :- * Project (17)
-               :           :     :           :           :  +- * BroadcastHashJoin Inner BuildRight (16)
-               :           :     :           :           :     :- * Project (10)
-               :           :     :           :           :     :  +- * BroadcastHashJoin Inner BuildRight (9)
-               :           :     :           :           :     :     :- * Filter (3)
-               :           :     :           :           :     :     :  +- * ColumnarToRow (2)
-               :           :     :           :           :     :     :     +- Scan parquet default.catalog_sales (1)
-               :           :     :           :           :     :     +- BroadcastExchange (8)
-               :           :     :           :           :     :        +- * Project (7)
-               :           :     :           :           :     :           +- * Filter (6)
-               :           :     :           :           :     :              +- * ColumnarToRow (5)
-               :           :     :           :           :     :                 +- Scan parquet default.household_demographics (4)
-               :           :     :           :           :     +- BroadcastExchange (15)
-               :           :     :           :           :        +- * Project (14)
-               :           :     :           :           :           +- * Filter (13)
-               :           :     :           :           :              +- * ColumnarToRow (12)
-               :           :     :           :           :                 +- Scan parquet default.customer_demographics (11)
-               :           :     :           :           +- BroadcastExchange (21)
-               :           :     :           :              +- * Filter (20)
-               :           :     :           :                 +- * ColumnarToRow (19)
-               :           :     :           :                    +- Scan parquet default.date_dim (18)
-               :           :     :           +- * Sort (30)
-               :           :     :              +- Exchange (29)
-               :           :     :                 +- * Filter (28)
-               :           :     :                    +- * ColumnarToRow (27)
-               :           :     :                       +- Scan parquet default.item (26)
+               :           :     :- * Sort (47)
+               :           :     :  +- Exchange (46)
+               :           :     :     +- * Project (45)
+               :           :     :        +- * BroadcastHashJoin Inner BuildRight (44)
+               :           :     :           :- * Project (32)
+               :           :     :           :  +- * SortMergeJoin Inner (31)
+               :           :     :           :     :- * Sort (25)
+               :           :     :           :     :  +- Exchange (24)
+               :           :     :           :     :     +- * Project (23)
+               :           :     :           :     :        +- * BroadcastHashJoin Inner BuildRight (22)
+               :           :     :           :     :           :- * Project (17)
+               :           :     :           :     :           :  +- * BroadcastHashJoin Inner BuildRight (16)
+               :           :     :           :     :           :     :- * Project (10)
+               :           :     :           :     :           :     :  +- * BroadcastHashJoin Inner BuildRight (9)
+               :           :     :           :     :           :     :     :- * Filter (3)
+               :           :     :           :     :           :     :     :  +- * ColumnarToRow (2)
+               :           :     :           :     :           :     :     :     +- Scan parquet default.catalog_sales (1)
+               :           :     :           :     :           :     :     +- BroadcastExchange (8)
+               :           :     :           :     :           :     :        +- * Project (7)
+               :           :     :           :     :           :     :           +- * Filter (6)
+               :           :     :           :     :           :     :              +- * ColumnarToRow (5)
+               :           :     :           :     :           :     :                 +- Scan parquet default.household_demographics (4)
+               :           :     :           :     :           :     +- BroadcastExchange (15)
+               :           :     :           :     :           :        +- * Project (14)
+               :           :     :           :     :           :           +- * Filter (13)
+               :           :     :           :     :           :              +- * ColumnarToRow (12)
+               :           :     :           :     :           :                 +- Scan parquet default.customer_demographics (11)
+               :           :     :           :     :           +- BroadcastExchange (21)
+               :           :     :           :     :              +- * Filter (20)
+               :           :     :           :     :                 +- * ColumnarToRow (19)
+               :           :     :           :     :                    +- Scan parquet default.date_dim (18)
+               :           :     :           :     +- * Sort (30)
+               :           :     :           :        +- Exchange (29)
+               :           :     :           :           +- * Filter (28)
+               :           :     :           :              +- * ColumnarToRow (27)
+               :           :     :           :                 +- Scan parquet default.item (26)
+               :           :     :           +- BroadcastExchange (43)
+               :           :     :              +- * Project (42)
+               :           :     :                 +- * BroadcastHashJoin Inner BuildLeft (41)
+               :           :     :                    :- BroadcastExchange (37)
+               :           :     :                    :  +- * Project (36)
+               :           :     :                    :     +- * Filter (35)
+               :           :     :                    :        +- * ColumnarToRow (34)
+               :           :     :                    :           +- Scan parquet default.date_dim (33)
+               :           :     :                    +- * Filter (40)
+               :           :     :                       +- * ColumnarToRow (39)
+               :           :     :                          +- Scan parquet default.date_dim (38)
                :           :     +- * Sort (58)
                :           :        +- Exchange (57)
                :           :           +- * Project (56)
                :           :              +- * BroadcastHashJoin Inner BuildRight (55)
-               :           :                 :- * Project (50)
-               :           :                 :  +- * BroadcastHashJoin Inner BuildLeft (49)
-               :           :                 :     :- BroadcastExchange (45)
-               :           :                 :     :  +- * Project (44)
-               :           :                 :     :     +- * BroadcastHashJoin Inner BuildLeft (43)
-               :           :                 :     :        :- BroadcastExchange (39)
-               :           :                 :     :        :  +- * Project (38)
-               :           :                 :     :        :     +- * Filter (37)
-               :           :                 :     :        :        +- * ColumnarToRow (36)
-               :           :                 :     :        :           +- Scan parquet default.date_dim (35)
-               :           :                 :     :        +- * Filter (42)
-               :           :                 :     :           +- * ColumnarToRow (41)
-               :           :                 :     :              +- Scan parquet default.date_dim (40)
-               :           :                 :     +- * Filter (48)
-               :           :                 :        +- * ColumnarToRow (47)
-               :           :                 :           +- Scan parquet default.inventory (46)
+               :           :                 :- * Filter (50)
+               :           :                 :  +- * ColumnarToRow (49)
+               :           :                 :     +- Scan parquet default.inventory (48)
                :           :                 +- BroadcastExchange (54)
                :           :                    +- * Filter (53)
                :           :                       +- * ColumnarToRow (52)
@@ -98,7 +98,7 @@ Condition : (((((isnotnull(cs_quantity#8) AND isnotnull(cs_item_sk#5)) AND isnot
 Output [2]: [hd_demo_sk#9, hd_buy_potential#10]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/household_demographics]
-PushedFilters: [IsNotNull(hd_buy_potential), EqualTo(hd_buy_potential,1001-5000), IsNotNull(hd_demo_sk)]
+PushedFilters: [IsNotNull(hd_buy_potential), EqualTo(hd_buy_potential,1001-5000      ), IsNotNull(hd_demo_sk)]
 ReadSchema: struct<hd_demo_sk:int,hd_buy_potential:string>
 
 (5) ColumnarToRow [codegen id : 1]
@@ -106,7 +106,7 @@ Input [2]: [hd_demo_sk#9, hd_buy_potential#10]
 
 (6) Filter [codegen id : 1]
 Input [2]: [hd_demo_sk#9, hd_buy_potential#10]
-Condition : ((isnotnull(hd_buy_potential#10) AND (hd_buy_potential#10 = 1001-5000)) AND isnotnull(hd_demo_sk#9))
+Condition : ((isnotnull(hd_buy_potential#10) AND (hd_buy_potential#10 = 1001-5000      )) AND isnotnull(hd_demo_sk#9))
 
 (7) Project [codegen id : 1]
 Output [1]: [hd_demo_sk#9]
@@ -213,95 +213,95 @@ Arguments: hashpartitioning(i_item_sk#19, 5), ENSURE_REQUIREMENTS, [id=#21]
 Input [2]: [i_item_sk#19, i_item_desc#20]
 Arguments: [i_item_sk#19 ASC NULLS FIRST], false, 0
 
-(31) SortMergeJoin [codegen id : 8]
+(31) SortMergeJoin [codegen id : 10]
 Left keys [1]: [cs_item_sk#5]
 Right keys [1]: [i_item_sk#19]
 Join condition: None
 
-(32) Project [codegen id : 8]
+(32) Project [codegen id : 10]
 Output [7]: [cs_sold_date_sk#1, cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, d_date#16, i_item_desc#20]
 Input [8]: [cs_sold_date_sk#1, cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, d_date#16, i_item_sk#19, i_item_desc#20]
 
-(33) Exchange
-Input [7]: [cs_sold_date_sk#1, cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, d_date#16, i_item_desc#20]
-Arguments: hashpartitioning(cs_item_sk#5, cs_sold_date_sk#1, 5), ENSURE_REQUIREMENTS, [id=#22]
-
-(34) Sort [codegen id : 9]
-Input [7]: [cs_sold_date_sk#1, cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, d_date#16, i_item_desc#20]
-Arguments: [cs_item_sk#5 ASC NULLS FIRST, cs_sold_date_sk#1 ASC NULLS FIRST], false, 0
-
-(35) Scan parquet default.date_dim
-Output [4]: [d_date_sk#23, d_date#24, d_week_seq#25, d_year#26]
+(33) Scan parquet default.date_dim
+Output [4]: [d_date_sk#22, d_date#23, d_week_seq#24, d_year#25]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/date_dim]
 PushedFilters: [IsNotNull(d_year), EqualTo(d_year,2001), IsNotNull(d_date_sk), IsNotNull(d_week_seq), IsNotNull(d_date)]
 ReadSchema: struct<d_date_sk:int,d_date:date,d_week_seq:int,d_year:int>
 
-(36) ColumnarToRow [codegen id : 10]
-Input [4]: [d_date_sk#23, d_date#24, d_week_seq#25, d_year#26]
+(34) ColumnarToRow [codegen id : 8]
+Input [4]: [d_date_sk#22, d_date#23, d_week_seq#24, d_year#25]
 
-(37) Filter [codegen id : 10]
-Input [4]: [d_date_sk#23, d_date#24, d_week_seq#25, d_year#26]
-Condition : ((((isnotnull(d_year#26) AND (d_year#26 = 2001)) AND isnotnull(d_date_sk#23)) AND isnotnull(d_week_seq#25)) AND isnotnull(d_date#24))
+(35) Filter [codegen id : 8]
+Input [4]: [d_date_sk#22, d_date#23, d_week_seq#24, d_year#25]
+Condition : ((((isnotnull(d_year#25) AND (d_year#25 = 2001)) AND isnotnull(d_date_sk#22)) AND isnotnull(d_week_seq#24)) AND isnotnull(d_date#23))
 
-(38) Project [codegen id : 10]
-Output [3]: [d_date_sk#23, d_date#24, d_week_seq#25]
-Input [4]: [d_date_sk#23, d_date#24, d_week_seq#25, d_year#26]
+(36) Project [codegen id : 8]
+Output [3]: [d_date_sk#22, d_date#23, d_week_seq#24]
+Input [4]: [d_date_sk#22, d_date#23, d_week_seq#24, d_year#25]
 
-(39) BroadcastExchange
-Input [3]: [d_date_sk#23, d_date#24, d_week_seq#25]
-Arguments: HashedRelationBroadcastMode(List(cast(input[2, int, true] as bigint)),false), [id=#27]
+(37) BroadcastExchange
+Input [3]: [d_date_sk#22, d_date#23, d_week_seq#24]
+Arguments: HashedRelationBroadcastMode(List(cast(input[2, int, true] as bigint)),false), [id=#26]
 
-(40) Scan parquet default.date_dim
-Output [2]: [d_date_sk#28, d_week_seq#29]
+(38) Scan parquet default.date_dim
+Output [2]: [d_date_sk#27, d_week_seq#28]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/date_dim]
 PushedFilters: [IsNotNull(d_week_seq), IsNotNull(d_date_sk)]
 ReadSchema: struct<d_date_sk:int,d_week_seq:int>
 
-(41) ColumnarToRow
-Input [2]: [d_date_sk#28, d_week_seq#29]
+(39) ColumnarToRow
+Input [2]: [d_date_sk#27, d_week_seq#28]
 
-(42) Filter
-Input [2]: [d_date_sk#28, d_week_seq#29]
-Condition : (isnotnull(d_week_seq#29) AND isnotnull(d_date_sk#28))
+(40) Filter
+Input [2]: [d_date_sk#27, d_week_seq#28]
+Condition : (isnotnull(d_week_seq#28) AND isnotnull(d_date_sk#27))
 
-(43) BroadcastHashJoin [codegen id : 11]
-Left keys [1]: [d_week_seq#25]
-Right keys [1]: [d_week_seq#29]
+(41) BroadcastHashJoin [codegen id : 9]
+Left keys [1]: [d_week_seq#24]
+Right keys [1]: [d_week_seq#28]
 Join condition: None
 
-(44) Project [codegen id : 11]
-Output [4]: [d_date_sk#23, d_date#24, d_week_seq#25, d_date_sk#28]
-Input [5]: [d_date_sk#23, d_date#24, d_week_seq#25, d_date_sk#28, d_week_seq#29]
+(42) Project [codegen id : 9]
+Output [4]: [d_date_sk#22, d_date#23, d_week_seq#24, d_date_sk#27]
+Input [5]: [d_date_sk#22, d_date#23, d_week_seq#24, d_date_sk#27, d_week_seq#28]
+
+(43) BroadcastExchange
+Input [4]: [d_date_sk#22, d_date#23, d_week_seq#24, d_date_sk#27]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#29]
+
+(44) BroadcastHashJoin [codegen id : 10]
+Left keys [1]: [cs_sold_date_sk#1]
+Right keys [1]: [d_date_sk#22]
+Join condition: (d_date#16 > d_date#23 + 5 days)
+
+(45) Project [codegen id : 10]
+Output [7]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, i_item_desc#20, d_week_seq#24, d_date_sk#27]
+Input [11]: [cs_sold_date_sk#1, cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, d_date#16, i_item_desc#20, d_date_sk#22, d_date#23, d_week_seq#24, d_date_sk#27]
 
-(45) BroadcastExchange
-Input [4]: [d_date_sk#23, d_date#24, d_week_seq#25, d_date_sk#28]
-Arguments: HashedRelationBroadcastMode(List(cast(input[3, int, true] as bigint)),false), [id=#30]
+(46) Exchange
+Input [7]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, i_item_desc#20, d_week_seq#24, d_date_sk#27]
+Arguments: hashpartitioning(cs_item_sk#5, d_date_sk#27, 5), ENSURE_REQUIREMENTS, [id=#30]
 
-(46) Scan parquet default.inventory
+(47) Sort [codegen id : 11]
+Input [7]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, i_item_desc#20, d_week_seq#24, d_date_sk#27]
+Arguments: [cs_item_sk#5 ASC NULLS FIRST, d_date_sk#27 ASC NULLS FIRST], false, 0
+
+(48) Scan parquet default.inventory
 Output [4]: [inv_date_sk#31, inv_item_sk#32, inv_warehouse_sk#33, inv_quantity_on_hand#34]
 Batched: true
 Location [not included in comparison]/{warehouse_dir}/inventory]
 PushedFilters: [IsNotNull(inv_quantity_on_hand), IsNotNull(inv_item_sk), IsNotNull(inv_warehouse_sk), IsNotNull(inv_date_sk)]
 ReadSchema: struct<inv_date_sk:int,inv_item_sk:int,inv_warehouse_sk:int,inv_quantity_on_hand:int>
 
-(47) ColumnarToRow
+(49) ColumnarToRow [codegen id : 13]
 Input [4]: [inv_date_sk#31, inv_item_sk#32, inv_warehouse_sk#33, inv_quantity_on_hand#34]
 
-(48) Filter
+(50) Filter [codegen id : 13]
 Input [4]: [inv_date_sk#31, inv_item_sk#32, inv_warehouse_sk#33, inv_quantity_on_hand#34]
 Condition : (((isnotnull(inv_quantity_on_hand#34) AND isnotnull(inv_item_sk#32)) AND isnotnull(inv_warehouse_sk#33)) AND isnotnull(inv_date_sk#31))
 
-(49) BroadcastHashJoin [codegen id : 13]
-Left keys [1]: [d_date_sk#28]
-Right keys [1]: [inv_date_sk#31]
-Join condition: None
-
-(50) Project [codegen id : 13]
-Output [6]: [d_date_sk#23, d_date#24, d_week_seq#25, inv_item_sk#32, inv_warehouse_sk#33, inv_quantity_on_hand#34]
-Input [8]: [d_date_sk#23, d_date#24, d_week_seq#25, d_date_sk#28, inv_date_sk#31, inv_item_sk#32, inv_warehouse_sk#33, inv_quantity_on_hand#34]
-
 (51) Scan parquet default.warehouse
 Output [2]: [w_warehouse_sk#35, w_warehouse_name#36]
 Batched: true
@@ -326,25 +326,25 @@ Right keys [1]: [w_warehouse_sk#35]
 Join condition: None
 
 (56) Project [codegen id : 13]
-Output [6]: [d_date_sk#23, d_date#24, d_week_seq#25, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
-Input [8]: [d_date_sk#23, d_date#24, d_week_seq#25, inv_item_sk#32, inv_warehouse_sk#33, inv_quantity_on_hand#34, w_warehouse_sk#35, w_warehouse_name#36]
+Output [4]: [inv_date_sk#31, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
+Input [6]: [inv_date_sk#31, inv_item_sk#32, inv_warehouse_sk#33, inv_quantity_on_hand#34, w_warehouse_sk#35, w_warehouse_name#36]
 
 (57) Exchange
-Input [6]: [d_date_sk#23, d_date#24, d_week_seq#25, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
-Arguments: hashpartitioning(inv_item_sk#32, d_date_sk#23, 5), ENSURE_REQUIREMENTS, [id=#38]
+Input [4]: [inv_date_sk#31, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
+Arguments: hashpartitioning(inv_item_sk#32, inv_date_sk#31, 5), ENSURE_REQUIREMENTS, [id=#38]
 
 (58) Sort [codegen id : 14]
-Input [6]: [d_date_sk#23, d_date#24, d_week_seq#25, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
-Arguments: [inv_item_sk#32 ASC NULLS FIRST, d_date_sk#23 ASC NULLS FIRST], false, 0
+Input [4]: [inv_date_sk#31, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
+Arguments: [inv_item_sk#32 ASC NULLS FIRST, inv_date_sk#31 ASC NULLS FIRST], false, 0
 
 (59) SortMergeJoin [codegen id : 16]
-Left keys [2]: [cs_item_sk#5, cs_sold_date_sk#1]
-Right keys [2]: [inv_item_sk#32, d_date_sk#23]
-Join condition: ((inv_quantity_on_hand#34 < cs_quantity#8) AND (d_date#16 > d_date#24 + 5 days))
+Left keys [2]: [cs_item_sk#5, d_date_sk#27]
+Right keys [2]: [inv_item_sk#32, inv_date_sk#31]
+Join condition: (inv_quantity_on_hand#34 < cs_quantity#8)
 
 (60) Project [codegen id : 16]
-Output [6]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#25]
-Input [13]: [cs_sold_date_sk#1, cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, d_date#16, i_item_desc#20, d_date_sk#23, d_date#24, d_week_seq#25, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
+Output [6]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#24]
+Input [11]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, cs_quantity#8, i_item_desc#20, d_week_seq#24, d_date_sk#27, inv_date_sk#31, inv_item_sk#32, inv_quantity_on_hand#34, w_warehouse_name#36]
 
 (61) Scan parquet default.promotion
 Output [1]: [p_promo_sk#39]
@@ -370,15 +370,15 @@ Right keys [1]: [p_promo_sk#39]
 Join condition: None
 
 (66) Project [codegen id : 16]
-Output [5]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#25]
-Input [7]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#25, p_promo_sk#39]
+Output [5]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#24]
+Input [7]: [cs_item_sk#5, cs_promo_sk#6, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#24, p_promo_sk#39]
 
 (67) Exchange
-Input [5]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#25]
+Input [5]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#24]
 Arguments: hashpartitioning(cs_item_sk#5, cs_order_number#7, 5), ENSURE_REQUIREMENTS, [id=#41]
 
 (68) Sort [codegen id : 17]
-Input [5]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#25]
+Input [5]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#24]
 Arguments: [cs_item_sk#5 ASC NULLS FIRST, cs_order_number#7 ASC NULLS FIRST], false, 0
 
 (69) Scan parquet default.catalog_returns
@@ -409,28 +409,28 @@ Right keys [2]: [cr_item_sk#42, cr_order_number#43]
 Join condition: None
 
 (75) Project [codegen id : 20]
-Output [3]: [w_warehouse_name#36, i_item_desc#20, d_week_seq#25]
-Input [7]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#25, cr_item_sk#42, cr_order_number#43]
+Output [3]: [w_warehouse_name#36, i_item_desc#20, d_week_seq#24]
+Input [7]: [cs_item_sk#5, cs_order_number#7, w_warehouse_name#36, i_item_desc#20, d_week_seq#24, cr_item_sk#42, cr_order_number#43]
 
 (76) HashAggregate [codegen id : 20]
-Input [3]: [w_warehouse_name#36, i_item_desc#20, d_week_seq#25]
-Keys [3]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#25]
+Input [3]: [w_warehouse_name#36, i_item_desc#20, d_week_seq#24]
+Keys [3]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#24]
 Functions [1]: [partial_count(1)]
 Aggregate Attributes [1]: [count#45]
-Results [4]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#25, count#46]
+Results [4]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#24, count#46]
 
 (77) Exchange
-Input [4]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#25, count#46]
-Arguments: hashpartitioning(i_item_desc#20, w_warehouse_name#36, d_week_seq#25, 5), ENSURE_REQUIREMENTS, [id=#47]
+Input [4]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#24, count#46]
+Arguments: hashpartitioning(i_item_desc#20, w_warehouse_name#36, d_week_seq#24, 5), ENSURE_REQUIREMENTS, [id=#47]
 
 (78) HashAggregate [codegen id : 21]
-Input [4]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#25, count#46]
-Keys [3]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#25]
+Input [4]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#24, count#46]
+Keys [3]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#24]
 Functions [1]: [count(1)]
 Aggregate Attributes [1]: [count(1)#48]
-Results [6]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#25, count(1)#48 AS no_promo#49, count(1)#48 AS promo#50, count(1)#48 AS total_cnt#51]
+Results [6]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#24, count(1)#48 AS no_promo#49, count(1)#48 AS promo#50, count(1)#48 AS total_cnt#51]
 
 (79) TakeOrderedAndProject
-Input [6]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#25, no_promo#49, promo#50, total_cnt#51]
-Arguments: 100, [total_cnt#51 DESC NULLS LAST, i_item_desc#20 ASC NULLS FIRST, w_warehouse_name#36 ASC NULLS FIRST, d_week_seq#25 ASC NULLS FIRST], [i_item_desc#20, w_warehouse_name#36, d_week_seq#25, no_promo#49, promo#50, total_cnt#51]
+Input [6]: [i_item_desc#20, w_warehouse_name#36, d_week_seq#24, no_promo#49, promo#50, total_cnt#51]
+Arguments: 100, [total_cnt#51 DESC NULLS LAST, i_item_desc#20 ASC NULLS FIRST, w_warehouse_name#36 ASC NULLS FIRST, d_week_seq#24 ASC NULLS FIRST], [i_item_desc#20, w_warehouse_name#36, d_week_seq#24, no_promo#49, promo#50, total_cnt#51]
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q72.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q72.sf100/simplified.txt
index b88505ad7b9bc..918508787c4b0 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q72.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q72.sf100/simplified.txt
@@ -16,95 +16,95 @@ TakeOrderedAndProject [total_cnt,i_item_desc,w_warehouse_name,d_week_seq,no_prom
                               Project [cs_item_sk,cs_order_number,w_warehouse_name,i_item_desc,d_week_seq]
                                 BroadcastHashJoin [cs_promo_sk,p_promo_sk]
                                   Project [cs_item_sk,cs_promo_sk,cs_order_number,w_warehouse_name,i_item_desc,d_week_seq]
-                                    SortMergeJoin [cs_item_sk,cs_sold_date_sk,inv_item_sk,d_date_sk,inv_quantity_on_hand,cs_quantity,d_date,d_date]
+                                    SortMergeJoin [cs_item_sk,d_date_sk,inv_item_sk,inv_date_sk,inv_quantity_on_hand,cs_quantity]
                                       InputAdapter
-                                        WholeStageCodegen (9)
-                                          Sort [cs_item_sk,cs_sold_date_sk]
+                                        WholeStageCodegen (11)
+                                          Sort [cs_item_sk,d_date_sk]
                                             InputAdapter
-                                              Exchange [cs_item_sk,cs_sold_date_sk] #3
-                                                WholeStageCodegen (8)
-                                                  Project [cs_sold_date_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity,d_date,i_item_desc]
-                                                    SortMergeJoin [cs_item_sk,i_item_sk]
-                                                      InputAdapter
-                                                        WholeStageCodegen (5)
-                                                          Sort [cs_item_sk]
-                                                            InputAdapter
-                                                              Exchange [cs_item_sk] #4
-                                                                WholeStageCodegen (4)
-                                                                  Project [cs_sold_date_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity,d_date]
-                                                                    BroadcastHashJoin [cs_ship_date_sk,d_date_sk]
-                                                                      Project [cs_sold_date_sk,cs_ship_date_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity]
-                                                                        BroadcastHashJoin [cs_bill_cdemo_sk,cd_demo_sk]
-                                                                          Project [cs_sold_date_sk,cs_ship_date_sk,cs_bill_cdemo_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity]
-                                                                            BroadcastHashJoin [cs_bill_hdemo_sk,hd_demo_sk]
-                                                                              Filter [cs_quantity,cs_item_sk,cs_bill_cdemo_sk,cs_bill_hdemo_sk,cs_sold_date_sk,cs_ship_date_sk]
-                                                                                ColumnarToRow
+                                              Exchange [cs_item_sk,d_date_sk] #3
+                                                WholeStageCodegen (10)
+                                                  Project [cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity,i_item_desc,d_week_seq,d_date_sk]
+                                                    BroadcastHashJoin [cs_sold_date_sk,d_date_sk,d_date,d_date]
+                                                      Project [cs_sold_date_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity,d_date,i_item_desc]
+                                                        SortMergeJoin [cs_item_sk,i_item_sk]
+                                                          InputAdapter
+                                                            WholeStageCodegen (5)
+                                                              Sort [cs_item_sk]
+                                                                InputAdapter
+                                                                  Exchange [cs_item_sk] #4
+                                                                    WholeStageCodegen (4)
+                                                                      Project [cs_sold_date_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity,d_date]
+                                                                        BroadcastHashJoin [cs_ship_date_sk,d_date_sk]
+                                                                          Project [cs_sold_date_sk,cs_ship_date_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity]
+                                                                            BroadcastHashJoin [cs_bill_cdemo_sk,cd_demo_sk]
+                                                                              Project [cs_sold_date_sk,cs_ship_date_sk,cs_bill_cdemo_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity]
+                                                                                BroadcastHashJoin [cs_bill_hdemo_sk,hd_demo_sk]
+                                                                                  Filter [cs_quantity,cs_item_sk,cs_bill_cdemo_sk,cs_bill_hdemo_sk,cs_sold_date_sk,cs_ship_date_sk]
+                                                                                    ColumnarToRow
+                                                                                      InputAdapter
+                                                                                        Scan parquet default.catalog_sales [cs_sold_date_sk,cs_ship_date_sk,cs_bill_cdemo_sk,cs_bill_hdemo_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity]
                                                                                   InputAdapter
-                                                                                    Scan parquet default.catalog_sales [cs_sold_date_sk,cs_ship_date_sk,cs_bill_cdemo_sk,cs_bill_hdemo_sk,cs_item_sk,cs_promo_sk,cs_order_number,cs_quantity]
+                                                                                    BroadcastExchange #5
+                                                                                      WholeStageCodegen (1)
+                                                                                        Project [hd_demo_sk]
+                                                                                          Filter [hd_buy_potential,hd_demo_sk]
+                                                                                            ColumnarToRow
+                                                                                              InputAdapter
+                                                                                                Scan parquet default.household_demographics [hd_demo_sk,hd_buy_potential]
                                                                               InputAdapter
-                                                                                BroadcastExchange #5
-                                                                                  WholeStageCodegen (1)
-                                                                                    Project [hd_demo_sk]
-                                                                                      Filter [hd_buy_potential,hd_demo_sk]
+                                                                                BroadcastExchange #6
+                                                                                  WholeStageCodegen (2)
+                                                                                    Project [cd_demo_sk]
+                                                                                      Filter [cd_marital_status,cd_demo_sk]
                                                                                         ColumnarToRow
                                                                                           InputAdapter
-                                                                                            Scan parquet default.household_demographics [hd_demo_sk,hd_buy_potential]
+                                                                                            Scan parquet default.customer_demographics [cd_demo_sk,cd_marital_status]
                                                                           InputAdapter
-                                                                            BroadcastExchange #6
-                                                                              WholeStageCodegen (2)
-                                                                                Project [cd_demo_sk]
-                                                                                  Filter [cd_marital_status,cd_demo_sk]
-                                                                                    ColumnarToRow
-                                                                                      InputAdapter
-                                                                                        Scan parquet default.customer_demographics [cd_demo_sk,cd_marital_status]
-                                                                      InputAdapter
-                                                                        BroadcastExchange #7
-                                                                          WholeStageCodegen (3)
-                                                                            Filter [d_date,d_date_sk]
-                                                                              ColumnarToRow
-                                                                                InputAdapter
-                                                                                  Scan parquet default.date_dim [d_date_sk,d_date]
+                                                                            BroadcastExchange #7
+                                                                              WholeStageCodegen (3)
+                                                                                Filter [d_date,d_date_sk]
+                                                                                  ColumnarToRow
+                                                                                    InputAdapter
+                                                                                      Scan parquet default.date_dim [d_date_sk,d_date]
+                                                          InputAdapter
+                                                            WholeStageCodegen (7)
+                                                              Sort [i_item_sk]
+                                                                InputAdapter
+                                                                  Exchange [i_item_sk] #8
+                                                                    WholeStageCodegen (6)
+                                                                      Filter [i_item_sk]
+                                                                        ColumnarToRow
+                                                                          InputAdapter
+                                                                            Scan parquet default.item [i_item_sk,i_item_desc]
                                                       InputAdapter
-                                                        WholeStageCodegen (7)
-                                                          Sort [i_item_sk]
-                                                            InputAdapter
-                                                              Exchange [i_item_sk] #8
-                                                                WholeStageCodegen (6)
-                                                                  Filter [i_item_sk]
-                                                                    ColumnarToRow
-                                                                      InputAdapter
-                                                                        Scan parquet default.item [i_item_sk,i_item_desc]
+                                                        BroadcastExchange #9
+                                                          WholeStageCodegen (9)
+                                                            Project [d_date_sk,d_date,d_week_seq,d_date_sk]
+                                                              BroadcastHashJoin [d_week_seq,d_week_seq]
+                                                                InputAdapter
+                                                                  BroadcastExchange #10
+                                                                    WholeStageCodegen (8)
+                                                                      Project [d_date_sk,d_date,d_week_seq]
+                                                                        Filter [d_year,d_date_sk,d_week_seq,d_date]
+                                                                          ColumnarToRow
+                                                                            InputAdapter
+                                                                              Scan parquet default.date_dim [d_date_sk,d_date,d_week_seq,d_year]
+                                                                Filter [d_week_seq,d_date_sk]
+                                                                  ColumnarToRow
+                                                                    InputAdapter
+                                                                      Scan parquet default.date_dim [d_date_sk,d_week_seq]
                                       InputAdapter
                                         WholeStageCodegen (14)
-                                          Sort [inv_item_sk,d_date_sk]
+                                          Sort [inv_item_sk,inv_date_sk]
                                             InputAdapter
-                                              Exchange [inv_item_sk,d_date_sk] #9
+                                              Exchange [inv_item_sk,inv_date_sk] #11
                                                 WholeStageCodegen (13)
-                                                  Project [d_date_sk,d_date,d_week_seq,inv_item_sk,inv_quantity_on_hand,w_warehouse_name]
+                                                  Project [inv_date_sk,inv_item_sk,inv_quantity_on_hand,w_warehouse_name]
                                                     BroadcastHashJoin [inv_warehouse_sk,w_warehouse_sk]
-                                                      Project [d_date_sk,d_date,d_week_seq,inv_item_sk,inv_warehouse_sk,inv_quantity_on_hand]
-                                                        BroadcastHashJoin [d_date_sk,inv_date_sk]
+                                                      Filter [inv_quantity_on_hand,inv_item_sk,inv_warehouse_sk,inv_date_sk]
+                                                        ColumnarToRow
                                                           InputAdapter
-                                                            BroadcastExchange #10
-                                                              WholeStageCodegen (11)
-                                                                Project [d_date_sk,d_date,d_week_seq,d_date_sk]
-                                                                  BroadcastHashJoin [d_week_seq,d_week_seq]
-                                                                    InputAdapter
-                                                                      BroadcastExchange #11
-                                                                        WholeStageCodegen (10)
-                                                                          Project [d_date_sk,d_date,d_week_seq]
-                                                                            Filter [d_year,d_date_sk,d_week_seq,d_date]
-                                                                              ColumnarToRow
-                                                                                InputAdapter
-                                                                                  Scan parquet default.date_dim [d_date_sk,d_date,d_week_seq,d_year]
-                                                                    Filter [d_week_seq,d_date_sk]
-                                                                      ColumnarToRow
-                                                                        InputAdapter
-                                                                          Scan parquet default.date_dim [d_date_sk,d_week_seq]
-                                                          Filter [inv_quantity_on_hand,inv_item_sk,inv_warehouse_sk,inv_date_sk]
-                                                            ColumnarToRow
-                                                              InputAdapter
-                                                                Scan parquet default.inventory [inv_date_sk,inv_item_sk,inv_warehouse_sk,inv_quantity_on_hand]
+                                                            Scan parquet default.inventory [inv_date_sk,inv_item_sk,inv_warehouse_sk,inv_quantity_on_hand]
                                                       InputAdapter
                                                         BroadcastExchange #12
                                                           WholeStageCodegen (12)
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q98.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q98.sf100/explain.txt
index 6fa7b04a3b463..694b1ead8d0f0 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q98.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q98.sf100/explain.txt
@@ -9,24 +9,24 @@
                   +- Exchange (21)
                      +- * HashAggregate (20)
                         +- * Project (19)
-                           +- * SortMergeJoin Inner (18)
-                              :- * Sort (12)
-                              :  +- Exchange (11)
-                              :     +- * Project (10)
-                              :        +- * BroadcastHashJoin Inner BuildRight (9)
-                              :           :- * Filter (3)
-                              :           :  +- * ColumnarToRow (2)
-                              :           :     +- Scan parquet default.store_sales (1)
-                              :           +- BroadcastExchange (8)
-                              :              +- * Project (7)
-                              :                 +- * Filter (6)
-                              :                    +- * ColumnarToRow (5)
-                              :                       +- Scan parquet default.date_dim (4)
-                              +- * Sort (17)
-                                 +- Exchange (16)
+                           +- * BroadcastHashJoin Inner BuildRight (18)
+                              :- * Project (12)
+                              :  +- * SortMergeJoin Inner (11)
+                              :     :- * Sort (5)
+                              :     :  +- Exchange (4)
+                              :     :     +- * Filter (3)
+                              :     :        +- * ColumnarToRow (2)
+                              :     :           +- Scan parquet default.store_sales (1)
+                              :     +- * Sort (10)
+                              :        +- Exchange (9)
+                              :           +- * Filter (8)
+                              :              +- * ColumnarToRow (7)
+                              :                 +- Scan parquet default.item (6)
+                              +- BroadcastExchange (17)
+                                 +- * Project (16)
                                     +- * Filter (15)
                                        +- * ColumnarToRow (14)
-                                          +- Scan parquet default.item (13)
+                                          +- Scan parquet default.date_dim (13)
 
 
 (1) Scan parquet default.store_sales
@@ -36,122 +36,122 @@ Location [not included in comparison]/{warehouse_dir}/store_sales]
 PushedFilters: [IsNotNull(ss_item_sk), IsNotNull(ss_sold_date_sk)]
 ReadSchema: struct<ss_sold_date_sk:int,ss_item_sk:int,ss_ext_sales_price:decimal(7,2)>
 
-(2) ColumnarToRow [codegen id : 2]
+(2) ColumnarToRow [codegen id : 1]
 Input [3]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3]
 
-(3) Filter [codegen id : 2]
+(3) Filter [codegen id : 1]
 Input [3]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3]
 Condition : (isnotnull(ss_item_sk#2) AND isnotnull(ss_sold_date_sk#1))
 
-(4) Scan parquet default.date_dim
-Output [2]: [d_date_sk#4, d_date#5]
-Batched: true
-Location [not included in comparison]/{warehouse_dir}/date_dim]
-PushedFilters: [IsNotNull(d_date), GreaterThanOrEqual(d_date,1999-02-22), LessThanOrEqual(d_date,1999-03-24), IsNotNull(d_date_sk)]
-ReadSchema: struct<d_date_sk:int,d_date:date>
+(4) Exchange
+Input [3]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3]
+Arguments: hashpartitioning(ss_item_sk#2, 5), ENSURE_REQUIREMENTS, [id=#4]
 
-(5) ColumnarToRow [codegen id : 1]
-Input [2]: [d_date_sk#4, d_date#5]
+(5) Sort [codegen id : 2]
+Input [3]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3]
+Arguments: [ss_item_sk#2 ASC NULLS FIRST], false, 0
 
-(6) Filter [codegen id : 1]
-Input [2]: [d_date_sk#4, d_date#5]
-Condition : (((isnotnull(d_date#5) AND (d_date#5 >= 10644)) AND (d_date#5 <= 10674)) AND isnotnull(d_date_sk#4))
+(6) Scan parquet default.item
+Output [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Batched: true
+Location [not included in comparison]/{warehouse_dir}/item]
+PushedFilters: [In(i_category, [Sports                                            ,Books                                             ,Home                                              ]), IsNotNull(i_item_sk)]
+ReadSchema: struct<i_item_sk:int,i_item_id:string,i_item_desc:string,i_current_price:decimal(7,2),i_class:string,i_category:string>
 
-(7) Project [codegen id : 1]
-Output [1]: [d_date_sk#4]
-Input [2]: [d_date_sk#4, d_date#5]
+(7) ColumnarToRow [codegen id : 3]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
 
-(8) BroadcastExchange
-Input [1]: [d_date_sk#4]
-Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#6]
+(8) Filter [codegen id : 3]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Condition : (i_category#10 IN (Sports                                            ,Books                                             ,Home                                              ) AND isnotnull(i_item_sk#5))
 
-(9) BroadcastHashJoin [codegen id : 2]
-Left keys [1]: [ss_sold_date_sk#1]
-Right keys [1]: [d_date_sk#4]
-Join condition: None
+(9) Exchange
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Arguments: hashpartitioning(i_item_sk#5, 5), ENSURE_REQUIREMENTS, [id=#11]
 
-(10) Project [codegen id : 2]
-Output [2]: [ss_item_sk#2, ss_ext_sales_price#3]
-Input [4]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3, d_date_sk#4]
+(10) Sort [codegen id : 4]
+Input [6]: [i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Arguments: [i_item_sk#5 ASC NULLS FIRST], false, 0
 
-(11) Exchange
-Input [2]: [ss_item_sk#2, ss_ext_sales_price#3]
-Arguments: hashpartitioning(ss_item_sk#2, 5), true, [id=#7]
+(11) SortMergeJoin [codegen id : 6]
+Left keys [1]: [ss_item_sk#2]
+Right keys [1]: [i_item_sk#5]
+Join condition: None
 
-(12) Sort [codegen id : 3]
-Input [2]: [ss_item_sk#2, ss_ext_sales_price#3]
-Arguments: [ss_item_sk#2 ASC NULLS FIRST], false, 0
+(12) Project [codegen id : 6]
+Output [7]: [ss_sold_date_sk#1, ss_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Input [9]: [ss_sold_date_sk#1, ss_item_sk#2, ss_ext_sales_price#3, i_item_sk#5, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
 
-(13) Scan parquet default.item
-Output [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+(13) Scan parquet default.date_dim
+Output [2]: [d_date_sk#12, d_date#13]
 Batched: true
-Location [not included in comparison]/{warehouse_dir}/item]
-PushedFilters: [In(i_category, [Sports,Books,Home]), IsNotNull(i_item_sk)]
-ReadSchema: struct<i_item_sk:int,i_item_id:string,i_item_desc:string,i_current_price:decimal(7,2),i_class:string,i_category:string>
+Location [not included in comparison]/{warehouse_dir}/date_dim]
+PushedFilters: [IsNotNull(d_date), GreaterThanOrEqual(d_date,1999-02-22), LessThanOrEqual(d_date,1999-03-24), IsNotNull(d_date_sk)]
+ReadSchema: struct<d_date_sk:int,d_date:date>
 
-(14) ColumnarToRow [codegen id : 4]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+(14) ColumnarToRow [codegen id : 5]
+Input [2]: [d_date_sk#12, d_date#13]
 
-(15) Filter [codegen id : 4]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Condition : (i_category#13 IN (Sports,Books,Home) AND isnotnull(i_item_sk#8))
+(15) Filter [codegen id : 5]
+Input [2]: [d_date_sk#12, d_date#13]
+Condition : (((isnotnull(d_date#13) AND (d_date#13 >= 10644)) AND (d_date#13 <= 10674)) AND isnotnull(d_date_sk#12))
 
-(16) Exchange
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Arguments: hashpartitioning(i_item_sk#8, 5), true, [id=#14]
+(16) Project [codegen id : 5]
+Output [1]: [d_date_sk#12]
+Input [2]: [d_date_sk#12, d_date#13]
 
-(17) Sort [codegen id : 5]
-Input [6]: [i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Arguments: [i_item_sk#8 ASC NULLS FIRST], false, 0
+(17) BroadcastExchange
+Input [1]: [d_date_sk#12]
+Arguments: HashedRelationBroadcastMode(List(cast(input[0, int, true] as bigint)),false), [id=#14]
 
-(18) SortMergeJoin [codegen id : 6]
-Left keys [1]: [ss_item_sk#2]
-Right keys [1]: [i_item_sk#8]
+(18) BroadcastHashJoin [codegen id : 6]
+Left keys [1]: [ss_sold_date_sk#1]
+Right keys [1]: [d_date_sk#12]
 Join condition: None
 
 (19) Project [codegen id : 6]
-Output [6]: [ss_ext_sales_price#3, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Input [8]: [ss_item_sk#2, ss_ext_sales_price#3, i_item_sk#8, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
+Output [6]: [ss_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Input [8]: [ss_sold_date_sk#1, ss_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10, d_date_sk#12]
 
 (20) HashAggregate [codegen id : 6]
-Input [6]: [ss_ext_sales_price#3, i_item_id#9, i_item_desc#10, i_current_price#11, i_class#12, i_category#13]
-Keys [5]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11]
+Input [6]: [ss_ext_sales_price#3, i_item_id#6, i_item_desc#7, i_current_price#8, i_class#9, i_category#10]
+Keys [5]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8]
 Functions [1]: [partial_sum(UnscaledValue(ss_ext_sales_price#3))]
 Aggregate Attributes [1]: [sum#15]
-Results [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
+Results [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
 
 (21) Exchange
-Input [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
-Arguments: hashpartitioning(i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, 5), true, [id=#17]
+Input [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
+Arguments: hashpartitioning(i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, 5), ENSURE_REQUIREMENTS, [id=#17]
 
 (22) HashAggregate [codegen id : 7]
-Input [6]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, sum#16]
-Keys [5]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11]
+Input [6]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, sum#16]
+Keys [5]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8]
 Functions [1]: [sum(UnscaledValue(ss_ext_sales_price#3))]
 Aggregate Attributes [1]: [sum(UnscaledValue(ss_ext_sales_price#3))#18]
-Results [8]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#18,17,2) AS itemrevenue#19, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#18,17,2) AS _w0#20, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#18,17,2) AS _w1#21]
+Results [8]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#18,17,2) AS itemrevenue#19, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#18,17,2) AS _w0#20, MakeDecimal(sum(UnscaledValue(ss_ext_sales_price#3))#18,17,2) AS _w1#21]
 
 (23) Exchange
-Input [8]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21]
-Arguments: hashpartitioning(i_class#12, 5), true, [id=#22]
+Input [8]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21]
+Arguments: hashpartitioning(i_class#9, 5), ENSURE_REQUIREMENTS, [id=#22]
 
 (24) Sort [codegen id : 8]
-Input [8]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21]
-Arguments: [i_class#12 ASC NULLS FIRST], false, 0
+Input [8]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21]
+Arguments: [i_class#9 ASC NULLS FIRST], false, 0
 
 (25) Window
-Input [8]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21]
-Arguments: [sum(_w1#21) windowspecdefinition(i_class#12, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#23], [i_class#12]
+Input [8]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21]
+Arguments: [sum(_w1#21) windowspecdefinition(i_class#9, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS _we0#23], [i_class#9]
 
 (26) Project [codegen id : 9]
-Output [7]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(_w0#20) * 100.00), DecimalType(21,2), true) as decimal(27,2))) / promote_precision(_we0#23)), DecimalType(38,17), true) AS revenueratio#24]
-Input [9]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, _w0#20, _w1#21, _we0#23]
+Output [7]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, CheckOverflow((promote_precision(cast(CheckOverflow((promote_precision(_w0#20) * 100.00), DecimalType(21,2), true) as decimal(27,2))) / promote_precision(_we0#23)), DecimalType(38,17), true) AS revenueratio#24]
+Input [9]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, _w0#20, _w1#21, _we0#23]
 
 (27) Exchange
-Input [7]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, revenueratio#24]
-Arguments: rangepartitioning(i_category#13 ASC NULLS FIRST, i_class#12 ASC NULLS FIRST, i_item_id#9 ASC NULLS FIRST, i_item_desc#10 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST, 5), true, [id=#25]
+Input [7]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, revenueratio#24]
+Arguments: rangepartitioning(i_category#10 ASC NULLS FIRST, i_class#9 ASC NULLS FIRST, i_item_id#6 ASC NULLS FIRST, i_item_desc#7 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST, 5), ENSURE_REQUIREMENTS, [id=#25]
 
 (28) Sort [codegen id : 10]
-Input [7]: [i_item_id#9, i_item_desc#10, i_category#13, i_class#12, i_current_price#11, itemrevenue#19, revenueratio#24]
-Arguments: [i_category#13 ASC NULLS FIRST, i_class#12 ASC NULLS FIRST, i_item_id#9 ASC NULLS FIRST, i_item_desc#10 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST], true, 0
+Input [7]: [i_item_id#6, i_item_desc#7, i_category#10, i_class#9, i_current_price#8, itemrevenue#19, revenueratio#24]
+Arguments: [i_category#10 ASC NULLS FIRST, i_class#9 ASC NULLS FIRST, i_item_id#6 ASC NULLS FIRST, i_item_desc#7 ASC NULLS FIRST, revenueratio#24 ASC NULLS FIRST], true, 0
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q98.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q98.sf100/simplified.txt
index ded65becfe826..7066245b3973e 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q98.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q98.sf100/simplified.txt
@@ -17,34 +17,34 @@ WholeStageCodegen (10)
                                 WholeStageCodegen (6)
                                   HashAggregate [i_item_id,i_item_desc,i_category,i_class,i_current_price,ss_ext_sales_price] [sum,sum]
                                     Project [ss_ext_sales_price,i_item_id,i_item_desc,i_current_price,i_class,i_category]
-                                      SortMergeJoin [ss_item_sk,i_item_sk]
-                                        InputAdapter
-                                          WholeStageCodegen (3)
-                                            Sort [ss_item_sk]
-                                              InputAdapter
-                                                Exchange [ss_item_sk] #4
-                                                  WholeStageCodegen (2)
-                                                    Project [ss_item_sk,ss_ext_sales_price]
-                                                      BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
+                                      BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
+                                        Project [ss_sold_date_sk,ss_ext_sales_price,i_item_id,i_item_desc,i_current_price,i_class,i_category]
+                                          SortMergeJoin [ss_item_sk,i_item_sk]
+                                            InputAdapter
+                                              WholeStageCodegen (2)
+                                                Sort [ss_item_sk]
+                                                  InputAdapter
+                                                    Exchange [ss_item_sk] #4
+                                                      WholeStageCodegen (1)
                                                         Filter [ss_item_sk,ss_sold_date_sk]
                                                           ColumnarToRow
                                                             InputAdapter
                                                               Scan parquet default.store_sales [ss_sold_date_sk,ss_item_sk,ss_ext_sales_price]
-                                                        InputAdapter
-                                                          BroadcastExchange #5
-                                                            WholeStageCodegen (1)
-                                                              Project [d_date_sk]
-                                                                Filter [d_date,d_date_sk]
-                                                                  ColumnarToRow
-                                                                    InputAdapter
-                                                                      Scan parquet default.date_dim [d_date_sk,d_date]
+                                            InputAdapter
+                                              WholeStageCodegen (4)
+                                                Sort [i_item_sk]
+                                                  InputAdapter
+                                                    Exchange [i_item_sk] #5
+                                                      WholeStageCodegen (3)
+                                                        Filter [i_category,i_item_sk]
+                                                          ColumnarToRow
+                                                            InputAdapter
+                                                              Scan parquet default.item [i_item_sk,i_item_id,i_item_desc,i_current_price,i_class,i_category]
                                         InputAdapter
-                                          WholeStageCodegen (5)
-                                            Sort [i_item_sk]
-                                              InputAdapter
-                                                Exchange [i_item_sk] #6
-                                                  WholeStageCodegen (4)
-                                                    Filter [i_category,i_item_sk]
-                                                      ColumnarToRow
-                                                        InputAdapter
-                                                          Scan parquet default.item [i_item_sk,i_item_id,i_item_desc,i_current_price,i_class,i_category]
+                                          BroadcastExchange #6
+                                            WholeStageCodegen (5)
+                                              Project [d_date_sk]
+                                                Filter [d_date,d_date_sk]
+                                                  ColumnarToRow
+                                                    InputAdapter
+                                                      Scan parquet default.date_dim [d_date_sk,d_date]

From 9ad6a061f493de9d9c448dcf95f4e5cdc3ce4889 Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 9 Apr 2021 11:52:55 -0700
Subject: [PATCH 062/169] [SPARK-34963][SQL] Fix nested column pruning for
 extracting case-insensitive struct field from array of struct

### What changes were proposed in this pull request?

This patch proposes a fix of nested column pruning for extracting case-insensitive struct field from array of struct.

### Why are the changes needed?

Under case-insensitive mode, nested column pruning rule cannot correctly push down extractor of a struct field of an array of struct, e.g.,

```scala
val query = spark.table("contacts").select("friends.First", "friends.MiDDle")
```

Error stack:
```
[info]   java.lang.IllegalArgumentException: Field "First" does not exist.
[info] Available fields:
[info]   at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:274)
[info]   at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:274)
[info]   at scala.collection.MapLike$class.getOrElse(MapLike.scala:128)
[info]   at scala.collection.AbstractMap.getOrElse(Map.scala:59)
[info]   at org.apache.spark.sql.types.StructType.apply(StructType.scala:273)
[info]   at org.apache.spark.sql.execution.ProjectionOverSchema$$anonfun$getProjection$3.apply(ProjectionOverSchema.scala:44)
[info]   at org.apache.spark.sql.execution.ProjectionOverSchema$$anonfun$getProjection$3.apply(ProjectionOverSchema.scala:41)
```

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Unit test

Closes #32059 from viirya/fix-array-nested-pruning.

Authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
(cherry picked from commit 364d1eaf10f51c357f507325557fb076140ced2c)
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 .../expressions/ProjectionOverSchema.scala    |  9 +++-
 .../catalyst/expressions/SelectedField.scala  |  6 ++-
 .../datasources/SchemaPruningSuite.scala      | 42 +++++++++++++++++++
 3 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ProjectionOverSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ProjectionOverSchema.scala
index 241c761624b76..03b5517f6df05 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ProjectionOverSchema.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ProjectionOverSchema.scala
@@ -41,9 +41,14 @@ case class ProjectionOverSchema(schema: StructType) {
       case a: GetArrayStructFields =>
         getProjection(a.child).map(p => (p, p.dataType)).map {
           case (projection, ArrayType(projSchema @ StructType(_), _)) =>
+            // For case-sensitivity aware field resolution, we should take `ordinal` which
+            // points to correct struct field.
+            val selectedField = a.child.dataType.asInstanceOf[ArrayType]
+              .elementType.asInstanceOf[StructType](a.ordinal)
+            val prunedField = projSchema(selectedField.name)
             GetArrayStructFields(projection,
-              projSchema(a.field.name),
-              projSchema.fieldIndex(a.field.name),
+              prunedField.copy(name = a.field.name),
+              projSchema.fieldIndex(selectedField.name),
               projSchema.size,
               a.containsNull)
           case (_, projSchema) =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SelectedField.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SelectedField.scala
index f2acb75ea6ac4..39dfdf99a9a2b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SelectedField.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SelectedField.scala
@@ -75,7 +75,11 @@ object SelectedField {
         val field = c.childSchema(c.ordinal)
         val newField = field.copy(dataType = dataTypeOpt.getOrElse(field.dataType))
         selectField(c.child, Option(struct(newField)))
-      case GetArrayStructFields(child, field, _, _, containsNull) =>
+      case GetArrayStructFields(child, _, ordinal, _, containsNull) =>
+        // For case-sensitivity aware field resolution, we should take `ordinal` which
+        // points to correct struct field.
+        val field = child.dataType.asInstanceOf[ArrayType]
+          .elementType.asInstanceOf[StructType](ordinal)
         val newFieldDataType = dataTypeOpt match {
           case None =>
             // GetArrayStructFields is the top level extractor. This means its result is
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
index c90732183cb7a..765d2fc584a7d 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/SchemaPruningSuite.scala
@@ -774,4 +774,46 @@ abstract class SchemaPruningSuite
         assert(scanSchema === expectedScanSchema)
     }
   }
+
+  testSchemaPruning("SPARK-34963: extract case-insensitive struct field from array") {
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+      val query1 = spark.table("contacts")
+        .select("friends.First", "friends.MiDDle")
+      checkScan(query1, "struct<friends:array<struct<first:string,middle:string>>>")
+      checkAnswer(query1,
+        Row(Array.empty[String], Array.empty[String]) ::
+          Row(Array("Susan"), Array("Z.")) ::
+          Row(null, null) ::
+          Row(null, null) :: Nil)
+
+      val query2 = spark.table("contacts")
+        .where("friends.First is not null")
+        .select("friends.First", "friends.MiDDle")
+      checkScan(query2, "struct<friends:array<struct<first:string,middle:string>>>")
+      checkAnswer(query2,
+        Row(Array.empty[String], Array.empty[String]) ::
+          Row(Array("Susan"), Array("Z.")) :: Nil)
+    }
+  }
+
+  testSchemaPruning("SPARK-34963: extract case-insensitive struct field from struct") {
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+      val query1 = spark.table("contacts")
+        .select("Name.First", "NAME.MiDDle")
+      checkScan(query1, "struct<name:struct<first:string,middle:string>>")
+      checkAnswer(query1,
+        Row("Jane", "X.") ::
+          Row("Janet", null) ::
+          Row("Jim", null) ::
+          Row("John", "Y.") :: Nil)
+
+      val query2 = spark.table("contacts")
+        .where("Name.MIDDLE is not null")
+        .select("Name.First", "NAME.MiDDle")
+      checkScan(query2, "struct<name:struct<first:string,middle:string>>")
+      checkAnswer(query2,
+        Row("Jane", "X.") ::
+          Row("John", "Y.") :: Nil)
+    }
+  }
 }

From 8715cf99351e22026047fedd68df46798c1d1ba5 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Mon, 12 Apr 2021 16:47:49 +0300
Subject: [PATCH 063/169] [SPARK-34926][SQL][3.1]
 PartitioningUtils.getPathFragment() should respect partition value is null

### What changes were proposed in this pull request?

When we insert data into a partition table partition with empty DataFrame. We will call `PartitioningUtils.getPathFragment()` then to update this partition's metadata too. When we insert to a partition when partition value is `null`, it will throw exception like
```
[info]   java.lang.NullPointerException:
[info]   at scala.collection.immutable.StringOps$.length$extension(StringOps.scala:51)
[info]   at scala.collection.immutable.StringOps.length(StringOps.scala:51)
[info]   at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:35)
[info]   at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
[info]   at scala.collection.immutable.StringOps.foreach(StringOps.scala:33)
[info]   at org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils$.escapePathName(ExternalCatalogUtils.scala:69)
[info]   at org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils$.getPartitionValueString(ExternalCatalogUtils.scala:126)
[info]   at org.apache.spark.sql.execution.datasources.PartitioningUtils$.$anonfun$getPathFragment$1(PartitioningUtils.scala:354)
[info]   at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
[info]   at scala.collection.Iterator.foreach(Iterator.scala:941)
[info]   at scala.collection.Iterator.foreach$(Iterator.scala:941)
[info]   at scala.collection.AbstractIterator.foreach(Iterator.scala:1429)
[info]   at scala.collection.IterableLike.foreach(IterableLike.scala:74)
[info]   at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
```
`PartitioningUtils.getPathFragment()`  should support `null` value too

### Why are the changes needed?
Fix bug

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Added UT

Closes #32127 from AngersZhuuuu/SPARK-34926-3.1.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Max Gekk <max.gekk@gmail.com>
---
 .../catalyst/catalog/ExternalCatalogUtils.scala |  8 ++++++--
 .../datasources/PartitioningUtils.scala         |  3 ++-
 .../apache/spark/sql/sources/InsertSuite.scala  | 17 +++++++++++++++++
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
index 4b132d8ab6c7d..c2b74604715b2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
@@ -119,12 +119,16 @@ object ExternalCatalogUtils {
     }
   }
 
-  def getPartitionPathString(col: String, value: String): String = {
-    val partitionString = if (value == null || value.isEmpty) {
+  def getPartitionValueString(value: String): String = {
+    if (value == null || value.isEmpty) {
       DEFAULT_PARTITION_NAME
     } else {
       escapePathName(value)
     }
+  }
+
+  def getPartitionPathString(col: String, value: String): String = {
+    val partitionString = getPartitionValueString(value)
     escapePathName(col) + "=" + partitionString
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
index 69123ee7af5b9..947b1e904f96c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PartitioningUtils.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCoercion
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
+import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.getPartitionValueString
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Literal}
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateFormatter, DateTimeUtils, TimestampFormatter}
 import org.apache.spark.sql.types._
@@ -349,7 +350,7 @@ object PartitioningUtils {
    */
   def getPathFragment(spec: TablePartitionSpec, partitionSchema: StructType): String = {
     partitionSchema.map { field =>
-      escapePathName(field.name) + "=" + escapePathName(spec(field.name))
+      escapePathName(field.name) + "=" + getPartitionValueString(spec(field.name))
     }.mkString("/")
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index bce55ac34419f..4513ef6f2bdd3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -933,6 +933,23 @@ class InsertSuite extends DataSourceTest with SharedSparkSession {
       assert(msg.contains("cannot resolve '`c3`' given input columns"))
     }
   }
+
+  test("SPARK-34926: PartitioningUtils.getPathFragment() should respect partition value is null") {
+    withTable("t1", "t2") {
+      sql("CREATE TABLE t1(id INT) USING PARQUET")
+      sql(
+        """
+          |CREATE TABLE t2 (c1 INT, part STRING)
+          |  USING parquet
+          |PARTITIONED BY (part)
+          |""".stripMargin)
+      sql(
+        """
+          |INSERT INTO TABLE t2 PARTITION (part = null)
+          |SELECT * FROM t1 where 1=0""".stripMargin)
+      checkAnswer(spark.table("t2"), Nil)
+    }
+  }
 }
 
 class FileExistingTestFileSystem extends RawLocalFileSystem {

From 8accb94aad0d1caa9602bbd3db575708835b8e7d Mon Sep 17 00:00:00 2001
From: Yingyi Bu <yingyi.bu@databricks.com>
Date: Tue, 13 Apr 2021 19:57:13 +0800
Subject: [PATCH 064/169] [SPARK-35014] Fix the PhysicalAggregation pattern to
 not rewrite foldable expressions

### What changes were proposed in this pull request?

Fix PhysicalAggregation to not transform a foldable expression.

### Why are the changes needed?

It can potentially break certain queries like the added unit test shows.

### Does this PR introduce _any_ user-facing change?

Yes, it fixes undesirable errors caused by a returned TypeCheckFailure from places like RegExpReplace.checkInputDataTypes.

Closes #32113 from sigmod/foldable.

Authored-by: Yingyi Bu <yingyi.bu@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 9cd25b46b9d1de0c7cdecdabd8cf37b25ec2d78a)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/planning/patterns.scala      |  2 +-
 .../util/PhysicalAggregationSuite.scala       | 54 +++++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/PhysicalAggregationSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index 2880e87ab1566..c22a874779fca 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -333,7 +333,7 @@ object PhysicalAggregation {
           case ue: PythonUDF if PythonUDF.isGroupedAggPandasUDF(ue) =>
             equivalentAggregateExpressions.getEquivalentExprs(ue).headOption
               .getOrElse(ue).asInstanceOf[PythonUDF].resultAttribute
-          case expression =>
+          case expression if !expression.foldable =>
             // Since we're using `namedGroupingAttributes` to extract the grouping key
             // columns, we need to replace grouping key expressions with their corresponding
             // attributes. We do not rely on the equality check at here since attributes may
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/PhysicalAggregationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/PhysicalAggregationSuite.scala
new file mode 100644
index 0000000000000..b8c60dfbf4f97
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/PhysicalAggregationSuite.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.util
+
+import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.planning.PhysicalAggregation
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
+
+class PhysicalAggregationSuite extends PlanTest {
+  val testRelation = LocalRelation('a.int, 'b.int)
+
+  test("SPARK-35014: a foldable expression should not be replaced by an AttributeReference") {
+    val query = testRelation
+      .groupBy('a, Literal.create(1) as 'k)(
+        'a, Round(Literal.create(1.2), Literal.create(1)) as 'r, count('b) as 'c)
+    val analyzedQuery = SimpleAnalyzer.execute(query)
+
+    val PhysicalAggregation(
+      groupingExpressions,
+      aggregateExpressions,
+      resultExpressions,
+      _ /* child */
+    ) = analyzedQuery
+
+    assertResult(2)(groupingExpressions.length)
+    assertResult(1)(aggregateExpressions.length)
+    assertResult(3)(resultExpressions.length)
+
+    // Verify that Round's scale parameter is a Literal.
+    resultExpressions(1) match {
+      case Alias(Round(_, _: Literal), _) =>
+      case other => fail("unexpected result expression: " + other)
+    }
+  }
+}

From 9e8987604c6a071077904ad36f920b5abb57ff3f Mon Sep 17 00:00:00 2001
From: weixiuli <weixiuli@jd.com>
Date: Wed, 14 Apr 2021 11:44:48 -0500
Subject: [PATCH 065/169] [SPARK-34834][NETWORK] Fix a potential Netty memory
 leak in TransportResponseHandler

### What changes were proposed in this pull request?
There is a potential Netty memory leak in TransportResponseHandler.

### Why are the changes needed?
Fix a potential Netty memory leak in TransportResponseHandler.

### Does this PR introduce _any_ user-facing change?
NO

### How was this patch tested?
NO

Closes #31942 from weixiuli/SPARK-34834.

Authored-by: weixiuli <weixiuli@jd.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
(cherry picked from commit bf9f3b884fcd6bd3428898581d4b5dca9bae6538)
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../apache/spark/network/client/TransportResponseHandler.java    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java b/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
index 2f143f77fa4ae..3aac2d2441d2a 100644
--- a/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
+++ b/common/network-common/src/main/java/org/apache/spark/network/client/TransportResponseHandler.java
@@ -188,6 +188,7 @@ public void handle(ResponseMessage message) throws Exception {
       if (listener == null) {
         logger.warn("Ignoring response for RPC {} from {} ({} bytes) since it is not outstanding",
           resp.requestId, getRemoteAddress(channel), resp.body().size());
+        resp.body().release();
       } else {
         outstandingRpcs.remove(resp.requestId);
         try {

From 844beff36636408ea29a7c457a9fa2f21b3e0367 Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.com>
Date: Wed, 14 Apr 2021 22:24:29 -0700
Subject: [PATCH 066/169] [SPARK-34225][CORE][FOLLOWUP] Replace Hadoop's Path
 with Utils.resolveURI to make the way to get URI simple

### What changes were proposed in this pull request?

This PR proposes to replace Hadoop's `Path` with `Utils.resolveURI` to make the way to get URI simple in `SparkContext`.

### Why are the changes needed?

Keep the code simple.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Existing tests.

Closes #32164 from sarutak/followup-SPARK-34225.

Authored-by: Kousuke Saruta <sarutak@oss.nttdata.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit 767ea86ecf60dd85a925ec5111f0b16dd931c1fe)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../scala/org/apache/spark/SparkContext.scala    | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala
index 6bb898a964c68..9f28232c8bc4a 100644
--- a/core/src/main/scala/org/apache/spark/SparkContext.scala
+++ b/core/src/main/scala/org/apache/spark/SparkContext.scala
@@ -1586,15 +1586,7 @@ class SparkContext(config: SparkConf) extends Logging {
   private def addFile(
       path: String, recursive: Boolean, addedOnSubmit: Boolean, isArchive: Boolean = false
     ): Unit = {
-    val uri = if (!isArchive) {
-      if (Utils.isAbsoluteURI(path) && path.contains("%")) {
-        new URI(path)
-      } else {
-        new Path(path).toUri
-      }
-    } else {
-      Utils.resolveURI(path)
-    }
+    val uri = Utils.resolveURI(path)
     val schemeCorrectedURI = uri.getScheme match {
       case null => new File(path).getCanonicalFile.toURI
       case "local" =>
@@ -1982,11 +1974,7 @@ class SparkContext(config: SparkConf) extends Logging {
         // For local paths with backslashes on Windows, URI throws an exception
         addLocalJarFile(new File(path))
       } else {
-        val uri = if (Utils.isAbsoluteURI(path) && path.contains("%")) {
-          new URI(path)
-        } else {
-          new Path(path).toUri
-        }
+        val uri = Utils.resolveURI(path)
         // SPARK-17650: Make sure this is a valid URL before adding it to the list of dependencies
         Utils.validateURL(uri)
         uri.getScheme match {

From 8afe6b25ca39e322ce58468d2457f9cb0c88b87a Mon Sep 17 00:00:00 2001
From: Sander Goos <sander.goos@databricks.com>
Date: Mon, 19 Apr 2021 15:09:29 +0000
Subject: [PATCH 067/169] [SPARK-35136] Remove initial null value of
 LiveStage.info

### What changes were proposed in this pull request?
To prevent potential NullPointerExceptions, this PR changes the `LiveStage` constructor to take `info` as a constructor parameter and adds a nullcheck in  `AppStatusListener.activeStages`.

### Why are the changes needed?
The `AppStatusListener.getOrCreateStage` would create a LiveStage object with the `info` field set to null and right after that set it to a specific StageInfo object. This can lead to a race condition when the `livestages` are read in between those calls. This could then lead to a null pointer exception in, for instance: `AppStatusListener.activeStages`.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Regular CI/CD tests

Closes #32233 from sander-goos/SPARK-35136-livestage.

Authored-by: Sander Goos <sander.goos@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../scala/org/apache/spark/status/AppStatusListener.scala     | 4 ++--
 core/src/main/scala/org/apache/spark/status/LiveEntity.scala  | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
index 52d41cdd72664..4245243c52be7 100644
--- a/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
+++ b/core/src/main/scala/org/apache/spark/status/AppStatusListener.scala
@@ -1018,7 +1018,7 @@ private[spark] class AppStatusListener(
    */
   def activeStages(): Seq[v1.StageData] = {
     liveStages.values.asScala
-      .filter(_.info.submissionTime.isDefined)
+      .filter(s => Option(s.info).exists(_.submissionTime.isDefined))
       .map(_.toApi())
       .toList
       .sortBy(_.stageId)
@@ -1179,7 +1179,7 @@ private[spark] class AppStatusListener(
 
   private def getOrCreateStage(info: StageInfo): LiveStage = {
     val stage = liveStages.computeIfAbsent((info.stageId, info.attemptNumber),
-      (_: (Int, Int)) => new LiveStage())
+      (_: (Int, Int)) => new LiveStage(info))
     stage.info = info
     stage
   }
diff --git a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
index 38f1f25f2fcaa..d5cfdcb5842eb 100644
--- a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
+++ b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
@@ -393,14 +393,13 @@ private class LiveExecutorStageSummary(
 
 }
 
-private class LiveStage extends LiveEntity {
+private class LiveStage(var info: StageInfo) extends LiveEntity {
 
   import LiveEntityHelpers._
 
   var jobs = Seq[LiveJob]()
   var jobIds = Set[Int]()
 
-  var info: StageInfo = null
   var status = v1.StageStatus.PENDING
 
   var description: Option[String] = None

From 363a5bd62114e7404a4c579d2a631a806925153f Mon Sep 17 00:00:00 2001
From: Adam Binford <adamq43@gmail.com>
Date: Tue, 20 Apr 2021 11:59:56 +0900
Subject: [PATCH 068/169] [SPARK-35117][UI] Change progress bar back to
 highlight ratio of tasks in progress

### What changes were proposed in this pull request?
Small UI update to add highlighting the number of tasks in progress in a stage/job instead of highlighting the whole in progress stage/job. This was the behavior pre Spark 3.1 and the bootstrap 4 upgrade.

### Why are the changes needed?

To add back in functionality lost between 3.0 and 3.1. This provides a great visual queue of how much of a stage/job is currently being run.

### Does this PR introduce _any_ user-facing change?

Small UI change.

Before:
![image](https://user-images.githubusercontent.com/3536454/115216189-3fddaa00-a0d2-11eb-88e0-e3be925c92f0.png)

After (and pre Spark 3.1):
![image](https://user-images.githubusercontent.com/3536454/115216216-48ce7b80-a0d2-11eb-9953-2adb3b377133.png)

### How was this patch tested?

Updated existing UT.

Closes #32214 from Kimahriman/progress-bar-started.

Authored-by: Adam Binford <adamq43@gmail.com>
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
(cherry picked from commit e55ff83d775132f2c9c48a0b33e03e130abfa504)
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
---
 .../resources/org/apache/spark/ui/static/webui.css     |  4 ++--
 core/src/main/scala/org/apache/spark/ui/UIUtils.scala  | 10 ++++++----
 .../test/scala/org/apache/spark/ui/UIUtilsSuite.scala  |  3 ++-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/webui.css b/core/src/main/resources/org/apache/spark/ui/static/webui.css
index 262cee7b58aff..fad28ce333675 100755
--- a/core/src/main/resources/org/apache/spark/ui/static/webui.css
+++ b/core/src/main/resources/org/apache/spark/ui/static/webui.css
@@ -113,7 +113,7 @@ table.sortable td {
   box-shadow: inset 1px 0 0 rgba(0,0,0,.15), inset 0 -1px 0 rgba(0,0,0,.15);
 }
 
-.progress.progress-started {
+.progress .progress-bar.progress-started {
   background-color: #A0DFFF;
   background-image: -moz-linear-gradient(top, #A4EDFF, #94DDFF);
   background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#A4EDFF), to(#94DDFF));
@@ -124,7 +124,7 @@ table.sortable td {
   filter: progid:dximagetransform.microsoft.gradient(startColorstr='#FFA4EDFF', endColorstr='#FF94DDFF', GradientType=0);
 }
 
-.progress .progress-bar {
+.progress .progress-bar.progress-completed {
   background-color: #3EC0FF;
   background-image: -moz-linear-gradient(top, #44CBFF, #34B0EE);
   background-image: -webkit-gradient(linear, 0 0, 0 100%, from(#44CBFF), to(#34B0EE));
diff --git a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
index 5e3406037a72b..05b6fea38a9ab 100644
--- a/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
+++ b/core/src/main/scala/org/apache/spark/ui/UIUtils.scala
@@ -460,13 +460,14 @@ private[spark] object UIUtils extends Logging {
       skipped: Int,
       reasonToNumKilled: Map[String, Int],
       total: Int): Seq[Node] = {
-    val ratio = if (total == 0) 100.0 else (completed.toDouble/total)*100
+    val ratio = if (total == 0) 100.0 else (completed.toDouble / total) * 100
     val completeWidth = "width: %s%%".format(ratio)
     // started + completed can be > total when there are speculative tasks
     val boundedStarted = math.min(started, total - completed)
-    val startWidth = "width: %s%%".format((boundedStarted.toDouble/total)*100)
+    val startRatio = if (total == 0) 0.0 else (boundedStarted.toDouble / total) * 100
+    val startWidth = "width: %s%%".format(startRatio)
 
-    <div class={ if (started > 0) s"progress progress-started" else s"progress" }>
+    <div class="progress">
       <span style="text-align:center; position:absolute; width:100%;">
         {completed}/{total}
         { if (failed == 0 && skipped == 0 && started > 0) s"($started running)" }
@@ -477,7 +478,8 @@ private[spark] object UIUtils extends Logging {
           }
         }
       </span>
-      <div class="progress-bar" style={completeWidth}></div>
+      <div class="progress-bar progress-completed" style={completeWidth}></div>
+      <div class="progress-bar progress-started" style={startWidth}></div>
     </div>
   }
 
diff --git a/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala b/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala
index d4368c882d660..9d040bb4e1ec7 100644
--- a/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala
+++ b/core/src/test/scala/org/apache/spark/ui/UIUtilsSuite.scala
@@ -113,7 +113,8 @@ class UIUtilsSuite extends SparkFunSuite {
   test("SPARK-11906: Progress bar should not overflow because of speculative tasks") {
     val generated = makeProgressBar(2, 3, 0, 0, Map.empty, 4).head.child.filter(_.label == "div")
     val expected = Seq(
-      <div class="progress-bar" style="width: 75.0%"></div>
+      <div class="progress-bar progress-completed" style="width: 75.0%"></div>,
+      <div class="progress-bar progress-started" style="width: 25.0%"></div>
     )
     assert(generated.sameElements(expected),
       s"\nRunning progress bar should round down\n\nExpected:\n$expected\nGenerated:\n$generated")

From 95acf4105509c6bb4d39edf02b77d48575a88096 Mon Sep 17 00:00:00 2001
From: allisonwang-db <66282705+allisonwang-db@users.noreply.github.com>
Date: Tue, 20 Apr 2021 11:11:40 +0800
Subject: [PATCH 069/169] [SPARK-35080][SQL] Only allow a subset of correlated
 equality predicates when a subquery is aggregated

This PR updated the `foundNonEqualCorrelatedPred` logic for correlated subqueries in `CheckAnalysis` to only allow correlated equality predicates that guarantee one-to-one mapping between inner and outer attributes, instead of all equality predicates.

To fix correctness bugs. Before this fix Spark can give wrong results for certain correlated subqueries that pass CheckAnalysis:
Example 1:
```sql
create or replace view t1(c) as values ('a'), ('b')
create or replace view t2(c) as values ('ab'), ('abc'), ('bc')

select c, (select count(*) from t2 where t1.c = substring(t2.c, 1, 1)) from t1
```
Correct results: [(a, 2), (b, 1)]
Spark results:
```
+---+-----------------+
|c  |scalarsubquery(c)|
+---+-----------------+
|a  |1                |
|a  |1                |
|b  |1                |
+---+-----------------+
```
Example 2:
```sql
create or replace view t1(a, b) as values (0, 6), (1, 5), (2, 4), (3, 3);
create or replace view t2(c) as values (6);

select c, (select count(*) from t1 where a + b = c) from t2;
```
Correct results: [(6, 4)]
Spark results:
```
+---+-----------------+
|c  |scalarsubquery(c)|
+---+-----------------+
|6  |1                |
|6  |1                |
|6  |1                |
|6  |1                |
+---+-----------------+
```
Yes. Users will not be able to run queries that contain unsupported correlated equality predicates.

Added unit tests.

Closes #32179 from allisonwang-db/spark-35080-subquery-bug.

Lead-authored-by: allisonwang-db <66282705+allisonwang-db@users.noreply.github.com>
Co-authored-by: Wenchen Fan <cloud0fan@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit bad4b6f025de4946112a0897892a97d5ae6822cf)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/analysis/CheckAnalysis.scala | 77 ++++++++++++++++---
 .../analysis/AnalysisErrorSuite.scala         | 24 ++++++
 .../sql-tests/results/udf/udf-except.sql.out  | 12 ++-
 .../org/apache/spark/sql/SubquerySuite.scala  | 11 ++-
 4 files changed, 109 insertions(+), 15 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 442b22c0bbc05..609b155efda28 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -927,14 +927,72 @@ trait CheckAnalysis extends PredicateHelper {
     // +- SubqueryAlias t1, `t1`
     // +- Project [_1#73 AS c1#76, _2#74 AS c2#77]
     // +- LocalRelation [_1#73, _2#74]
-    def failOnNonEqualCorrelatedPredicate(found: Boolean, p: LogicalPlan): Unit = {
-      if (found) {
+    // SPARK-35080: The same issue can happen to correlated equality predicates when
+    // they do not guarantee one-to-one mapping between inner and outer attributes.
+    // For example:
+    // Table:
+    //   t1(a, b): [(0, 6), (1, 5), (2, 4)]
+    //   t2(c): [(6)]
+    //
+    // Query:
+    //   SELECT c, (SELECT COUNT(*) FROM t1 WHERE a + b = c) FROM t2
+    //
+    // Original subquery plan:
+    //   Aggregate [count(1)]
+    //   +- Filter ((a + b) = outer(c))
+    //      +- LocalRelation [a, b]
+    //
+    // Plan after pulling up correlated predicates:
+    //   Aggregate [a, b] [count(1), a, b]
+    //   +- LocalRelation [a, b]
+    //
+    // Plan after rewrite:
+    //   Project [c1, count(1)]
+    //   +- Join LeftOuter ((a + b) = c)
+    //      :- LocalRelation [c]
+    //      +- Aggregate [a, b] [count(1), a, b]
+    //         +- LocalRelation [a, b]
+    //
+    // The right hand side of the join transformed from the subquery will output
+    //   count(1) | a | b
+    //      1     | 0 | 6
+    //      1     | 1 | 5
+    //      1     | 2 | 4
+    // and the plan after rewrite will give the original query incorrect results.
+    def failOnUnsupportedCorrelatedPredicate(predicates: Seq[Expression], p: LogicalPlan): Unit = {
+      if (predicates.nonEmpty) {
         // Report a non-supported case as an exception
-        failAnalysis(s"Correlated column is not allowed in a non-equality predicate:\n$p")
+        failAnalysis("Correlated column is not allowed in predicate " +
+          s"${predicates.map(_.sql).mkString}:\n$p")
       }
     }
 
-    var foundNonEqualCorrelatedPred: Boolean = false
+    def containsAttribute(e: Expression): Boolean = {
+      e.find(_.isInstanceOf[Attribute]).isDefined
+    }
+
+    // Given a correlated predicate, check if it is either a non-equality predicate or
+    // equality predicate that does not guarantee one-on-one mapping between inner and
+    // outer attributes. When the correlated predicate does not contain any attribute
+    // (i.e. only has outer references), it is supported and should return false. E.G.:
+    //   (a = outer(c)) -> false
+    //   (outer(c) = outer(d)) -> false
+    //   (a > outer(c)) -> true
+    //   (a + b = outer(c)) -> true
+    // The last one is true because there can be multiple combinations of (a, b) that
+    // satisfy the equality condition. For example, if outer(c) = 0, then both (0, 0)
+    // and (-1, 1) can make the predicate evaluate to true.
+    def isUnsupportedPredicate(condition: Expression): Boolean = condition match {
+      // Only allow equality condition with one side being an attribute and another
+      // side being an expression without attributes from the inner query. Note
+      // OuterReference is a leaf node and will not be found here.
+      case Equality(_: Attribute, b) => containsAttribute(b)
+      case Equality(a, _: Attribute) => containsAttribute(a)
+      case e @ Equality(_, _) => containsAttribute(e)
+      case _ => true
+    }
+
+    val unsupportedPredicates = mutable.ArrayBuffer.empty[Expression]
 
     // Simplify the predicates before validating any unsupported correlation patterns in the plan.
     AnalysisHelper.allowInvokingTransformsInAnalyzer { BooleanSimplification(sub).foreachUp {
@@ -977,22 +1035,17 @@ trait CheckAnalysis extends PredicateHelper {
       // The other operator is Join. Filter can be anywhere in a correlated subquery.
       case f: Filter =>
         val (correlated, _) = splitConjunctivePredicates(f.condition).partition(containsOuter)
-
-        // Find any non-equality correlated predicates
-        foundNonEqualCorrelatedPred = foundNonEqualCorrelatedPred || correlated.exists {
-          case _: EqualTo | _: EqualNullSafe => false
-          case _ => true
-        }
+        unsupportedPredicates ++= correlated.filter(isUnsupportedPredicate)
         failOnInvalidOuterReference(f)
 
       // Aggregate cannot host any correlated expressions
       // It can be on a correlation path if the correlation contains
-      // only equality correlated predicates.
+      // only supported correlated equality predicates.
       // It cannot be on a correlation path if the correlation has
       // non-equality correlated predicates.
       case a: Aggregate =>
         failOnInvalidOuterReference(a)
-        failOnNonEqualCorrelatedPredicate(foundNonEqualCorrelatedPred, a)
+        failOnUnsupportedCorrelatedPredicate(unsupportedPredicates.toSeq, a)
 
       // Join can host correlated expressions.
       case j @ Join(left, right, joinType, _, _) =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index 47e97743941d7..ae5bbb1a87f30 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -733,4 +733,28 @@ class AnalysisErrorSuite extends AnalysisTest {
           UnresolvedRelation(TableIdentifier("t", Option("nonexist")))))))
     assertAnalysisError(plan, "Table or view not found:" :: Nil)
   }
+
+  test("SPARK-35080: Unsupported correlated equality predicates in subquery") {
+    val a = AttributeReference("a", IntegerType)()
+    val b = AttributeReference("b", IntegerType)()
+    val c = AttributeReference("c", IntegerType)()
+    val t1 = LocalRelation(a, b)
+    val t2 = LocalRelation(c)
+    val conditions = Seq(
+      (abs($"a") === $"c", "abs(`a`) = outer(`c`)"),
+      (abs($"a") <=> $"c", "abs(`a`) <=> outer(`c`)"),
+      ($"a" + 1 === $"c", "(`a` + 1) = outer(`c`)"),
+      ($"a" + $"b" === $"c", "(`a` + `b`) = outer(`c`)"),
+      ($"a" + $"c" === $"b", "(`a` + outer(`c`)) = `b`"),
+      (And($"a" === $"c", Cast($"a", IntegerType) === $"c"), "CAST(`a` AS INT) = outer(`c`)"))
+    conditions.foreach { case (cond, msg) =>
+      val plan = Project(
+        ScalarSubquery(
+          Aggregate(Nil, count(Literal(1)).as("cnt") :: Nil,
+            Filter(cond, t1))
+        ).as("sub") :: Nil,
+        t2)
+      assertAnalysisError(plan, s"Correlated column is not allowed in predicate ($msg)" :: Nil)
+    }
+  }
 }
diff --git a/sql/core/src/test/resources/sql-tests/results/udf/udf-except.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/udf-except.sql.out
index 054ee00ecc2ae..43506b49a6683 100644
--- a/sql/core/src/test/resources/sql-tests/results/udf/udf-except.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/udf/udf-except.sql.out
@@ -100,6 +100,14 @@ WHERE  udf(t1.v) >= (SELECT   min(udf(t2.v))
                 FROM     t2
                 WHERE    t2.k = t1.k)
 -- !query schema
-struct<k:string>
+struct<>
 -- !query output
-two
+org.apache.spark.sql.AnalysisException
+Correlated column is not allowed in predicate (CAST(udf(cast(k as string)) AS STRING) = CAST(udf(cast(outer(k#x) as string)) AS STRING)):
+Aggregate [cast(udf(cast(max(cast(udf(cast(v#x as string)) as int)) as string)) as int) AS CAST(udf(cast(max(cast(udf(cast(v as string)) as int)) as string)) AS INT)#x]
++- Filter (cast(udf(cast(k#x as string)) as string) = cast(udf(cast(outer(k#x) as string)) as string))
+   +- SubqueryAlias t2
+      +- View (`t2`, [k#x,v#x])
+         +- Project [k#x, v#x]
+            +- SubqueryAlias t2
+               +- LocalRelation [k#x, v#x]
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
index 73b23496de515..fafe1bb39336f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -542,7 +542,7 @@ class SubquerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
       sql("select a, (select sum(b) from l l2 where l2.a < l1.a) sum_b from l l1")
     }
     assert(msg1.getMessage.contains(
-      "Correlated column is not allowed in a non-equality predicate:"))
+      "Correlated column is not allowed in predicate (l2.`a` < outer(l1.`a`))"))
   }
 
   test("disjunctive correlated scalar subquery") {
@@ -1753,4 +1753,13 @@ class SubquerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
       }
     }
   }
+
+  test("SPARK-35080: correlated equality predicates contain only outer references") {
+    withTempView("t") {
+      Seq((0, 1), (1, 1)).toDF("c1", "c2").createOrReplaceTempView("t")
+      checkAnswer(
+        sql("select c1, c2, (select count(*) from l where c1 = c2) from t"),
+        Row(0, 1, 0) :: Row(1, 1, 8) :: Nil)
+    }
+  }
 }

From a41cebef30c3d622c3cb044889ffae6581d2d007 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 21 Apr 2021 09:14:11 +0900
Subject: [PATCH 070/169] [SPARK-34639][SQL][3.1]
 RelationalGroupedDataset.alias should not create UnresolvedAlias

### What changes were proposed in this pull request?

This PR partially backports https://github.com/apache/spark/pull/31758 to 3.1, to fix a backward compatibility issue caused by https://github.com/apache/spark/pull/28490

The query below has different output schemas in 3.0 and 3.1
```
sql("select struct(1, 2) as s").groupBy(col("s.col1")).agg(first("s"))
```

In 3.0 the output column name is `col1`, in 3.1  it's `s.col1`. This breaks existing queries.

In https://github.com/apache/spark/pull/28490 , we changed the logic of resolving aggregate expressions. What happened is that the input nested column `s.col1` will become `UnresolvedAlias(s.col1, None)`. In `ResolveReference`, the logic used to directly resolve `s.col` to `s.col1 AS col1` but after #28490 we enter the code path with `trimAlias = true and !isTopLevel`, so the alias is removed and resulting in `s.col1`, which will then be resolved in `ResolveAliases` as `s.col1 AS s.col1`

#31758 happens to fix this issue because we no longer wrap `UnresolvedAttribute` with `UnresolvedAlias` in `RelationalGroupedDataset`.

### Why are the changes needed?

Fix an unexpected query output schema change

### Does this PR introduce _any_ user-facing change?

Yes as explained above.

### How was this patch tested?

updated test

Closes #32239 from cloud-fan/bug.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
---
 .../org/apache/spark/sql/RelationalGroupedDataset.scala     | 6 +-----
 .../src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 3 +++
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
index c40ce0f4777c6..7e735eecbac3e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala
@@ -24,7 +24,7 @@ import scala.collection.JavaConverters._
 import org.apache.spark.annotation.Stable
 import org.apache.spark.api.python.PythonEvalType
 import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.sql.catalyst.analysis.{Star, UnresolvedAlias, UnresolvedAttribute, UnresolvedFunction}
+import org.apache.spark.sql.catalyst.analysis.{Star, UnresolvedAlias, UnresolvedFunction}
 import org.apache.spark.sql.catalyst.encoders.encoderFor
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.aggregate._
@@ -80,11 +80,7 @@ class RelationalGroupedDataset protected[sql](
     }
   }
 
-  // Wrap UnresolvedAttribute with UnresolvedAlias, as when we resolve UnresolvedAttribute, we
-  // will remove intermediate Alias for ExtractValue chain, and we need to alias it again to
-  // make it a NamedExpression.
   private[this] def alias(expr: Expression): NamedExpression = expr match {
-    case u: UnresolvedAttribute => UnresolvedAlias(u)
     case expr: NamedExpression => expr
     case a: AggregateExpression if a.aggregateFunction.isInstanceOf[TypedAggregateExpression] =>
       UnresolvedAlias(a, Some(Column.generateAlias))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index aa673dc666510..d154e887b7ec4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -3597,6 +3597,9 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
         Row("A", null, 5) :: Row("A", "{\"i\": 1}", 3) :: Row("A", "{\"i\": 2}", 2) ::
           Row("B", null, 1) :: Row("B", "{\"i\": 1}", 1) ::
           Row("C", null, 3) :: Row("C", "{\"i\": 1}", 3) :: Nil)
+
+      assert(spark.table("t").groupBy($"c.json_string").count().schema.fieldNames ===
+        Seq("json_string", "count"))
     }
   }
 

From 06e5e2a079a13b596ec50e76ea86fff7fcb20fb3 Mon Sep 17 00:00:00 2001
From: "sandeep.katta" <sandeep.katta2007@gmail.com>
Date: Wed, 21 Apr 2021 15:16:17 +0800
Subject: [PATCH 071/169] [SPARK-35096][SQL] SchemaPruning should adhere
 spark.sql.caseSensitive config

### What changes were proposed in this pull request?

As a part of the SPARK-26837 pruning of nested fields from object serializers are supported. But it is missed to handle case insensitivity nature of spark

In this PR I have resolved the column names to be pruned based on `spark.sql.caseSensitive ` config
**Exception Before Fix**

```
Caused by: java.lang.ArrayIndexOutOfBoundsException: 0
  at org.apache.spark.sql.types.StructType.apply(StructType.scala:414)
  at org.apache.spark.sql.catalyst.optimizer.ObjectSerializerPruning$$anonfun$apply$4.$anonfun$applyOrElse$3(objects.scala:216)
  at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
  at scala.collection.immutable.List.foreach(List.scala:392)
  at scala.collection.TraversableLike.map(TraversableLike.scala:238)
  at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
  at scala.collection.immutable.List.map(List.scala:298)
  at org.apache.spark.sql.catalyst.optimizer.ObjectSerializerPruning$$anonfun$apply$4.applyOrElse(objects.scala:215)
  at org.apache.spark.sql.catalyst.optimizer.ObjectSerializerPruning$$anonfun$apply$4.applyOrElse(objects.scala:203)
  at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$1(TreeNode.scala:309)
  at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:72)
  at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:309)
  at
```

### Why are the changes needed?
After Upgrade to Spark 3 `foreachBatch` API throws` java.lang.ArrayIndexOutOfBoundsException`. This issue will be fixed using this PR

### Does this PR introduce _any_ user-facing change?
No, Infact fixes the regression

### How was this patch tested?
Added tests and also tested verified manually

Closes #32194 from sandeep-katta/SPARK-35096.

Authored-by: sandeep.katta <sandeep.katta2007@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 4f309cec07b1b1e5f7cddb0f98598fb1a234c2bd)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/expressions/SchemaPruning.scala  | 15 ++++---
 .../expressions/SchemaPruningSuite.scala      | 43 ++++++++++++++++++-
 2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
index 6213267c41c64..4ee6488c92fcc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
@@ -17,9 +17,10 @@
 
 package org.apache.spark.sql.catalyst.expressions
 
+import org.apache.spark.sql.catalyst.SQLConfHelper
 import org.apache.spark.sql.types._
 
-object SchemaPruning {
+object SchemaPruning extends SQLConfHelper {
   /**
    * Filters the schema by the requested fields. For example, if the schema is struct<a:int, b:int>,
    * and given requested field are "a", the field "b" is pruned in the returned schema.
@@ -28,6 +29,7 @@ object SchemaPruning {
   def pruneDataSchema(
       dataSchema: StructType,
       requestedRootFields: Seq[RootField]): StructType = {
+    val resolver = conf.resolver
     // Merge the requested root fields into a single schema. Note the ordering of the fields
     // in the resulting schema may differ from their ordering in the logical relation's
     // original schema
@@ -36,7 +38,7 @@ object SchemaPruning {
       .reduceLeft(_ merge _)
     val dataSchemaFieldNames = dataSchema.fieldNames.toSet
     val mergedDataSchema =
-      StructType(mergedSchema.filter(f => dataSchemaFieldNames.contains(f.name)))
+      StructType(mergedSchema.filter(f => dataSchemaFieldNames.exists(resolver(_, f.name))))
     // Sort the fields of mergedDataSchema according to their order in dataSchema,
     // recursively. This makes mergedDataSchema a pruned schema of dataSchema
     sortLeftFieldsByRight(mergedDataSchema, dataSchema).asInstanceOf[StructType]
@@ -61,12 +63,15 @@ object SchemaPruning {
           sortLeftFieldsByRight(leftValueType, rightValueType),
           containsNull)
       case (leftStruct: StructType, rightStruct: StructType) =>
-        val filteredRightFieldNames = rightStruct.fieldNames.filter(leftStruct.fieldNames.contains)
+        val resolver = conf.resolver
+        val filteredRightFieldNames = rightStruct.fieldNames
+          .filter(name => leftStruct.fieldNames.exists(resolver(_, name)))
         val sortedLeftFields = filteredRightFieldNames.map { fieldName =>
-          val leftFieldType = leftStruct(fieldName).dataType
+          val resolvedLeftStruct = leftStruct.find(p => resolver(p.name, fieldName)).get
+          val leftFieldType = resolvedLeftStruct.dataType
           val rightFieldType = rightStruct(fieldName).dataType
           val sortedLeftFieldType = sortLeftFieldsByRight(leftFieldType, rightFieldType)
-          StructField(fieldName, sortedLeftFieldType, nullable = leftStruct(fieldName).nullable)
+          StructField(fieldName, sortedLeftFieldType, nullable = resolvedLeftStruct.nullable)
         }
         StructType(sortedLeftFields)
       case _ => left
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruningSuite.scala
index c04f59ebb1b1b..7895f4d5ef400 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruningSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruningSuite.scala
@@ -18,9 +18,20 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.expressions.SchemaPruning.RootField
+import org.apache.spark.sql.catalyst.plans.SQLHelper
+import org.apache.spark.sql.internal.SQLConf.CASE_SENSITIVE
 import org.apache.spark.sql.types._
 
-class SchemaPruningSuite extends SparkFunSuite {
+class SchemaPruningSuite extends SparkFunSuite with SQLHelper {
+
+  def getRootFields(requestedFields: StructField*): Seq[RootField] = {
+    requestedFields.map { f =>
+      // `derivedFromAtt` doesn't affect the result of pruned schema.
+      SchemaPruning.RootField(field = f, derivedFromAtt = true)
+    }
+  }
+
   test("prune schema by the requested fields") {
     def testPrunedSchema(
         schema: StructType,
@@ -59,4 +70,34 @@ class SchemaPruningSuite extends SparkFunSuite {
       StructType.fromDDL("e int, f string")))
     testPrunedSchema(complexStruct, StructField("c", IntegerType), selectFieldInMap)
   }
+
+  test("SPARK-35096: test case insensitivity of pruned schema") {
+    Seq(true, false).foreach(isCaseSensitive => {
+      withSQLConf(CASE_SENSITIVE.key -> isCaseSensitive.toString) {
+        if (isCaseSensitive) {
+          // Schema is case-sensitive
+          val requestedFields = getRootFields(StructField("id", IntegerType))
+          val prunedSchema = SchemaPruning.pruneDataSchema(
+            StructType.fromDDL("ID int, name String"), requestedFields)
+          assert(prunedSchema == StructType(Seq.empty))
+          // Root fields are case-sensitive
+          val rootFieldsSchema = SchemaPruning.pruneDataSchema(
+            StructType.fromDDL("id int, name String"),
+            getRootFields(StructField("ID", IntegerType)))
+          assert(rootFieldsSchema == StructType(StructType(Seq.empty)))
+        } else {
+          // Schema is case-insensitive
+          val prunedSchema = SchemaPruning.pruneDataSchema(
+            StructType.fromDDL("ID int, name String"),
+            getRootFields(StructField("id", IntegerType)))
+          assert(prunedSchema == StructType(StructField("ID", IntegerType) :: Nil))
+          // Root fields are case-insensitive
+          val rootFieldsSchema = SchemaPruning.pruneDataSchema(
+            StructType.fromDDL("id int, name String"),
+            getRootFields(StructField("ID", IntegerType)))
+          assert(rootFieldsSchema == StructType(StructField("id", IntegerType) :: Nil))
+        }
+      }
+    })
+  }
 }

From a657b0a56c4de4712211ad92bb48ef17cf26fda0 Mon Sep 17 00:00:00 2001
From: kyoty <echohlne@gmail.com>
Date: Thu, 22 Apr 2021 21:00:04 +0900
Subject: [PATCH 072/169] [SPARK-35127][UI] When we switch between different
 stage-detail pages, the entry item in the newly-opened page may be blank
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?

To make sure that pageSize shoud not be shared between different stage pages.
The screenshots of the problem are placed in the attachment of [JIRA](https://issues.apache.org/jira/browse/SPARK-35127)

### Why are the changes needed?
fix the bug.

according to reference:`https://datatables.net/reference/option/lengthMenu`
`-1` represents display all rows, but now we use `totalTasksToShow`, it will cause the select item show as empty when we swich between different stage-detail pages.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
manual test, it is a small io problem, and the modification does not affect the function, but just an adjustment of js configuration

the gif below shows how the problem can be reproduced:
![reproduce](https://user-images.githubusercontent.com/52202080/115204351-f7060f80-a12a-11eb-8900-a009ad0c8870.gif)

![微信截图_20210419162849](https://user-images.githubusercontent.com/52202080/115205675-629cac80-a12c-11eb-9cb8-1939c7450e99.png)

the gif below shows the result after modified:

![after_modified](https://user-images.githubusercontent.com/52202080/115204886-91fee980-a12b-11eb-9ccb-d5900a99095d.gif)

Closes #32223 from kyoty/stages-task-empty-pagesize.

Authored-by: kyoty <echohlne@gmail.com>
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
(cherry picked from commit 7242d7f774b821cbcb564c8afeddecaf6664d159)
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
---
 .../main/resources/org/apache/spark/ui/static/stagepage.js   | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/stagepage.js b/core/src/main/resources/org/apache/spark/ui/static/stagepage.js
index 91bf274aa467d..06e28c7701dc3 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/stagepage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/stagepage.js
@@ -746,7 +746,7 @@ $(document).ready(function () {
                     "paging": true,
                     "info": true,
                     "processing": true,
-                    "lengthMenu": [[20, 40, 60, 100, totalTasksToShow], [20, 40, 60, 100, "All"]],
+                    "lengthMenu": [[20, 40, 60, 100, -1], [20, 40, 60, 100, "All"]],
                     "orderMulti": false,
                     "bAutoWidth": false,
                     "ajax": {
@@ -762,6 +762,9 @@ $(document).ready(function () {
                             data.numTasks = totalTasksToShow;
                             data.columnIndexToSort = columnIndexToSort;
                             data.columnNameToSort = columnNameToSort;
+                            if (data.length === -1) {
+                                data.length = totalTasksToShow;
+                            }
                         },
                         "dataSrc": function (jsons) {
                             var jsonStr = JSON.stringify(jsons);

From 60867dd3d70a42b45add6cc46bb5496e9204bb74 Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Fri, 23 Apr 2021 15:08:11 +0800
Subject: [PATCH 073/169] [SPARK-34897][SQL][3.1] Support reconcile schemas
 based on index after nested column pruning

This PR backports https://github.com/apache/spark/pull/31993 to branch-3.1. The origin PR description:

### What changes were proposed in this pull request?

It will remove `StructField` when [pruning nested columns](https://github.com/apache/spark/blob/0f2c0b53e8fb18c86c67b5dd679c006db93f94a5/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala#L28-L42). For example:
```scala
spark.sql(
  """
    |CREATE TABLE t1 (
    |  _col0 INT,
    |  _col1 STRING,
    |  _col2 STRUCT<c1: STRING, c2: STRING, c3: STRING, c4: BIGINT>)
    |USING ORC
    |""".stripMargin)

spark.sql("INSERT INTO t1 values(1, '2', struct('a', 'b', 'c', 10L))")

spark.sql("SELECT _col0, _col2.c1 FROM t1").show
```

Before this pr. The returned schema is: ``` `_col0` INT,`_col2` STRUCT<`c1`: STRING> ``` add it will throw exception:
```
java.lang.AssertionError: assertion failed: The given data schema struct<_col0:int,_col2:struct<c1:string>> has less fields than the actual ORC physical schema, no idea which columns were dropped, fail to read.
	at scala.Predef$.assert(Predef.scala:223)
	at org.apache.spark.sql.execution.datasources.orc.OrcUtils$.requestedColumnIds(OrcUtils.scala:160)
```

After this pr. The returned schema is: ``` `_col0` INT,`_col1` STRING,`_col2` STRUCT<`c1`: STRING> ```.

The finally schema is ``` `_col0` INT,`_col2` STRUCT<`c1`: STRING> ``` after the complete column pruning:
https://github.com/apache/spark/blob/7a5647a93aaea9d1d78d9262e24fc8c010db04d0/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala#L208-L213

https://github.com/apache/spark/blob/e64eb75aede71a5403a4d4436e63b1fcfdeca14d/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala#L96-L97

### Why are the changes needed?

Fix bug.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Unit test.

Closes #32279 from wangyum/SPARK-34897-3.1.

Authored-by: Yuming Wang <yumwang@ebay.com>
Signed-off-by: Yuming Wang <yumwang@ebay.com>
---
 .../catalyst/expressions/SchemaPruning.scala  |  14 +-
 .../expressions/SchemaPruningSuite.scala      | 123 +++++++++++-------
 .../datasources/HadoopFsRelation.scala        |   2 +
 .../datasources/v2/PushDownUtils.scala        |  10 +-
 .../datasources/orc/OrcSourceSuite.scala      |  16 +++
 5 files changed, 108 insertions(+), 57 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
index 4ee6488c92fcc..30093ef085913 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruning.scala
@@ -22,9 +22,12 @@ import org.apache.spark.sql.types._
 
 object SchemaPruning extends SQLConfHelper {
   /**
-   * Filters the schema by the requested fields. For example, if the schema is struct<a:int, b:int>,
-   * and given requested field are "a", the field "b" is pruned in the returned schema.
-   * Note that schema field ordering at original schema is still preserved in pruned schema.
+   * Prunes the nested schema by the requested fields. For example, if the schema is:
+   * `id int, s struct<a:int, b:int>`, and given requested field "s.a", the inner field "b"
+   * is pruned in the returned schema: `id int, s struct<a:int>`.
+   * Note that:
+   *   1. The schema field ordering at original schema is still preserved in pruned schema.
+   *   2. The top-level fields are not pruned here.
    */
   def pruneDataSchema(
       dataSchema: StructType,
@@ -34,11 +37,10 @@ object SchemaPruning extends SQLConfHelper {
     // in the resulting schema may differ from their ordering in the logical relation's
     // original schema
     val mergedSchema = requestedRootFields
-      .map { case root: RootField => StructType(Array(root.field)) }
+      .map { root: RootField => StructType(Array(root.field)) }
       .reduceLeft(_ merge _)
-    val dataSchemaFieldNames = dataSchema.fieldNames.toSet
     val mergedDataSchema =
-      StructType(mergedSchema.filter(f => dataSchemaFieldNames.exists(resolver(_, f.name))))
+      StructType(dataSchema.map(d => mergedSchema.find(m => resolver(m.name, d.name)).getOrElse(d)))
     // Sort the fields of mergedDataSchema according to their order in dataSchema,
     // recursively. This makes mergedDataSchema a pruned schema of dataSchema
     sortLeftFieldsByRight(mergedDataSchema, dataSchema).asInstanceOf[StructType]
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruningSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruningSuite.scala
index 7895f4d5ef400..2fab553183492 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruningSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SchemaPruningSuite.scala
@@ -18,39 +18,36 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.expressions.SchemaPruning.RootField
 import org.apache.spark.sql.catalyst.plans.SQLHelper
 import org.apache.spark.sql.internal.SQLConf.CASE_SENSITIVE
 import org.apache.spark.sql.types._
 
 class SchemaPruningSuite extends SparkFunSuite with SQLHelper {
-
-  def getRootFields(requestedFields: StructField*): Seq[RootField] = {
-    requestedFields.map { f =>
+  private def testPrunedSchema(
+      schema: StructType,
+      requestedFields: Seq[StructField],
+      expectedSchema: StructType): Unit = {
+    val requestedRootFields = requestedFields.map { f =>
       // `derivedFromAtt` doesn't affect the result of pruned schema.
       SchemaPruning.RootField(field = f, derivedFromAtt = true)
     }
+    val prunedSchema = SchemaPruning.pruneDataSchema(schema, requestedRootFields)
+    assert(prunedSchema === expectedSchema)
   }
 
   test("prune schema by the requested fields") {
-    def testPrunedSchema(
-        schema: StructType,
-        requestedFields: StructField*): Unit = {
-      val requestedRootFields = requestedFields.map { f =>
-        // `derivedFromAtt` doesn't affect the result of pruned schema.
-        SchemaPruning.RootField(field = f, derivedFromAtt = true)
-      }
-      val expectedSchema = SchemaPruning.pruneDataSchema(schema, requestedRootFields)
-      assert(expectedSchema == StructType(requestedFields))
-    }
-
-    testPrunedSchema(StructType.fromDDL("a int, b int"), StructField("a", IntegerType))
-    testPrunedSchema(StructType.fromDDL("a int, b int"), StructField("b", IntegerType))
+    testPrunedSchema(
+      StructType.fromDDL("a int, b int"),
+      Seq(StructField("a", IntegerType)),
+      StructType.fromDDL("a int, b int"))
 
     val structOfStruct = StructType.fromDDL("a struct<a:int, b:int>, b int")
-    testPrunedSchema(structOfStruct, StructField("a", StructType.fromDDL("a int, b int")))
-    testPrunedSchema(structOfStruct, StructField("b", IntegerType))
-    testPrunedSchema(structOfStruct, StructField("a", StructType.fromDDL("b int")))
+    testPrunedSchema(structOfStruct,
+      Seq(StructField("a", StructType.fromDDL("a int")), StructField("b", IntegerType)),
+      StructType.fromDDL("a struct<a:int>, b int"))
+    testPrunedSchema(structOfStruct,
+      Seq(StructField("a", StructType.fromDDL("a int"))),
+      StructType.fromDDL("a struct<a:int>, b int"))
 
     val arrayOfStruct = StructField("a", ArrayType(StructType.fromDDL("a int, b int, c string")))
     val mapOfStruct = StructField("d", MapType(StructType.fromDDL("a int, b int, c string"),
@@ -60,44 +57,76 @@ class SchemaPruningSuite extends SparkFunSuite with SQLHelper {
       arrayOfStruct :: StructField("b", structOfStruct) :: StructField("c", IntegerType) ::
         mapOfStruct :: Nil)
 
-    testPrunedSchema(complexStruct, StructField("a", ArrayType(StructType.fromDDL("b int"))),
-      StructField("b", StructType.fromDDL("a int")))
     testPrunedSchema(complexStruct,
-      StructField("a", ArrayType(StructType.fromDDL("b int, c string"))),
-      StructField("b", StructType.fromDDL("b int")))
+      Seq(StructField("a", ArrayType(StructType.fromDDL("b int"))),
+        StructField("b", StructType.fromDDL("a int"))),
+      StructType(
+        StructField("a", ArrayType(StructType.fromDDL("b int"))) ::
+          StructField("b", StructType.fromDDL("a int")) ::
+          StructField("c", IntegerType) ::
+          mapOfStruct :: Nil))
+    testPrunedSchema(complexStruct,
+      Seq(StructField("a", ArrayType(StructType.fromDDL("b int, c string"))),
+        StructField("b", StructType.fromDDL("b int"))),
+      StructType(
+        StructField("a", ArrayType(StructType.fromDDL("b int, c string"))) ::
+          StructField("b", StructType.fromDDL("b int")) ::
+          StructField("c", IntegerType) ::
+          mapOfStruct :: Nil))
 
     val selectFieldInMap = StructField("d", MapType(StructType.fromDDL("a int, b int"),
       StructType.fromDDL("e int, f string")))
-    testPrunedSchema(complexStruct, StructField("c", IntegerType), selectFieldInMap)
+    testPrunedSchema(complexStruct,
+      Seq(StructField("c", IntegerType), selectFieldInMap),
+      StructType(
+        arrayOfStruct ::
+          StructField("b", structOfStruct) ::
+          StructField("c", IntegerType) ::
+          selectFieldInMap :: Nil))
   }
 
   test("SPARK-35096: test case insensitivity of pruned schema") {
-    Seq(true, false).foreach(isCaseSensitive => {
+    val upperCaseSchema = StructType.fromDDL("A struct<A:int, B:int>, B int")
+    val lowerCaseSchema = StructType.fromDDL("a struct<a:int, b:int>, b int")
+    val upperCaseRequestedFields = Seq(StructField("A", StructType.fromDDL("A int")))
+    val lowerCaseRequestedFields = Seq(StructField("a", StructType.fromDDL("a int")))
+
+    Seq(true, false).foreach { isCaseSensitive =>
       withSQLConf(CASE_SENSITIVE.key -> isCaseSensitive.toString) {
         if (isCaseSensitive) {
-          // Schema is case-sensitive
-          val requestedFields = getRootFields(StructField("id", IntegerType))
-          val prunedSchema = SchemaPruning.pruneDataSchema(
-            StructType.fromDDL("ID int, name String"), requestedFields)
-          assert(prunedSchema == StructType(Seq.empty))
-          // Root fields are case-sensitive
-          val rootFieldsSchema = SchemaPruning.pruneDataSchema(
-            StructType.fromDDL("id int, name String"),
-            getRootFields(StructField("ID", IntegerType)))
-          assert(rootFieldsSchema == StructType(StructType(Seq.empty)))
+          testPrunedSchema(
+            upperCaseSchema,
+            upperCaseRequestedFields,
+            StructType.fromDDL("A struct<A:int>, B int"))
+          testPrunedSchema(
+            upperCaseSchema,
+            lowerCaseRequestedFields,
+            upperCaseSchema)
+
+          testPrunedSchema(
+            lowerCaseSchema,
+            upperCaseRequestedFields,
+            lowerCaseSchema)
+          testPrunedSchema(
+            lowerCaseSchema,
+            lowerCaseRequestedFields,
+            StructType.fromDDL("a struct<a:int>, b int"))
         } else {
-          // Schema is case-insensitive
-          val prunedSchema = SchemaPruning.pruneDataSchema(
-            StructType.fromDDL("ID int, name String"),
-            getRootFields(StructField("id", IntegerType)))
-          assert(prunedSchema == StructType(StructField("ID", IntegerType) :: Nil))
-          // Root fields are case-insensitive
-          val rootFieldsSchema = SchemaPruning.pruneDataSchema(
-            StructType.fromDDL("id int, name String"),
-            getRootFields(StructField("ID", IntegerType)))
-          assert(rootFieldsSchema == StructType(StructField("id", IntegerType) :: Nil))
+          Seq(upperCaseRequestedFields, lowerCaseRequestedFields).foreach { requestedFields =>
+            testPrunedSchema(
+              upperCaseSchema,
+              requestedFields,
+              StructType.fromDDL("A struct<A:int>, B int"))
+          }
+
+          Seq(upperCaseRequestedFields, lowerCaseRequestedFields).foreach { requestedFields =>
+            testPrunedSchema(
+              lowerCaseSchema,
+              requestedFields,
+              StructType.fromDDL("a struct<a:int>, b int"))
+          }
         }
       }
-    })
+    }
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
index a0b191e60f376..4ed8943ef46f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/HadoopFsRelation.scala
@@ -40,6 +40,8 @@ import org.apache.spark.sql.types.{StructField, StructType}
 case class HadoopFsRelation(
     location: FileIndex,
     partitionSchema: StructType,
+    // The top-level columns in `dataSchema` should match the actual physical file schema, otherwise
+    // the ORC data source may not work with the by-ordinal mode.
     dataSchema: StructType,
     bucketSpec: Option[BucketSpec],
     fileFormat: FileFormat,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala
index 167ba45b888a3..1f57f17911457 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala
@@ -81,6 +81,10 @@ object PushDownUtils extends PredicateHelper {
       relation: DataSourceV2Relation,
       projects: Seq[NamedExpression],
       filters: Seq[Expression]): (Scan, Seq[AttributeReference]) = {
+    val exprs = projects ++ filters
+    val requiredColumns = AttributeSet(exprs.flatMap(_.references))
+    val neededOutput = relation.output.filter(requiredColumns.contains)
+
     scanBuilder match {
       case r: SupportsPushDownRequiredColumns if SQLConf.get.nestedSchemaPruningEnabled =>
         val rootFields = SchemaPruning.identifyRootFields(projects, filters)
@@ -89,14 +93,12 @@ object PushDownUtils extends PredicateHelper {
         } else {
           new StructType()
         }
-        r.pruneColumns(prunedSchema)
+        val neededFieldNames = neededOutput.map(_.name).toSet
+        r.pruneColumns(StructType(prunedSchema.filter(f => neededFieldNames.contains(f.name))))
         val scan = r.build()
         scan -> toOutputAttrs(scan.readSchema(), relation)
 
       case r: SupportsPushDownRequiredColumns =>
-        val exprs = projects ++ filters
-        val requiredColumns = AttributeSet(exprs.flatMap(_.references))
-        val neededOutput = relation.output.filter(requiredColumns.contains)
         r.pruneColumns(neededOutput.toStructType)
         val scan = r.build()
         // always project, in case the relation's output has been updated and doesn't match
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
index 4c489bdcc649e..40fdcaaf3e6c0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/orc/OrcSourceSuite.scala
@@ -594,4 +594,20 @@ class OrcSourceSuite extends OrcSuite with SharedSparkSession {
     val df = readResourceOrcFile("test-data/TestStringDictionary.testRowIndex.orc")
     assert(df.where("str < 'row 001000'").count() === 1000)
   }
+
+  test("SPARK-34897: Support reconcile schemas based on index after nested column pruning") {
+    withTable("t1") {
+      spark.sql(
+        """
+          |CREATE TABLE t1 (
+          |  _col0 INT,
+          |  _col1 STRING,
+          |  _col2 STRUCT<c1: STRING, c2: STRING, c3: STRING, c4: BIGINT>)
+          |USING ORC
+          |""".stripMargin)
+
+      spark.sql("INSERT INTO t1 values(1, '2', struct('a', 'b', 'c', 10L))")
+      checkAnswer(spark.sql("SELECT _col0, _col2.c1 FROM t1"), Seq(Row(1, "a")))
+    }
+  }
 }

From b1e56bddb659f605f5f302983f36cb8890c364a3 Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Sun, 25 Apr 2021 20:27:12 +0800
Subject: [PATCH 074/169] [SPARK-35168][SQL] mapred.reduce.tasks should be
 shuffle.partitions not adaptive.coalescePartitions.initialPartitionNum

### What changes were proposed in this pull request?

```sql
spark-sql> set spark.sql.adaptive.coalescePartitions.initialPartitionNum=1;
spark.sql.adaptive.coalescePartitions.initialPartitionNum	1
Time taken: 2.18 seconds, Fetched 1 row(s)
spark-sql> set mapred.reduce.tasks;
21/04/21 14:27:11 WARN SetCommand: Property mapred.reduce.tasks is deprecated, showing spark.sql.shuffle.partitions instead.
spark.sql.shuffle.partitions	1
Time taken: 0.03 seconds, Fetched 1 row(s)
spark-sql> set spark.sql.shuffle.partitions;
spark.sql.shuffle.partitions	200
Time taken: 0.024 seconds, Fetched 1 row(s)
spark-sql> set mapred.reduce.tasks=2;
21/04/21 14:31:52 WARN SetCommand: Property mapred.reduce.tasks is deprecated, automatically converted to spark.sql.shuffle.partitions instead.
spark.sql.shuffle.partitions	2
Time taken: 0.017 seconds, Fetched 1 row(s)
spark-sql> set mapred.reduce.tasks;
21/04/21 14:31:55 WARN SetCommand: Property mapred.reduce.tasks is deprecated, showing spark.sql.shuffle.partitions instead.
spark.sql.shuffle.partitions	1
Time taken: 0.017 seconds, Fetched 1 row(s)
spark-sql>
```

`mapred.reduce.tasks` is mapping to `spark.sql.shuffle.partitions` at write-side, but `spark.sql.adaptive.coalescePartitions.initialPartitionNum` might take precede of `spark.sql.shuffle.partitions`

### Why are the changes needed?

roundtrip for `mapred.reduce.tasks`

### Does this PR introduce _any_ user-facing change?

yes, `mapred.reduce.tasks` will always report `spark.sql.shuffle.partitions` whether `spark.sql.adaptive.coalescePartitions.initialPartitionNum` exists or not.

### How was this patch tested?

a new test

Closes #32265 from yaooqinn/SPARK-35168.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Kent Yao <yao@apache.org>
(cherry picked from commit 5b1353f690bf416fdb3a34c94741425b95f97308)
Signed-off-by: Kent Yao <yao@apache.org>
---
 .../spark/sql/execution/command/SetCommand.scala  |  2 +-
 .../apache/spark/sql/internal/SQLConfSuite.scala  | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
index 7d92e6e189fb2..888fb33764588 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/SetCommand.scala
@@ -138,7 +138,7 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm
             s"showing ${SQLConf.SHUFFLE_PARTITIONS.key} instead.")
         Seq(Row(
           SQLConf.SHUFFLE_PARTITIONS.key,
-          sparkSession.sessionState.conf.numShufflePartitions.toString))
+          sparkSession.sessionState.conf.defaultNumShufflePartitions.toString))
       }
       (keyValueOutput, runFunc)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index 1ea2d4fd0b32c..bf0edcac99fb7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -117,6 +117,21 @@ class SQLConfSuite extends QueryTest with SharedSparkSession {
     }
   }
 
+  test(s"SPARK-35168: ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS} should respect" +
+      s" ${SQLConf.SHUFFLE_PARTITIONS.key}") {
+    spark.sessionState.conf.clear()
+    try {
+      sql(s"SET ${SQLConf.ADAPTIVE_EXECUTION_ENABLED.key}=true")
+      sql(s"SET ${SQLConf.COALESCE_PARTITIONS_ENABLED.key}=true")
+      sql(s"SET ${SQLConf.COALESCE_PARTITIONS_INITIAL_PARTITION_NUM.key}=1")
+      sql(s"SET ${SQLConf.SHUFFLE_PARTITIONS.key}=2")
+      checkAnswer(sql(s"SET ${SQLConf.Deprecated.MAPRED_REDUCE_TASKS}"),
+        Row(SQLConf.SHUFFLE_PARTITIONS.key, "2"))
+    } finally {
+      spark.sessionState.conf.clear()
+    }
+  }
+
   test("SPARK-31234: reset will not change static sql configs and spark core configs") {
     val conf = spark.sparkContext.getConf.getAll.toMap
     val appName = conf.get("spark.app.name")

From bc789f26e954539e178a80754a5ec0e69e96bf2e Mon Sep 17 00:00:00 2001
From: kyoty <echohlne@gmail.com>
Date: Mon, 26 Apr 2021 12:13:22 +0900
Subject: [PATCH 075/169] [SPARK-35087][UI] Some columns in table Aggregated
 Metrics by Executor of stage-detail page shows incorrectly.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?

 columns like 'Shuffle Read Size / Records', 'Output Size/ Records' etc  in table ` Aggregated Metrics by Executor` of stage-detail page should be sorted as numerical-order instead of lexicographical-order.

### Why are the changes needed?
buf fix,the sorting style should be consistent between different columns.

The correspondence between the table and the index is shown below(it is defined in stagespage-template.html)：
| index | column name                            |
| ----- | -------------------------------------- |
| 0     | Executor ID                            |
| 1     | Logs                                   |
| 2     | Address                                |
| 3     | Task Time                              |
| 4     | Total Tasks                            |
| 5     | Failed Tasks                           |
| 6     | Killed Tasks                           |
| 7     | Succeeded Tasks                        |
| 8     | Excluded                               |
| 9     | Input Size / Records                   |
| 10    | Output Size / Records                  |
| 11    | Shuffle Read Size / Records            |
| 12    | Shuffle Write Size / Records           |
| 13    | Spill (Memory)                         |
| 14    | Spill (Disk)                           |
| 15    | Peak JVM Memory OnHeap / OffHeap       |
| 16    | Peak Execution Memory OnHeap / OffHeap |
| 17    | Peak Storage Memory OnHeap / OffHeap   |
| 18    | Peak Pool Memory Direct / Mapped       |

I constructed some data to simulate the sorting results of the index columns from 9 to 18.
As shown below,it can be seen that the sorting results of columns 9-12 are wrong:

![simulate-result](https://user-images.githubusercontent.com/52202080/115120775-c9fa1580-9fe1-11eb-8514-71f29db3a5eb.png)

The reason is that the real data corresponding to columns 9-12 (note that it is not the data displayed on the page) are **all strings similar to`94685/131`(bytes/records),while the real data corresponding to columns 13-18 are all numbers,**
so the sorting corresponding to columns 13-18 loos well, but the results of columns 9-12 are incorrect because the strings are sorted according to lexicographical order.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Only JS was modified, and the manual test result works well.

**before modified:**
![looks-illegal](https://user-images.githubusercontent.com/52202080/115120812-06c60c80-9fe2-11eb-9ada-fa520fe43c4e.png)

**after modified:**
![sort-result-corrent](https://user-images.githubusercontent.com/52202080/114865187-7c847980-9e24-11eb-9fbc-39ee224726d6.png)

Closes #32190 from kyoty/aggregated-metrics-by-executor-sorted-incorrectly.

Authored-by: kyoty <echohlne@gmail.com>
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
(cherry picked from commit 2d6467d6d1633a06b6416eacd4f8167973e88136)
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
---
 .../org/apache/spark/ui/static/stagepage.js   | 42 +++++++++++++++++--
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/core/src/main/resources/org/apache/spark/ui/static/stagepage.js b/core/src/main/resources/org/apache/spark/ui/static/stagepage.js
index 06e28c7701dc3..8b32fe7d3e20d 100644
--- a/core/src/main/resources/org/apache/spark/ui/static/stagepage.js
+++ b/core/src/main/resources/org/apache/spark/ui/static/stagepage.js
@@ -43,6 +43,23 @@ $.extend( $.fn.dataTable.ext.type.order, {
         a = ConvertDurationString( a );
         b = ConvertDurationString( b );
         return ((a < b) ? 1 : ((a > b) ? -1 : 0));
+    },
+
+    "size-pre": function (data) {
+        var floatValue = parseFloat(data)
+        return isNaN(floatValue) ? 0 : floatValue;
+    },
+
+    "size-asc": function (a, b) {
+        a = parseFloat(a);
+        b = parseFloat(b);
+        return ((a < b) ? -1 : ((a > b) ? 1 : 0));
+    },
+
+    "size-desc": function (a, b) {
+        a = parseFloat(a);
+        b = parseFloat(b);
+        return ((a < b) ? 1 : ((a > b) ? -1 : 0));
     }
 } );
 
@@ -562,10 +579,27 @@ $(document).ready(function () {
                         }
                     ],
                     "columnDefs": [
-                        { "visible": false, "targets": 15 },
-                        { "visible": false, "targets": 16 },
-                        { "visible": false, "targets": 17 },
-                        { "visible": false, "targets": 18 }
+                        // SPARK-35087 [type:size] means String with structures like : 'size / records',
+                        // they should be sorted as numerical-order instead of lexicographical-order by default.
+                        // The targets: $id represents column id which comes from stagespage-template.html
+                        // #summary-executor-table.If the relative position of the columns in the table
+                        // #summary-executor-table has changed,please be careful to adjust the column index here
+                        // Input Size / Records
+                        {"type": "size", "targets": 9},
+                        // Output Size / Records
+                        {"type": "size", "targets": 10},
+                        // Shuffle Read Size / Records
+                        {"type": "size", "targets": 11},
+                        // Shuffle Write Size / Records
+                        {"type": "size", "targets": 12},
+                        // Peak JVM Memory OnHeap / OffHeap
+                        {"visible": false, "targets": 15},
+                        // Peak Execution Memory OnHeap / OffHeap
+                        {"visible": false, "targets": 16},
+                        // Peak Storage Memory OnHeap / OffHeap
+                        {"visible": false, "targets": 17},
+                        // Peak Pool Memory Direct / Mapped
+                        {"visible": false, "targets": 18}
                     ],
                     "deferRender": true,
                     "order": [[0, "asc"]],

From 7b26b0c8ffe37cf74eb75f3d63698657b5c48578 Mon Sep 17 00:00:00 2001
From: Adam Binford <adamq43@gmail.com>
Date: Sun, 25 Apr 2021 23:39:56 -0700
Subject: [PATCH 076/169] [SPARK-35213][SQL] Keep the correct ordering of
 nested structs in chained withField operations

### What changes were proposed in this pull request?

Modifies the UpdateFields optimizer to fix correctness issues with certain nested and chained withField operations. Examples for recreating the issue are in the new unit tests as well as the JIRA issue.

### Why are the changes needed?

Certain withField patterns can cause Exceptions or even incorrect results. It appears to be a result of the additional UpdateFields optimization added in https://github.com/apache/spark/pull/29812. It traverses fieldOps in reverse order to take the last one per field, but this can cause nested structs to change order which leads to mismatches between the schema and the actual data. This updates the optimization to maintain the initial ordering of nested structs to match the generated schema.

### Does this PR introduce _any_ user-facing change?

It fixes exceptions and incorrect results for valid uses in the latest Spark release.

### How was this patch tested?

Added new unit tests for these edge cases.

Closes #32338 from Kimahriman/bug/optimize-with-fields.

Authored-by: Adam Binford <adamq43@gmail.com>
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
(cherry picked from commit 74afc68e2172cb0dc3567e12a8a2c304bb7ea138)
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 .../sql/catalyst/optimizer/UpdateFields.scala | 30 ++++------
 .../optimizer/OptimizeWithFieldsSuite.scala   | 21 +++++++
 .../spark/sql/ColumnExpressionSuite.scala     | 55 +++++++++++++++++++
 3 files changed, 88 insertions(+), 18 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/UpdateFields.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/UpdateFields.scala
index 465d2efe2775c..d3127b5560178 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/UpdateFields.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/UpdateFields.scala
@@ -50,28 +50,22 @@ object OptimizeUpdateFields extends Rule[LogicalPlan] {
       val values = withFields.map(_.valExpr)
 
       val newNames = mutable.ArrayBuffer.empty[String]
-      val newValues = mutable.ArrayBuffer.empty[Expression]
+      val newValues = mutable.HashMap.empty[String, Expression]
+      // Used to remember the casing of the last instance
+      val nameMap = mutable.HashMap.empty[String, String]
 
-      if (caseSensitive) {
-        names.zip(values).reverse.foreach { case (name, value) =>
-          if (!newNames.contains(name)) {
-            newNames += name
-            newValues += value
-          }
-        }
-      } else {
-        val nameSet = mutable.HashSet.empty[String]
-        names.zip(values).reverse.foreach { case (name, value) =>
-          val lowercaseName = name.toLowerCase(Locale.ROOT)
-          if (!nameSet.contains(lowercaseName)) {
-            newNames += name
-            newValues += value
-            nameSet += lowercaseName
-          }
+      names.zip(values).foreach { case (name, value) =>
+        val normalizedName = if (caseSensitive) name else name.toLowerCase(Locale.ROOT)
+        if (nameMap.contains(normalizedName)) {
+          newValues += normalizedName -> value
+        } else {
+          newNames += normalizedName
+          newValues += normalizedName -> value
         }
+        nameMap += normalizedName -> name
       }
 
-      val newWithFields = newNames.reverse.zip(newValues.reverse).map(p => WithField(p._1, p._2))
+      val newWithFields = newNames.map(n => WithField(nameMap(n), newValues(n)))
       UpdateFields(structExpr, newWithFields.toSeq)
 
     case UpdateFields(UpdateFields(struct, fieldOps1), fieldOps2) =>
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeWithFieldsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeWithFieldsSuite.scala
index b093b39cc4b88..e63742ac0de56 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeWithFieldsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizeWithFieldsSuite.scala
@@ -126,4 +126,25 @@ class OptimizeWithFieldsSuite extends PlanTest {
       comparePlans(optimized, correctAnswer)
     }
   }
+
+  test("SPARK-35213: ensure optimize WithFields maintains correct WithField ordering") {
+    val originalQuery = testRelation
+      .select(
+        Alias(UpdateFields('a,
+          WithField("a1", Literal(3)) ::
+          WithField("b1", Literal(4)) ::
+          WithField("a1", Literal(5)) ::
+          Nil), "out")())
+
+    val optimized = Optimize.execute(originalQuery.analyze)
+    val correctAnswer = testRelation
+      .select(
+        Alias(UpdateFields('a,
+          WithField("a1", Literal(5)) ::
+          WithField("b1", Literal(4)) ::
+          Nil), "out")())
+      .analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index 01b1508d034c3..186091db19280 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -1660,6 +1660,61 @@ class ColumnExpressionSuite extends QueryTest with SharedSparkSession {
       StructType(Seq(StructField("a", IntegerType, nullable = true))))
   }
 
+  test("SPARK-35213: chained withField operations should have correct schema for new columns") {
+    val df = spark.createDataFrame(
+      sparkContext.parallelize(Row(null) :: Nil),
+      StructType(Seq(StructField("data", NullType))))
+
+    checkAnswer(
+      df.withColumn("data", struct()
+        .withField("a", struct())
+        .withField("b", struct())
+        .withField("a.aa", lit("aa1"))
+        .withField("b.ba", lit("ba1"))
+        .withField("a.ab", lit("ab1"))),
+        Row(Row(Row("aa1", "ab1"), Row("ba1"))) :: Nil,
+        StructType(Seq(
+          StructField("data", StructType(Seq(
+            StructField("a", StructType(Seq(
+              StructField("aa", StringType, nullable = false),
+              StructField("ab", StringType, nullable = false)
+            )), nullable = false),
+            StructField("b", StructType(Seq(
+              StructField("ba", StringType, nullable = false)
+            )), nullable = false)
+          )), nullable = false)
+        ))
+    )
+  }
+
+  test("SPARK-35213: optimized withField operations should maintain correct nested struct " +
+    "ordering") {
+    val df = spark.createDataFrame(
+      sparkContext.parallelize(Row(null) :: Nil),
+      StructType(Seq(StructField("data", NullType))))
+
+    checkAnswer(
+      df.withColumn("data", struct()
+          .withField("a", struct().withField("aa", lit("aa1")))
+          .withField("b", struct().withField("ba", lit("ba1")))
+        )
+        .withColumn("data", col("data").withField("b.bb", lit("bb1")))
+        .withColumn("data", col("data").withField("a.ab", lit("ab1"))),
+        Row(Row(Row("aa1", "ab1"), Row("ba1", "bb1"))) :: Nil,
+        StructType(Seq(
+          StructField("data", StructType(Seq(
+            StructField("a", StructType(Seq(
+              StructField("aa", StringType, nullable = false),
+              StructField("ab", StringType, nullable = false)
+            )), nullable = false),
+            StructField("b", StructType(Seq(
+              StructField("ba", StringType, nullable = false),
+              StructField("bb", StringType, nullable = false)
+            )), nullable = false)
+          )), nullable = false)
+        ))
+    )
+  }
 
   test("dropFields should throw an exception if called on a non-StructType column") {
     intercept[AnalysisException] {

From a6e8f53e2d5736c66d0814850dda896023a325a3 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 28 Apr 2021 10:45:04 +0900
Subject: [PATCH 077/169] [SPARK-35244][SQL] Invoke should throw the original
 exception

### What changes were proposed in this pull request?

This PR updates the interpreted code path of invoke expressions, to unwrap the `InvocationTargetException`

### Why are the changes needed?

Make interpreted and codegen path consistent for invoke expressions.

### Does this PR introduce _any_ user-facing change?

no

### How was this patch tested?

new UT

Closes #32370 from cloud-fan/minor.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: hyukjinkwon <gurwls223@apache.org>
---
 .../sql/catalyst/expressions/objects/objects.scala     |  7 ++++++-
 .../catalyst/expressions/ObjectExpressionsSuite.scala  | 10 ++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index c7b59d2a442e0..dda7a0a9b5499 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -127,7 +127,12 @@ trait InvokeLike extends Expression with NonSQLExpression {
       // return null if one of arguments is null
       null
     } else {
-      val ret = method.invoke(obj, args: _*)
+      val ret = try {
+        method.invoke(obj, args: _*)
+      } catch {
+        // Re-throw the original exception.
+        case e: java.lang.reflect.InvocationTargetException => throw e.getCause
+      }
       val boxedClass = ScalaReflection.typeBoxedJavaMapping.get(dataType)
       if (boxedClass.isDefined) {
         boxedClass.get.cast(ret)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala
index bc2b93e5390da..6e71c95d344c3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala
@@ -608,6 +608,16 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkExceptionInExpression[RuntimeException](
       serializer4, EmptyRow, "Cannot use null as map key!")
   }
+
+  test("SPARK-35244: invoke should throw the original exception") {
+    val strClsType = ObjectType(classOf[String])
+    checkExceptionInExpression[StringIndexOutOfBoundsException](
+      Invoke(Literal("a", strClsType), "substring", strClsType, Seq(Literal(3))), "")
+
+    val mathCls = classOf[Math]
+    checkExceptionInExpression[ArithmeticException](
+      StaticInvoke(mathCls, IntegerType, "addExact", Seq(Literal(Int.MaxValue), Literal(1))), "")
+  }
 }
 
 class TestBean extends Serializable {

From 5d4ab0e15eb3037af7e124ee34395281eb18a23d Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sat, 1 May 2021 10:20:46 -0700
Subject: [PATCH 078/169] [SPARK-35278][SQL] Invoke should find the method with
 correct number of parameters

### What changes were proposed in this pull request?

This patch fixes `Invoke` expression when the target object has more than one method with the given method name.

### Why are the changes needed?

`Invoke` will find out the method on the target object with given method name. If there are more than one method with the name, currently it is undeterministic which method will be used. We should add the condition of parameter number when finding the method.

### Does this PR introduce _any_ user-facing change?

Yes, fixed a bug when using `Invoke` on a object where more than one method with the given method name.

### How was this patch tested?

Unit test.

Closes #32404 from viirya/verify-invoke-param-len.

Authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
(cherry picked from commit 6ce1b161e96176777344beb610163636e7dfeb00)
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 .../expressions/objects/objects.scala         | 29 ++++++++++++++---
 .../expressions/ObjectExpressionsSuite.scala  | 31 +++++++++++++++++++
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index dda7a0a9b5499..534d2964f8e53 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -319,11 +319,30 @@ case class Invoke(
 
   @transient lazy val method = targetObject.dataType match {
     case ObjectType(cls) =>
-      val m = cls.getMethods.find(_.getName == encodedFunctionName)
-      if (m.isEmpty) {
-        sys.error(s"Couldn't find $encodedFunctionName on $cls")
-      } else {
-        m
+      // Looking with function name + argument classes first.
+      try {
+        Some(cls.getMethod(encodedFunctionName, argClasses: _*))
+      } catch {
+        case _: NoSuchMethodException =>
+          // For some cases, e.g. arg class is Object, `getMethod` cannot find the method.
+          // We look at function name + argument length
+          val m = cls.getMethods.filter { m =>
+            m.getName == encodedFunctionName && m.getParameterCount == arguments.length
+          }
+          if (m.isEmpty) {
+            sys.error(s"Couldn't find $encodedFunctionName on $cls")
+          } else if (m.length > 1) {
+            // More than one matched method signature. Exclude synthetic one, e.g. generic one.
+            val realMethods = m.filter(!_.isSynthetic)
+            if (realMethods.length > 1) {
+              // Ambiguous case, we don't know which method to choose, just fail it.
+              sys.error(s"Found ${realMethods.length} $encodedFunctionName on $cls")
+            } else {
+              Some(realMethods.head)
+            }
+          } else {
+            Some(m.head)
+          }
       }
     case _ => None
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala
index 6e71c95d344c3..328c4f84d58f7 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala
@@ -618,6 +618,29 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkExceptionInExpression[ArithmeticException](
       StaticInvoke(mathCls, IntegerType, "addExact", Seq(Literal(Int.MaxValue), Literal(1))), "")
   }
+
+  test("SPARK-35278: invoke should find method with correct number of parameters") {
+    val strClsType = ObjectType(classOf[String])
+    checkExceptionInExpression[StringIndexOutOfBoundsException](
+      Invoke(Literal("a", strClsType), "substring", strClsType, Seq(Literal(3))), "")
+
+    checkObjectExprEvaluation(
+      Invoke(Literal("a", strClsType), "substring", strClsType, Seq(Literal(0))), "a")
+
+    checkExceptionInExpression[StringIndexOutOfBoundsException](
+      Invoke(Literal("a", strClsType), "substring", strClsType, Seq(Literal(0), Literal(3))), "")
+
+    checkObjectExprEvaluation(
+      Invoke(Literal("a", strClsType), "substring", strClsType, Seq(Literal(0), Literal(1))), "a")
+  }
+
+  test("SPARK-35278: invoke should correctly invoke override method") {
+    val clsType = ObjectType(classOf[ConcreteClass])
+    val obj = new ConcreteClass
+
+    checkObjectExprEvaluation(
+      Invoke(Literal(obj, clsType), "testFunc", IntegerType, Seq(Literal(1))), 0)
+  }
 }
 
 class TestBean extends Serializable {
@@ -628,3 +651,11 @@ class TestBean extends Serializable {
   def setNonPrimitive(i: AnyRef): Unit =
     assert(i != null, "this setter should not be called with null.")
 }
+
+abstract class BaseClass[T] {
+  def testFunc(param: T): T
+}
+
+class ConcreteClass extends BaseClass[Int] with Serializable {
+  override def testFunc(param: Int): Int = param - 1
+}

From 4e2198c438515f40a0c55665f9819a747894c470 Mon Sep 17 00:00:00 2001
From: dsolow <dsolow@sayari.com>
Date: Wed, 5 May 2021 12:46:13 +0900
Subject: [PATCH 079/169] [SPARK-34794][SQL] Fix lambda variable name issues in
 nested DataFrame functions

### What changes were proposed in this pull request?

To fix lambda variable name issues in nested DataFrame functions, this PR modifies code to use a global counter for `LambdaVariables` names created by higher order functions.

This is the rework of #31887. Closes #31887.

### Why are the changes needed?

 This moves away from the current hard-coded variable names which break on nested function calls. There is currently a bug where nested transforms in particular fail (the inner variable shadows the outer variable)

For this query:
```
val df = Seq(
    (Seq(1,2,3), Seq("a", "b", "c"))
).toDF("numbers", "letters")

df.select(
    f.flatten(
        f.transform(
            $"numbers",
            (number: Column) => { f.transform(
                $"letters",
                (letter: Column) => { f.struct(
                    number.as("number"),
                    letter.as("letter")
                ) }
            ) }
        )
    ).as("zipped")
).show(10, false)
```
This is the current (incorrect) output:
```
+------------------------------------------------------------------------+
|zipped                                                                  |
+------------------------------------------------------------------------+
|[{a, a}, {b, b}, {c, c}, {a, a}, {b, b}, {c, c}, {a, a}, {b, b}, {c, c}]|
+------------------------------------------------------------------------+
```
And this is the correct output after fix:
```
+------------------------------------------------------------------------+
|zipped                                                                  |
+------------------------------------------------------------------------+
|[{1, a}, {1, b}, {1, c}, {2, a}, {2, b}, {2, c}, {3, a}, {3, b}, {3, c}]|
+------------------------------------------------------------------------+
```

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Added the new test in `DataFrameFunctionsSuite`.

Closes #32424 from maropu/pr31887.

Lead-authored-by: dsolow <dsolow@sayari.com>
Co-authored-by: Takeshi Yamamuro <yamamuro@apache.org>
Co-authored-by: dmsolow <dsolow@sayarianalytics.com>
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
(cherry picked from commit f550e03b96638de93381734c4eada2ace02d9a4f)
Signed-off-by: Takeshi Yamamuro <yamamuro@apache.org>
---
 .../expressions/higherOrderFunctions.scala    | 12 +++++++++-
 .../org/apache/spark/sql/functions.scala      | 12 +++++-----
 .../spark/sql/DataFrameFunctionsSuite.scala   | 23 +++++++++++++++++++
 3 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala
index ba447ea4de2ce..a4e069d652b43 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import java.util.Comparator
-import java.util.concurrent.atomic.AtomicReference
+import java.util.concurrent.atomic.{AtomicInteger, AtomicReference}
 
 import scala.collection.mutable
 
@@ -52,6 +52,16 @@ case class UnresolvedNamedLambdaVariable(nameParts: Seq[String])
   override def sql: String = name
 }
 
+object UnresolvedNamedLambdaVariable {
+
+  // Counter to ensure lambda variable names are unique
+  private val nextVarNameId = new AtomicInteger(0)
+
+  def freshVarName(name: String): String = {
+    s"${name}_${nextVarNameId.getAndIncrement()}"
+  }
+}
+
 /**
  * A named lambda variable.
  */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index e6b41cdb3eb18..6bc49b6ec0cf6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -3644,22 +3644,22 @@ object functions {
   }
 
   private def createLambda(f: Column => Column) = {
-    val x = UnresolvedNamedLambdaVariable(Seq("x"))
+    val x = UnresolvedNamedLambdaVariable(Seq(UnresolvedNamedLambdaVariable.freshVarName("x")))
     val function = f(Column(x)).expr
     LambdaFunction(function, Seq(x))
   }
 
   private def createLambda(f: (Column, Column) => Column) = {
-    val x = UnresolvedNamedLambdaVariable(Seq("x"))
-    val y = UnresolvedNamedLambdaVariable(Seq("y"))
+    val x = UnresolvedNamedLambdaVariable(Seq(UnresolvedNamedLambdaVariable.freshVarName("x")))
+    val y = UnresolvedNamedLambdaVariable(Seq(UnresolvedNamedLambdaVariable.freshVarName("y")))
     val function = f(Column(x), Column(y)).expr
     LambdaFunction(function, Seq(x, y))
   }
 
   private def createLambda(f: (Column, Column, Column) => Column) = {
-    val x = UnresolvedNamedLambdaVariable(Seq("x"))
-    val y = UnresolvedNamedLambdaVariable(Seq("y"))
-    val z = UnresolvedNamedLambdaVariable(Seq("z"))
+    val x = UnresolvedNamedLambdaVariable(Seq(UnresolvedNamedLambdaVariable.freshVarName("x")))
+    val y = UnresolvedNamedLambdaVariable(Seq(UnresolvedNamedLambdaVariable.freshVarName("y")))
+    val z = UnresolvedNamedLambdaVariable(Seq(UnresolvedNamedLambdaVariable.freshVarName("z")))
     val function = f(Column(x), Column(y), Column(z)).expr
     LambdaFunction(function, Seq(x, y, z))
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index aa1678e4f82fa..9baad7c026693 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -3629,6 +3629,29 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
       df.select(map(map_entries($"m"), lit(1))),
       Row(Map(Seq(Row(1, "a")) -> 1)))
   }
+
+  test("SPARK-34794: lambda variable name issues in nested functions") {
+    val df1 = Seq((Seq(1, 2), Seq("a", "b"))).toDF("numbers", "letters")
+
+    checkAnswer(df1.select(flatten(transform($"numbers", (number: Column) =>
+      transform($"letters", (letter: Column) =>
+        struct(number, letter))))),
+      Seq(Row(Seq(Row(1, "a"), Row(1, "b"), Row(2, "a"), Row(2, "b"))))
+    )
+    checkAnswer(df1.select(flatten(transform($"numbers", (number: Column, i: Column) =>
+      transform($"letters", (letter: Column, j: Column) =>
+        struct(number + j, concat(letter, i)))))),
+      Seq(Row(Seq(Row(1, "a0"), Row(2, "b0"), Row(2, "a1"), Row(3, "b1"))))
+    )
+
+    val df2 = Seq((Map("a" -> 1, "b" -> 2), Map("a" -> 2, "b" -> 3))).toDF("m1", "m2")
+
+    checkAnswer(df2.select(map_zip_with($"m1", $"m2", (k1: Column, ov1: Column, ov2: Column) =>
+      map_zip_with($"m1", $"m2", (k2: Column, iv1: Column, iv2: Column) =>
+        ov1 + iv1 + ov2 + iv2))),
+      Seq(Row(Map("a" -> Map("a" -> 6, "b" -> 8), "b" -> Map("a" -> 8, "b" -> 10))))
+    )
+  }
 }
 
 object DataFrameFunctionsSuite {

From 80b41db5c635fd09da9feeb2d4a057c17c9550dc Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Fri, 7 May 2021 09:07:57 -0700
Subject: [PATCH 080/169] [SPARK-35288][SQL] StaticInvoke should find the
 method without exact argument classes match

### What changes were proposed in this pull request?

This patch proposes to make StaticInvoke able to find method with given method name even the parameter types do not exactly match to argument classes.

### Why are the changes needed?

Unlike `Invoke`, `StaticInvoke` only tries to get the method with exact argument classes. If the calling method's parameter types are not exactly matched with the argument classes, `StaticInvoke` cannot find the method.

`StaticInvoke` should be able to find the method under the cases too.

### Does this PR introduce _any_ user-facing change?

Yes. `StaticInvoke` can find a method even the argument classes are not exactly matched.

### How was this patch tested?

Unit test.

Closes #32413 from viirya/static-invoke.

Authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
(cherry picked from commit 33fbf5647b4a5587c78ac51339c0cbc9d70547a4)
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 .../expressions/objects/objects.scala         | 56 ++++++++++---------
 .../expressions/ObjectExpressionsSuite.scala  | 34 +++++++++--
 2 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 534d2964f8e53..947665e997b87 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -141,6 +141,34 @@ trait InvokeLike extends Expression with NonSQLExpression {
       }
     }
   }
+
+  final def findMethod(cls: Class[_], functionName: String, argClasses: Seq[Class[_]]): Method = {
+    // Looking with function name + argument classes first.
+    try {
+      cls.getMethod(functionName, argClasses: _*)
+    } catch {
+      case _: NoSuchMethodException =>
+        // For some cases, e.g. arg class is Object, `getMethod` cannot find the method.
+        // We look at function name + argument length
+        val m = cls.getMethods.filter { m =>
+          m.getName == functionName && m.getParameterCount == arguments.length
+        }
+        if (m.isEmpty) {
+          sys.error(s"Couldn't find $functionName on $cls")
+        } else if (m.length > 1) {
+          // More than one matched method signature. Exclude synthetic one, e.g. generic one.
+          val realMethods = m.filter(!_.isSynthetic)
+          if (realMethods.length > 1) {
+            // Ambiguous case, we don't know which method to choose, just fail it.
+            sys.error(s"Found ${realMethods.length} $functionName on $cls")
+          } else {
+            realMethods.head
+          }
+        } else {
+          m.head
+        }
+    }
+  }
 }
 
 /**
@@ -232,7 +260,7 @@ case class StaticInvoke(
   override def children: Seq[Expression] = arguments
 
   lazy val argClasses = ScalaReflection.expressionJavaClasses(arguments)
-  @transient lazy val method = cls.getDeclaredMethod(functionName, argClasses : _*)
+  @transient lazy val method = findMethod(cls, functionName, argClasses)
 
   override def eval(input: InternalRow): Any = {
     invoke(null, method, arguments, input, dataType)
@@ -319,31 +347,7 @@ case class Invoke(
 
   @transient lazy val method = targetObject.dataType match {
     case ObjectType(cls) =>
-      // Looking with function name + argument classes first.
-      try {
-        Some(cls.getMethod(encodedFunctionName, argClasses: _*))
-      } catch {
-        case _: NoSuchMethodException =>
-          // For some cases, e.g. arg class is Object, `getMethod` cannot find the method.
-          // We look at function name + argument length
-          val m = cls.getMethods.filter { m =>
-            m.getName == encodedFunctionName && m.getParameterCount == arguments.length
-          }
-          if (m.isEmpty) {
-            sys.error(s"Couldn't find $encodedFunctionName on $cls")
-          } else if (m.length > 1) {
-            // More than one matched method signature. Exclude synthetic one, e.g. generic one.
-            val realMethods = m.filter(!_.isSynthetic)
-            if (realMethods.length > 1) {
-              // Ambiguous case, we don't know which method to choose, just fail it.
-              sys.error(s"Found ${realMethods.length} $encodedFunctionName on $cls")
-            } else {
-              Some(realMethods.head)
-            }
-          } else {
-            Some(m.head)
-          }
-      }
+      Some(findMethod(cls, encodedFunctionName, argClasses))
     case _ => None
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala
index 328c4f84d58f7..35ceff6931d7e 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ObjectExpressionsSuite.scala
@@ -638,8 +638,22 @@ class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     val clsType = ObjectType(classOf[ConcreteClass])
     val obj = new ConcreteClass
 
+    val input = (1, 2)
     checkObjectExprEvaluation(
-      Invoke(Literal(obj, clsType), "testFunc", IntegerType, Seq(Literal(1))), 0)
+      Invoke(Literal(obj, clsType), "testFunc", IntegerType,
+        Seq(Literal(input, ObjectType(input.getClass)))), 2)
+  }
+
+  test("SPARK-35288: static invoke should find method without exact param type match") {
+    val input = (1, 2)
+
+    checkObjectExprEvaluation(
+      StaticInvoke(TestStaticInvoke.getClass, IntegerType, "func",
+        Seq(Literal(input, ObjectType(input.getClass)))), 3)
+
+    checkObjectExprEvaluation(
+      StaticInvoke(TestStaticInvoke.getClass, IntegerType, "func",
+        Seq(Literal(1, IntegerType))), -1)
   }
 }
 
@@ -652,10 +666,22 @@ class TestBean extends Serializable {
     assert(i != null, "this setter should not be called with null.")
 }
 
+object TestStaticInvoke {
+  def func(param: Any): Int = param match {
+    case pair: Tuple2[_, _] =>
+      pair.asInstanceOf[Tuple2[Int, Int]]._1 + pair.asInstanceOf[Tuple2[Int, Int]]._2
+    case _ => -1
+  }
+}
+
 abstract class BaseClass[T] {
-  def testFunc(param: T): T
+  def testFunc(param: T): Int
 }
 
-class ConcreteClass extends BaseClass[Int] with Serializable {
-  override def testFunc(param: Int): Int = param - 1
+class ConcreteClass extends BaseClass[Product] with Serializable {
+  override def testFunc(param: Product): Int = param match {
+    case _: Tuple2[_, _] => 2
+    case _: Tuple3[_, _, _] => 3
+    case _ => 4
+  }
 }

From da706704507f956045f48fc757a5d80f8a3dae16 Mon Sep 17 00:00:00 2001
From: fhygh <283452027@qq.com>
Date: Tue, 18 May 2021 00:13:40 +0800
Subject: [PATCH 081/169] [SPARK-35359][SQL] Insert data with char/varchar
 datatype will fail when data length exceed length limitation

### What changes were proposed in this pull request?
This PR is used to fix this bug:

```
set spark.sql.legacy.charVarcharAsString=true;
create table chartb01(a char(3));
insert into chartb01 select 'aaaaa';
```

here we expect the data of table chartb01 is 'aaa', but it runs failed.

### Why are the changes needed?
Improve backward compatibility

```
spark-sql>
         > create table tchar01(col char(2)) using parquet;
Time taken: 0.767 seconds
spark-sql>
         > insert into tchar01 select 'aaa';
ERROR | Executor task launch worker for task 0.0 in stage 0.0 (TID 0) | Aborting task | org.apache.spark.util.Utils.logError(Logging.scala:94)
java.lang.RuntimeException: Exceeds char/varchar type length limitation: 2
        at org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils.trimTrailingSpaces(CharVarcharCodegenUtils.java:31)
        at org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils.charTypeWriteSideCheck(CharVarcharCodegenUtils.java:44)
        at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.project_doConsume_0$(Unknown Source)
        at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
        at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
        at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
        at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:279)
        at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1500)
        at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:288)
        at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$write$15(FileFormatWriter.scala:212)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
        at org.apache.spark.scheduler.Task.run(Task.scala:131)
        at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
        at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1466)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)
```

### Does this PR introduce _any_ user-facing change?
No (the legacy config is false by default).

### How was this patch tested?
Added unit tests.

Closes #32501 from fhygh/master.

Authored-by: fhygh <283452027@qq.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 3a3f8ca6f421b9bc51e0059c954262489aa41f5d)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../analysis/TableOutputResolver.scala        |  6 +++-
 .../spark/sql/util/PartitioningUtils.scala    | 36 ++++++++++---------
 .../spark/sql/CharVarcharTestSuite.scala      | 12 +++++++
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala
index d5c407b47c5be..32bdb82b3b68e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/TableOutputResolver.scala
@@ -100,7 +100,11 @@ object TableOutputResolver {
         case _ =>
           Cast(queryExpr, tableAttr.dataType, Option(conf.sessionLocalTimeZone))
       }
-      val exprWithStrLenCheck = CharVarcharUtils.stringLengthCheck(casted, tableAttr)
+      val exprWithStrLenCheck = if (conf.charVarcharAsString) {
+        casted
+      } else {
+        CharVarcharUtils.stringLengthCheck(casted, tableAttr)
+      }
       // Renaming is needed for handling the following cases like
       // 1) Column names/types do not match, e.g., INSERT INTO TABLE tab1 SELECT 1, 2
       // 2) Target tables have column metadata
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/PartitioningUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/PartitioningUtils.scala
index 004725487911a..cf30c713d8d0c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/PartitioningUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/PartitioningUtils.scala
@@ -23,6 +23,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.DEFAULT_PARTITION_NAME
 import org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils
 import org.apache.spark.sql.catalyst.util.CharVarcharUtils
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{CharType, StructType, VarcharType}
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -44,23 +45,24 @@ private[sql] object PartitioningUtils {
         throw new AnalysisException(s"$key is not a valid partition column in table $tblName.")
       }
 
-      val normalizedVal = normalizedFiled.dataType match {
-        case CharType(len) if value != null && value != DEFAULT_PARTITION_NAME =>
-          val v = value match {
-            case Some(str: String) => Some(charTypeWriteSideCheck(str, len))
-            case str: String => charTypeWriteSideCheck(str, len)
-            case other => other
-          }
-          v.asInstanceOf[T]
-        case VarcharType(len) if value != null && value != DEFAULT_PARTITION_NAME =>
-          val v = value match {
-            case Some(str: String) => Some(varcharTypeWriteSideCheck(str, len))
-            case str: String => varcharTypeWriteSideCheck(str, len)
-            case other => other
-          }
-          v.asInstanceOf[T]
-        case _ => value
-      }
+      val normalizedVal =
+        if (SQLConf.get.charVarcharAsString) value else normalizedFiled.dataType match {
+          case CharType(len) if value != null && value != DEFAULT_PARTITION_NAME =>
+            val v = value match {
+              case Some(str: String) => Some(charTypeWriteSideCheck(str, len))
+              case str: String => charTypeWriteSideCheck(str, len)
+              case other => other
+            }
+            v.asInstanceOf[T]
+          case VarcharType(len) if value != null && value != DEFAULT_PARTITION_NAME =>
+            val v = value match {
+              case Some(str: String) => Some(varcharTypeWriteSideCheck(str, len))
+              case str: String => varcharTypeWriteSideCheck(str, len)
+              case other => other
+            }
+            v.asInstanceOf[T]
+          case _ => value
+        }
       normalizedFiled.name -> normalizedVal
     }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
index 76f7f42d66923..70e69843f743c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
@@ -806,6 +806,18 @@ class FileSourceCharVarcharTestSuite extends CharVarcharTestSuite with SharedSpa
     }
   }
 
+  test("SPARK-35359: create table and insert data over length values") {
+    Seq("char", "varchar").foreach { typ =>
+      withSQLConf((SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING.key, "true")) {
+        withTable("t") {
+          sql(s"CREATE TABLE t (col $typ(2)) using $format")
+          sql("INSERT INTO t SELECT 'aaa'")
+          checkAnswer(sql("select * from t"), Row("aaa"))
+        }
+      }
+    }
+  }
+
   test("alter table set location w/ fit length values") {
     Seq("char", "varchar").foreach { typ =>
       withTempPath { dir =>

From 53c98d85850067e418f210bacc9a1cc3a3ee8e02 Mon Sep 17 00:00:00 2001
From: Tengfei Huang <tengfei.h@gmail.com>
Date: Tue, 18 May 2021 23:20:12 +0800
Subject: [PATCH 082/169] [SPARK-35411][SQL] Add essential information while
 serializing TreeNode to json

### What changes were proposed in this pull request?
Write out Seq of product objects which contain TreeNode, to avoid the cases as described in https://issues.apache.org/jira/browse/SPARK-35411 that essential information will be ignored and just written out as null values. These information are necessary to understand the query plans.

### Why are the changes needed?
Information like cteRelations in With node, and branches in CaseWhen expression are necessary to understand the query plans, they should be written out to the result json string.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
UT case added.

Closes #32557 from ivoson/plan-json-fix.

Authored-by: Tengfei Huang <tengfei.h@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 9804f07c17af6d8e789f729d5872b85740cc3186)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/trees/TreeNode.scala   | 10 ++++++---
 .../sql/catalyst/trees/TreeNodeSuite.scala    | 21 +++++++++++++++++++
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 75a1f712671ee..60f9d3a97a35f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -787,9 +787,10 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
         ("deserialized" -> s.deserialized) ~ ("replication" -> s.replication)
     case n: TreeNode[_] => n.jsonValue
     case o: Option[_] => o.map(parseToJson)
-    // Recursive scan Seq[TreeNode], Seq[Partitioning], Seq[DataType]
-    case t: Seq[_] if t.forall(_.isInstanceOf[TreeNode[_]]) ||
-      t.forall(_.isInstanceOf[Partitioning]) || t.forall(_.isInstanceOf[DataType]) =>
+    // Recursive scan Seq[Partitioning], Seq[DataType], Seq[Product]
+    case t: Seq[_] if t.forall(_.isInstanceOf[Partitioning]) ||
+      t.forall(_.isInstanceOf[DataType]) ||
+      t.forall(_.isInstanceOf[Product]) =>
       JArray(t.map(parseToJson).toList)
     case t: Seq[_] if t.length > 0 && t.head.isInstanceOf[String] =>
       JString(truncatedString(t, "[", ", ", "]", SQLConf.get.maxToStringFields))
@@ -827,6 +828,9 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
     case broadcast: BroadcastMode => true
     case table: CatalogTableType => true
     case storage: CatalogStorageFormat => true
+    // Write out product that contains TreeNode, since there are some Tuples such as cteRelations
+    // in With, branches in CaseWhen which are essential to understand the plan.
+    case p if p.productIterator.exists(_.isInstanceOf[TreeNode[_]]) => true
     case _ => false
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index 4ad8475a0113c..d837af70ae104 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -594,6 +594,27 @@ class TreeNodeSuite extends SparkFunSuite with SQLHelper {
           "class" -> classOf[JsonTestTreeNode].getName,
           "num-children" -> 0,
           "arg" -> "1")))
+
+    // Convert Seq of Product contains TreeNode to JSON.
+    assertJSON(
+      Seq(("a", JsonTestTreeNode("0")), ("b", JsonTestTreeNode("1"))),
+      List(
+        JObject(
+          "product-class" -> "scala.Tuple2",
+          "_1" -> "a",
+          "_2" -> List(JObject(
+            "class" -> classOf[JsonTestTreeNode].getName,
+            "num-children" -> 0,
+            "arg" -> "0"
+          ))),
+        JObject(
+          "product-class" -> "scala.Tuple2",
+          "_1" -> "b",
+          "_2" -> List(JObject(
+            "class" -> classOf[JsonTestTreeNode].getName,
+            "num-children" -> 0,
+            "arg" -> "1"
+          )))))
   }
 
   test("toJSON should not throws java.lang.StackOverflowError") {

From 18c468b5ce457b579de8d0d2cea77cb1a6d44610 Mon Sep 17 00:00:00 2001
From: Yuzhou Sun <yuzhosun@amazon.com>
Date: Wed, 19 May 2021 15:46:27 +0800
Subject: [PATCH 083/169] [SPARK-35106][CORE][SQL] Avoid failing rename caused
 by destination directory not exist

### What changes were proposed in this pull request?

1. In HadoopMapReduceCommitProtocol, create parent directory before renaming custom partition path staging files
2. In InMemoryCatalog and HiveExternalCatalog, create new partition directory before renaming old partition path
3. Check return value of FileSystem#rename, if false, throw exception to avoid silent data loss cause by rename failure
4. Change DebugFilesystem#rename behavior to make it match HDFS's behavior (return false without rename when dst parent directory not exist)

### Why are the changes needed?

Depends on FileSystem#rename implementation, when destination directory does not exist, file system may
1. return false without renaming file nor throwing exception (e.g. HDFS), or
2. create destination directory, rename files, and return true (e.g. LocalFileSystem)

In the first case above, renames in HadoopMapReduceCommitProtocol for custom partition path will fail silently if the destination partition path does not exist. Failed renames can happen when
1. dynamicPartitionOverwrite == true, the custom partition path directories are deleted by the job before the rename; or
2. the custom partition path directories do not exist before the job; or
3. something else is wrong when file system handle `rename`

The renames in MemoryCatalog and HiveExternalCatalog for partition renaming also have similar issue.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Modified DebugFilesystem#rename, and added new unit tests.

Without the fix in src code, five InsertSuite tests and one AlterTableRenamePartitionSuite test failed:
InsertSuite.SPARK-20236: dynamic partition overwrite with custom partition path (existing test with modified FS)
```
== Results ==
!== Correct Answer - 1 ==   == Spark Answer - 0 ==
struct<>                   struct<>
![2,1,1]
```

InsertSuite.SPARK-35106: insert overwrite with custom partition path
```
== Results ==
!== Correct Answer - 1 ==   == Spark Answer - 0 ==
struct<>                   struct<>
![2,1,1]
```

InsertSuite.SPARK-35106: dynamic partition overwrite with custom partition path
```
== Results ==
!== Correct Answer - 2 ==   == Spark Answer - 1 ==
!struct<>                   struct<i:int,part1:int,part2:int>
 [1,1,1]                    [1,1,1]
![1,1,2]
```

InsertSuite.SPARK-35106: Throw exception when rename custom partition paths returns false
```
Expected exception org.apache.spark.SparkException to be thrown, but no exception was thrown
```

InsertSuite.SPARK-35106: Throw exception when rename dynamic partition paths returns false
```
Expected exception org.apache.spark.SparkException to be thrown, but no exception was thrown
```

AlterTableRenamePartitionSuite.ALTER TABLE .. RENAME PARTITION V1: multi part partition (existing test with modified FS)
```
== Results ==
!== Correct Answer - 1 ==   == Spark Answer - 0 ==
 struct<>                   struct<>
![3,123,3]
```

Closes #32530 from YuzhouSun/SPARK-35106.

Authored-by: Yuzhou Sun <yuzhosun@amazon.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit a72d05c7e632fbb0d8a6082c3cacdf61f36518b4)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../io/HadoopMapReduceCommitProtocol.scala    |  19 ++-
 .../org/apache/spark/DebugFilesystem.scala    |  14 ++-
 .../catalyst/catalog/InMemoryCatalog.scala    |   6 +-
 .../spark/sql/sources/InsertSuite.scala       | 116 +++++++++++++++++-
 .../spark/sql/hive/HiveExternalCatalog.scala  |   6 +-
 5 files changed, 151 insertions(+), 10 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
index 30f9a650a69c9..c061d617fce4b 100644
--- a/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
+++ b/core/src/main/scala/org/apache/spark/internal/io/HadoopMapReduceCommitProtocol.scala
@@ -188,13 +188,18 @@ class HadoopMapReduceCommitProtocol(
 
       val filesToMove = allAbsPathFiles.foldLeft(Map[String, String]())(_ ++ _)
       logDebug(s"Committing files staged for absolute locations $filesToMove")
+      val absParentPaths = filesToMove.values.map(new Path(_).getParent).toSet
       if (dynamicPartitionOverwrite) {
-        val absPartitionPaths = filesToMove.values.map(new Path(_).getParent).toSet
-        logDebug(s"Clean up absolute partition directories for overwriting: $absPartitionPaths")
-        absPartitionPaths.foreach(fs.delete(_, true))
+        logDebug(s"Clean up absolute partition directories for overwriting: $absParentPaths")
+        absParentPaths.foreach(fs.delete(_, true))
       }
+      logDebug(s"Create absolute parent directories: $absParentPaths")
+      absParentPaths.foreach(fs.mkdirs)
       for ((src, dst) <- filesToMove) {
-        fs.rename(new Path(src), new Path(dst))
+        if (!fs.rename(new Path(src), new Path(dst))) {
+          throw new IOException(s"Failed to rename $src to $dst when committing files staged for " +
+            s"absolute locations")
+        }
       }
 
       if (dynamicPartitionOverwrite) {
@@ -213,7 +218,11 @@ class HadoopMapReduceCommitProtocol(
             // a parent that exists, otherwise we may get unexpected result on the rename.
             fs.mkdirs(finalPartPath.getParent)
           }
-          fs.rename(new Path(stagingDir, part), finalPartPath)
+          val stagingPartPath = new Path(stagingDir, part)
+          if (!fs.rename(stagingPartPath, finalPartPath)) {
+            throw new IOException(s"Failed to rename $stagingPartPath to $finalPartPath when " +
+              s"committing files staged for overwriting dynamic partitions")
+          }
         }
       }
 
diff --git a/core/src/test/scala/org/apache/spark/DebugFilesystem.scala b/core/src/test/scala/org/apache/spark/DebugFilesystem.scala
index 1d3e28b39548f..8f220801f41e3 100644
--- a/core/src/test/scala/org/apache/spark/DebugFilesystem.scala
+++ b/core/src/test/scala/org/apache/spark/DebugFilesystem.scala
@@ -57,8 +57,14 @@ object DebugFilesystem extends Logging {
 }
 
 /**
- * DebugFilesystem wraps file open calls to track all open connections. This can be used in tests
- * to check that connections are not leaked.
+ * DebugFilesystem wraps
+ *     1) file open calls to track all open connections. This can be used in tests to check that
+ *        connections are not leaked;
+ *     2) rename calls to return false when destination's parent path does not exist. When
+ *        destination parent does not exist, LocalFileSystem uses FileUtil#copy to copy the
+ *        file and returns true if succeed, while many other hadoop file systems (e.g. HDFS, S3A)
+ *        return false without renaming any file. This helps to test that Spark can work with the
+ *        latter file systems.
  */
 // TODO(ekl) we should consider always interposing this to expose num open conns as a metric
 class DebugFilesystem extends LocalFileSystem {
@@ -120,4 +126,8 @@ class DebugFilesystem extends LocalFileSystem {
       override def hashCode(): Int = wrapped.hashCode()
     }
   }
+
+  override def rename(src: Path, dst: Path): Boolean = {
+    exists(dst.getParent) && super.rename(src, dst)
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
index 08b54fc7538ea..5809751967cb3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala
@@ -499,7 +499,11 @@ class InMemoryCatalog(
           newSpec, partitionColumnNames, tablePath)
         try {
           val fs = tablePath.getFileSystem(hadoopConfig)
-          fs.rename(oldPartPath, newPartPath)
+          fs.mkdirs(newPartPath)
+          if(!fs.rename(oldPartPath, newPartPath)) {
+            throw new IOException(s"Renaming partition path from $oldPartPath to " +
+              s"$newPartPath returned false")
+          }
         } catch {
           case e: IOException =>
             throw new SparkException(s"Unable to rename partition path $oldPartPath", e)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
index 4513ef6f2bdd3..1d0a57a216c32 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/InsertSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.sources
 
-import java.io.File
+import java.io.{File, IOException}
 import java.sql.Date
 
 import org.apache.hadoop.fs.{FileAlreadyExistsException, FSDataOutputStream, Path, RawLocalFileSystem}
@@ -950,6 +950,110 @@ class InsertSuite extends DataSourceTest with SharedSparkSession {
       checkAnswer(spark.table("t2"), Nil)
     }
   }
+
+  test("SPARK-35106: insert overwrite with custom partition path") {
+    withTempPath { path =>
+      withTable("t") {
+      sql(
+        """
+          |create table t(i int, part1 int, part2 int) using parquet
+          |partitioned by (part1, part2)
+        """.stripMargin)
+
+        sql(s"alter table t add partition(part1=1, part2=1) location '${path.getAbsolutePath}'")
+        sql(s"insert into t partition(part1=1, part2=1) select 1")
+        checkAnswer(spark.table("t"), Row(1, 1, 1))
+
+        sql("insert overwrite table t partition(part1=1, part2=1) select 2")
+        checkAnswer(spark.table("t"), Row(2, 1, 1))
+
+        sql("insert overwrite table t partition(part1=2, part2) select 2, 2")
+        checkAnswer(spark.table("t"), Row(2, 1, 1) :: Row(2, 2, 2) :: Nil)
+
+        sql("insert overwrite table t partition(part1=1, part2=2) select 3")
+        checkAnswer(spark.table("t"), Row(2, 1, 1) :: Row(2, 2, 2) :: Row(3, 1, 2) :: Nil)
+
+        sql("insert overwrite table t partition(part1=1, part2) select 4, 1")
+        checkAnswer(spark.table("t"), Row(4, 1, 1) :: Row(2, 2, 2) :: Nil)
+      }
+    }
+  }
+
+  test("SPARK-35106: dynamic partition overwrite with custom partition path") {
+    withSQLConf(SQLConf.PARTITION_OVERWRITE_MODE.key -> PartitionOverwriteMode.DYNAMIC.toString) {
+      withTempPath { path =>
+        withTable("t") {
+          sql(
+            """
+              |create table t(i int, part1 int, part2 int) using parquet
+              |partitioned by (part1, part2)
+            """.stripMargin)
+
+          sql(s"insert into t partition(part1=1, part2=1) select 1")
+          checkAnswer(spark.table("t"), Row(1, 1, 1))
+
+          sql(s"alter table t add partition(part1=1, part2=2) location '${path.getAbsolutePath}'")
+
+          // dynamic partition overwrite to empty custom partition
+          sql(s"insert overwrite table t partition(part1=1, part2=2) select 1")
+          checkAnswer(spark.table("t"), Row(1, 1, 1) :: Row(1, 1, 2) :: Nil)
+
+          // dynamic partition overwrite to non-empty custom partition
+          sql("insert overwrite table t partition(part1=1, part2=2) select 2")
+          checkAnswer(spark.table("t"), Row(1, 1, 1) :: Row(2, 1, 2) :: Nil)
+        }
+      }
+    }
+  }
+
+  test("SPARK-35106: Throw exception when rename custom partition paths returns false") {
+    withSQLConf(
+      "fs.file.impl" -> classOf[RenameFromSparkStagingToFinalDirAlwaysTurnsFalseFilesystem].getName,
+      "fs.file.impl.disable.cache" -> "true") {
+      withTempPath { path =>
+        withTable("t") {
+          sql(
+            """
+              |create table t(i int, part1 int, part2 int) using parquet
+              |partitioned by (part1, part2)
+            """.stripMargin)
+
+          sql(s"alter table t add partition(part1=1, part2=1) location '${path.getAbsolutePath}'")
+
+          val e = intercept[SparkException] {
+            sql(s"insert into t partition(part1=1, part2=1) select 1")
+          }.getCause
+          assert(e.isInstanceOf[IOException])
+          assert(e.getMessage.contains("Failed to rename"))
+          assert(e.getMessage.contains("when committing files staged for absolute location"))
+        }
+      }
+    }
+  }
+
+  test("SPARK-35106: Throw exception when rename dynamic partition paths returns false") {
+    withSQLConf(
+      "fs.file.impl" -> classOf[RenameFromSparkStagingToFinalDirAlwaysTurnsFalseFilesystem].getName,
+      "fs.file.impl.disable.cache" -> "true",
+      SQLConf.PARTITION_OVERWRITE_MODE.key -> PartitionOverwriteMode.DYNAMIC.toString) {
+
+      withTable("t") {
+        sql(
+          """
+            |create table t(i int, part1 int, part2 int) using parquet
+            |partitioned by (part1, part2)
+          """.stripMargin)
+
+        val e = intercept[SparkException] {
+          sql(s"insert overwrite table t partition(part1, part2) values (1, 1, 1)")
+        }.getCause
+        assert(e.isInstanceOf[IOException])
+        assert(e.getMessage.contains("Failed to rename"))
+        assert(e.getMessage.contains(
+          "when committing files staged for overwriting dynamic partitions"))
+      }
+    }
+  }
 }
 
 class FileExistingTestFileSystem extends RawLocalFileSystem {
@@ -962,3 +1066,13 @@ class FileExistingTestFileSystem extends RawLocalFileSystem {
     throw new FileAlreadyExistsException(s"${f.toString} already exists")
   }
 }
+
+class RenameFromSparkStagingToFinalDirAlwaysTurnsFalseFilesystem extends RawLocalFileSystem {
+  override def rename(src: Path, dst: Path): Boolean = {
+    (!isSparkStagingDir(src) || isSparkStagingDir(dst)) && super.rename(src, dst)
+  }
+
+  private def isSparkStagingDir(path: Path): Boolean = {
+    path.toString.contains(".spark-staging-")
+  }
+}
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 46ebcb72bbec5..019718cc53a95 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -1081,7 +1081,11 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
         // scalastyle:on caselocale
         val actualPartitionPath = new Path(currentFullPath, actualPartitionString)
         try {
-          fs.rename(actualPartitionPath, expectedPartitionPath)
+          fs.mkdirs(expectedPartitionPath)
+          if(!fs.rename(actualPartitionPath, expectedPartitionPath)) {
+            throw new IOException(s"Renaming partition path from $actualPartitionPath to " +
+              s"$expectedPartitionPath returned false")
+          }
         } catch {
           case e: IOException =>
             throw new SparkException("Unable to rename partition path from " +

From fd47c5dea62f834e8f733918be848ee42d21d248 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove73@gmail.com>
Date: Wed, 19 May 2021 07:45:26 -0500
Subject: [PATCH 084/169] [SPARK-35093][SQL] AQE now uses newQueryStage plan as
 key for looking up cached exchanges for re-use

### What changes were proposed in this pull request?
AQE has an optimization where it attempts to reuse compatible exchanges but it does not take into account whether the exchanges are columnar or not, resulting in incorrect reuse under some circumstances.

This PR simply changes the key used to lookup cached stages. It now uses the canonicalized form of the new query stage (potentially created by a plugin) rather than using the canonicalized form of the original exchange.

### Why are the changes needed?
When using the [RAPIDS Accelerator for Apache Spark](https://github.com/NVIDIA/spark-rapids) we sometimes see a new query stage correctly create a row-based exchange and then Spark replaces it with a cached columnar exchange, which is not compatible, and this causes queries to fail.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
The patch has been tested with the query that highlighted this issue. I looked at writing unit tests for this but it would involve implementing a mock columnar exchange in the tests so would be quite a bit of work. If anyone has ideas on other ways to test this I am happy to hear them.

Closes #32195 from andygrove/SPARK-35093.

Authored-by: Andy Grove <andygrove73@gmail.com>
Signed-off-by: Thomas Graves <tgraves@apache.org>
(cherry picked from commit 52e3cf9ff50b4209e29cb06df09b1ef3a18bc83b)
Signed-off-by: Thomas Graves <tgraves@apache.org>
---
 .../spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
index 41048fc16235e..bb0ffaa2b819f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -419,7 +419,8 @@ case class AdaptiveSparkPlanExec(
               // Check the `stageCache` again for reuse. If a match is found, ditch the new stage
               // and reuse the existing stage found in the `stageCache`, otherwise update the
               // `stageCache` with the new stage.
-              val queryStage = context.stageCache.getOrElseUpdate(e.canonicalized, newStage)
+              val queryStage = context.stageCache.getOrElseUpdate(
+                newStage.plan.canonicalized, newStage)
               if (queryStage.ne(newStage)) {
                 newStage = reuseQueryStage(queryStage, e)
               }

From 84c53cb2c1fdc916d4238fe2e024bbe96adfc1b0 Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.com>
Date: Tue, 25 May 2021 00:26:10 +0800
Subject: [PATCH 085/169] [SPARK-35287][SQL] Allow RemoveRedundantProjects to
 preserve ProjectExec which generates UnsafeRow for DataSourceV2ScanRelation

### What changes were proposed in this pull request?

This PR fixes an issue that `RemoveRedundantProjects` removes `ProjectExec` which is for generating `UnsafeRow`.
In `DataSourceV2Strategy`, `ProjectExec` will be inserted to ensure internal rows are `UnsafeRow`.

```
  private def withProjectAndFilter(
      project: Seq[NamedExpression],
      filters: Seq[Expression],
      scan: LeafExecNode,
      needsUnsafeConversion: Boolean): SparkPlan = {
    val filterCondition = filters.reduceLeftOption(And)
    val withFilter = filterCondition.map(FilterExec(_, scan)).getOrElse(scan)

    if (withFilter.output != project || needsUnsafeConversion) {
      ProjectExec(project, withFilter)
    } else {
      withFilter
    }
  }
...
    case PhysicalOperation(project, filters, relation: DataSourceV2ScanRelation) =>
      // projection and filters were already pushed down in the optimizer.
      // this uses PhysicalOperation to get the projection and ensure that if the batch scan does
      // not support columnar, a projection is added to convert the rows to UnsafeRow.
      val batchExec = BatchScanExec(relation.output, relation.scan)
      withProjectAndFilter(project, filters, batchExec, !batchExec.supportsColumnar) :: Nil
```
So, the hierarchy of the partial tree should be like `ProjectExec(FilterExec(BatchScan))`.
But `RemoveRedundantProjects` doesn't consider this type of hierarchy, leading `ClassCastException`.

A concreate example to reproduce this issue is reported:
```
import scala.collection.JavaConverters._

import org.apache.iceberg.{PartitionSpec, TableProperties}
import org.apache.iceberg.hadoop.HadoopTables
import org.apache.iceberg.spark.SparkSchemaUtil
import org.apache.spark.sql.{DataFrame, QueryTest, SparkSession}
import org.apache.spark.sql.internal.SQLConf

class RemoveRedundantProjectsTest extends QueryTest {
  override val spark: SparkSession = SparkSession
    .builder()
    .master("local[4]")
    .config("spark.driver.bindAddress", "127.0.0.1")
    .appName(suiteName)
    .getOrCreate()
  test("RemoveRedundantProjects removes non-redundant projects") {
    withSQLConf(
      SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1",
      SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false",
      SQLConf.REMOVE_REDUNDANT_PROJECTS_ENABLED.key -> "true") {
      withTempDir { dir =>
        val path = dir.getCanonicalPath
        val data = spark.range(3).toDF
        val table = new HadoopTables().create(
          SparkSchemaUtil.convert(data.schema),
          PartitionSpec.unpartitioned(),
          Map(TableProperties.WRITE_NEW_DATA_LOCATION -> path).asJava,
          path)
        data.write.format("iceberg").mode("overwrite").save(path)
        table.refresh()

        val df = spark.read.format("iceberg").load(path)
        val dfX = df.as("x")
        val dfY = df.as("y")
        val join = dfX.filter(dfX("id") > 0).join(dfY, "id")
        join.explain("extended")
        assert(join.count() == 2)
      }
    }
  }
}
```
```
[info] - RemoveRedundantProjects removes non-redundant projects *** FAILED ***
[info]   org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 1.0 failed 1 times, most recent failure: Lost task 0.0 in stage 1.0 (TID 4) (xeroxms100.northamerica.corp.microsoft.com executor driver): java.lang.ClassCastException: org.apache.spark.sql.catalyst.expressions.GenericInternalRow cannot be cast to org.apache.spark.sql.catalyst.expressions.UnsafeRow
[info]  at org.apache.spark.sql.execution.UnsafeExternalRowSorter.sort(UnsafeExternalRowSorter.java:226)
[info]  at org.apache.spark.sql.execution.SortExec.$anonfun$doExecute$1(SortExec.scala:119)
```

### Why are the changes needed?

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

New test.

Closes #32606 from sarutak/fix-project-removal-issue.

Authored-by: Kousuke Saruta <sarutak@oss.nttdata.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit d4fb98354a24e6343e8be66543c76cb445ec3a2c)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../execution/RemoveRedundantProjects.scala   |  1 +
 .../RemoveRedundantProjectsSuite.scala        | 20 +++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala
index bfb6e805c0541..1520b486b96d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala
@@ -97,6 +97,7 @@ object RemoveRedundantProjects extends Rule[SparkPlan] {
       // If a DataSourceV2ScanExec node does not support columnar, a ProjectExec node is required
       // to convert the rows to UnsafeRow. See DataSourceV2Strategy for more details.
       case d: DataSourceV2ScanExecBase if !d.supportsColumnar => false
+      case FilterExec(_, d: DataSourceV2ScanExecBase) if !d.supportsColumnar => false
       case _ =>
         if (requireOrdering) {
           project.output.map(_.exprId.id) == child.output.map(_.exprId.id) &&
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantProjectsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantProjectsSuite.scala
index 040c5189abcb6..ab8cd96a1742a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantProjectsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantProjectsSuite.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution
 
 import org.apache.spark.sql.{DataFrame, QueryTest, Row}
+import org.apache.spark.sql.connector.SimpleWritableDataSource
 import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanHelper, DisableAdaptiveExecutionSuite, EnableAdaptiveExecutionSuite}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
@@ -28,6 +29,7 @@ abstract class RemoveRedundantProjectsSuiteBase
   extends QueryTest
     with SharedSparkSession
     with AdaptiveSparkPlanHelper {
+  import testImplicits._
 
   private def assertProjectExecCount(df: DataFrame, expected: Int): Unit = {
     withClue(df.queryExecution) {
@@ -212,6 +214,24 @@ abstract class RemoveRedundantProjectsSuiteBase
         |LIMIT 10
         |""".stripMargin
     assertProjectExec(query, 0, 3)
+
+  }
+
+  Seq("true", "false").foreach { codegenEnabled =>
+    test("SPARK-35287: project generating unsafe row for DataSourceV2ScanRelation " +
+      s"should not be removed (codegen=$codegenEnabled)") {
+      withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegenEnabled) {
+        withTempPath { path =>
+          val format = classOf[SimpleWritableDataSource].getName
+          spark.range(3).select($"id" as "i", $"id" as "j")
+            .write.format(format).mode("overwrite").save(path.getCanonicalPath)
+
+          val df =
+            spark.read.format(format).load(path.getCanonicalPath).filter($"i" > 0).orderBy($"i")
+          assert(df.collect === Array(Row(1, 1), Row(2, 2)))
+        }
+      }
+    }
   }
 }
 

From 81ad2a693ab362f43ac31dee224628046249c3ce Mon Sep 17 00:00:00 2001
From: Adam Binford <adamq43@gmail.com>
Date: Mon, 24 May 2021 12:53:30 -0700
Subject: [PATCH 086/169] [SPARK-35449][SQL][3.1] Only extract common
 expressions from CaseWhen values if elseValue is set

### What changes were proposed in this pull request?

This PR fixes a bug with subexpression elimination for CaseWhen statements. https://github.com/apache/spark/pull/30245 added support for creating subexpressions that are present in all branches of conditional statements. However, for a statement to be in "all branches" of a CaseWhen statement, it must also be in the elseValue.

### Why are the changes needed?

Fix a bug where a subexpression can be created and run for branches of a conditional that don't pass. This can cause issues especially with a UDF in a branch that gets executed assuming the condition is true.

### Does this PR introduce _any_ user-facing change?

Yes, fixes a potential bug where a UDF could be eagerly executed even though it might expect to have already passed some form of validation. For example:
```
val col = when($"id" < 0, myUdf($"id"))
spark.range(1).select(when(col > 0, col)).show()
```

`myUdf($"id")` is considered a subexpression and eagerly evaluated, because it is pulled out as a common expression from both executions of the when clause, but if `id >= 0` it should never actually be run.

### How was this patch tested?

Updated existing test with new case.

Closes #32651 from Kimahriman/bug-case-subexpr-elimination-3.1.

Authored-by: Adam Binford <adamq43@gmail.com>
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 .../expressions/EquivalentExpressions.scala    |  8 +++++++-
 .../SubexpressionEliminationSuite.scala        | 18 +++++++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
index 1dfff412d9a8e..d03dd53f31e6d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
@@ -128,7 +128,13 @@ class EquivalentExpressions {
       // a subexpression among values doesn't need to be in conditions because no matter which
       // condition is true, it will be evaluated.
       val conditions = c.branches.tail.map(_._1)
-      val values = c.branches.map(_._2) ++ c.elseValue
+      // For an expression to be in all branch values of a CaseWhen statement, it must also be in
+      // the elseValue.
+      val values = if (c.elseValue.nonEmpty) {
+        c.branches.map(_._2) ++ c.elseValue
+      } else {
+        Nil
+      }
       Seq(conditions, values)
     case c: Coalesce => Seq(c.children.tail)
     case _ => Nil
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubexpressionEliminationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubexpressionEliminationSuite.scala
index 65671d253dc53..dd2162e279234 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubexpressionEliminationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubexpressionEliminationSuite.scala
@@ -209,7 +209,7 @@ class SubexpressionEliminationSuite extends SparkFunSuite with ExpressionEvalHel
       (GreaterThan(add2, Literal(4)), add1) ::
       (GreaterThan(add2, Literal(5)), add1) :: Nil
 
-    val caseWhenExpr2 = CaseWhen(conditions2, None)
+    val caseWhenExpr2 = CaseWhen(conditions2, add1)
     val equivalence2 = new EquivalentExpressions
     equivalence2.addExprTree(caseWhenExpr2)
 
@@ -309,6 +309,22 @@ class SubexpressionEliminationSuite extends SparkFunSuite with ExpressionEvalHel
       CodeGenerator.compile(code)
     }
   }
+
+  test("SPARK-35499: Subexpressions should only be extracted from CaseWhen values with an "
+    + "elseValue") {
+    val add1 = Add(Literal(1), Literal(2))
+    val add2 = Add(Literal(2), Literal(3))
+    val conditions = (GreaterThan(add1, Literal(3)), add1) ::
+      (GreaterThan(add2, Literal(4)), add1) ::
+      (GreaterThan(add2, Literal(5)), add1) :: Nil
+
+    val caseWhenExpr = CaseWhen(conditions, None)
+    val equivalence = new EquivalentExpressions
+    equivalence.addExprTree(caseWhenExpr)
+
+    // `add1` is not in the elseValue, so we can't extract it from the branches
+    assert(equivalence.getAllEquivalentExprs.count(_.size == 2) == 0)
+  }
 }
 
 case class CodegenFallbackExpression(child: Expression)

From 3e01a0198693ae7e5228491ba294d91ceea051cc Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Fri, 28 May 2021 13:04:55 +0000
Subject: [PATCH 087/169] [SPARK-35454][SQL][3.1] One LogicalPlan can match
 multiple dataset ids

### What changes were proposed in this pull request?

Change the type of `DATASET_ID_TAG` from `Long` to `HashSet[Long]` to allow the logical plan to match multiple datasets.

### Why are the changes needed?

During the transformation from one Dataset to another Dataset, the DATASET_ID_TAG of logical plan won't change if the plan itself doesn't change:

https://github.com/apache/spark/blob/b5241c97b17a1139a4ff719bfce7f68aef094d95/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala#L234-L237

However, dataset id always changes even if the logical plan doesn't change:
https://github.com/apache/spark/blob/b5241c97b17a1139a4ff719bfce7f68aef094d95/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala#L207-L208

And this can lead to the mismatch between dataset's id and col's __dataset_id. E.g.,

```scala
  test("SPARK-28344: fail ambiguous self join - Dataset.colRegex as column ref") {
    // The test can fail if we change it to:
    // val df1 = spark.range(3).toDF()
    // val df2 = df1.filter($"id" > 0).toDF()
    val df1 = spark.range(3)
    val df2 = df1.filter($"id" > 0)

    withSQLConf(
      SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true",
      SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
      assertAmbiguousSelfJoin(df1.join(df2, df1.colRegex("id") > df2.colRegex("id")))
    }
  }
```

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Added unit tests.

Closes #32692 from Ngone51/spark-35454-3.1.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../plans/logical/AnalysisHelper.scala        |  4 +-
 .../scala/org/apache/spark/sql/Dataset.scala  | 11 +--
 .../analysis/DetectAmbiguousSelfJoin.scala    |  8 +-
 .../spark/sql/DataFrameSelfJoinSuite.scala    | 87 +++++++++++++++++++
 4 files changed, 100 insertions(+), 10 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelper.scala
index 54b01416381c6..b31b3e6d17054 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelper.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/AnalysisHelper.scala
@@ -91,7 +91,9 @@ trait AnalysisHelper extends QueryPlan[LogicalPlan] { self: LogicalPlan =>
           }
         } else {
           CurrentOrigin.withOrigin(origin) {
-            rule.applyOrElse(afterRuleOnChildren, identity[LogicalPlan])
+            val afterRule = rule.applyOrElse(afterRuleOnChildren, identity[LogicalPlan])
+            afterRule.copyTagsFrom(self)
+            afterRule
           }
         }
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 91607dcd09cc9..e9569a0bc00dd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql
 import java.io.{ByteArrayOutputStream, CharArrayWriter, DataOutputStream}
 
 import scala.collection.JavaConverters._
-import scala.collection.mutable.ArrayBuffer
+import scala.collection.mutable.{ArrayBuffer, HashSet}
 import scala.reflect.runtime.universe.TypeTag
 import scala.util.control.NonFatal
 
@@ -69,7 +69,7 @@ private[sql] object Dataset {
   val curId = new java.util.concurrent.atomic.AtomicLong()
   val DATASET_ID_KEY = "__dataset_id"
   val COL_POS_KEY = "__col_position"
-  val DATASET_ID_TAG = TreeNodeTag[Long]("dataset_id")
+  val DATASET_ID_TAG = TreeNodeTag[HashSet[Long]]("dataset_id")
 
   def apply[T: Encoder](sparkSession: SparkSession, logicalPlan: LogicalPlan): Dataset[T] = {
     val dataset = new Dataset(sparkSession, logicalPlan, implicitly[Encoder[T]])
@@ -231,9 +231,10 @@ class Dataset[T] private[sql](
       case _ =>
         queryExecution.analyzed
     }
-    if (sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED) &&
-        plan.getTagValue(Dataset.DATASET_ID_TAG).isEmpty) {
-      plan.setTagValue(Dataset.DATASET_ID_TAG, id)
+    if (sparkSession.sessionState.conf.getConf(SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED)) {
+      val dsIds = plan.getTagValue(Dataset.DATASET_ID_TAG).getOrElse(new HashSet[Long])
+      dsIds.add(id)
+      plan.setTagValue(Dataset.DATASET_ID_TAG, dsIds)
     }
     plan
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala
index b26a0785b7f4f..781a7ab60c224 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/analysis/DetectAmbiguousSelfJoin.scala
@@ -57,8 +57,8 @@ object DetectAmbiguousSelfJoin extends Rule[LogicalPlan] {
   }
 
   object LogicalPlanWithDatasetId {
-    def unapply(p: LogicalPlan): Option[(LogicalPlan, Long)] = {
-      p.getTagValue(Dataset.DATASET_ID_TAG).map(id => p -> id)
+    def unapply(p: LogicalPlan): Option[(LogicalPlan, mutable.HashSet[Long])] = {
+      p.getTagValue(Dataset.DATASET_ID_TAG).map(ids => p -> ids)
     }
   }
 
@@ -89,9 +89,9 @@ object DetectAmbiguousSelfJoin extends Rule[LogicalPlan] {
       val inputAttrs = AttributeSet(plan.children.flatMap(_.output))
 
       plan.foreach {
-        case LogicalPlanWithDatasetId(p, id) if dsIdSet.contains(id) =>
+        case LogicalPlanWithDatasetId(p, ids) if dsIdSet.intersect(ids).nonEmpty =>
           colRefs.foreach { ref =>
-            if (id == ref.datasetId) {
+            if (ids.contains(ref.datasetId)) {
               if (ref.colPos < 0 || ref.colPos >= p.output.length) {
                 throw new IllegalStateException("[BUG] Hit an invalid Dataset column reference: " +
                   s"$ref. Please open a JIRA ticket to report it.")
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala
index 76f07b5b0132d..062404f412bb7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql
 
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
 import org.apache.spark.sql.expressions.Window
 import org.apache.spark.sql.functions.{count, sum}
 import org.apache.spark.sql.internal.SQLConf
@@ -257,4 +258,90 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {
       checkAnswer(df1.join(df2, df1("b") === 2), Row(1, 2, 1))
     }
   }
+
+  test("SPARK-35454: __dataset_id and __col_position should be correctly set") {
+    val ds = Seq[TestData](
+      TestData(1, "sales"),
+      TestData(2, "personnel"),
+      TestData(3, "develop"),
+      TestData(4, "IT")).toDS()
+    var dsIdSetOpt = ds.logicalPlan.getTagValue(Dataset.DATASET_ID_TAG)
+    assert(dsIdSetOpt.get.size === 1)
+    var col1DsId = -1L
+    val col1 = ds.col("key")
+    col1.expr.foreach {
+      case a: AttributeReference =>
+        col1DsId = a.metadata.getLong(Dataset.DATASET_ID_KEY)
+        assert(dsIdSetOpt.get.contains(col1DsId))
+        assert(a.metadata.getLong(Dataset.COL_POS_KEY) === 0)
+    }
+
+    val df = ds.toDF()
+    dsIdSetOpt = df.logicalPlan.getTagValue(Dataset.DATASET_ID_TAG)
+    assert(dsIdSetOpt.get.size === 2)
+    var col2DsId = -1L
+    val col2 = df.col("key")
+    col2.expr.foreach {
+      case a: AttributeReference =>
+        col2DsId = a.metadata.getLong(Dataset.DATASET_ID_KEY)
+        assert(dsIdSetOpt.get.contains(a.metadata.getLong(Dataset.DATASET_ID_KEY)))
+        assert(a.metadata.getLong(Dataset.COL_POS_KEY) === 0)
+    }
+    assert(col1DsId !== col2DsId)
+  }
+
+  test("SPARK-35454: fail ambiguous self join - toDF") {
+    val df1 = spark.range(3).toDF()
+    val df2 = df1.filter($"id" > 0).toDF()
+
+    withSQLConf(
+      SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true",
+      SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
+      assertAmbiguousSelfJoin(df1.join(df2, df1.col("id") > df2.col("id")))
+    }
+  }
+
+  test("SPARK-35454: fail ambiguous self join - join four tables") {
+    val df1 = spark.range(3).select($"id".as("a"), $"id".as("b"))
+    val df2 = df1.filter($"a" > 0).select("b")
+    val df3 = df1.filter($"a" <= 2).select("b")
+    val df4 = df1.filter($"b" <= 2)
+    val df5 = spark.range(1)
+
+    withSQLConf(
+      SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "false",
+      SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
+      // `df2("b") < df4("b")` is always false
+      checkAnswer(df1.join(df2).join(df3).join(df4, df2("b") < df4("b")), Nil)
+      // `df2("b")` actually points to the column of `df1`.
+      checkAnswer(
+        df1.join(df2).join(df5).join(df4).select(df2("b")),
+        Seq(0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2).map(Row(_)))
+      // `df5("id")` is not ambiguous.
+      checkAnswer(
+        df1.join(df5).join(df3).select(df5("id")),
+        Seq(0, 0, 0, 0, 0, 0, 0, 0, 0).map(Row(_)))
+
+      // Alias the dataframe and use qualified column names can fix ambiguous self-join.
+      val aliasedDf1 = df1.alias("w")
+      val aliasedDf2 = df2.as("x")
+      val aliasedDf3 = df3.as("y")
+      val aliasedDf4 = df3.as("z")
+      checkAnswer(
+        aliasedDf1.join(aliasedDf2).join(aliasedDf3).join(aliasedDf4, $"x.b" < $"y.b"),
+        Seq(Row(0, 0, 1, 2, 0), Row(0, 0, 1, 2, 1), Row(0, 0, 1, 2, 2),
+          Row(1, 1, 1, 2, 0), Row(1, 1, 1, 2, 1), Row(1, 1, 1, 2, 2),
+          Row(2, 2, 1, 2, 0), Row(2, 2, 1, 2, 1), Row(2, 2, 1, 2, 2)))
+      checkAnswer(
+        aliasedDf1.join(df5).join(aliasedDf3).select($"y.b"),
+        Seq(0, 0, 0, 1, 1, 1, 2, 2, 2).map(Row(_)))
+    }
+
+    withSQLConf(
+      SQLConf.FAIL_AMBIGUOUS_SELF_JOIN_ENABLED.key -> "true",
+      SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
+      assertAmbiguousSelfJoin(df1.join(df2).join(df3).join(df4, df2("b") < df4("b")))
+      assertAmbiguousSelfJoin(df1.join(df2).join(df5).join(df4).select(df2("b")))
+    }
+  }
 }

From eeb5e9c16ad36aa9f4a345810690efd7ab40c4e9 Mon Sep 17 00:00:00 2001
From: Tengfei Huang <tengfei.h@gmail.com>
Date: Mon, 31 May 2021 22:15:26 +0800
Subject: [PATCH 088/169] [SPARK-35411][SQL][FOLLOWUP] Handle Currying Product
 while serializing TreeNode to JSON

### What changes were proposed in this pull request?
Handle Currying Product while serializing TreeNode to JSON. While processing [Product](https://github.com/apache/spark/blob/v3.1.2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala#L820), we may get an assert error for cases like Currying Product because of the mismatch of sizes between field name and field values.
Fallback to use reflection to get all the values for constructor parameters when we  meet such cases.

### Why are the changes needed?
Avoid throwing error while serializing TreeNode to JSON, try to output as much information as possible.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
New UT case added.

Closes #32713 from ivoson/SPARK-35411-followup.

Authored-by: Tengfei Huang <tengfei.h@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/trees/TreeNode.scala     | 16 +++++++++++++++-
 .../sql/catalyst/trees/TreeNodeSuite.scala      | 17 +++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
index 60f9d3a97a35f..7ebaba8031c54 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala
@@ -803,7 +803,20 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
     case p: Product if shouldConvertToJson(p) =>
       try {
         val fieldNames = getConstructorParameterNames(p.getClass)
-        val fieldValues = p.productIterator.toSeq
+        val fieldValues = {
+          if (p.productArity == fieldNames.length) {
+            p.productIterator.toSeq
+          } else {
+            val clazz = p.getClass
+            // Fallback to use reflection if length of product elements do not match
+            // constructor params.
+            fieldNames.map { fieldName =>
+              val field = clazz.getDeclaredField(fieldName)
+              field.setAccessible(true)
+              field.get(p)
+            }
+          }
+        }
         assert(fieldNames.length == fieldValues.length, s"$simpleClassName fields: " +
           fieldNames.mkString(", ") + s", values: " + fieldValues.mkString(", "))
         ("product-class" -> JString(p.getClass.getName)) :: fieldNames.zip(fieldValues).map {
@@ -811,6 +824,7 @@ abstract class TreeNode[BaseType <: TreeNode[BaseType]] extends Product {
         }.toList
       } catch {
         case _: RuntimeException => null
+        case _: ReflectiveOperationException => null
       }
     case _ => JNull
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
index d837af70ae104..9afad22debdfe 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala
@@ -88,6 +88,8 @@ case class FakeLeafPlan(child: LogicalPlan)
   override def output: Seq[Attribute] = child.output
 }
 
+case class FakeCurryingProduct(x: Expression)(val y: Int)
+
 class TreeNodeSuite extends SparkFunSuite with SQLHelper {
   test("top node changed") {
     val after = Literal(1) transform { case Literal(1, _) => Literal(2) }
@@ -615,6 +617,21 @@ class TreeNodeSuite extends SparkFunSuite with SQLHelper {
             "num-children" -> 0,
             "arg" -> "1"
           )))))
+
+    // Convert currying product contains TreeNode to JSON.
+    assertJSON(
+      FakeCurryingProduct(Literal(1))(1),
+      JObject(
+        "product-class" -> classOf[FakeCurryingProduct].getName,
+        "x" -> List(
+          JObject(
+            "class" -> JString(classOf[Literal].getName),
+            "num-children" -> 0,
+            "value" -> "1",
+            "dataType" -> "integer")),
+        "y" -> 1
+      )
+    )
   }
 
   test("toJSON should not throws java.lang.StackOverflowError") {

From 5465d29864a89347c8217bc10a07995751981c9e Mon Sep 17 00:00:00 2001
From: attilapiros <piros.attila.zsolt@gmail.com>
Date: Wed, 2 Jun 2021 09:34:28 -0700
Subject: [PATCH 089/169] [SPARK-35610][CORE] Fix the memory leak introduced by
 the Executor's stop shutdown hook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?

Fixing the memory leak by deregistering the shutdown hook when the executor is stopped. This way the Garbage Collector can release the executor object early. Which is a huge win for our tests as user's classloader could be also released which keeps references to objects which are created for the jars on the classpath.

### Why are the changes needed?

I have identified this leak by running the Livy tests (I know it is close to the attic but this leak causes a constant OOM there) and it is in our Spark unit tests as well.

This leak can be identified by checking the number of `LeakyEntry` in case of Scala 2.12.14 (and `ZipEntry` for Scala 2.12.10) instances which with its related data can take up a considerable amount of memory (as those are created from the jars which are on the classpath).

I have my own tool for instrumenting JVM code [trace-agent](https://github.com/attilapiros/trace-agent) and with that I am able to call JVM diagnostic commands at specific methods. Let me show how it in action.

It has a single text file embedded into the tool's jar called action.txt.
In this case actions.txt content is:

{noformat}
$ unzip -q -c trace-agent-0.0.7.jar actions.txt
diagnostic_command org.apache.spark.repl.ReplSuite runInterpreter  cmd:gcClassHistogram,limit_output_lines:8,where:beforeAndAfter,with_gc:true
diagnostic_command org.apache.spark.repl.ReplSuite afterAll  cmd:gcClassHistogram,limit_output_lines:8,where:after,with_gc:true
{noformat}

Which creates a class histogram at the beginning and at the end of `org.apache.spark.repl.ReplSuite#runInterpreter()` (after triggering a GC which might not finish as GC is done in a separate thread..) and one histogram in the end of the `org.apache.spark.repl.ReplSuite#afterAll()` method.

And the histograms are the followings on master branch:

```
$ ./build/sbt ";project repl;set Test/javaOptions += \"-javaagent:/Users/attilazsoltpiros/git/attilapiros/memoryLeak/trace-agent-0.0.7.jar\"; testOnly" |grep "ZipEntry\|LeakyEntry"
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   3:        394178       18920544  scala.reflect.io.FileZipArchive$LeakyEntry
   3:        394178       18920544  scala.reflect.io.FileZipArchive$LeakyEntry
   3:        591267       28380816  scala.reflect.io.FileZipArchive$LeakyEntry
   3:        591267       28380816  scala.reflect.io.FileZipArchive$LeakyEntry
   3:        788356       37841088  scala.reflect.io.FileZipArchive$LeakyEntry
   3:        788356       37841088  scala.reflect.io.FileZipArchive$LeakyEntry
   3:        985445       47301360  scala.reflect.io.FileZipArchive$LeakyEntry
   3:        985445       47301360  scala.reflect.io.FileZipArchive$LeakyEntry
   3:       1182534       56761632  scala.reflect.io.FileZipArchive$LeakyEntry
   3:       1182534       56761632  scala.reflect.io.FileZipArchive$LeakyEntry
   3:       1379623       66221904  scala.reflect.io.FileZipArchive$LeakyEntry
   3:       1379623       66221904  scala.reflect.io.FileZipArchive$LeakyEntry
   3:       1576712       75682176  scala.reflect.io.FileZipArchive$LeakyEntry
```

Where the header of the table is:

```
num     #instances         #bytes  class name
```

So the `LeakyEntry` in the end is about 75MB (173MB in case of Scala 2.12.10 and before for another class called `ZipEntry`) but the first item (a char/byte arrays) and the second item (strings) in the histogram also relates to this leak:

```
$ ./build/sbt ";project repl;set Test/javaOptions += \"-javaagent:/Users/attilazsoltpiros/git/attilapiros/memoryLeak/trace-agent-0.0.7.jar\"; testOnly" |grep "1:\|2:\|3:"
   1:          2701        3496112  [B
   2:         21855        2607192  [C
   3:          4885         537264  java.lang.Class
   1:        480323       55970208  [C
   2:        480499       11531976  java.lang.String
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        481825       56148024  [C
   2:        481998       11567952  java.lang.String
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        487056       57550344  [C
   2:        487179       11692296  java.lang.String
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        487054       57551008  [C
   2:        487176       11692224  java.lang.String
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        927823      107139160  [C
   2:        928072       22273728  java.lang.String
   3:        394178       18920544  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        927793      107129328  [C
   2:        928041       22272984  java.lang.String
   3:        394178       18920544  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       1361851      155555608  [C
   2:       1362261       32694264  java.lang.String
   3:        591267       28380816  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       1361683      155493464  [C
   2:       1362092       32690208  java.lang.String
   3:        591267       28380816  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       1803074      205157728  [C
   2:       1803268       43278432  java.lang.String
   3:        788356       37841088  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       1802385      204938224  [C
   2:       1802579       43261896  java.lang.String
   3:        788356       37841088  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2236631      253636592  [C
   2:       2237029       53688696  java.lang.String
   3:        985445       47301360  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2236536      253603008  [C
   2:       2236933       53686392  java.lang.String
   3:        985445       47301360  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2668892      301893920  [C
   2:       2669510       64068240  java.lang.String
   3:       1182534       56761632  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2668759      301846376  [C
   2:       2669376       64065024  java.lang.String
   3:       1182534       56761632  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       3101238      350101048  [C
   2:       3102073       74449752  java.lang.String
   3:       1379623       66221904  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       3101240      350101104  [C
   2:       3102075       74449800  java.lang.String
   3:       1379623       66221904  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       3533785      398371760  [C
   2:       3534835       84836040  java.lang.String
   3:       1576712       75682176  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       3533759      398367088  [C
   2:       3534807       84835368  java.lang.String
   3:       1576712       75682176  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       3967049      446893400  [C
   2:       3968314       95239536  java.lang.String
   3:       1773801       85142448  scala.reflect.io.FileZipArchive$LeakyEntry
[info] - SPARK-26633: ExecutorClassLoader.getResourceAsStream find REPL classes (8 seconds, 248 milliseconds)
Setting default log level to "ERROR".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
   1:       3966423      446709584  [C
   2:       3967682       95224368  java.lang.String
   3:       1773801       85142448  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       4399583      495097208  [C
   2:       4401050      105625200  java.lang.String
   3:       1970890       94602720  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       4399578      495070064  [C
   2:       4401040      105624960  java.lang.String
   3:       1970890       94602720  scala.reflect.io.FileZipArchive$LeakyEntry
```

The last three is about 700MB altogether.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?

I used the trace-agent tool with the same settings for the modified code:

```
$ ./build/sbt ";project repl;set Test/javaOptions += \"-javaagent:/Users/attilazsoltpiros/git/attilapiros/memoryLeak/trace-agent-0.0.7.jar\"; testOnly" |grep "1:\|2:\|3:"
   1:          2701        3496112  [B
   2:         21855        2607192  [C
   3:          4885         537264  java.lang.Class
   1:        480323       55970208  [C
   2:        480499       11531976  java.lang.String
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        481825       56148024  [C
   2:        481998       11567952  java.lang.String
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        487056       57550344  [C
   2:        487179       11692296  java.lang.String
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        487054       57551008  [C
   2:        487176       11692224  java.lang.String
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        927823      107139160  [C
   2:        928072       22273728  java.lang.String
   3:        394178       18920544  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        927793      107129328  [C
   2:        928041       22272984  java.lang.String
   3:        394178       18920544  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       1361851      155555608  [C
   2:       1362261       32694264  java.lang.String
   3:        591267       28380816  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       1361683      155493464  [C
   2:       1362092       32690208  java.lang.String
   3:        591267       28380816  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       1803074      205157728  [C
   2:       1803268       43278432  java.lang.String
   3:        788356       37841088  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       1802385      204938224  [C
   2:       1802579       43261896  java.lang.String
   3:        788356       37841088  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2236631      253636592  [C
   2:       2237029       53688696  java.lang.String
   3:        985445       47301360  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2236536      253603008  [C
   2:       2236933       53686392  java.lang.String
   3:        985445       47301360  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2668892      301893920  [C
   2:       2669510       64068240  java.lang.String
   3:       1182534       56761632  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2668759      301846376  [C
   2:       2669376       64065024  java.lang.String
   3:       1182534       56761632  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       3101238      350101048  [C
   2:       3102073       74449752  java.lang.String
   3:       1379623       66221904  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       3101240      350101104  [C
   2:       3102075       74449800  java.lang.String
   3:       1379623       66221904  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       3533785      398371760  [C
   2:       3534835       84836040  java.lang.String
   3:       1576712       75682176  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       3533759      398367088  [C
   2:       3534807       84835368  java.lang.String
   3:       1576712       75682176  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       3967049      446893400  [C
   2:       3968314       95239536  java.lang.String
   3:       1773801       85142448  scala.reflect.io.FileZipArchive$LeakyEntry
[info] - SPARK-26633: ExecutorClassLoader.getResourceAsStream find REPL classes (8 seconds, 248 milliseconds)
Setting default log level to "ERROR".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
   1:       3966423      446709584  [C
   2:       3967682       95224368  java.lang.String
   3:       1773801       85142448  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       4399583      495097208  [C
   2:       4401050      105625200  java.lang.String
   3:       1970890       94602720  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       4399578      495070064  [C
   2:       4401040      105624960  java.lang.String
   3:       1970890       94602720  scala.reflect.io.FileZipArchive$LeakyEntry
[success] Total time: 174 s (02:54), completed Jun 2, 2021 2:00:43 PM
╭─attilazsoltpirosapiros-MBP16 ~/git/attilapiros/memoryLeak ‹SPARK-35610*›
╰─$ vim
╭─attilazsoltpirosapiros-MBP16 ~/git/attilapiros/memoryLeak ‹SPARK-35610*›
╰─$ ./build/sbt ";project repl;set Test/javaOptions += \"-javaagent:/Users/attilazsoltpiros/git/attilapiros/memoryLeak/trace-agent-0.0.7.jar\"; testOnly" |grep "1:\|2:\|3:"
   1:          2685        3457368  [B
   2:         21833        2606712  [C
   3:          4885         537264  java.lang.Class
   1:        480245       55978400  [C
   2:        480421       11530104  java.lang.String
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        480460       56005784  [C
   2:        480633       11535192  java.lang.String
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        486643       57537784  [C
   2:        486766       11682384  java.lang.String
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        486636       57538192  [C
   2:        486758       11682192  java.lang.String
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        501208       60411856  [C
   2:        501180       12028320  java.lang.String
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        501206       60412960  [C
   2:        501177       12028248  java.lang.String
   3:        197089        9460272  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        934925      108773320  [C
   2:        935058       22441392  java.lang.String
   3:        394178       18920544  scala.reflect.io.FileZipArchive$LeakyEntry
   1:        934912      108769528  [C
   2:        935044       22441056  java.lang.String
   3:        394178       18920544  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       1370351      156901296  [C
   2:       1370318       32887632  java.lang.String
   3:        591267       28380816  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       1369660      156681680  [C
   2:       1369627       32871048  java.lang.String
   3:        591267       28380816  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       1803746      205383136  [C
   2:       1803917       43294008  java.lang.String
   3:        788356       37841088  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       1803658      205353096  [C
   2:       1803828       43291872  java.lang.String
   3:        788356       37841088  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2235677      253608240  [C
   2:       2236068       53665632  java.lang.String
   3:        985445       47301360  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2235539      253560088  [C
   2:       2235929       53662296  java.lang.String
   3:        985445       47301360  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2667775      301799240  [C
   2:       2668383       64041192  java.lang.String
   3:       1182534       56761632  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2667765      301798568  [C
   2:       2668373       64040952  java.lang.String
   3:       1182534       56761632  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2666665      301491096  [C
   2:       2667285       64014840  java.lang.String
   3:       1182534       56761632  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2666648      301490792  [C
   2:       2667266       64014384  java.lang.String
   3:       1182534       56761632  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2668169      301833032  [C
   2:       2668782       64050768  java.lang.String
   3:       1182534       56761632  scala.reflect.io.FileZipArchive$LeakyEntry
[info] - SPARK-26633: ExecutorClassLoader.getResourceAsStream find REPL classes (6 seconds, 396 milliseconds)
Setting default log level to "ERROR".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
   1:       2235495      253419952  [C
   2:       2235887       53661288  java.lang.String
   3:        985445       47301360  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2668379      301800768  [C
   2:       2668979       64055496  java.lang.String
   3:       1182534       56761632  scala.reflect.io.FileZipArchive$LeakyEntry
   1:       2236123      253522640  [C
   2:       2236514       53676336  java.lang.String
   3:        985445       47301360  scala.reflect.io.FileZipArchive$LeakyEntry
```

The sum of the last three numbers is about 354MB.

Closes #32748 from attilapiros/SPARK-35610.

Authored-by: attilapiros <piros.attila.zsolt@gmail.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit 806edf8f4460b81e5b71ea00f83c14c4f8134bd4)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 core/src/main/scala/org/apache/spark/executor/Executor.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
index 4ead4397e9739..a40a61ecb592a 100644
--- a/core/src/main/scala/org/apache/spark/executor/Executor.scala
+++ b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -72,7 +72,7 @@ private[spark] class Executor(
   logInfo(s"Starting executor ID $executorId on host $executorHostname")
 
   private val executorShutdown = new AtomicBoolean(false)
-  ShutdownHookManager.addShutdownHook(
+  val stopHookReference = ShutdownHookManager.addShutdownHook(
     () => stop()
   )
   // Application dependencies (added through SparkContext) that we've fetched so far on this node.
@@ -312,6 +312,7 @@ private[spark] class Executor(
 
   def stop(): Unit = {
     if (!executorShutdown.getAndSet(true)) {
+      ShutdownHookManager.removeShutdownHook(stopHookReference)
       env.metricsSystem.report()
       try {
         metricsPoller.stop()

From 6953e0c347cf98aa897e6e6f8759f4a13cf80b9e Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Thu, 3 Jun 2021 09:41:59 -0700
Subject: [PATCH 090/169] [SPARK-35589][CORE][3.1] BlockManagerMasterEndpoint
 should not ignore index-only shuffle file during updating

### What changes were proposed in this pull request?

This is a backport of https://github.com/apache/spark/pull/32727 .

This PR aims to make `BlockManagerMasterEndpoint.updateBlockInfo` not to ignore index-only shuffle files.
In addition, this PR fixes `IndexShuffleBlockResolver.getMigrationBlocks` to return data files first.

### Why are the changes needed?

When [SPARK-20629](https://github.com/apache/spark/commit/a4ca355af8556e8c5948e492ef70ef0b48416dc4) introduced a worker decommission, index-only shuffle files are not considered properly.
- SPARK-33198 fixed `getMigrationBlocks` to handle index only shuffle files
- SPARK-35589 (this) aims to fix `updateBlockInfo` to handle index only shuffle files.

### Does this PR introduce _any_ user-facing change?

No. This is a bug fix.

### How was this patch tested?

Pass the CIs with the newly added test case.

Closes #32756 from dongjoon-hyun/SPARK-35589-2.

Authored-by: Dongjoon Hyun <dhyun@apple.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../shuffle/IndexShuffleBlockResolver.scala   |  2 +-
 .../storage/BlockManagerMasterEndpoint.scala  |  8 ++--
 .../spark/storage/BlockManagerSuite.scala     | 48 +++++++++++++++++++
 3 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
index 5f0bb42108c56..7112ef5ddbec9 100644
--- a/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
+++ b/core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala
@@ -246,7 +246,7 @@ private[spark] class IndexShuffleBlockResolver(
         throw new FileNotFoundException("Index file is deleted already.")
       }
       if (dataFile.exists()) {
-        List((indexBlockId, indexBlockData), (dataBlockId, dataBlockData))
+        List((dataBlockId, dataBlockData), (indexBlockId, indexBlockData))
       } else {
         List((indexBlockId, indexBlockData))
       }
diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
index 7e3597c19dea4..dd1cf70525484 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
@@ -580,12 +580,12 @@ class BlockManagerMasterEndpoint(
     if (blockId.isShuffle) {
       blockId match {
         case ShuffleIndexBlockId(shuffleId, mapId, _) =>
-          // Don't update the map output on just the index block
-          logDebug(s"Received shuffle index block update for ${shuffleId} ${mapId}, ignoring.")
+          // We need to update this at index file because there exists the index-only block
+          logDebug(s"Received shuffle index block update for ${shuffleId} ${mapId}, updating.")
+          mapOutputTracker.updateMapOutput(shuffleId, mapId, blockManagerId)
           return true
         case ShuffleDataBlockId(shuffleId: Int, mapId: Long, reduceId: Int) =>
-          logDebug(s"Received shuffle data block update for ${shuffleId} ${mapId}, updating.")
-          mapOutputTracker.updateMapOutput(shuffleId, mapId, blockManagerId)
+          logDebug(s"Received shuffle data block update for ${shuffleId} ${mapId}, ignore.")
           return true
         case _ =>
           logDebug(s"Unexpected shuffle block type ${blockId}" +
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
index 09678c77e10b3..cd319daccc0db 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerSuite.scala
@@ -1975,6 +1975,54 @@ class BlockManagerSuite extends SparkFunSuite with Matchers with BeforeAndAfterE
     }
   }
 
+  test("SPARK-35589: test migration of index-only shuffle blocks during decommissioning") {
+    val shuffleManager1 = makeSortShuffleManager()
+    val bm1 = makeBlockManager(3500, "exec1", shuffleManager = shuffleManager1)
+    shuffleManager1.shuffleBlockResolver._blockManager = bm1
+
+    val shuffleManager2 = makeSortShuffleManager()
+    val bm2 = makeBlockManager(3500, "exec2", shuffleManager = shuffleManager2)
+    shuffleManager2.shuffleBlockResolver._blockManager = bm2
+
+    val blockSize = 5
+    val shuffleDataBlockContent = Array[Byte](0, 1, 2, 3, 4)
+    val shuffleData = ShuffleDataBlockId(0, 0, 0)
+    Files.write(bm1.diskBlockManager.getFile(shuffleData).toPath(), shuffleDataBlockContent)
+    val shuffleIndexBlockContent = Array[Byte](5, 6, 7, 8, 9)
+    val shuffleIndex = ShuffleIndexBlockId(0, 0, 0)
+    val shuffleIndexOnly = ShuffleIndexBlockId(0, 1, 0)
+    Files.write(bm1.diskBlockManager.getFile(shuffleIndex).toPath(), shuffleIndexBlockContent)
+    Files.write(bm1.diskBlockManager.getFile(shuffleIndexOnly).toPath(), shuffleIndexBlockContent)
+
+    mapOutputTracker.registerShuffle(0, 2)
+    val decomManager = new BlockManagerDecommissioner(conf, bm1)
+    try {
+      mapOutputTracker.registerMapOutput(0, 0, MapStatus(bm1.blockManagerId, Array(blockSize), 0))
+      mapOutputTracker.registerMapOutput(0, 1, MapStatus(bm1.blockManagerId, Array(blockSize), 1))
+      assert(mapOutputTracker.shuffleStatuses(0).mapStatuses(0).location === bm1.blockManagerId)
+      assert(mapOutputTracker.shuffleStatuses(0).mapStatuses(1).location === bm1.blockManagerId)
+
+      val env = mock(classOf[SparkEnv])
+      when(env.conf).thenReturn(conf)
+      SparkEnv.set(env)
+
+      decomManager.refreshOffloadingShuffleBlocks()
+
+      eventually(timeout(1.second), interval(10.milliseconds)) {
+        assert(mapOutputTracker.shuffleStatuses(0).mapStatuses(0).location === bm2.blockManagerId)
+        assert(mapOutputTracker.shuffleStatuses(0).mapStatuses(1).location === bm2.blockManagerId)
+      }
+      assert(Files.readAllBytes(bm2.diskBlockManager.getFile(shuffleData).toPath())
+        === shuffleDataBlockContent)
+      assert(Files.readAllBytes(bm2.diskBlockManager.getFile(shuffleIndex).toPath())
+        === shuffleIndexBlockContent)
+    } finally {
+      mapOutputTracker.unregisterShuffle(0)
+      // Avoid thread leak
+      decomManager.stopOffloadingShuffleBlocks()
+    }
+  }
+
   test("SPARK-32919: Shuffle push merger locations should be bounded with in" +
     " spark.shuffle.push.retainedMergerLocations") {
     assert(master.getShufflePushMergerLocations(10, Set.empty).isEmpty)

From d40b54070e16607e366fd7741d12c0b3d9c98f79 Mon Sep 17 00:00:00 2001
From: dgd-contributor <dgd_contributor@viettel.com.vn>
Date: Thu, 10 Jun 2021 08:08:51 +0300
Subject: [PATCH 091/169] [SPARK-35679][SQL] instantToMicros overflow

### Why are the changes needed?
With Long.minValue cast to an instant, secs will be floored in function microsToInstant and cause overflow when multiply with Micros_per_second

```
def microsToInstant(micros: Long): Instant = {
  val secs = Math.floorDiv(micros, MICROS_PER_SECOND)
  // Unfolded Math.floorMod(us, MICROS_PER_SECOND) to reuse the result of
  // the above calculation of `secs` via `floorDiv`.
  val mos = micros - secs * MICROS_PER_SECOND  <- it will overflow here
  Instant.ofEpochSecond(secs, mos * NANOS_PER_MICROS)
}
```

But the overflow is acceptable because it won't produce any change to the result

However, when convert the instant back to micro value, it will raise Overflow Error

```
def instantToMicros(instant: Instant): Long = {
  val us = Math.multiplyExact(instant.getEpochSecond, MICROS_PER_SECOND) <- It overflow here
  val result = Math.addExact(us, NANOSECONDS.toMicros(instant.getNano))
  result
}
```

Code to reproduce this error
```
instantToMicros(microToInstant(Long.MinValue))
```

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Test added

Closes #32839 from dgd-contributor/SPARK-35679_instantToMicro.

Authored-by: dgd-contributor <dgd_contributor@viettel.com.vn>
Signed-off-by: Max Gekk <max.gekk@gmail.com>
(cherry picked from commit aa3de4077302fe7e0b23b01a338c7feab0e5974e)
Signed-off-by: Max Gekk <max.gekk@gmail.com>
---
 .../sql/catalyst/util/DateTimeUtils.scala     | 14 ++-
 .../catalyst/util/DateTimeUtilsSuite.scala    | 89 ++++++++++---------
 2 files changed, 58 insertions(+), 45 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 8d7cf81d85dac..acf529fe6f836 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -381,6 +381,9 @@ object DateTimeUtils {
       timestamp.get
     }
   }
+  // See issue SPARK-35679
+  // min second cause overflow in instant to micro
+  private val MIN_SECONDS = Math.floorDiv(Long.MinValue, MICROS_PER_SECOND)
 
   @tailrec
   def stringToTime(s: String): java.util.Date = {
@@ -408,9 +411,14 @@ object DateTimeUtils {
    * microseconds where microsecond 0 is 1970-01-01 00:00:00Z.
    */
   def instantToMicros(instant: Instant): Long = {
-    val us = Math.multiplyExact(instant.getEpochSecond, MICROS_PER_SECOND)
-    val result = Math.addExact(us, NANOSECONDS.toMicros(instant.getNano))
-    result
+    val secs = instant.getEpochSecond
+    if (secs == MIN_SECONDS) {
+      val us = Math.multiplyExact(secs + 1, MICROS_PER_SECOND)
+      Math.addExact(us, NANOSECONDS.toMicros(instant.getNano) - MICROS_PER_SECOND)
+    } else {
+      val us = Math.multiplyExact(secs, MICROS_PER_SECOND)
+      Math.addExact(us, NANOSECONDS.toMicros(instant.getNano))
+    }
   }
 
   /**
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 17d8f55691409..6862c3b5bc8e9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -676,48 +676,53 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
     }
   }
 
-  test("ceilTimestamp") {
-      def testCeil(
-                    level: Int,
-                    expected: String,
-                    inputTS: SQLTimestamp,
-                    zoneId: ZoneId = ZoneId.systemDefault()): Unit = {
-          val ceilTS =
-              DateTimeUtils.ceilTimestamp(inputTS, level, zoneId)
-          val expectedTS =
-              DateTimeUtils.stringToTimestamp(UTF8String.fromString(expected), zoneId)
-          assert(ceilTS === expectedTS.get)
-        }
+  test("SPARK-35679: instantToMicros should be able to return microseconds of Long.MinValue") {
+    assert(instantToMicros(microsToInstant(Long.MaxValue)) === Long.MaxValue)
+    assert(instantToMicros(microsToInstant(Long.MinValue)) === Long.MinValue)
+  }
 
-        val defaultInputTS =
-          DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-05T09:32:05.359"),
-              ZoneId.systemDefault())
-      val defaultInputTS1 =
-          DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-31T20:32:05.359"),
-              ZoneId.systemDefault())
-      val defaultInputTS2 =
-          DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-04-01T02:32:05.359"),
-              ZoneId.systemDefault())
-      val defaultInputTS3 =
-          DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-30T02:32:05.359"),
-              ZoneId.systemDefault())
-      val defaultInputTS4 =
-          DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-29T02:32:05.359"),
-              ZoneId.systemDefault())
-
-        testCeil(DateTimeUtils.TRUNC_TO_YEAR, "2016-01-01T00:00:00", defaultInputTS.get)
-      testCeil(DateTimeUtils.TRUNC_TO_MONTH, "2015-04-01T00:00:00", defaultInputTS.get)
-      testCeil(DateTimeUtils.TRUNC_TO_DAY, "2015-03-06T00:00:00", defaultInputTS.get)
-      testCeil(DateTimeUtils.TRUNC_TO_HOUR, "2015-03-05T10:00:00", defaultInputTS.get)
-      testCeil(DateTimeUtils.TRUNC_TO_MINUTE, "2015-03-05T09:33:00", defaultInputTS.get)
-      testCeil(DateTimeUtils.TRUNC_TO_SECOND, "2015-03-05T09:32:06", defaultInputTS.get)
-      testCeil(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-09T00:00:00", defaultInputTS.get)
-      testCeil(DateTimeUtils.TRUNC_TO_WEEK, "2015-04-06T00:00:00", defaultInputTS1.get)
-      testCeil(DateTimeUtils.TRUNC_TO_WEEK, "2015-04-06T00:00:00", defaultInputTS2.get)
-      testCeil(DateTimeUtils.TRUNC_TO_WEEK, "2015-04-06T00:00:00", defaultInputTS3.get)
-      testCeil(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-30T00:00:00", defaultInputTS4.get)
-      testCeil(DateTimeUtils.TRUNC_TO_QUARTER, "2015-04-01T00:00:00", defaultInputTS.get)
-      testCeil(DateTimeUtils.TRUNC_TO_QUARTER, "2015-04-01T00:00:00", defaultInputTS1.get)
-      testCeil(DateTimeUtils.TRUNC_TO_QUARTER, "2015-07-01T00:00:00", defaultInputTS2.get)
+  test("ceilTimestamp") {
+    def testCeil(
+                  level: Int,
+                  expected: String,
+                  inputTS: SQLTimestamp,
+                  zoneId: ZoneId = ZoneId.systemDefault()): Unit = {
+      val ceilTS =
+        DateTimeUtils.ceilTimestamp(inputTS, level, zoneId)
+      val expectedTS =
+        DateTimeUtils.stringToTimestamp(UTF8String.fromString(expected), zoneId)
+      assert(ceilTS === expectedTS.get)
     }
+
+    val defaultInputTS =
+      DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-05T09:32:05.359"),
+        ZoneId.systemDefault())
+    val defaultInputTS1 =
+      DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-31T20:32:05.359"),
+        ZoneId.systemDefault())
+    val defaultInputTS2 =
+      DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-04-01T02:32:05.359"),
+        ZoneId.systemDefault())
+    val defaultInputTS3 =
+      DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-30T02:32:05.359"),
+        ZoneId.systemDefault())
+    val defaultInputTS4 =
+      DateTimeUtils.stringToTimestamp(UTF8String.fromString("2015-03-29T02:32:05.359"),
+        ZoneId.systemDefault())
+
+    testCeil(DateTimeUtils.TRUNC_TO_YEAR, "2016-01-01T00:00:00", defaultInputTS.get)
+    testCeil(DateTimeUtils.TRUNC_TO_MONTH, "2015-04-01T00:00:00", defaultInputTS.get)
+    testCeil(DateTimeUtils.TRUNC_TO_DAY, "2015-03-06T00:00:00", defaultInputTS.get)
+    testCeil(DateTimeUtils.TRUNC_TO_HOUR, "2015-03-05T10:00:00", defaultInputTS.get)
+    testCeil(DateTimeUtils.TRUNC_TO_MINUTE, "2015-03-05T09:33:00", defaultInputTS.get)
+    testCeil(DateTimeUtils.TRUNC_TO_SECOND, "2015-03-05T09:32:06", defaultInputTS.get)
+    testCeil(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-09T00:00:00", defaultInputTS.get)
+    testCeil(DateTimeUtils.TRUNC_TO_WEEK, "2015-04-06T00:00:00", defaultInputTS1.get)
+    testCeil(DateTimeUtils.TRUNC_TO_WEEK, "2015-04-06T00:00:00", defaultInputTS2.get)
+    testCeil(DateTimeUtils.TRUNC_TO_WEEK, "2015-04-06T00:00:00", defaultInputTS3.get)
+    testCeil(DateTimeUtils.TRUNC_TO_WEEK, "2015-03-30T00:00:00", defaultInputTS4.get)
+    testCeil(DateTimeUtils.TRUNC_TO_QUARTER, "2015-04-01T00:00:00", defaultInputTS.get)
+    testCeil(DateTimeUtils.TRUNC_TO_QUARTER, "2015-04-01T00:00:00", defaultInputTS1.get)
+    testCeil(DateTimeUtils.TRUNC_TO_QUARTER, "2015-07-01T00:00:00", defaultInputTS2.get)
+  }
 }

From c3878567b49a671e8439faa4357035d37823bcfb Mon Sep 17 00:00:00 2001
From: Fu Chen <cfmcgrady@gmail.com>
Date: Thu, 10 Jun 2021 15:32:10 +0800
Subject: [PATCH 092/169] [SPARK-35673][SQL] Fix user-defined hint and
 unrecognized hint in subquery

Use `UnresolvedHint.resolved = child.resolved` instead `UnresolvedHint.resolved = false`, then the plan contains `UnresolvedHint` child can be optimized by rule in batch `Resolution`.

For instance, before this pr, the following plan can't be optimized by `ResolveReferences`.
```
!'Project [*]
 +- SubqueryAlias __auto_generated_subquery_name
    +- UnresolvedHint use_hash
       +- Project [42 AS 42#10]
          +- OneRowRelation
```

fix hint in subquery bug

No.

New test.

Closes #32841 from cfmcgrady/SPARK-35673.

Authored-by: Fu Chen <cfmcgrady@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 5280f02747eed9849e4a64562d38aee11e21616f)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/analysis/CheckAnalysis.scala |  3 +++
 .../sql/catalyst/plans/logical/hints.scala    |  5 +++-
 .../analysis/AnalysisErrorSuite.scala         | 16 ++++++++++++
 .../sql/SparkSessionExtensionSuite.scala      | 26 +++++++++++++++++++
 4 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index 609b155efda28..6cd6f240b5023 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -112,6 +112,9 @@ trait CheckAnalysis extends PredicateHelper {
       case u: UnresolvedRelation =>
         u.failAnalysis(s"Table or view not found: ${u.multipartIdentifier.quoted}")
 
+      case u: UnresolvedHint =>
+        u.failAnalysis(s"Hint not found: ${u.name}")
+
       case InsertIntoStatement(u: UnresolvedRelation, _, _, _, _, _) =>
         failAnalysis(s"Table not found: ${u.multipartIdentifier.quoted}")
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala
index 4b5e278fccdfb..1b72d21f532af 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala
@@ -29,7 +29,10 @@ import org.apache.spark.sql.catalyst.expressions.Attribute
 case class UnresolvedHint(name: String, parameters: Seq[Any], child: LogicalPlan)
   extends UnaryNode {
 
-  override lazy val resolved: Boolean = false
+  // we need it to be resolved so that the analyzer can continue to analyze the rest of the query
+  // plan.
+  override lazy val resolved: Boolean = child.resolved
+
   override def output: Seq[Attribute] = child.output
 }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
index ae5bbb1a87f30..79dc2ea0f9a51 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala
@@ -757,4 +757,20 @@ class AnalysisErrorSuite extends AnalysisTest {
       assertAnalysisError(plan, s"Correlated column is not allowed in predicate ($msg)" :: Nil)
     }
   }
+
+  test("SPARK-35673: fail if the plan still contains UnresolvedHint after analysis") {
+    val hintName = "some_random_hint_that_does_not_exist"
+    val plan = UnresolvedHint(hintName, Seq.empty,
+      Project(Alias(Literal(1), "x")() :: Nil, OneRowRelation())
+    )
+    assert(plan.resolved)
+
+    val error = intercept[AnalysisException] {
+      SimpleAnalyzer.checkAnalysis(plan)
+    }
+    assert(error.message.contains(s"Hint not found: ${hintName}"))
+
+    // UnresolvedHint be removed by batch `Remove Unresolved Hints`
+    assertAnalysisSuccess(plan, true)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala
index 35d2513835611..03514d850fc67 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SparkSessionExtensionSuite.scala
@@ -355,6 +355,32 @@ class SparkSessionExtensionSuite extends SparkFunSuite {
       stop(session)
     }
   }
+
+  test("SPARK-35673: user-defined hint and unrecognized hint in subquery") {
+    withSession(Seq(_.injectPostHocResolutionRule(MyHintRule))) { session =>
+      // unrecognized hint
+      QueryTest.checkAnswer(
+        session.sql(
+          """
+            |SELECT *
+            |FROM (
+            |    SELECT /*+ some_random_hint_that_does_not_exist */ 42
+            |)
+            |""".stripMargin),
+        Row(42) :: Nil)
+
+      // user-defined hint
+      QueryTest.checkAnswer(
+        session.sql(
+          """
+            |SELECT *
+            |FROM (
+            |    SELECT /*+ CONVERT_TO_EMPTY */ 42
+            |)
+            |""".stripMargin),
+        Nil)
+    }
+  }
 }
 
 case class MyRule(spark: SparkSession) extends Rule[LogicalPlan] {

From d1523a21da8b9f8bce0cbe3d90f7c1ff087ef75d Mon Sep 17 00:00:00 2001
From: Emil Ejbyfeldt <eejbyfeldt@liveintent.com>
Date: Thu, 10 Jun 2021 09:37:27 -0700
Subject: [PATCH 093/169] [SPARK-35653][SQL] Fix CatalystToExternalMap
 interpreted path fails for Map with case classes as keys or values

### What changes were proposed in this pull request?
Use the key/value LambdaFunction to convert the elements instead of
using CatalystTypeConverters.createToScalaConverter. This is how it is
done in MapObjects and that correctly handles Arrays with case classes.

### Why are the changes needed?
Before these changes the added test cases would fail with the following:
```
[info] - encode/decode for map with case class as value: Map(1 -> IntAndString(1,a)) (interpreted path) *** FAILED *** (64 milliseconds)
[info]   Encoded/Decoded data does not match input data
[info]
[info]   in:  Map(1 -> IntAndString(1,a))
[info]   out: Map(1 -> [1,a])
[info]   types: scala.collection.immutable.Map$Map1 [info]
[info]   Encoded Data: [org.apache.spark.sql.catalyst.expressions.UnsafeMapData5ecf5d9e]
[info]   Schema: value#823
[info]   root
[info]   -- value: map (nullable = true)
[info]       |-- key: integer
[info]       |-- value: struct (valueContainsNull = true)
[info]       |    |-- i: integer (nullable = false)
[info]       |    |-- s: string (nullable = true)
[info]
[info]
[info]   fromRow Expressions:
[info]   catalysttoexternalmap(lambdavariable(CatalystToExternalMap_key, IntegerType, false, 178), lambdavariable(CatalystToExternalMap_key, IntegerType, false, 178), lambdavariable(CatalystToExternalMap_value, StructField(i,IntegerType,false), StructField(s,StringType,true), true, 179), if (isnull(lambdavariable(CatalystToExternalMap_value, StructField(i,IntegerType,false), StructField(s,StringType,true), true, 179))) null else newInstance(class org.apache.spark.sql.catalyst.encoders.IntAndString), input[0, map<int,struct<i:int,s:string>>, true], interface scala.collection.immutable.Map
[info]   :- lambdavariable(CatalystToExternalMap_key, IntegerType, false, 178)
[info]   :- lambdavariable(CatalystToExternalMap_key, IntegerType, false, 178)
[info]   :- lambdavariable(CatalystToExternalMap_value, StructField(i,IntegerType,false), StructField(s,StringType,true), true, 179)
[info]   :- if (isnull(lambdavariable(CatalystToExternalMap_value, StructField(i,IntegerType,false), StructField(s,StringType,true), true, 179))) null else newInstance(class org.apache.spark.sql.catalyst.encoders.IntAndString)
[info]   :  :- isnull(lambdavariable(CatalystToExternalMap_value, StructField(i,IntegerType,false), StructField(s,StringType,true), true, 179))
[info]   :  :  +- lambdavariable(CatalystToExternalMap_value, StructField(i,IntegerType,false), StructField(s,StringType,true), true, 179)
[info]   :  :- null
[info]   :  +- newInstance(class org.apache.spark.sql.catalyst.encoders.IntAndString)
[info]   :     :- assertnotnull(lambdavariable(CatalystToExternalMap_value, StructField(i,IntegerType,false), StructField(s,StringType,true), true, 179).i)
[info]   :     :  +- lambdavariable(CatalystToExternalMap_value, StructField(i,IntegerType,false), StructField(s,StringType,true), true, 179).i
[info]   :     :     +- lambdavariable(CatalystToExternalMap_value, StructField(i,IntegerType,false), StructField(s,StringType,true), true, 179)
[info]   :     +- lambdavariable(CatalystToExternalMap_value, StructField(i,IntegerType,false), StructField(s,StringType,true), true, 179).s.toString
[info]   :        +- lambdavariable(CatalystToExternalMap_value, StructField(i,IntegerType,false), StructField(s,StringType,true), true, 179).s
[info]   :           +- lambdavariable(CatalystToExternalMap_value, StructField(i,IntegerType,false), StructField(s,StringType,true), true, 179)
[info]   +- input[0, map<int,struct<i:int,s:string>>, true] (ExpressionEncoderSuite.scala:627)
```
So using a map with cases classes for keys or values and using the interpreted path would incorrect deserialize data from the catalyst representation.

### Does this PR introduce _any_ user-facing change?
Yes, it fixes the bug.

### How was this patch tested?
Existing and new unit tests in the ExpressionEncoderSuite

Closes #32783 from eejbyfeldt/fix-interpreted-path-for-map-with-case-classes.

Authored-by: Emil Ejbyfeldt <eejbyfeldt@liveintent.com>
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
(cherry picked from commit e2e3fe77823387f6d4164eede05bf077b4235c87)
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 .../sql/catalyst/expressions/objects/objects.scala | 14 ++++++--------
 .../catalyst/encoders/ExpressionEncoderSuite.scala |  5 +++++
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
index 947665e997b87..6fcca6b784490 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala
@@ -27,7 +27,7 @@ import scala.util.{Properties, Try}
 import org.apache.spark.{SparkConf, SparkEnv}
 import org.apache.spark.serializer._
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, ScalaReflection}
+import org.apache.spark.sql.catalyst.{InternalRow, ScalaReflection}
 import org.apache.spark.sql.catalyst.encoders.RowEncoder
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.expressions.codegen._
@@ -1118,11 +1118,6 @@ case class CatalystToExternalMap private(
 
   private lazy val inputMapType = inputData.dataType.asInstanceOf[MapType]
 
-  private lazy val keyConverter =
-    CatalystTypeConverters.createToScalaConverter(inputMapType.keyType)
-  private lazy val valueConverter =
-    CatalystTypeConverters.createToScalaConverter(inputMapType.valueType)
-
   private lazy val (newMapBuilderMethod, moduleField) = {
     val clazz = Utils.classForName(collClass.getCanonicalName + "$")
     (clazz.getMethod("newBuilder"), clazz.getField("MODULE$").get(null))
@@ -1139,10 +1134,13 @@ case class CatalystToExternalMap private(
       builder.sizeHint(result.numElements())
       val keyArray = result.keyArray()
       val valueArray = result.valueArray()
+      val row = new GenericInternalRow(1)
       var i = 0
       while (i < result.numElements()) {
-        val key = keyConverter(keyArray.get(i, inputMapType.keyType))
-        val value = valueConverter(valueArray.get(i, inputMapType.valueType))
+        row.update(0, keyArray.get(i, inputMapType.keyType))
+        val key = keyLambdaFunction.eval(row)
+        row.update(0, valueArray.get(i, inputMapType.valueType))
+        val value = valueLambdaFunction.eval(row)
         builder += Tuple2(key, value)
         i += 1
       }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
index 095f6a970617b..7cd659fc9ceb3 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoderSuite.scala
@@ -114,6 +114,7 @@ case class ReferenceValueClass(wrapped: ReferenceValueClass.Container) extends A
 object ReferenceValueClass {
   case class Container(data: Int)
 }
+case class IntAndString(i: Int, s: String)
 
 class ExpressionEncoderSuite extends CodegenInterpretedPlanTest with AnalysisTest {
   OuterScopes.addOuterScope(this)
@@ -174,6 +175,10 @@ class ExpressionEncoderSuite extends CodegenInterpretedPlanTest with AnalysisTes
   encodeDecodeTest(Map(1 -> "a", 2 -> "b"), "map")
   encodeDecodeTest(Map(1 -> "a", 2 -> null), "map with null")
   encodeDecodeTest(Map(1 -> Map("a" -> 1), 2 -> Map("b" -> 2)), "map of map")
+  encodeDecodeTest(Map(1 -> IntAndString(1, "a")), "map with case class as value")
+  encodeDecodeTest(Map(IntAndString(1, "a") -> 1), "map with case class as key")
+  encodeDecodeTest(Map(IntAndString(1, "a") -> IntAndString(2, "b")),
+    "map with case class as key and value")
 
   encodeDecodeTest(Tuple1[Seq[Int]](null), "null seq in tuple")
   encodeDecodeTest(Tuple1[Map[String, String]](null), "null map in tuple")

From f8e19bc5194e1c09a8cb15a65781c0e155fae39b Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.com>
Date: Fri, 11 Jun 2021 01:20:35 +0800
Subject: [PATCH 094/169] [SPARK-35296][SQL] Allow Dataset.observe to work even
 if CollectMetricsExec in a task handles multiple partitions

### What changes were proposed in this pull request?

This PR fixes an issue that `Dataset.observe` doesn't work if `CollectMetricsExec` in a task handles multiple partitions.
If `coalesce` follows `observe` and the number of partitions shrinks after `coalesce`, `CollectMetricsExec` can handle multiple partitions in a task.

### Why are the changes needed?

The current implementation of `CollectMetricsExec` doesn't consider the case it can handle multiple partitions.
Because new `updater` is created for each partition even though those partitions belong to the same task, `collector.setState(updater)` raise an assertion error.
This is a simple reproducible example.
```
$ bin/spark-shell --master "local[1]"
scala> spark.range(1, 4, 1, 3).observe("my_event", count($"id").as("count_val")).coalesce(2).collect
```
```
java.lang.AssertionError: assertion failed
	at scala.Predef$.assert(Predef.scala:208)
	at org.apache.spark.sql.execution.AggregatingAccumulator.setState(AggregatingAccumulator.scala:204)
	at org.apache.spark.sql.execution.CollectMetricsExec.$anonfun$doExecute$2(CollectMetricsExec.scala:72)
	at org.apache.spark.sql.execution.CollectMetricsExec.$anonfun$doExecute$2$adapted(CollectMetricsExec.scala:71)
	at org.apache.spark.TaskContext$$anon$1.onTaskCompletion(TaskContext.scala:125)
	at org.apache.spark.TaskContextImpl.$anonfun$markTaskCompleted$1(TaskContextImpl.scala:124)
	at org.apache.spark.TaskContextImpl.$anonfun$markTaskCompleted$1$adapted(TaskContextImpl.scala:124)
	at org.apache.spark.TaskContextImpl.$anonfun$invokeListeners$1(TaskContextImpl.scala:137)
	at org.apache.spark.TaskContextImpl.$anonfun$invokeListeners$1$adapted(TaskContextImpl.scala:135)
```

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

New test.

Closes #32786 from sarutak/fix-collectmetricsexec.

Authored-by: Kousuke Saruta <sarutak@oss.nttdata.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 44b695fbb06b0d89783b4838941c68543c5a5c8b)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../execution/AggregatingAccumulator.scala    | 16 +++++----
 .../sql/execution/CollectMetricsExec.scala    |  6 +++-
 .../sql/util/DataFrameCallbackSuite.scala     | 34 +++++++++++++++++++
 3 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/AggregatingAccumulator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/AggregatingAccumulator.scala
index 94e159c562e31..0fa4e6c316360 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/AggregatingAccumulator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/AggregatingAccumulator.scala
@@ -33,7 +33,7 @@ class AggregatingAccumulator private(
     bufferSchema: Seq[DataType],
     initialValues: Seq[Expression],
     updateExpressions: Seq[Expression],
-    @transient private val mergeExpressions: Seq[Expression],
+    mergeExpressions: Seq[Expression],
     @transient private val resultExpressions: Seq[Expression],
     imperatives: Array[ImperativeAggregate],
     typedImperatives: Array[TypedImperativeAggregate[_]],
@@ -95,13 +95,14 @@ class AggregatingAccumulator private(
 
   /**
    * Driver side operations like `merge` and `value` are executed in the DAGScheduler thread. This
-   * thread does not have a SQL configuration so we attach our own here. Note that we can't (and
-   * shouldn't) call `merge` or `value` on an accumulator originating from an executor so we just
-   * return a default value here.
+   * thread does not have a SQL configuration so we attach our own here.
    */
-  private[this] def withSQLConf[T](default: => T)(body: => T): T = {
+  private[this] def withSQLConf[T](canRunOnExecutor: Boolean, default: => T)(body: => T): T = {
     if (conf != null) {
+      // When we can reach here, we are on the driver side.
       SQLConf.withExistingConf(conf)(body)
+    } else if (canRunOnExecutor) {
+      body
     } else {
       default
     }
@@ -147,7 +148,8 @@ class AggregatingAccumulator private(
     }
   }
 
-  override def merge(other: AccumulatorV2[InternalRow, InternalRow]): Unit = withSQLConf(()) {
+  override def merge(
+      other: AccumulatorV2[InternalRow, InternalRow]): Unit = withSQLConf(true, ()) {
     if (!other.isZero) {
       other match {
         case agg: AggregatingAccumulator =>
@@ -171,7 +173,7 @@ class AggregatingAccumulator private(
     }
   }
 
-  override def value: InternalRow = withSQLConf(InternalRow.empty) {
+  override def value: InternalRow = withSQLConf(false, InternalRow.empty) {
     // Either use the existing buffer or create a temporary one.
     val input = if (!isZero) {
       buffer
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala
index b0bbb52bc4990..2883bc0583c84 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala
@@ -69,7 +69,11 @@ case class CollectMetricsExec(
       // - Performance issues due to excessive serialization.
       val updater = collector.copyAndReset()
       TaskContext.get().addTaskCompletionListener[Unit] { _ =>
-        collector.setState(updater)
+        if (collector.isZero) {
+          collector.setState(updater)
+        } else {
+          collector.merge(updater)
+        }
       }
 
       rows.map { r =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
index b17c93503804c..296c8106cb8ff 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
@@ -281,6 +281,40 @@ class DataFrameCallbackSuite extends QueryTest
     }
   }
 
+  test("SPARK-35296: observe should work even if a task contains multiple partitions") {
+    val metricMaps = ArrayBuffer.empty[Map[String, Row]]
+    val listener = new QueryExecutionListener {
+      override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
+        metricMaps += qe.observedMetrics
+      }
+
+      override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {
+        // No-op
+      }
+    }
+    spark.listenerManager.register(listener)
+    try {
+      val df = spark.range(1, 4, 1, 3)
+        .observe(
+          name = "my_event",
+          count($"id").as("count_val"))
+        .coalesce(2)
+
+      def checkMetrics(metrics: Map[String, Row]): Unit = {
+        assert(metrics.size === 1)
+        assert(metrics("my_event") === Row(3L))
+      }
+
+      df.collect()
+      sparkContext.listenerBus.waitUntilEmpty()
+      assert(metricMaps.size === 1)
+      checkMetrics(metricMaps.head)
+      metricMaps.clear()
+    } finally {
+      spark.listenerManager.unregister(listener)
+    }
+  }
+
   testQuietly("SPARK-31144: QueryExecutionListener should receive `java.lang.Error`") {
     var e: Exception = null
     val listener = new QueryExecutionListener {

From c42759ca0ee16b587f8502337d879d02c3e9d0ad Mon Sep 17 00:00:00 2001
From: dgd-contributor <dgd_contributor@viettel.com.vn>
Date: Fri, 11 Jun 2021 20:36:50 +0800
Subject: [PATCH 095/169] [SPARK-35652][SQL] joinWith on two table generated
 from same one

It seems like spark inner join is performing a cartesian join in self joining using `joinWith`

To produce this issues:
```
val df = spark.range(0,3)
df.joinWith(df, df("id") === df("id")).show()
```

Before this pull request, the result is
+---+---+
 | _1 |  _2 |
+---+---+
|    0 |   0 |
|    0 |   1 |
|    0 |   2 |
|    1 |   0 |
|    1 |   1 |
|    1 |   2 |
|    2 |   0 |
|    2 |   1 |
|    2 |   2 |
+---+---+

The expected result is
+---+---+
 | _1 |  _2 |
+---+---+
|    0 |   0 |
|    1 |   1 |
|    2 |   2 |
+---+---+
correctness

no

add test

Closes #32863 from dgd-contributor/SPARK-35652_join_and_joinWith_in_seft_joining.

Authored-by: dgd-contributor <dgd_contributor@viettel.com.vn>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 6e1aa15679b5fed249c62b2340151a0299401b18)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../scala/org/apache/spark/sql/Dataset.scala  | 50 ++++++++++++-------
 .../org/apache/spark/sql/DatasetSuite.scala   | 18 +++++++
 2 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index e9569a0bc00dd..51a8f5aadba6d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1041,6 +1041,30 @@ class Dataset[T] private[sql](
    */
   def join(right: Dataset[_], joinExprs: Column): DataFrame = join(right, joinExprs, "inner")
 
+  /**
+   * find the trivially true predicates and automatically resolves them to both sides.
+   */
+  private def resolveSelfJoinCondition(plan: Join): Join = {
+    val resolver = sparkSession.sessionState.analyzer.resolver
+    val cond = plan.condition.map { _.transform {
+      case catalyst.expressions.EqualTo(a: AttributeReference, b: AttributeReference)
+        if a.sameRef(b) =>
+        catalyst.expressions.EqualTo(
+          plan.left.resolveQuoted(a.name, resolver)
+            .getOrElse(throw resolveException(a.name, plan.left.schema.fieldNames)),
+          plan.right.resolveQuoted(b.name, resolver)
+            .getOrElse(throw resolveException(b.name, plan.right.schema.fieldNames)))
+      case catalyst.expressions.EqualNullSafe(a: AttributeReference, b: AttributeReference)
+        if a.sameRef(b) =>
+        catalyst.expressions.EqualNullSafe(
+          plan.left.resolveQuoted(a.name, resolver)
+            .getOrElse(throw resolveException(a.name, plan.left.schema.fieldNames)),
+          plan.right.resolveQuoted(b.name, resolver)
+            .getOrElse(throw resolveException(b.name, plan.right.schema.fieldNames)))
+    }}
+    plan.copy(condition = cond)
+  }
+
   /**
    * Join with another `DataFrame`, using the given join expression. The following performs
    * a full outer join between `df1` and `df2`.
@@ -1095,26 +1119,9 @@ class Dataset[T] private[sql](
     // Otherwise, find the trivially true predicates and automatically resolves them to both sides.
     // By the time we get here, since we have already run analysis, all attributes should've been
     // resolved and become AttributeReference.
-    val resolver = sparkSession.sessionState.analyzer.resolver
-    val cond = plan.condition.map { _.transform {
-      case catalyst.expressions.EqualTo(a: AttributeReference, b: AttributeReference)
-          if a.sameRef(b) =>
-        catalyst.expressions.EqualTo(
-          plan.left.resolveQuoted(a.name, resolver)
-            .getOrElse(throw resolveException(a.name, plan.left.schema.fieldNames)),
-          plan.right.resolveQuoted(b.name, resolver)
-            .getOrElse(throw resolveException(b.name, plan.right.schema.fieldNames)))
-      case catalyst.expressions.EqualNullSafe(a: AttributeReference, b: AttributeReference)
-        if a.sameRef(b) =>
-        catalyst.expressions.EqualNullSafe(
-          plan.left.resolveQuoted(a.name, resolver)
-            .getOrElse(throw resolveException(a.name, plan.left.schema.fieldNames)),
-          plan.right.resolveQuoted(b.name, resolver)
-            .getOrElse(throw resolveException(b.name, plan.right.schema.fieldNames)))
-    }}
 
     withPlan {
-      plan.copy(condition = cond)
+      resolveSelfJoinCondition(plan)
     }
   }
 
@@ -1156,7 +1163,7 @@ class Dataset[T] private[sql](
   def joinWith[U](other: Dataset[U], condition: Column, joinType: String): Dataset[(T, U)] = {
     // Creates a Join node and resolve it first, to get join condition resolved, self-join resolved,
     // etc.
-    val joined = sparkSession.sessionState.executePlan(
+    var joined = sparkSession.sessionState.executePlan(
       Join(
         this.logicalPlan,
         other.logicalPlan,
@@ -1168,6 +1175,11 @@ class Dataset[T] private[sql](
       throw new AnalysisException("Invalid join type in joinWith: " + joined.joinType.sql)
     }
 
+    // If auto self join alias is enable
+    if (sqlContext.conf.dataFrameSelfJoinAutoResolveAmbiguity) {
+      joined = resolveSelfJoinCondition(joined)
+    }
+
     implicit val tuple2Encoder: Encoder[(T, U)] =
       ExpressionEncoder.tuple(this.exprEnc, other.exprEnc)
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 69fbb9b137575..1b8bb3f4c92d6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1980,6 +1980,24 @@ class DatasetSuite extends QueryTest
 
     checkAnswer(withUDF, Row(Row(1), null, null) :: Row(Row(1), null, null) :: Nil)
   }
+
+  test("SPARK-35652: joinWith on two table generated from same one performing a cartesian join," +
+    " which should be inner join") {
+    val df = Seq(1, 2, 3).toDS()
+
+    val joined = df.joinWith(df, df("value") === df("value"), "inner")
+
+    val expectedSchema = StructType(Seq(
+      StructField("_1", IntegerType, nullable = false),
+      StructField("_2", IntegerType, nullable = false)
+    ))
+
+    assert(joined.schema === expectedSchema)
+
+    checkDataset(
+      joined,
+      (1, 1), (2, 2), (3, 3))
+  }
 }
 
 case class Bar(a: Int)

From f1a1e2e9d49aca00e306b75012ad9ea17b5dd253 Mon Sep 17 00:00:00 2001
From: Tanel Kiis <tanel.kiis@gmail.com>
Date: Fri, 11 Jun 2021 21:03:08 +0800
Subject: [PATCH 096/169] [SPARK-35695][SQL] Collect observed metrics from
 cached and adaptive execution sub-trees

### What changes were proposed in this pull request?

Collect observed metrics from cached and adaptive execution sub-trees.

### Why are the changes needed?

Currently persisting/caching will hide all observed metrics in that sub-tree from reaching the `QueryExecutionListeners`. Adaptive query execution can also hide the metrics from reaching `QueryExecutionListeners`.

### Does this PR introduce _any_ user-facing change?

Bugfix

### How was this patch tested?

New UTs

Closes #32862 from tanelk/SPARK-35695_collect_metrics_persist.

Lead-authored-by: Tanel Kiis <tanel.kiis@gmail.com>
Co-authored-by: tanel.kiis@gmail.com <tanel.kiis@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 692dc66c4a3660665c1f156df6eeb9ce6f86195e)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/execution/CollectMetricsExec.scala    |  12 +-
 .../sql/util/DataFrameCallbackSuite.scala     | 110 +++++++++++-------
 2 files changed, 79 insertions(+), 43 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala
index 2883bc0583c84..7a595044351c6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala
@@ -22,6 +22,8 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression, SortOrder}
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
+import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, QueryStageExec}
+import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
 import org.apache.spark.sql.types.StructType
 
 /**
@@ -90,8 +92,14 @@ object CollectMetricsExec {
    */
   def collect(plan: SparkPlan): Map[String, Row] = {
     val metrics = plan.collectWithSubqueries {
-      case collector: CollectMetricsExec => collector.name -> collector.collectedMetrics
+      case collector: CollectMetricsExec => Map(collector.name -> collector.collectedMetrics)
+      case tableScan: InMemoryTableScanExec =>
+        CollectMetricsExec.collect(tableScan.relation.cachedPlan)
+      case adaptivePlan: AdaptiveSparkPlanExec =>
+        CollectMetricsExec.collect(adaptivePlan.executedPlan)
+      case queryStageExec: QueryStageExec =>
+        CollectMetricsExec.collect(queryStageExec.plan)
     }
-    metrics.toMap
+    metrics.reduceOption(_ ++ _).getOrElse(Map.empty)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
index 296c8106cb8ff..db2b7b7504a13 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/util/DataFrameCallbackSuite.scala
@@ -17,6 +17,8 @@
 
 package org.apache.spark.sql.util
 
+import java.lang.{Long => JLong}
+
 import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark._
@@ -29,6 +31,7 @@ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
 import org.apache.spark.sql.execution.command.RunnableCommand
 import org.apache.spark.sql.execution.datasources.{CreateTable, InsertIntoHadoopFsRelationCommand}
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.sql.types.StringType
 
@@ -234,18 +237,58 @@ class DataFrameCallbackSuite extends QueryTest
   }
 
   test("get observable metrics by callback") {
-    val metricMaps = ArrayBuffer.empty[Map[String, Row]]
-    val listener = new QueryExecutionListener {
-      override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
-        metricMaps += qe.observedMetrics
-      }
+    val df = spark.range(100)
+      .observe(
+        name = "my_event",
+        min($"id").as("min_val"),
+        max($"id").as("max_val"),
+        // Test unresolved alias
+        sum($"id"),
+        count(when($"id" % 2 === 0, 1)).as("num_even"))
+      .observe(
+        name = "other_event",
+        avg($"id").cast("int").as("avg_val"))
+
+    validateObservedMetrics(df)
+  }
 
-      override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = {
-        // No-op
-      }
-    }
-    spark.listenerManager.register(listener)
-    try {
+  test("SPARK-35296: observe should work even if a task contains multiple partitions") {
+    val df = spark.range(0, 100, 1, 3)
+      .observe(
+        name = "my_event",
+        min($"id").as("min_val"),
+        max($"id").as("max_val"),
+        // Test unresolved alias
+        sum($"id"),
+        count(when($"id" % 2 === 0, 1)).as("num_even"))
+      .observe(
+        name = "other_event",
+        avg($"id").cast("int").as("avg_val"))
+      .coalesce(2)
+
+    validateObservedMetrics(df)
+  }
+
+  test("SPARK-35695: get observable metrics with persist by callback") {
+    val df = spark.range(100)
+      .observe(
+        name = "my_event",
+        min($"id").as("min_val"),
+        max($"id").as("max_val"),
+        // Test unresolved alias
+        sum($"id"),
+        count(when($"id" % 2 === 0, 1)).as("num_even"))
+      .persist()
+      .observe(
+        name = "other_event",
+        avg($"id").cast("int").as("avg_val"))
+      .persist()
+
+    validateObservedMetrics(df)
+  }
+
+  test("SPARK-35695: get observable metrics with adaptive execution by callback") {
+    withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") {
       val df = spark.range(100)
         .observe(
           name = "my_event",
@@ -253,35 +296,16 @@ class DataFrameCallbackSuite extends QueryTest
           max($"id").as("max_val"),
           sum($"id").as("sum_val"),
           count(when($"id" % 2 === 0, 1)).as("num_even"))
+        .repartition($"id")
         .observe(
           name = "other_event",
           avg($"id").cast("int").as("avg_val"))
 
-      def checkMetrics(metrics: Map[String, Row]): Unit = {
-        assert(metrics.size === 2)
-        assert(metrics("my_event") === Row(0L, 99L, 4950L, 50L))
-        assert(metrics("other_event") === Row(49))
-      }
-
-      // First run
-      df.collect()
-      sparkContext.listenerBus.waitUntilEmpty()
-      assert(metricMaps.size === 1)
-      checkMetrics(metricMaps.head)
-      metricMaps.clear()
-
-      // Second run should produce the same result as the first run.
-      df.collect()
-      sparkContext.listenerBus.waitUntilEmpty()
-      assert(metricMaps.size === 1)
-      checkMetrics(metricMaps.head)
-
-    } finally {
-      spark.listenerManager.unregister(listener)
+      validateObservedMetrics(df)
     }
   }
 
-  test("SPARK-35296: observe should work even if a task contains multiple partitions") {
+  private def validateObservedMetrics(df: Dataset[JLong]): Unit = {
     val metricMaps = ArrayBuffer.empty[Map[String, Row]]
     val listener = new QueryExecutionListener {
       override def onSuccess(funcName: String, qe: QueryExecution, duration: Long): Unit = {
@@ -294,27 +318,31 @@ class DataFrameCallbackSuite extends QueryTest
     }
     spark.listenerManager.register(listener)
     try {
-      val df = spark.range(1, 4, 1, 3)
-        .observe(
-          name = "my_event",
-          count($"id").as("count_val"))
-        .coalesce(2)
-
       def checkMetrics(metrics: Map[String, Row]): Unit = {
-        assert(metrics.size === 1)
-        assert(metrics("my_event") === Row(3L))
+        assert(metrics.size === 2)
+        assert(metrics("my_event") === Row(0L, 99L, 4950L, 50L))
+        assert(metrics("other_event") === Row(49))
       }
 
+      // First run
       df.collect()
       sparkContext.listenerBus.waitUntilEmpty()
       assert(metricMaps.size === 1)
       checkMetrics(metricMaps.head)
       metricMaps.clear()
+
+      // Second run should produce the same result as the first run.
+      df.collect()
+      sparkContext.listenerBus.waitUntilEmpty()
+      assert(metricMaps.size === 1)
+      checkMetrics(metricMaps.head)
+
     } finally {
       spark.listenerManager.unregister(listener)
     }
   }
 
+
   testQuietly("SPARK-31144: QueryExecutionListener should receive `java.lang.Error`") {
     var e: Exception = null
     val listener = new QueryExecutionListener {

From f87ad51de4a21b8cb9954dc3840eadc50fdafd99 Mon Sep 17 00:00:00 2001
From: shahid <shahidki31@gmail.com>
Date: Sat, 12 Jun 2021 15:38:41 +0900
Subject: [PATCH 097/169] [SPARK-35746][UI] Fix taskid in the stage page task
 event timeline

### What changes were proposed in this pull request?
Task id is given incorrect in the timeline plot in Stage Page

### Why are the changes needed?
Map event timeline plots to correct task
**Before:**
![image](https://user-images.githubusercontent.com/23054875/121761077-81775800-cb4b-11eb-8ec6-ee71926a6549.png)

**After**
![image](https://user-images.githubusercontent.com/23054875/121761195-02ceea80-cb4c-11eb-8ce6-07bb1cca190e.png)
### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Manually tested

Closes #32888 from shahidki31/shahid/fixtaskid.

Authored-by: shahid <shahidki31@gmail.com>
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
(cherry picked from commit 450b415028c3b00f3a002126cd11318d3932e28f)
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
---
 core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 47ba951953cec..459e09ac9a36e 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -355,7 +355,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
                |'content': '<div class="task-assignment-timeline-content"
                  |data-toggle="tooltip" data-placement="top"
                  |data-html="true" data-container="body"
-                 |data-title="${s"Task " + index + " (attempt " + attempt + ")"}<br>
+                 |data-title="${s"Task " + taskInfo.taskId + " (attempt " + attempt + ")"}<br>
                  |Status: ${taskInfo.status}<br>
                  |Launch Time: ${UIUtils.formatDate(new Date(launchTime))}
                  |${

From e934d571256f34a11d246d2ead28bc8e55e6b318 Mon Sep 17 00:00:00 2001
From: Kun Wan <wankun@apache.org>
Date: Sun, 13 Jun 2021 16:01:00 -0500
Subject: [PATCH 098/169] [SPARK-35714][CORE] Bug fix for deadlock during the
 executor shutdown

### What changes were proposed in this pull request?

Bug fix for deadlock during the executor shutdown

### Why are the changes needed?

When a executor received a TERM signal, it (the second TERM signal) will lock java.lang.Shutdown class and then call Shutdown.exit() method to exit the JVM.
Shutdown will call SparkShutdownHook to shutdown the executor.
During the executor shutdown phase, RemoteProcessDisconnected event will be send to the RPC inbox, and then WorkerWatcher will try to call System.exit(-1) again.
Because java.lang.Shutdown has already locked, a deadlock has occurred.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Test case "task reaper kills JVM if killed tasks keep running for too long" in JobCancellationSuite

Closes #32868 from wankunde/SPARK-35714.

Authored-by: Kun Wan <wankun@apache.org>
Signed-off-by: Sean Owen <srowen@gmail.com>
(cherry picked from commit 69aa7ad11f68e96e045b5eb915e21708e018421a)
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../apache/spark/deploy/worker/WorkerWatcher.scala | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
index 23efcab6caad1..43ec492de479e 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
@@ -17,8 +17,13 @@
 
 package org.apache.spark.deploy.worker
 
+import scala.concurrent.ExecutionContext.Implicits.global
+import scala.concurrent.Future
+import scala.concurrent.duration._
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc._
+import org.apache.spark.util.ThreadUtils
 
 /**
  * Endpoint which connects to a worker process and terminates the JVM if the
@@ -45,7 +50,14 @@ private[spark] class WorkerWatcher(
   private val expectedAddress = RpcAddress.fromURIString(workerUrl)
   private def isWorker(address: RpcAddress) = expectedAddress == address
 
-  private def exitNonZero() = if (isTesting) isShutDown = true else System.exit(-1)
+  private def exitNonZero() =
+    if (isTesting) {
+      isShutDown = true
+    } else {
+      ThreadUtils.awaitResult(Future {
+        System.exit(-1)
+      }, 5.seconds)
+    }
 
   override def receive: PartialFunction[Any, Unit] = {
     case e => logWarning(s"Received unexpected message: $e")

From 763c27caa284d44d6d6247186865aaeebbc58594 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove73@gmail.com>
Date: Tue, 15 Jun 2021 11:59:21 -0700
Subject: [PATCH 099/169] [SPARK-35767][SQL] Avoid executing child plan twice
 in CoalesceExec

### What changes were proposed in this pull request?

`CoalesceExec` needlessly calls `child.execute` twice when it could just call it once and re-use the results. This only happens when `numPartitions == 1`.

### Why are the changes needed?

It is more efficient to execute the child plan once rather than twice.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

There are no functional changes. This is just a performance optimization, so the existing tests should be sufficient to catch any regressions.

Closes #32920 from andygrove/coalesce-exec-executes-twice.

Authored-by: Andy Grove <andygrove73@gmail.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
(cherry picked from commit 1012967ace4c7bd4e5a6f59c6ea6eec45871f292)
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../apache/spark/sql/execution/basicPhysicalOperators.scala  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
index d651132d7abf5..4fcd67b18f467 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/basicPhysicalOperators.scala
@@ -690,12 +690,13 @@ case class CoalesceExec(numPartitions: Int, child: SparkPlan) extends UnaryExecN
   }
 
   protected override def doExecute(): RDD[InternalRow] = {
-    if (numPartitions == 1 && child.execute().getNumPartitions < 1) {
+    val rdd = child.execute()
+    if (numPartitions == 1 && rdd.getNumPartitions < 1) {
       // Make sure we don't output an RDD with 0 partitions, when claiming that we have a
       // `SinglePartition`.
       new CoalesceExec.EmptyRDDWithPartitions(sparkContext, numPartitions)
     } else {
-      child.execute().coalesce(numPartitions, shuffle = false)
+      rdd.coalesce(numPartitions, shuffle = false)
     }
   }
 }

From 6ae88cfc8e5cbdf8abf706f8540b95e76a64d709 Mon Sep 17 00:00:00 2001
From: Cheng Su <chengsu@fb.com>
Date: Thu, 17 Jun 2021 13:57:35 +0800
Subject: [PATCH 100/169] [SPARK-35791][SQL] Release on-going map properly for
 NULL-aware ANTI join

### What changes were proposed in this pull request?

NULL-aware ANTI join (https://issues.apache.org/jira/browse/SPARK-32290) detects NULL join keys during building the map for `HashedRelation`, and will immediately return `HashedRelationWithAllNullKeys` without taking care of the map built already. Before returning `HashedRelationWithAllNullKeys`, the map needs to be freed properly to save memory and keep memory accounting correctly.

### Why are the changes needed?

Save memory and keep memory accounting correctly for the join query.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Existing unit tests introduced in https://github.com/apache/spark/pull/29104 .

Closes #32939 from c21/free-null-aware.

Authored-by: Cheng Su <chengsu@fb.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit e0d81d9b712310dd1d70988299b6fba4777945a1)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/sql/execution/joins/HashedRelation.scala   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
index a91cc0782e1f8..fec766207b4ba 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/joins/HashedRelation.scala
@@ -480,6 +480,7 @@ private[joins] object UnsafeHashedRelation {
           // scalastyle:on throwerror
         }
       } else if (isNullAware) {
+        binaryMap.free()
         return HashedRelationWithAllNullKeys
       }
     }
@@ -1064,6 +1065,7 @@ private[joins] object LongHashedRelation {
         val key = rowKey.getLong(0)
         map.append(key, unsafeRow)
       } else if (isNullAware) {
+        map.free()
         return HashedRelationWithAllNullKeys
       }
     }

From bd86d35c3c12686500aa3cd68bf3ce013c86074b Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Thu, 18 Mar 2021 12:48:19 +0000
Subject: [PATCH 101/169] [SPARK-34766][SQL][3.1] Do not capture maven config
 for views
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

backport [#31856](https://github.com/apache/spark/pull/31856) for branch-3.1

### What changes were proposed in this pull request?

Skip capture maven repo config for views.

### Why are the changes needed?

Due to the bad network, we always use the thirdparty maven repo to run test. e.g.,
```
build/sbt "test:testOnly *SQLQueryTestSuite" -Dspark.sql.maven.additionalRemoteRepositories=xxxxx
```

It's failed with such error msg
```
[info] - show-tblproperties.sql *** FAILED *** (128 milliseconds)
[info] show-tblproperties.sql
[info] Expected "...rredTempViewNames [][]", but got "...rredTempViewNames [][
[info] view.sqlConfig.spark.sql.maven.additionalRemoteRepositories xxxxx]" Result did not match for query #6
[info] SHOW TBLPROPERTIES view (SQLQueryTestSuite.scala:464)
```

It's not necessary to capture the maven config to view since it's a session level config.
 

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

manual test pass
```
build/sbt "test:testOnly *SQLQueryTestSuite" -Dspark.sql.maven.additionalRemoteRepositories=xxx
```

Closes #31856 from ulysses-you/skip-maven-config.

Authored-by: ulysses-you <ulyssesyou18gmail.com>
Signed-off-by: Kent Yao <yaoapache.org>

Closes #31879 from ulysses-you/SPARK-34766-3-1.

Authored-by: ulysses-you <ulyssesyou18@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../scala/org/apache/spark/sql/execution/command/views.scala   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
index ef0e90d97ae1c..620553c17d342 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
@@ -351,7 +351,8 @@ object ViewHelper {
     "spark.sql.codegen.",
     "spark.sql.execution.",
     "spark.sql.shuffle.",
-    "spark.sql.adaptive.")
+    "spark.sql.adaptive.",
+    SQLConf.ADDITIONAL_REMOTE_REPOSITORIES.key)
 
   private val configAllowList = Seq(
     SQLConf.DISABLE_HINTS.key

From 07802bbc55d17a2387626baafc2bd328e3a3b2ce Mon Sep 17 00:00:00 2001
From: Linhong Liu <linhong.liu@databricks.com>
Date: Thu, 17 Jun 2021 21:40:53 +0800
Subject: [PATCH 102/169] [SPARK-35792][SQL] View should not capture configs
 used in `RelationConversions`

### What changes were proposed in this pull request?
`RelationConversions` is actually an optimization rule while it's executed in the analysis phase.
For view, it's designed to only capture semantic configs, so we should ignore the optimization
configs that will be used in the analysis phase.

This PR also fixes the issue that view resolution will always use the default value for uncaptured config

### Why are the changes needed?
Bugfix

### Does this PR introduce _any_ user-facing change?
Yes, after this PR view resolution will respect the values set in the current session for the below configs
```
"spark.sql.hive.convertMetastoreParquet"
"spark.sql.hive.convertMetastoreOrc"
"spark.sql.hive.convertInsertingPartitionedTable"
"spark.sql.hive.convertMetastoreCtas"
```

### How was this patch tested?
By running new UT:
```
$ build/sbt -Phive-2.3 -Phive-thriftserver "test:testOnly *HiveSQLViewSuite"
```

Closes #32941 from linhongliu-db/SPARK-35792-ignore-convert-configs.

Authored-by: Linhong Liu <linhong.liu@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit b86a69f026b503896846ec32c8f7addc39dda2a0)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../plans/logical/basicLogicalOperators.scala | 13 +++++++++-
 .../spark/sql/execution/command/views.scala   |  5 ++++
 .../sql/hive/execution/HiveSQLViewSuite.scala | 24 ++++++++++++++++++-
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
index 110e21f9c3935..4aefb66f72ee2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -500,7 +500,18 @@ object View {
     if (activeConf.useCurrentSQLConfigsForView && !isTempView) return activeConf
 
     val sqlConf = new SQLConf()
-    for ((k, v) <- configs) {
+    // We retain below configs from current session because they are not captured by view
+    // as optimization configs but they are still needed during the view resolution.
+    // TODO: remove this `retainedConfigs` after the `RelationConversions` is moved to
+    // optimization phase.
+    val retainedConfigs = activeConf.getAllConfs.filterKeys(key =>
+      Seq(
+        "spark.sql.hive.convertMetastoreParquet",
+        "spark.sql.hive.convertMetastoreOrc",
+        "spark.sql.hive.convertInsertingPartitionedTable",
+        "spark.sql.hive.convertMetastoreCtas"
+      ).contains(key))
+    for ((k, v) <- configs ++ retainedConfigs) {
       sqlConf.settings.put(k, v)
     }
     sqlConf
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
index 620553c17d342..c58c6b4d34baa 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
@@ -352,6 +352,11 @@ object ViewHelper {
     "spark.sql.execution.",
     "spark.sql.shuffle.",
     "spark.sql.adaptive.",
+    // ignore optimization configs used in `RelationConversions`
+    "spark.sql.hive.convertMetastoreParquet",
+    "spark.sql.hive.convertMetastoreOrc",
+    "spark.sql.hive.convertInsertingPartitionedTable",
+    "spark.sql.hive.convertMetastoreCtas",
     SQLConf.ADDITIONAL_REMOTE_REPOSITORIES.key)
 
   private val configAllowList = Seq(
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala
index 8aae7a1545b1a..feb2c6765d352 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveSQLViewSuite.scala
@@ -19,8 +19,9 @@ package org.apache.spark.sql.hive.execution
 
 import org.apache.spark.sql.{AnalysisException, Row}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType}
+import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType, HiveTableRelation}
 import org.apache.spark.sql.execution.SQLViewSuite
+import org.apache.spark.sql.hive.HiveUtils
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.types.{NullType, StructType}
 
@@ -157,4 +158,25 @@ class HiveSQLViewSuite extends SQLViewSuite with TestHiveSingleton {
       )
     }
   }
+
+  test("SPARK-35792: ignore optimization configs used in RelationConversions") {
+    withTable("t_orc") {
+      withView("v_orc") {
+        withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "true") {
+          spark.sql("create table t_orc stored as orc as select 1 as a, 2 as b")
+          spark.sql("create view v_orc as select * from t_orc")
+        }
+        withSQLConf(HiveUtils.CONVERT_METASTORE_ORC.key -> "false") {
+          val relationInTable = sql("select * from t_orc").queryExecution.analyzed.collect {
+            case r: HiveTableRelation => r
+          }.headOption
+          val relationInView = sql("select * from v_orc").queryExecution.analyzed.collect {
+            case r: HiveTableRelation => r
+          }.headOption
+          assert(relationInTable.isDefined)
+          assert(relationInView.isDefined)
+        }
+      }
+    }
+  }
 }

From c33950e3f221371fa39c216a064914fc8f86f0df Mon Sep 17 00:00:00 2001
From: Vasily Kolpakov <vasilykolpakov@gmail.com>
Date: Mon, 21 Jun 2021 08:23:20 -0500
Subject: [PATCH 103/169] [SPARK-35391] Fix memory leak in
 ExecutorAllocationListener

### What changes were proposed in this pull request?
This PR fixes a memory leak in ExecutorAllocationListener.

### Why are the changes needed?
Dynamic allocation stops working under high load (~100 tasks/s, ~5 stages/s) in long-lived (~10 days) spark applications. This PR addresses the problem.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Manual tests. The patch fixed dynamic allocation in production cluster.

Closes #32526 from VasilyKolpakov/SPARK-35391_fix_ExecutorAllocationListener.

Authored-by: Vasily Kolpakov <vasilykolpakov@gmail.com>
Signed-off-by: Thomas Graves <tgraves@apache.org>
(cherry picked from commit 844f10c7426a76fb29ee91223c8af43825e147c5)
Signed-off-by: Thomas Graves <tgraves@apache.org>
---
 .../spark/ExecutorAllocationManager.scala     | 40 ++++++++++++-------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
index bdb768ed5a6ca..f2078f45f69bf 100644
--- a/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
+++ b/core/src/main/scala/org/apache/spark/ExecutorAllocationManager.scala
@@ -736,6 +736,7 @@ private[spark] class ExecutorAllocationManager(
         stageAttemptToTaskIndices -= stageAttempt
         stageAttemptToSpeculativeTaskIndices -= stageAttempt
         stageAttemptToExecutorPlacementHints -= stageAttempt
+        removeStageFromResourceProfileIfUnused(stageAttempt)
 
         // Update the executor placement hints
         updateExecutorPlacementHints()
@@ -780,20 +781,7 @@ private[spark] class ExecutorAllocationManager(
           stageAttemptToNumRunningTask(stageAttempt) -= 1
           if (stageAttemptToNumRunningTask(stageAttempt) == 0) {
             stageAttemptToNumRunningTask -= stageAttempt
-            if (!stageAttemptToNumTasks.contains(stageAttempt)) {
-              val rpForStage = resourceProfileIdToStageAttempt.filter { case (k, v) =>
-                v.contains(stageAttempt)
-              }.keys
-              if (rpForStage.size == 1) {
-                // be careful about the removal from here due to late tasks, make sure stage is
-                // really complete and no tasks left
-                resourceProfileIdToStageAttempt(rpForStage.head) -= stageAttempt
-              } else {
-                logWarning(s"Should have exactly one resource profile for stage $stageAttempt," +
-                  s" but have $rpForStage")
-              }
-            }
-
+            removeStageFromResourceProfileIfUnused(stageAttempt)
           }
         }
         if (taskEnd.taskInfo.speculative) {
@@ -858,6 +846,28 @@ private[spark] class ExecutorAllocationManager(
       allocationManager.synchronized {
         // Clear unschedulableTaskSets since atleast one task becomes schedulable now
         unschedulableTaskSets.remove(stageAttempt)
+        removeStageFromResourceProfileIfUnused(stageAttempt)
+      }
+    }
+
+    def removeStageFromResourceProfileIfUnused(stageAttempt: StageAttempt): Unit = {
+      if (!stageAttemptToNumRunningTask.contains(stageAttempt) &&
+          !stageAttemptToNumTasks.contains(stageAttempt) &&
+          !stageAttemptToNumSpeculativeTasks.contains(stageAttempt) &&
+          !stageAttemptToTaskIndices.contains(stageAttempt) &&
+          !stageAttemptToSpeculativeTaskIndices.contains(stageAttempt)
+      ) {
+        val rpForStage = resourceProfileIdToStageAttempt.filter { case (k, v) =>
+          v.contains(stageAttempt)
+        }.keys
+        if (rpForStage.size == 1) {
+          // be careful about the removal from here due to late tasks, make sure stage is
+          // really complete and no tasks left
+          resourceProfileIdToStageAttempt(rpForStage.head) -= stageAttempt
+        } else {
+          logWarning(s"Should have exactly one resource profile for stage $stageAttempt," +
+              s" but have $rpForStage")
+        }
       }
     }
 
@@ -931,7 +941,7 @@ private[spark] class ExecutorAllocationManager(
       val attempts = resourceProfileIdToStageAttempt.getOrElse(rp, Set.empty).toSeq
       // attempts is a Set, change to Seq so we keep all values
       attempts.map { attempt =>
-        stageAttemptToNumRunningTask.getOrElseUpdate(attempt, 0)
+        stageAttemptToNumRunningTask.getOrElse(attempt, 0)
       }.sum
     }
 

From 5fe864a0ed3c1cfae609fa21e0b606cdfacb6d6f Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 23 Jun 2021 09:54:12 +0900
Subject: [PATCH 104/169] [SPARK-35695][SQL][FOLLOWUP] Use AQE helper to
 simplify the code in CollectMetricsExec

### What changes were proposed in this pull request?

This is a followup of https://github.com/apache/spark/pull/32862 , to simplify the code with AQE helper.

### Why are the changes needed?

code cleanup

### Does this PR introduce _any_ user-facing change?

no

### How was this patch tested?

existing tests

Closes #33026 from cloud-fan/follow.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit a87ee5d8b9dc00e327edc9911c21225c09042acd)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../spark/sql/execution/CollectMetricsExec.scala    | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala
index 7a595044351c6..c9e1bc3486e89 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/CollectMetricsExec.scala
@@ -22,7 +22,7 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
 import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression, SortOrder}
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
-import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, QueryStageExec}
+import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
 import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
 import org.apache.spark.sql.types.StructType
 
@@ -86,19 +86,16 @@ case class CollectMetricsExec(
   }
 }
 
-object CollectMetricsExec {
+object CollectMetricsExec extends AdaptiveSparkPlanHelper {
   /**
    * Recursively collect all collected metrics from a query tree.
    */
   def collect(plan: SparkPlan): Map[String, Row] = {
-    val metrics = plan.collectWithSubqueries {
-      case collector: CollectMetricsExec => Map(collector.name -> collector.collectedMetrics)
+    val metrics = collectWithSubqueries(plan) {
+      case collector: CollectMetricsExec =>
+        Map(collector.name -> collector.collectedMetrics)
       case tableScan: InMemoryTableScanExec =>
         CollectMetricsExec.collect(tableScan.relation.cachedPlan)
-      case adaptivePlan: AdaptiveSparkPlanExec =>
-        CollectMetricsExec.collect(adaptivePlan.executedPlan)
-      case queryStageExec: QueryStageExec =>
-        CollectMetricsExec.collect(queryStageExec.plan)
     }
     metrics.reduceOption(_ ++ _).getOrElse(Map.empty)
   }

From d2cbf6e9e6f4f87220af3dcf790e7544002e95cf Mon Sep 17 00:00:00 2001
From: dgd-contributor <dgd_contributor@viettel.com.vn>
Date: Thu, 24 Jun 2021 16:44:58 +0800
Subject: [PATCH 105/169] =?UTF-8?q?[SPARK-35841][SQL]=20Casting=20string?=
 =?UTF-8?q?=20to=20decimal=20type=20doesn't=20work=20if=20the=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… sum of the digits is greater than 38
Since Spark 3.1.1, NULL is returned when casting a string with many decimal places to a decimal type. If the sum of the digits before and after the decimal point is less than 39, a value is returned. From 39 digits, however, NULL is returned.
This worked until Spark 3.0.X.

Code to reproduce:

A string with 2 decimal places in front of the decimal point and 37 decimal places after the decimal point returns null

```
val data = Seq(
      "28.9259999999999983799625624669715762138",
      "28.925999999999998379962562466971576213",
      "2.9259999999999983799625624669715762138"
      )
val df = data.toDF("num")
df.withColumn("numConverted", col("num").cast("decimal(38, 5)")).show()
```

before this pull request, the result is
+----------------------+---------------+
|                 num          |numConverted|
+----------------------+---------------+
|28.92599999999999...|                  null|
|28.92599999999999...|         28.92600|
|2.925999999999998...|           2.92600|
+----------------------+---------------+

the correct result should be
+----------------------+---------------+
|                 num          |numConverted|
+----------------------+---------------+
|28.92599999999999...|         28.92600|
|28.92599999999999...|         28.92600|
|2.925999999999998...|           2.92600|
+----------------------+---------------+

The problem occur since https://issues.apache.org/jira/browse/SPARK-32706, it because the fast fail is checking precision length, which should only check the whole number part length of the input value, not the precision length

correctness

no

test added

Closes #33011 from dgd-contributor/SPARK-35841_castStringToDecimalTypeError.

Authored-by: dgd-contributor <dgd_contributor@viettel.com.vn>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 5b9c5c126fd391999003f8de9d87c255dfe9424a)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../scala/org/apache/spark/sql/types/Decimal.scala  | 10 +++-------
 .../org/apache/spark/sql/types/DecimalSuite.scala   | 13 +++++++++++++
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index 960e174f9c368..72cf03d0ab506 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -586,12 +586,8 @@ object Decimal {
     }
   }
 
-  private def calculatePrecision(bigDecimal: JavaBigDecimal): Int = {
-    if (bigDecimal.scale < 0) {
+  private def numDigitsInIntegralPart(bigDecimal: JavaBigDecimal): Int = {
       bigDecimal.precision - bigDecimal.scale
-    } else {
-      bigDecimal.precision
-    }
   }
 
   private def stringToJavaBigDecimal(str: UTF8String): JavaBigDecimal = {
@@ -605,7 +601,7 @@ object Decimal {
       val bigDecimal = stringToJavaBigDecimal(str)
       // We fast fail because constructing a very large JavaBigDecimal to Decimal is very slow.
       // For example: Decimal("6.0790316E+25569151")
-      if (calculatePrecision(bigDecimal) > DecimalType.MAX_PRECISION) {
+      if (numDigitsInIntegralPart(bigDecimal) > DecimalType.MAX_PRECISION) {
         null
       } else {
         Decimal(bigDecimal)
@@ -621,7 +617,7 @@ object Decimal {
       val bigDecimal = stringToJavaBigDecimal(str)
       // We fast fail because constructing a very large JavaBigDecimal to Decimal is very slow.
       // For example: Decimal("6.0790316E+25569151")
-      if (calculatePrecision(bigDecimal) > DecimalType.MAX_PRECISION) {
+      if (numDigitsInIntegralPart(bigDecimal) > DecimalType.MAX_PRECISION) {
         throw new ArithmeticException(s"out of decimal type range: $str")
       } else {
         Decimal(bigDecimal)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala
index 7ce451ed6d577..57449f5b6dc66 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala
@@ -286,4 +286,17 @@ class DecimalSuite extends SparkFunSuite with PrivateMethodTester with SQLHelper
     val e = intercept[NumberFormatException](Decimal.fromStringANSI(UTF8String.fromString("str")))
     assert(e.getMessage.contains("invalid input syntax for type numeric"))
   }
+
+  test("SPARK-35841: Casting string to decimal type doesn't work " +
+    "if the sum of the digits is greater than 38") {
+    val values = Array(
+      "28.9259999999999983799625624669715762138",
+      "28.925999999999998379962562466971576213",
+      "2.9259999999999983799625624669715762138"
+    )
+    for (string <- values) {
+      assert(Decimal.fromString(UTF8String.fromString(string)) === Decimal(string))
+      assert(Decimal.fromStringANSI(UTF8String.fromString(string)) === Decimal(string))
+    }
+  }
 }

From 70ec5ed889f32fa90cd3b0804e4e0fdb97a249af Mon Sep 17 00:00:00 2001
From: Liang-Chi Hsieh <viirya@gmail.com>
Date: Sun, 27 Jun 2021 15:41:22 -0700
Subject: [PATCH 106/169] [SPARK-35886][SQL][3.1] PromotePrecision should not
 overwrite genCodePromotePrecision should not overwrite genCode

### What changes were proposed in this pull request?

This patch fixes `PromotePrecision` where it overwrites `genCode` where subexpression elimination should happen. This is the backport of SPARK-35886 to branch-3.1.

### Why are the changes needed?

`PromotePrecision` overwrites `genCode` where subexpression elimination should happen. So if it is most top expression of a subexpression, it is never replaced.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Added test.

Closes #33111 from viirya/fix-precision-3.1.

Authored-by: Liang-Chi Hsieh <viirya@gmail.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../catalyst/expressions/decimalExpressions.scala |  6 ++----
 .../SubexpressionEliminationSuite.scala           | 15 ++++++++++++++-
 .../org/apache/spark/sql/DataFrameSuite.scala     | 15 +++++++++++++++
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
index 7e4560ab8161b..2de5bbbe9893e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/decimalExpressions.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.expressions
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, EmptyBlock, ExprCode}
+import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
@@ -103,10 +103,8 @@ object MakeDecimal {
 case class PromotePrecision(child: Expression) extends UnaryExpression {
   override def dataType: DataType = child.dataType
   override def eval(input: InternalRow): Any = child.eval(input)
-  /** Just a simple pass-through for code generation. */
-  override def genCode(ctx: CodegenContext): ExprCode = child.genCode(ctx)
   override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode =
-    ev.copy(EmptyBlock)
+    child.genCode(ctx)
   override def prettyName: String = "promote_precision"
   override def sql: String = child.sql
   override lazy val canonicalized: Expression = child.canonicalized
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubexpressionEliminationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubexpressionEliminationSuite.scala
index dd2162e279234..689164798dc06 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubexpressionEliminationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/SubexpressionEliminationSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.types.{BinaryType, DataType, IntegerType}
+import org.apache.spark.sql.types.{BinaryType, DataType, Decimal, IntegerType}
 
 class SubexpressionEliminationSuite extends SparkFunSuite with ExpressionEvalHelper {
   test("Semantic equals and hash") {
@@ -325,6 +325,19 @@ class SubexpressionEliminationSuite extends SparkFunSuite with ExpressionEvalHel
     // `add1` is not in the elseValue, so we can't extract it from the branches
     assert(equivalence.getAllEquivalentExprs.count(_.size == 2) == 0)
   }
+
+  test("SPARK-35886: PromotePrecision should not overwrite genCode") {
+    val p = PromotePrecision(Literal(Decimal("10.1")))
+
+    val ctx = new CodegenContext()
+    val subExprs = ctx.subexpressionEliminationForWholeStageCodegen(Seq(p, p))
+    val code = ctx.withSubExprEliminationExprs(subExprs.states) {
+      Seq(p.genCode(ctx))
+    }.head
+    // Decimal `Literal` will add the value by `addReferenceObj`.
+    // So if `p` is replaced by subexpression, the literal will be reused.
+    assert(code.value.toString == "((Decimal) references[0] /* literal */)")
+  }
 }
 
 case class CodegenFallbackExpression(child: Expression)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index 79f08897fc0dd..de252900f69ce 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -2657,6 +2657,21 @@ class DataFrameSuite extends QueryTest
       df10.select(zip_with(col("array1"), col("array2"), (b1, b2) => reverseThenConcat2(b1, b2)))
     checkAnswer(test10, Row(Array(Row("cbaihg"), Row("fedlkj"))) :: Nil)
   }
+
+  test("SPARK-35886: PromotePrecision should be subexpr replaced") {
+    withTable("tbl") {
+      sql(
+        """
+          |CREATE TABLE tbl (
+          |  c1 DECIMAL(18,6),
+          |  c2 DECIMAL(18,6),
+          |  c3 DECIMAL(18,6))
+          |USING parquet;
+          |""".stripMargin)
+      sql("INSERT INTO tbl SELECT 1, 1, 1")
+      checkAnswer(sql("SELECT sum(c1 * c3) + sum(c2 * c3) FROM tbl"), Row(2.00000000000) :: Nil)
+    }
+  }
 }
 
 case class GroupByKey(a: Int, b: Int)

From e48023abc448007d474f99b80a822af7a2daeb2c Mon Sep 17 00:00:00 2001
From: Tom van Bussel <tom.vanbussel@databricks.com>
Date: Mon, 28 Jun 2021 16:50:53 +0200
Subject: [PATCH 107/169] [SPARK-35898][SQL] Fix arrays and maps in
 RowToColumnConverter

### What changes were proposed in this pull request?

This PR fixes support for arrays and maps in `RowToColumnConverter`. In particular this PR fixes two bugs:

1. `appendArray` in `WritableColumnVector` does not reserve any elements in its child arrays, which causes the assertion in `OffHeapColumnVector.putArray` to fail.
2. The nullability of the child columns is propagated incorrectly when creating the child converters of `ArrayConverter` and `MapConverter` in `RowToColumnConverter`.

This PR fixes these issues.

### Why are the changes needed?

Both bugs cause an exception to be thrown.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

I added additional test cases to `ColumnVectorSuite` to catch the first bug, and I added `RowToColumnConverterSuite` to catch the both bugs (but specifically the second).

Closes #33108 from tomvanbussel/SPARK-35898.

Authored-by: Tom van Bussel <tom.vanbussel@databricks.com>
Signed-off-by: herman <herman@databricks.com>
(cherry picked from commit c6606502a2e338c0e973e5772a8cc44126ae2fde)
Signed-off-by: herman <herman@databricks.com>
---
 .../vectorized/WritableColumnVector.java      |   3 +
 .../apache/spark/sql/execution/Columnar.scala |   6 +-
 .../execution/RowToColumnConverterSuite.scala | 145 ++++++++++++++++++
 .../vectorized/ColumnVectorSuite.scala        |  87 +++++++++++
 4 files changed, 238 insertions(+), 3 deletions(-)
 create mode 100644 sql/core/src/test/scala/org/apache/spark/sql/execution/RowToColumnConverterSuite.scala

diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
index 8c0f1e1257503..97a685a25a801 100644
--- a/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
+++ b/sql/core/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVector.java
@@ -611,6 +611,9 @@ public final int appendByteArray(byte[] value, int offset, int length) {
 
   public final int appendArray(int length) {
     reserve(elementsAppended + 1);
+    for (WritableColumnVector childColumn : childColumns) {
+      childColumn.reserve(childColumn.elementsAppended + length);
+    }
     putArray(elementsAppended, arrayData().elementsAppended, length);
     return elementsAppended++;
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala
index 8d542792a0e28..ccb525d2e192e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/Columnar.scala
@@ -261,12 +261,12 @@ private object RowToColumnConverter {
       case DoubleType => DoubleConverter
       case StringType => StringConverter
       case CalendarIntervalType => CalendarConverter
-      case at: ArrayType => new ArrayConverter(getConverterForType(at.elementType, nullable))
+      case at: ArrayType => ArrayConverter(getConverterForType(at.elementType, at.containsNull))
       case st: StructType => new StructConverter(st.fields.map(
         (f) => getConverterForType(f.dataType, f.nullable)))
       case dt: DecimalType => new DecimalConverter(dt)
-      case mt: MapType => new MapConverter(getConverterForType(mt.keyType, nullable),
-        getConverterForType(mt.valueType, nullable))
+      case mt: MapType => MapConverter(getConverterForType(mt.keyType, nullable = false),
+        getConverterForType(mt.valueType, mt.valueContainsNull))
       case unknown => throw new UnsupportedOperationException(
         s"Type $unknown not supported")
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RowToColumnConverterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowToColumnConverterSuite.scala
new file mode 100644
index 0000000000000..1afe742b988ee
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/RowToColumnConverterSuite.scala
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution
+
+import org.apache.spark.SparkFunSuite
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, GenericArrayData}
+import org.apache.spark.sql.execution.vectorized.{OnHeapColumnVector, WritableColumnVector}
+import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
+
+class RowToColumnConverterSuite extends SparkFunSuite {
+  def convertRows(rows: Seq[InternalRow], schema: StructType): Seq[WritableColumnVector] = {
+    val converter = new RowToColumnConverter(schema)
+    val vectors =
+      schema.map(f => new OnHeapColumnVector(5, f.dataType)).toArray[WritableColumnVector]
+    for (row <- rows) {
+      converter.convert(row, vectors)
+    }
+    vectors
+  }
+
+  test("integer column") {
+    val schema = StructType(Seq(StructField("i", IntegerType)))
+    val rows = (0 until 100).map(i => InternalRow(i))
+    val vectors = convertRows(rows, schema)
+    rows.zipWithIndex.map { case (row, i) =>
+      assert(vectors.head.getInt(i) === row.getInt(0))
+    }
+  }
+
+  test("array column") {
+    val schema = StructType(Seq(StructField("a", ArrayType(IntegerType))))
+    val rows = (0 until 100).map { i =>
+      InternalRow(new GenericArrayData(0 until i))
+    }
+    val vectors = convertRows(rows, schema)
+    rows.zipWithIndex.map { case (row, i) =>
+      assert(vectors.head.getArray(i).array().array === row.getArray(0).array)
+    }
+  }
+
+  test("non-nullable array column with null elements") {
+    val arrayType = ArrayType(IntegerType, containsNull = true)
+    val schema = StructType(Seq(StructField("a", arrayType, nullable = false)))
+    val rows = (0 until 100).map { i =>
+      InternalRow(new GenericArrayData((0 until i).map { j =>
+        if (j % 3 == 0) {
+          null
+        } else {
+          j
+        }
+      }))
+    }
+    val vectors = convertRows(rows, schema)
+    rows.zipWithIndex.map { case (row, i) =>
+      assert(vectors.head.getArray(i).array().array === row.getArray(0).array)
+    }
+  }
+
+  test("nested array column") {
+    val arrayType = ArrayType(ArrayType(IntegerType))
+    val schema = StructType(Seq(StructField("a", arrayType)))
+    val rows = (0 until 100).map { i =>
+      InternalRow(new GenericArrayData((0 until i).map(j => new GenericArrayData(0 until j))))
+    }
+    val vectors = convertRows(rows, schema)
+    rows.zipWithIndex.map { case (row, i) =>
+      val result = vectors.head.getArray(i).array().array
+        .map(_.asInstanceOf[ArrayData].array)
+      val expected = row.getArray(0).array
+        .map(_.asInstanceOf[ArrayData].array)
+      assert(result === expected)
+    }
+  }
+
+  test("map column") {
+    val mapType = MapType(IntegerType, StringType)
+    val schema = StructType(Seq(StructField("m", mapType)))
+    val rows = (0 until 100).map { i =>
+      InternalRow(new ArrayBasedMapData(
+        new GenericArrayData(0 until i),
+        new GenericArrayData((0 until i).map(j => UTF8String.fromString(s"str$j")))))
+    }
+    val vectors = convertRows(rows, schema)
+    rows.zipWithIndex.map { case (row, i) =>
+      val result = vectors.head.getMap(i)
+      val expected = row.getMap(0)
+      assert(result.keyArray().array().array === expected.keyArray().array)
+      assert(result.valueArray().array().array === expected.valueArray().array)
+    }
+  }
+
+  test("non-nullable map column with null values") {
+    val mapType = MapType(IntegerType, StringType, valueContainsNull = true)
+    val schema = StructType(Seq(StructField("m", mapType, nullable = false)))
+    val rows = (0 until 100).map { i =>
+      InternalRow(new ArrayBasedMapData(
+        new GenericArrayData(0 until i),
+        new GenericArrayData((0 until i).map { j =>
+          if (j % 3 == 0) {
+            null
+          } else {
+            UTF8String.fromString(s"str$j")
+          }
+        })))
+    }
+    val vectors = convertRows(rows, schema)
+    rows.zipWithIndex.map { case (row, i) =>
+      val result = vectors.head.getMap(i)
+      val expected = row.getMap(0)
+      assert(result.keyArray().array().array === expected.keyArray().array)
+      assert(result.valueArray().array().array === expected.valueArray().array)
+    }
+  }
+
+  test("multiple columns") {
+    val schema = StructType(
+      Seq(StructField("s", ShortType), StructField("i", IntegerType), StructField("l", LongType)))
+    val rows = (0 until 100).map { i =>
+      InternalRow((3 * i).toShort, 3 * i + 1, (3 * i + 2).toLong)
+    }
+    val vectors = convertRows(rows, schema)
+    rows.zipWithIndex.map { case (row, i) =>
+      assert(vectors(0).getShort(i) === row.getShort(0))
+      assert(vectors(1).getInt(i) === row.getInt(1))
+      assert(vectors(2).getLong(i) === row.getLong(2))
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
index 247efd5554a8f..43f48abb9734f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/vectorized/ColumnVectorSuite.scala
@@ -243,6 +243,93 @@ class ColumnVectorSuite extends SparkFunSuite with BeforeAndAfterEach {
     assert(testVector.getArray(3).toIntArray() === Array(3, 4, 5))
   }
 
+  testVectors("SPARK-35898: array append", 1, arrayType) { testVector =>
+    // Populate it with arrays [0], [1, 2], [], [3, 4, 5]
+    val data = testVector.arrayData()
+    testVector.appendArray(1)
+    data.appendInt(0)
+    testVector.appendArray(2)
+    data.appendInt(1)
+    data.appendInt(2)
+    testVector.appendArray(0)
+    testVector.appendArray(3)
+    data.appendInt(3)
+    data.appendInt(4)
+    data.appendInt(5)
+
+    assert(testVector.getArray(0).toIntArray === Array(0))
+    assert(testVector.getArray(1).toIntArray === Array(1, 2))
+    assert(testVector.getArray(2).toIntArray === Array.empty[Int])
+    assert(testVector.getArray(3).toIntArray === Array(3, 4, 5))
+  }
+
+  val mapType: MapType = MapType(IntegerType, StringType)
+  testVectors("SPARK-35898: map", 5, mapType) { testVector =>
+    val keys = testVector.getChild(0)
+    val values = testVector.getChild(1)
+    var i = 0
+    while (i < 6) {
+      keys.appendInt(i)
+      val utf8 = s"str$i".getBytes("utf8")
+      values.appendByteArray(utf8, 0, utf8.length)
+      i += 1
+    }
+
+    testVector.putArray(0, 0, 1)
+    testVector.putArray(1, 1, 2)
+    testVector.putArray(2, 3, 0)
+    testVector.putArray(3, 3, 3)
+
+    assert(testVector.getMap(0).keyArray().toIntArray === Array(0))
+    assert(testVector.getMap(0).valueArray().toArray[UTF8String](StringType) ===
+      Array(UTF8String.fromString(s"str0")))
+    assert(testVector.getMap(1).keyArray().toIntArray === Array(1, 2))
+    assert(testVector.getMap(1).valueArray().toArray[UTF8String](StringType) ===
+      (1 to 2).map(i => UTF8String.fromString(s"str$i")).toArray)
+    assert(testVector.getMap(2).keyArray().toIntArray === Array.empty[Int])
+    assert(testVector.getMap(2).valueArray().toArray[UTF8String](StringType) ===
+      Array.empty[UTF8String])
+    assert(testVector.getMap(3).keyArray().toIntArray === Array(3, 4, 5))
+    assert(testVector.getMap(3).valueArray().toArray[UTF8String](StringType) ===
+      (3 to 5).map(i => UTF8String.fromString(s"str$i")).toArray)
+  }
+
+  testVectors("SPARK-35898: map append", 1, mapType) { testVector =>
+    val keys = testVector.getChild(0)
+    val values = testVector.getChild(1)
+    def appendPair(i: Int): Unit = {
+      keys.appendInt(i)
+      val utf8 = s"str$i".getBytes("utf8")
+      values.appendByteArray(utf8, 0, utf8.length)
+    }
+
+    // Populate it with the maps [0 -> str0], [1 -> str1, 2 -> str2], [],
+    // [3 -> str3, 4 -> str4, 5 -> str5]
+    testVector.appendArray(1)
+    appendPair(0)
+    testVector.appendArray(2)
+    appendPair(1)
+    appendPair(2)
+    testVector.appendArray(0)
+    testVector.appendArray(3)
+    appendPair(3)
+    appendPair(4)
+    appendPair(5)
+
+    assert(testVector.getMap(0).keyArray().toIntArray === Array(0))
+    assert(testVector.getMap(0).valueArray().toArray[UTF8String](StringType) ===
+      Array(UTF8String.fromString(s"str0")))
+    assert(testVector.getMap(1).keyArray().toIntArray === Array(1, 2))
+    assert(testVector.getMap(1).valueArray().toArray[UTF8String](StringType) ===
+      (1 to 2).map(i => UTF8String.fromString(s"str$i")).toArray)
+    assert(testVector.getMap(2).keyArray().toIntArray === Array.empty[Int])
+    assert(testVector.getMap(2).valueArray().toArray[UTF8String](StringType) ===
+      Array.empty[UTF8String])
+    assert(testVector.getMap(3).keyArray().toIntArray === Array(3, 4, 5))
+    assert(testVector.getMap(3).valueArray().toArray[UTF8String](StringType) ===
+      (3 to 5).map(i => UTF8String.fromString(s"str$i")).toArray)
+  }
+
   val structType: StructType = new StructType().add("int", IntegerType).add("double", DoubleType)
   testVectors("struct", 10, structType) { testVector =>
     val c1 = testVector.getChild(0)

From 1c89ae7b23da596fc0b8fed954fe543b652072aa Mon Sep 17 00:00:00 2001
From: Max Gekk <max.gekk@gmail.com>
Date: Wed, 30 Jun 2021 11:45:50 +0300
Subject: [PATCH 108/169] [SPARK-35935][SQL][3.1][3.0] Prevent failure of `MSCK
 REPAIR TABLE` on table refreshing

### What changes were proposed in this pull request?
In the PR, I propose to catch all non-fatal exceptions coming `refreshTable()` at the final stage of table repairing, and output an error message instead of failing with an exception.

### Why are the changes needed?
1. The uncaught exceptions from table refreshing might be considered as regression comparing to previous Spark versions. Table refreshing was introduced by https://github.com/apache/spark/pull/31066.
2. This should improve user experience with Spark SQL. For instance, when the `MSCK REPAIR TABLE` is performed in a chain of command in SQL where catching exception is difficult or even impossible.

### Does this PR introduce _any_ user-facing change?
Yes. Before the changes the `MSCK REPAIR TABLE` command can fail with the exception portrayed in SPARK-35935. After the changes, the same command outputs error message, and completes successfully.

### How was this patch tested?
By existing test suites. For instance:
```
$ build/sbt -Phive-2.3 -Phive-thriftserver "test:testOnly *CachedTableSuite"
```

Closes #33152 from MaxGekk/msck-repair-catch-except-3.1.

Authored-by: Max Gekk <max.gekk@gmail.com>
Signed-off-by: Max Gekk <max.gekk@gmail.com>
---
 .../org/apache/spark/sql/execution/command/ddl.scala   | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 2f96c453a6e30..851b221883d73 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -678,7 +678,15 @@ case class AlterTableRecoverPartitionsCommand(
     // This is always the case for Hive format tables, but is not true for Datasource tables created
     // before Spark 2.1 unless they are converted via `msck repair table`.
     spark.sessionState.catalog.alterTable(table.copy(tracksPartitionsInCatalog = true))
-    spark.catalog.refreshTable(tableIdentWithDB)
+    try {
+      spark.catalog.refreshTable(tableIdentWithDB)
+    } catch {
+      case NonFatal(e) =>
+        logError(s"Cannot refresh the table '$tableIdentWithDB'. A query of the table " +
+          "might return wrong result if the table was cached. To avoid such issue, you should " +
+          "uncache the table manually via the UNCACHE TABLE command after table recovering will " +
+          "complete fully.", e)
+    }
     logInfo(s"Recovered all partitions ($total).")
     Seq.empty[Row]
   }

From 72955689b9d35959d0073602c7f9b8000acc7f73 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Thu, 1 Jul 2021 11:40:00 +0800
Subject: [PATCH 109/169] [SPARK-35714][FOLLOW-UP][CORE] Use a shared stopping
 flag for WorkerWatcher to avoid the duplicate System.exit

### What changes were proposed in this pull request?

This PR proposes to let `WorkerWatcher` reuse the `stopping` flag in `CoarseGrainedExecutorBackend` to avoid the duplicate call of `System.exit`.

### Why are the changes needed?

As a followup of https://github.com/apache/spark/pull/32868, this PR tries to give a more robust fix.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Pass existing tests.

Closes #33028 from Ngone51/spark-35714-followup.

Lead-authored-by: yi.wu <yi.wu@databricks.com>
Co-authored-by: wuyi <yi.wu@databricks.com>
Signed-off-by: yi.wu <yi.wu@databricks.com>
(cherry picked from commit 868a59470650cc12272de0d0b04c6d98b1fe076d)
Signed-off-by: yi.wu <yi.wu@databricks.com>
---
 .../spark/deploy/worker/WorkerWatcher.scala     | 17 ++++++++---------
 .../executor/CoarseGrainedExecutorBackend.scala |  9 +++++----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
index 43ec492de479e..efffc9f23c4cc 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
@@ -17,13 +17,10 @@
 
 package org.apache.spark.deploy.worker
 
-import scala.concurrent.ExecutionContext.Implicits.global
-import scala.concurrent.Future
-import scala.concurrent.duration._
+import java.util.concurrent.atomic.AtomicBoolean
 
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc._
-import org.apache.spark.util.ThreadUtils
 
 /**
  * Endpoint which connects to a worker process and terminates the JVM if the
@@ -31,7 +28,10 @@ import org.apache.spark.util.ThreadUtils
  * Provides fate sharing between a worker and its associated child processes.
  */
 private[spark] class WorkerWatcher(
-    override val rpcEnv: RpcEnv, workerUrl: String, isTesting: Boolean = false)
+    override val rpcEnv: RpcEnv,
+    workerUrl: String,
+    isTesting: Boolean = false,
+    isChildProcessStopping: AtomicBoolean = new AtomicBoolean(false))
   extends RpcEndpoint with Logging {
 
   logInfo(s"Connecting to worker $workerUrl")
@@ -53,10 +53,9 @@ private[spark] class WorkerWatcher(
   private def exitNonZero() =
     if (isTesting) {
       isShutDown = true
-    } else {
-      ThreadUtils.awaitResult(Future {
-        System.exit(-1)
-      }, 5.seconds)
+    } else if (isChildProcessStopping.compareAndSet(false, true)) {
+      // SPARK-35714: avoid the duplicate call of `System.exit` to avoid the dead lock
+      System.exit(-1)
     }
 
   override def receive: PartialFunction[Any, Unit] = {
diff --git a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
index 40a8f7dc59a59..f12f5d996fc5f 100644
--- a/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
+++ b/core/src/main/scala/org/apache/spark/executor/CoarseGrainedExecutorBackend.scala
@@ -61,7 +61,7 @@ private[spark] class CoarseGrainedExecutorBackend(
 
   private implicit val formats = DefaultFormats
 
-  private[this] val stopping = new AtomicBoolean(false)
+  private[executor] val stopping = new AtomicBoolean(false)
   var executor: Executor = null
   @volatile var driver: Option[RpcEndpointRef] = None
 
@@ -454,10 +454,11 @@ private[spark] object CoarseGrainedExecutorBackend extends Logging {
       val env = SparkEnv.createExecutorEnv(driverConf, arguments.executorId, arguments.bindAddress,
         arguments.hostname, arguments.cores, cfg.ioEncryptionKey, isLocal = false)
 
-      env.rpcEnv.setupEndpoint("Executor",
-        backendCreateFn(env.rpcEnv, arguments, env, cfg.resourceProfile))
+      val backend = backendCreateFn(env.rpcEnv, arguments, env, cfg.resourceProfile)
+      env.rpcEnv.setupEndpoint("Executor", backend)
       arguments.workerUrl.foreach { url =>
-        env.rpcEnv.setupEndpoint("WorkerWatcher", new WorkerWatcher(env.rpcEnv, url))
+        env.rpcEnv.setupEndpoint("WorkerWatcher",
+          new WorkerWatcher(env.rpcEnv, url, isChildProcessStopping = backend.stopping))
       }
       env.rpcEnv.awaitTermination()
     }

From 8ae868cf26feb8898cc91cc0f7fc9ad86e338124 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Thu, 8 Jul 2021 01:35:39 +0800
Subject: [PATCH 110/169] [SPARK-36020][SQL][3.1] Check logical link in remove
 redundant projects

backport [#33222](https://github.com/apache/spark/pull/33222)

### What changes were proposed in this pull request?

The RemoveRedundantProjects feature can conflict with the AQE broadcast threshold (PR) sometimes. After removing the project, the physical plan to logical plan link can be changed and we may have a Project above LogicalQueryStage. This breaks AQE broadcast threshold, because the stats of Project does not have the isRuntime = true flag, and thus still use the normal broadcast threshold.

This PR updates RemoveRedundantProjects to not remove ProjectExec that has a different logical plan link than its child.

### Why are the changes needed?

Make AQE broadcast threshold work in more cases.

### Does this PR introduce _any_ user-facing change?

no

### How was this patch tested?

new tests

Closes #33238 from ulysses-you/SPARK-36020-3-1.

Lead-authored-by: ulysses-you <ulyssesyou18@gmail.com>
Co-authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../execution/RemoveRedundantProjects.scala   |  13 +-
 .../approved-plans-v1_4/q38.sf100/explain.txt | 181 +++++++++---------
 .../q38.sf100/simplified.txt                  | 167 ++++++++--------
 .../approved-plans-v1_4/q38/explain.txt       | 141 +++++++-------
 .../approved-plans-v1_4/q38/simplified.txt    | 103 +++++-----
 .../approved-plans-v1_4/q87.sf100/explain.txt | 181 +++++++++---------
 .../q87.sf100/simplified.txt                  | 167 ++++++++--------
 .../approved-plans-v1_4/q87/explain.txt       | 141 +++++++-------
 .../approved-plans-v1_4/q87/simplified.txt    | 103 +++++-----
 .../approved-plans-v2_7/q22.sf100/explain.txt |  77 ++++----
 .../q22.sf100/simplified.txt                  |  81 ++++----
 .../approved-plans-v2_7/q22/explain.txt       |  67 ++++---
 .../approved-plans-v2_7/q22/simplified.txt    |  59 +++---
 .../LogicalPlanTagInSparkPlanSuite.scala      |   2 +-
 .../RemoveRedundantProjectsSuite.scala        |  17 +-
 .../adaptive/AdaptiveQueryExecSuite.scala     |  21 ++
 16 files changed, 799 insertions(+), 722 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala
index 1520b486b96d8..eeeb8683668d2 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala
@@ -48,10 +48,8 @@ object RemoveRedundantProjects extends Rule[SparkPlan] {
   private def removeProject(plan: SparkPlan, requireOrdering: Boolean): SparkPlan = {
     plan match {
       case p @ ProjectExec(_, child) =>
-        if (isRedundant(p, child, requireOrdering)) {
-          val newPlan = removeProject(child, requireOrdering)
-          newPlan.setLogicalLink(child.logicalLink.get)
-          newPlan
+        if (isRedundant(p, child, requireOrdering) && canRemove(p, child)) {
+          removeProject(child, requireOrdering)
         } else {
           p.mapChildren(removeProject(_, false))
         }
@@ -110,4 +108,11 @@ object RemoveRedundantProjects extends Rule[SparkPlan] {
         }
     }
   }
+
+  // SPARK-36020: Currently a project can only be removed if (1) its logical link is empty or (2)
+  // its logical link is the same as the child's logical link. This is to ensure the physical
+  // plan node can correctly map to its logical plan node in AQE.
+  private def canRemove(project: ProjectExec, child: SparkPlan): Boolean = {
+    project.logicalLink.isEmpty || project.logicalLink.exists(child.logicalLink.contains)
+  }
 }
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/explain.txt
index 7465ddae84e8a..a925e29ad6919 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/explain.txt
@@ -1,71 +1,72 @@
 == Physical Plan ==
-* HashAggregate (67)
-+- Exchange (66)
-   +- * HashAggregate (65)
-      +- * HashAggregate (64)
-         +- * HashAggregate (63)
-            +- * HashAggregate (62)
-               +- * HashAggregate (61)
-                  +- * HashAggregate (60)
-                     +- Exchange (59)
-                        +- * HashAggregate (58)
-                           +- SortMergeJoin LeftSemi (57)
-                              :- SortMergeJoin LeftSemi (39)
-                              :  :- * Sort (21)
-                              :  :  +- Exchange (20)
-                              :  :     +- * Project (19)
-                              :  :        +- * SortMergeJoin Inner (18)
-                              :  :           :- * Sort (12)
-                              :  :           :  +- Exchange (11)
-                              :  :           :     +- * Project (10)
-                              :  :           :        +- * BroadcastHashJoin Inner BuildRight (9)
-                              :  :           :           :- * Filter (3)
-                              :  :           :           :  +- * ColumnarToRow (2)
-                              :  :           :           :     +- Scan parquet default.store_sales (1)
-                              :  :           :           +- BroadcastExchange (8)
-                              :  :           :              +- * Project (7)
-                              :  :           :                 +- * Filter (6)
-                              :  :           :                    +- * ColumnarToRow (5)
-                              :  :           :                       +- Scan parquet default.date_dim (4)
-                              :  :           +- * Sort (17)
-                              :  :              +- Exchange (16)
-                              :  :                 +- * Filter (15)
-                              :  :                    +- * ColumnarToRow (14)
-                              :  :                       +- Scan parquet default.customer (13)
-                              :  +- * Sort (38)
-                              :     +- Exchange (37)
-                              :        +- * HashAggregate (36)
-                              :           +- Exchange (35)
-                              :              +- * HashAggregate (34)
-                              :                 +- * Project (33)
-                              :                    +- * SortMergeJoin Inner (32)
-                              :                       :- * Sort (29)
-                              :                       :  +- Exchange (28)
-                              :                       :     +- * Project (27)
-                              :                       :        +- * BroadcastHashJoin Inner BuildRight (26)
-                              :                       :           :- * Filter (24)
-                              :                       :           :  +- * ColumnarToRow (23)
-                              :                       :           :     +- Scan parquet default.catalog_sales (22)
-                              :                       :           +- ReusedExchange (25)
-                              :                       +- * Sort (31)
-                              :                          +- ReusedExchange (30)
-                              +- * Sort (56)
-                                 +- Exchange (55)
-                                    +- * HashAggregate (54)
-                                       +- Exchange (53)
-                                          +- * HashAggregate (52)
-                                             +- * Project (51)
-                                                +- * SortMergeJoin Inner (50)
-                                                   :- * Sort (47)
-                                                   :  +- Exchange (46)
-                                                   :     +- * Project (45)
-                                                   :        +- * BroadcastHashJoin Inner BuildRight (44)
-                                                   :           :- * Filter (42)
-                                                   :           :  +- * ColumnarToRow (41)
-                                                   :           :     +- Scan parquet default.web_sales (40)
-                                                   :           +- ReusedExchange (43)
-                                                   +- * Sort (49)
-                                                      +- ReusedExchange (48)
+* HashAggregate (68)
++- Exchange (67)
+   +- * HashAggregate (66)
+      +- * HashAggregate (65)
+         +- * HashAggregate (64)
+            +- * HashAggregate (63)
+               +- * HashAggregate (62)
+                  +- * HashAggregate (61)
+                     +- Exchange (60)
+                        +- * HashAggregate (59)
+                           +- * Project (58)
+                              +- SortMergeJoin LeftSemi (57)
+                                 :- SortMergeJoin LeftSemi (39)
+                                 :  :- * Sort (21)
+                                 :  :  +- Exchange (20)
+                                 :  :     +- * Project (19)
+                                 :  :        +- * SortMergeJoin Inner (18)
+                                 :  :           :- * Sort (12)
+                                 :  :           :  +- Exchange (11)
+                                 :  :           :     +- * Project (10)
+                                 :  :           :        +- * BroadcastHashJoin Inner BuildRight (9)
+                                 :  :           :           :- * Filter (3)
+                                 :  :           :           :  +- * ColumnarToRow (2)
+                                 :  :           :           :     +- Scan parquet default.store_sales (1)
+                                 :  :           :           +- BroadcastExchange (8)
+                                 :  :           :              +- * Project (7)
+                                 :  :           :                 +- * Filter (6)
+                                 :  :           :                    +- * ColumnarToRow (5)
+                                 :  :           :                       +- Scan parquet default.date_dim (4)
+                                 :  :           +- * Sort (17)
+                                 :  :              +- Exchange (16)
+                                 :  :                 +- * Filter (15)
+                                 :  :                    +- * ColumnarToRow (14)
+                                 :  :                       +- Scan parquet default.customer (13)
+                                 :  +- * Sort (38)
+                                 :     +- Exchange (37)
+                                 :        +- * HashAggregate (36)
+                                 :           +- Exchange (35)
+                                 :              +- * HashAggregate (34)
+                                 :                 +- * Project (33)
+                                 :                    +- * SortMergeJoin Inner (32)
+                                 :                       :- * Sort (29)
+                                 :                       :  +- Exchange (28)
+                                 :                       :     +- * Project (27)
+                                 :                       :        +- * BroadcastHashJoin Inner BuildRight (26)
+                                 :                       :           :- * Filter (24)
+                                 :                       :           :  +- * ColumnarToRow (23)
+                                 :                       :           :     +- Scan parquet default.catalog_sales (22)
+                                 :                       :           +- ReusedExchange (25)
+                                 :                       +- * Sort (31)
+                                 :                          +- ReusedExchange (30)
+                                 +- * Sort (56)
+                                    +- Exchange (55)
+                                       +- * HashAggregate (54)
+                                          +- Exchange (53)
+                                             +- * HashAggregate (52)
+                                                +- * Project (51)
+                                                   +- * SortMergeJoin Inner (50)
+                                                      :- * Sort (47)
+                                                      :  +- Exchange (46)
+                                                      :     +- * Project (45)
+                                                      :        +- * BroadcastHashJoin Inner BuildRight (44)
+                                                      :           :- * Filter (42)
+                                                      :           :  +- * ColumnarToRow (41)
+                                                      :           :     +- Scan parquet default.web_sales (40)
+                                                      :           +- ReusedExchange (43)
+                                                      +- * Sort (49)
+                                                         +- ReusedExchange (48)
 
 
 (1) Scan parquet default.store_sales
@@ -115,7 +116,7 @@ Input [4]: [ss_sold_date_sk#1, ss_customer_sk#2, d_date_sk#3, d_date#4]
 
 (11) Exchange
 Input [2]: [ss_customer_sk#2, d_date#4]
-Arguments: hashpartitioning(ss_customer_sk#2, 5), true, [id=#7]
+Arguments: hashpartitioning(ss_customer_sk#2, 5), ENSURE_REQUIREMENTS, [id=#7]
 
 (12) Sort [codegen id : 3]
 Input [2]: [ss_customer_sk#2, d_date#4]
@@ -137,7 +138,7 @@ Condition : isnotnull(c_customer_sk#8)
 
 (16) Exchange
 Input [3]: [c_customer_sk#8, c_first_name#9, c_last_name#10]
-Arguments: hashpartitioning(c_customer_sk#8, 5), true, [id=#11]
+Arguments: hashpartitioning(c_customer_sk#8, 5), ENSURE_REQUIREMENTS, [id=#11]
 
 (17) Sort [codegen id : 5]
 Input [3]: [c_customer_sk#8, c_first_name#9, c_last_name#10]
@@ -154,7 +155,7 @@ Input [5]: [ss_customer_sk#2, d_date#4, c_customer_sk#8, c_first_name#9, c_last_
 
 (20) Exchange
 Input [3]: [d_date#4, c_first_name#9, c_last_name#10]
-Arguments: hashpartitioning(coalesce(c_last_name#10, ), isnull(c_last_name#10), coalesce(c_first_name#9, ), isnull(c_first_name#9), coalesce(d_date#4, 0), isnull(d_date#4), 5), true, [id=#12]
+Arguments: hashpartitioning(coalesce(c_last_name#10, ), isnull(c_last_name#10), coalesce(c_first_name#9, ), isnull(c_first_name#9), coalesce(d_date#4, 0), isnull(d_date#4), 5), ENSURE_REQUIREMENTS, [id=#12]
 
 (21) Sort [codegen id : 7]
 Input [3]: [d_date#4, c_first_name#9, c_last_name#10]
@@ -188,7 +189,7 @@ Input [4]: [cs_sold_date_sk#13, cs_bill_customer_sk#14, d_date_sk#15, d_date#16]
 
 (28) Exchange
 Input [2]: [cs_bill_customer_sk#14, d_date#16]
-Arguments: hashpartitioning(cs_bill_customer_sk#14, 5), true, [id=#17]
+Arguments: hashpartitioning(cs_bill_customer_sk#14, 5), ENSURE_REQUIREMENTS, [id=#17]
 
 (29) Sort [codegen id : 10]
 Input [2]: [cs_bill_customer_sk#14, d_date#16]
@@ -219,7 +220,7 @@ Results [3]: [c_last_name#20, c_first_name#19, d_date#16]
 
 (35) Exchange
 Input [3]: [c_last_name#20, c_first_name#19, d_date#16]
-Arguments: hashpartitioning(c_last_name#20, c_first_name#19, d_date#16, 5), true, [id=#21]
+Arguments: hashpartitioning(c_last_name#20, c_first_name#19, d_date#16, 5), ENSURE_REQUIREMENTS, [id=#21]
 
 (36) HashAggregate [codegen id : 14]
 Input [3]: [c_last_name#20, c_first_name#19, d_date#16]
@@ -230,7 +231,7 @@ Results [3]: [c_last_name#20, c_first_name#19, d_date#16]
 
 (37) Exchange
 Input [3]: [c_last_name#20, c_first_name#19, d_date#16]
-Arguments: hashpartitioning(coalesce(c_last_name#20, ), isnull(c_last_name#20), coalesce(c_first_name#19, ), isnull(c_first_name#19), coalesce(d_date#16, 0), isnull(d_date#16), 5), true, [id=#22]
+Arguments: hashpartitioning(coalesce(c_last_name#20, ), isnull(c_last_name#20), coalesce(c_first_name#19, ), isnull(c_first_name#19), coalesce(d_date#16, 0), isnull(d_date#16), 5), ENSURE_REQUIREMENTS, [id=#22]
 
 (38) Sort [codegen id : 15]
 Input [3]: [c_last_name#20, c_first_name#19, d_date#16]
@@ -269,7 +270,7 @@ Input [4]: [ws_sold_date_sk#23, ws_bill_customer_sk#24, d_date_sk#25, d_date#26]
 
 (46) Exchange
 Input [2]: [ws_bill_customer_sk#24, d_date#26]
-Arguments: hashpartitioning(ws_bill_customer_sk#24, 5), true, [id=#27]
+Arguments: hashpartitioning(ws_bill_customer_sk#24, 5), ENSURE_REQUIREMENTS, [id=#27]
 
 (47) Sort [codegen id : 18]
 Input [2]: [ws_bill_customer_sk#24, d_date#26]
@@ -300,7 +301,7 @@ Results [3]: [c_last_name#30, c_first_name#29, d_date#26]
 
 (53) Exchange
 Input [3]: [c_last_name#30, c_first_name#29, d_date#26]
-Arguments: hashpartitioning(c_last_name#30, c_first_name#29, d_date#26, 5), true, [id=#31]
+Arguments: hashpartitioning(c_last_name#30, c_first_name#29, d_date#26, 5), ENSURE_REQUIREMENTS, [id=#31]
 
 (54) HashAggregate [codegen id : 22]
 Input [3]: [c_last_name#30, c_first_name#29, d_date#26]
@@ -311,7 +312,7 @@ Results [3]: [c_last_name#30, c_first_name#29, d_date#26]
 
 (55) Exchange
 Input [3]: [c_last_name#30, c_first_name#29, d_date#26]
-Arguments: hashpartitioning(coalesce(c_last_name#30, ), isnull(c_last_name#30), coalesce(c_first_name#29, ), isnull(c_first_name#29), coalesce(d_date#26, 0), isnull(d_date#26), 5), true, [id=#32]
+Arguments: hashpartitioning(coalesce(c_last_name#30, ), isnull(c_last_name#30), coalesce(c_first_name#29, ), isnull(c_first_name#29), coalesce(d_date#26, 0), isnull(d_date#26), 5), ENSURE_REQUIREMENTS, [id=#32]
 
 (56) Sort [codegen id : 23]
 Input [3]: [c_last_name#30, c_first_name#29, d_date#26]
@@ -322,64 +323,68 @@ Left keys [6]: [coalesce(c_last_name#10, ), isnull(c_last_name#10), coalesce(c_f
 Right keys [6]: [coalesce(c_last_name#30, ), isnull(c_last_name#30), coalesce(c_first_name#29, ), isnull(c_first_name#29), coalesce(d_date#26, 0), isnull(d_date#26)]
 Join condition: None
 
-(58) HashAggregate [codegen id : 24]
+(58) Project [codegen id : 24]
+Output [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Input [3]: [d_date#4, c_first_name#9, c_last_name#10]
+
+(59) HashAggregate [codegen id : 24]
+Input [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Keys [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#10, c_first_name#9, d_date#4]
 
-(59) Exchange
+(60) Exchange
 Input [3]: [c_last_name#10, c_first_name#9, d_date#4]
-Arguments: hashpartitioning(c_last_name#10, c_first_name#9, d_date#4, 5), true, [id=#33]
+Arguments: hashpartitioning(c_last_name#10, c_first_name#9, d_date#4, 5), ENSURE_REQUIREMENTS, [id=#33]
 
-(60) HashAggregate [codegen id : 25]
+(61) HashAggregate [codegen id : 25]
 Input [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Keys [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#10, c_first_name#9, d_date#4]
 
-(61) HashAggregate [codegen id : 25]
+(62) HashAggregate [codegen id : 25]
 Input [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Keys [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#10, c_first_name#9, d_date#4]
 
-(62) HashAggregate [codegen id : 25]
+(63) HashAggregate [codegen id : 25]
 Input [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Keys [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#10, c_first_name#9, d_date#4]
 
-(63) HashAggregate [codegen id : 25]
+(64) HashAggregate [codegen id : 25]
 Input [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Keys [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#10, c_first_name#9, d_date#4]
 
-(64) HashAggregate [codegen id : 25]
+(65) HashAggregate [codegen id : 25]
 Input [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Keys [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results: []
 
-(65) HashAggregate [codegen id : 25]
+(66) HashAggregate [codegen id : 25]
 Input: []
 Keys: []
 Functions [1]: [partial_count(1)]
 Aggregate Attributes [1]: [count#34]
 Results [1]: [count#35]
 
-(66) Exchange
+(67) Exchange
 Input [1]: [count#35]
-Arguments: SinglePartition, true, [id=#36]
+Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#36]
 
-(67) HashAggregate [codegen id : 26]
+(68) HashAggregate [codegen id : 26]
 Input [1]: [count#35]
 Keys: []
 Functions [1]: [count(1)]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/simplified.txt
index 8dd59340cf069..015d3c5a81972 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38.sf100/simplified.txt
@@ -13,105 +13,106 @@ WholeStageCodegen (26)
                         Exchange [c_last_name,c_first_name,d_date] #2
                           WholeStageCodegen (24)
                             HashAggregate [c_last_name,c_first_name,d_date]
-                              InputAdapter
-                                SortMergeJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date]
+                              Project [c_last_name,c_first_name,d_date]
+                                InputAdapter
                                   SortMergeJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date]
-                                    WholeStageCodegen (7)
-                                      Sort [c_last_name,c_first_name,d_date]
-                                        InputAdapter
-                                          Exchange [c_last_name,c_first_name,d_date] #3
-                                            WholeStageCodegen (6)
-                                              Project [d_date,c_first_name,c_last_name]
-                                                SortMergeJoin [ss_customer_sk,c_customer_sk]
+                                    SortMergeJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date]
+                                      WholeStageCodegen (7)
+                                        Sort [c_last_name,c_first_name,d_date]
+                                          InputAdapter
+                                            Exchange [c_last_name,c_first_name,d_date] #3
+                                              WholeStageCodegen (6)
+                                                Project [d_date,c_first_name,c_last_name]
+                                                  SortMergeJoin [ss_customer_sk,c_customer_sk]
+                                                    InputAdapter
+                                                      WholeStageCodegen (3)
+                                                        Sort [ss_customer_sk]
+                                                          InputAdapter
+                                                            Exchange [ss_customer_sk] #4
+                                                              WholeStageCodegen (2)
+                                                                Project [ss_customer_sk,d_date]
+                                                                  BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
+                                                                    Filter [ss_sold_date_sk,ss_customer_sk]
+                                                                      ColumnarToRow
+                                                                        InputAdapter
+                                                                          Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk]
+                                                                    InputAdapter
+                                                                      BroadcastExchange #5
+                                                                        WholeStageCodegen (1)
+                                                                          Project [d_date_sk,d_date]
+                                                                            Filter [d_month_seq,d_date_sk]
+                                                                              ColumnarToRow
+                                                                                InputAdapter
+                                                                                  Scan parquet default.date_dim [d_date_sk,d_date,d_month_seq]
+                                                    InputAdapter
+                                                      WholeStageCodegen (5)
+                                                        Sort [c_customer_sk]
+                                                          InputAdapter
+                                                            Exchange [c_customer_sk] #6
+                                                              WholeStageCodegen (4)
+                                                                Filter [c_customer_sk]
+                                                                  ColumnarToRow
+                                                                    InputAdapter
+                                                                      Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name]
+                                      WholeStageCodegen (15)
+                                        Sort [c_last_name,c_first_name,d_date]
+                                          InputAdapter
+                                            Exchange [c_last_name,c_first_name,d_date] #7
+                                              WholeStageCodegen (14)
+                                                HashAggregate [c_last_name,c_first_name,d_date]
                                                   InputAdapter
-                                                    WholeStageCodegen (3)
-                                                      Sort [ss_customer_sk]
-                                                        InputAdapter
-                                                          Exchange [ss_customer_sk] #4
-                                                            WholeStageCodegen (2)
-                                                              Project [ss_customer_sk,d_date]
-                                                                BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
-                                                                  Filter [ss_sold_date_sk,ss_customer_sk]
-                                                                    ColumnarToRow
-                                                                      InputAdapter
-                                                                        Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk]
-                                                                  InputAdapter
-                                                                    BroadcastExchange #5
-                                                                      WholeStageCodegen (1)
-                                                                        Project [d_date_sk,d_date]
-                                                                          Filter [d_month_seq,d_date_sk]
-                                                                            ColumnarToRow
+                                                    Exchange [c_last_name,c_first_name,d_date] #8
+                                                      WholeStageCodegen (13)
+                                                        HashAggregate [c_last_name,c_first_name,d_date]
+                                                          Project [c_last_name,c_first_name,d_date]
+                                                            SortMergeJoin [cs_bill_customer_sk,c_customer_sk]
+                                                              InputAdapter
+                                                                WholeStageCodegen (10)
+                                                                  Sort [cs_bill_customer_sk]
+                                                                    InputAdapter
+                                                                      Exchange [cs_bill_customer_sk] #9
+                                                                        WholeStageCodegen (9)
+                                                                          Project [cs_bill_customer_sk,d_date]
+                                                                            BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
+                                                                              Filter [cs_sold_date_sk,cs_bill_customer_sk]
+                                                                                ColumnarToRow
+                                                                                  InputAdapter
+                                                                                    Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk]
                                                                               InputAdapter
-                                                                                Scan parquet default.date_dim [d_date_sk,d_date,d_month_seq]
-                                                  InputAdapter
-                                                    WholeStageCodegen (5)
-                                                      Sort [c_customer_sk]
-                                                        InputAdapter
-                                                          Exchange [c_customer_sk] #6
-                                                            WholeStageCodegen (4)
-                                                              Filter [c_customer_sk]
-                                                                ColumnarToRow
-                                                                  InputAdapter
-                                                                    Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name]
-                                    WholeStageCodegen (15)
+                                                                                ReusedExchange [d_date_sk,d_date] #5
+                                                              InputAdapter
+                                                                WholeStageCodegen (12)
+                                                                  Sort [c_customer_sk]
+                                                                    InputAdapter
+                                                                      ReusedExchange [c_customer_sk,c_first_name,c_last_name] #6
+                                    WholeStageCodegen (23)
                                       Sort [c_last_name,c_first_name,d_date]
                                         InputAdapter
-                                          Exchange [c_last_name,c_first_name,d_date] #7
-                                            WholeStageCodegen (14)
+                                          Exchange [c_last_name,c_first_name,d_date] #10
+                                            WholeStageCodegen (22)
                                               HashAggregate [c_last_name,c_first_name,d_date]
                                                 InputAdapter
-                                                  Exchange [c_last_name,c_first_name,d_date] #8
-                                                    WholeStageCodegen (13)
+                                                  Exchange [c_last_name,c_first_name,d_date] #11
+                                                    WholeStageCodegen (21)
                                                       HashAggregate [c_last_name,c_first_name,d_date]
                                                         Project [c_last_name,c_first_name,d_date]
-                                                          SortMergeJoin [cs_bill_customer_sk,c_customer_sk]
+                                                          SortMergeJoin [ws_bill_customer_sk,c_customer_sk]
                                                             InputAdapter
-                                                              WholeStageCodegen (10)
-                                                                Sort [cs_bill_customer_sk]
+                                                              WholeStageCodegen (18)
+                                                                Sort [ws_bill_customer_sk]
                                                                   InputAdapter
-                                                                    Exchange [cs_bill_customer_sk] #9
-                                                                      WholeStageCodegen (9)
-                                                                        Project [cs_bill_customer_sk,d_date]
-                                                                          BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
-                                                                            Filter [cs_sold_date_sk,cs_bill_customer_sk]
+                                                                    Exchange [ws_bill_customer_sk] #12
+                                                                      WholeStageCodegen (17)
+                                                                        Project [ws_bill_customer_sk,d_date]
+                                                                          BroadcastHashJoin [ws_sold_date_sk,d_date_sk]
+                                                                            Filter [ws_sold_date_sk,ws_bill_customer_sk]
                                                                               ColumnarToRow
                                                                                 InputAdapter
-                                                                                  Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk]
+                                                                                  Scan parquet default.web_sales [ws_sold_date_sk,ws_bill_customer_sk]
                                                                             InputAdapter
                                                                               ReusedExchange [d_date_sk,d_date] #5
                                                             InputAdapter
-                                                              WholeStageCodegen (12)
+                                                              WholeStageCodegen (20)
                                                                 Sort [c_customer_sk]
                                                                   InputAdapter
                                                                     ReusedExchange [c_customer_sk,c_first_name,c_last_name] #6
-                                  WholeStageCodegen (23)
-                                    Sort [c_last_name,c_first_name,d_date]
-                                      InputAdapter
-                                        Exchange [c_last_name,c_first_name,d_date] #10
-                                          WholeStageCodegen (22)
-                                            HashAggregate [c_last_name,c_first_name,d_date]
-                                              InputAdapter
-                                                Exchange [c_last_name,c_first_name,d_date] #11
-                                                  WholeStageCodegen (21)
-                                                    HashAggregate [c_last_name,c_first_name,d_date]
-                                                      Project [c_last_name,c_first_name,d_date]
-                                                        SortMergeJoin [ws_bill_customer_sk,c_customer_sk]
-                                                          InputAdapter
-                                                            WholeStageCodegen (18)
-                                                              Sort [ws_bill_customer_sk]
-                                                                InputAdapter
-                                                                  Exchange [ws_bill_customer_sk] #12
-                                                                    WholeStageCodegen (17)
-                                                                      Project [ws_bill_customer_sk,d_date]
-                                                                        BroadcastHashJoin [ws_sold_date_sk,d_date_sk]
-                                                                          Filter [ws_sold_date_sk,ws_bill_customer_sk]
-                                                                            ColumnarToRow
-                                                                              InputAdapter
-                                                                                Scan parquet default.web_sales [ws_sold_date_sk,ws_bill_customer_sk]
-                                                                          InputAdapter
-                                                                            ReusedExchange [d_date_sk,d_date] #5
-                                                          InputAdapter
-                                                            WholeStageCodegen (20)
-                                                              Sort [c_customer_sk]
-                                                                InputAdapter
-                                                                  ReusedExchange [c_customer_sk,c_first_name,c_last_name] #6
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/explain.txt
index 74454cf32afd0..05b4c6b71020e 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/explain.txt
@@ -1,58 +1,59 @@
 == Physical Plan ==
-* HashAggregate (54)
-+- Exchange (53)
-   +- * HashAggregate (52)
-      +- * HashAggregate (51)
-         +- * HashAggregate (50)
-            +- * HashAggregate (49)
-               +- * HashAggregate (48)
-                  +- * HashAggregate (47)
-                     +- Exchange (46)
-                        +- * HashAggregate (45)
-                           +- * BroadcastHashJoin LeftSemi BuildRight (44)
-                              :- * BroadcastHashJoin LeftSemi BuildRight (30)
-                              :  :- * Project (16)
-                              :  :  +- * BroadcastHashJoin Inner BuildRight (15)
-                              :  :     :- * Project (10)
-                              :  :     :  +- * BroadcastHashJoin Inner BuildRight (9)
-                              :  :     :     :- * Filter (3)
-                              :  :     :     :  +- * ColumnarToRow (2)
-                              :  :     :     :     +- Scan parquet default.store_sales (1)
-                              :  :     :     +- BroadcastExchange (8)
-                              :  :     :        +- * Project (7)
-                              :  :     :           +- * Filter (6)
-                              :  :     :              +- * ColumnarToRow (5)
-                              :  :     :                 +- Scan parquet default.date_dim (4)
-                              :  :     +- BroadcastExchange (14)
-                              :  :        +- * Filter (13)
-                              :  :           +- * ColumnarToRow (12)
-                              :  :              +- Scan parquet default.customer (11)
-                              :  +- BroadcastExchange (29)
-                              :     +- * HashAggregate (28)
-                              :        +- Exchange (27)
-                              :           +- * HashAggregate (26)
-                              :              +- * Project (25)
-                              :                 +- * BroadcastHashJoin Inner BuildRight (24)
-                              :                    :- * Project (22)
-                              :                    :  +- * BroadcastHashJoin Inner BuildRight (21)
-                              :                    :     :- * Filter (19)
-                              :                    :     :  +- * ColumnarToRow (18)
-                              :                    :     :     +- Scan parquet default.catalog_sales (17)
-                              :                    :     +- ReusedExchange (20)
-                              :                    +- ReusedExchange (23)
-                              +- BroadcastExchange (43)
-                                 +- * HashAggregate (42)
-                                    +- Exchange (41)
-                                       +- * HashAggregate (40)
-                                          +- * Project (39)
-                                             +- * BroadcastHashJoin Inner BuildRight (38)
-                                                :- * Project (36)
-                                                :  +- * BroadcastHashJoin Inner BuildRight (35)
-                                                :     :- * Filter (33)
-                                                :     :  +- * ColumnarToRow (32)
-                                                :     :     +- Scan parquet default.web_sales (31)
-                                                :     +- ReusedExchange (34)
-                                                +- ReusedExchange (37)
+* HashAggregate (55)
++- Exchange (54)
+   +- * HashAggregate (53)
+      +- * HashAggregate (52)
+         +- * HashAggregate (51)
+            +- * HashAggregate (50)
+               +- * HashAggregate (49)
+                  +- * HashAggregate (48)
+                     +- Exchange (47)
+                        +- * HashAggregate (46)
+                           +- * Project (45)
+                              +- * BroadcastHashJoin LeftSemi BuildRight (44)
+                                 :- * BroadcastHashJoin LeftSemi BuildRight (30)
+                                 :  :- * Project (16)
+                                 :  :  +- * BroadcastHashJoin Inner BuildRight (15)
+                                 :  :     :- * Project (10)
+                                 :  :     :  +- * BroadcastHashJoin Inner BuildRight (9)
+                                 :  :     :     :- * Filter (3)
+                                 :  :     :     :  +- * ColumnarToRow (2)
+                                 :  :     :     :     +- Scan parquet default.store_sales (1)
+                                 :  :     :     +- BroadcastExchange (8)
+                                 :  :     :        +- * Project (7)
+                                 :  :     :           +- * Filter (6)
+                                 :  :     :              +- * ColumnarToRow (5)
+                                 :  :     :                 +- Scan parquet default.date_dim (4)
+                                 :  :     +- BroadcastExchange (14)
+                                 :  :        +- * Filter (13)
+                                 :  :           +- * ColumnarToRow (12)
+                                 :  :              +- Scan parquet default.customer (11)
+                                 :  +- BroadcastExchange (29)
+                                 :     +- * HashAggregate (28)
+                                 :        +- Exchange (27)
+                                 :           +- * HashAggregate (26)
+                                 :              +- * Project (25)
+                                 :                 +- * BroadcastHashJoin Inner BuildRight (24)
+                                 :                    :- * Project (22)
+                                 :                    :  +- * BroadcastHashJoin Inner BuildRight (21)
+                                 :                    :     :- * Filter (19)
+                                 :                    :     :  +- * ColumnarToRow (18)
+                                 :                    :     :     +- Scan parquet default.catalog_sales (17)
+                                 :                    :     +- ReusedExchange (20)
+                                 :                    +- ReusedExchange (23)
+                                 +- BroadcastExchange (43)
+                                    +- * HashAggregate (42)
+                                       +- Exchange (41)
+                                          +- * HashAggregate (40)
+                                             +- * Project (39)
+                                                +- * BroadcastHashJoin Inner BuildRight (38)
+                                                   :- * Project (36)
+                                                   :  +- * BroadcastHashJoin Inner BuildRight (35)
+                                                   :     :- * Filter (33)
+                                                   :     :  +- * ColumnarToRow (32)
+                                                   :     :     +- Scan parquet default.web_sales (31)
+                                                   :     +- ReusedExchange (34)
+                                                   +- ReusedExchange (37)
 
 
 (1) Scan parquet default.store_sales
@@ -174,7 +175,7 @@ Results [3]: [c_last_name#17, c_first_name#16, d_date#14]
 
 (27) Exchange
 Input [3]: [c_last_name#17, c_first_name#16, d_date#14]
-Arguments: hashpartitioning(c_last_name#17, c_first_name#16, d_date#14, 5), true, [id=#18]
+Arguments: hashpartitioning(c_last_name#17, c_first_name#16, d_date#14, 5), ENSURE_REQUIREMENTS, [id=#18]
 
 (28) HashAggregate [codegen id : 6]
 Input [3]: [c_last_name#17, c_first_name#16, d_date#14]
@@ -239,7 +240,7 @@ Results [3]: [c_last_name#26, c_first_name#25, d_date#23]
 
 (41) Exchange
 Input [3]: [c_last_name#26, c_first_name#25, d_date#23]
-Arguments: hashpartitioning(c_last_name#26, c_first_name#25, d_date#23, 5), true, [id=#27]
+Arguments: hashpartitioning(c_last_name#26, c_first_name#25, d_date#23, 5), ENSURE_REQUIREMENTS, [id=#27]
 
 (42) HashAggregate [codegen id : 10]
 Input [3]: [c_last_name#26, c_first_name#25, d_date#23]
@@ -257,64 +258,68 @@ Left keys [6]: [coalesce(c_last_name#9, ), isnull(c_last_name#9), coalesce(c_fir
 Right keys [6]: [coalesce(c_last_name#26, ), isnull(c_last_name#26), coalesce(c_first_name#25, ), isnull(c_first_name#25), coalesce(d_date#23, 0), isnull(d_date#23)]
 Join condition: None
 
-(45) HashAggregate [codegen id : 11]
+(45) Project [codegen id : 11]
+Output [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Input [3]: [d_date#4, c_first_name#8, c_last_name#9]
+
+(46) HashAggregate [codegen id : 11]
+Input [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Keys [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#9, c_first_name#8, d_date#4]
 
-(46) Exchange
+(47) Exchange
 Input [3]: [c_last_name#9, c_first_name#8, d_date#4]
-Arguments: hashpartitioning(c_last_name#9, c_first_name#8, d_date#4, 5), true, [id=#29]
+Arguments: hashpartitioning(c_last_name#9, c_first_name#8, d_date#4, 5), ENSURE_REQUIREMENTS, [id=#29]
 
-(47) HashAggregate [codegen id : 12]
+(48) HashAggregate [codegen id : 12]
 Input [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Keys [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#9, c_first_name#8, d_date#4]
 
-(48) HashAggregate [codegen id : 12]
+(49) HashAggregate [codegen id : 12]
 Input [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Keys [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#9, c_first_name#8, d_date#4]
 
-(49) HashAggregate [codegen id : 12]
+(50) HashAggregate [codegen id : 12]
 Input [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Keys [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#9, c_first_name#8, d_date#4]
 
-(50) HashAggregate [codegen id : 12]
+(51) HashAggregate [codegen id : 12]
 Input [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Keys [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#9, c_first_name#8, d_date#4]
 
-(51) HashAggregate [codegen id : 12]
+(52) HashAggregate [codegen id : 12]
 Input [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Keys [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results: []
 
-(52) HashAggregate [codegen id : 12]
+(53) HashAggregate [codegen id : 12]
 Input: []
 Keys: []
 Functions [1]: [partial_count(1)]
 Aggregate Attributes [1]: [count#30]
 Results [1]: [count#31]
 
-(53) Exchange
+(54) Exchange
 Input [1]: [count#31]
-Arguments: SinglePartition, true, [id=#32]
+Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#32]
 
-(54) HashAggregate [codegen id : 13]
+(55) HashAggregate [codegen id : 13]
 Input [1]: [count#31]
 Keys: []
 Functions [1]: [count(1)]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/simplified.txt
index a5b57a4ac9450..0f32bfbed99d7 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q38/simplified.txt
@@ -13,68 +13,69 @@ WholeStageCodegen (13)
                         Exchange [c_last_name,c_first_name,d_date] #2
                           WholeStageCodegen (11)
                             HashAggregate [c_last_name,c_first_name,d_date]
-                              BroadcastHashJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date]
+                              Project [c_last_name,c_first_name,d_date]
                                 BroadcastHashJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date]
-                                  Project [d_date,c_first_name,c_last_name]
-                                    BroadcastHashJoin [ss_customer_sk,c_customer_sk]
-                                      Project [ss_customer_sk,d_date]
-                                        BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
-                                          Filter [ss_sold_date_sk,ss_customer_sk]
-                                            ColumnarToRow
-                                              InputAdapter
-                                                Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk]
-                                          InputAdapter
-                                            BroadcastExchange #3
-                                              WholeStageCodegen (1)
-                                                Project [d_date_sk,d_date]
-                                                  Filter [d_month_seq,d_date_sk]
-                                                    ColumnarToRow
-                                                      InputAdapter
-                                                        Scan parquet default.date_dim [d_date_sk,d_date,d_month_seq]
-                                      InputAdapter
-                                        BroadcastExchange #4
-                                          WholeStageCodegen (2)
-                                            Filter [c_customer_sk]
+                                  BroadcastHashJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date]
+                                    Project [d_date,c_first_name,c_last_name]
+                                      BroadcastHashJoin [ss_customer_sk,c_customer_sk]
+                                        Project [ss_customer_sk,d_date]
+                                          BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
+                                            Filter [ss_sold_date_sk,ss_customer_sk]
                                               ColumnarToRow
                                                 InputAdapter
-                                                  Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name]
+                                                  Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk]
+                                            InputAdapter
+                                              BroadcastExchange #3
+                                                WholeStageCodegen (1)
+                                                  Project [d_date_sk,d_date]
+                                                    Filter [d_month_seq,d_date_sk]
+                                                      ColumnarToRow
+                                                        InputAdapter
+                                                          Scan parquet default.date_dim [d_date_sk,d_date,d_month_seq]
+                                        InputAdapter
+                                          BroadcastExchange #4
+                                            WholeStageCodegen (2)
+                                              Filter [c_customer_sk]
+                                                ColumnarToRow
+                                                  InputAdapter
+                                                    Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name]
+                                    InputAdapter
+                                      BroadcastExchange #5
+                                        WholeStageCodegen (6)
+                                          HashAggregate [c_last_name,c_first_name,d_date]
+                                            InputAdapter
+                                              Exchange [c_last_name,c_first_name,d_date] #6
+                                                WholeStageCodegen (5)
+                                                  HashAggregate [c_last_name,c_first_name,d_date]
+                                                    Project [c_last_name,c_first_name,d_date]
+                                                      BroadcastHashJoin [cs_bill_customer_sk,c_customer_sk]
+                                                        Project [cs_bill_customer_sk,d_date]
+                                                          BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
+                                                            Filter [cs_sold_date_sk,cs_bill_customer_sk]
+                                                              ColumnarToRow
+                                                                InputAdapter
+                                                                  Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk]
+                                                            InputAdapter
+                                                              ReusedExchange [d_date_sk,d_date] #3
+                                                        InputAdapter
+                                                          ReusedExchange [c_customer_sk,c_first_name,c_last_name] #4
                                   InputAdapter
-                                    BroadcastExchange #5
-                                      WholeStageCodegen (6)
+                                    BroadcastExchange #7
+                                      WholeStageCodegen (10)
                                         HashAggregate [c_last_name,c_first_name,d_date]
                                           InputAdapter
-                                            Exchange [c_last_name,c_first_name,d_date] #6
-                                              WholeStageCodegen (5)
+                                            Exchange [c_last_name,c_first_name,d_date] #8
+                                              WholeStageCodegen (9)
                                                 HashAggregate [c_last_name,c_first_name,d_date]
                                                   Project [c_last_name,c_first_name,d_date]
-                                                    BroadcastHashJoin [cs_bill_customer_sk,c_customer_sk]
-                                                      Project [cs_bill_customer_sk,d_date]
-                                                        BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
-                                                          Filter [cs_sold_date_sk,cs_bill_customer_sk]
+                                                    BroadcastHashJoin [ws_bill_customer_sk,c_customer_sk]
+                                                      Project [ws_bill_customer_sk,d_date]
+                                                        BroadcastHashJoin [ws_sold_date_sk,d_date_sk]
+                                                          Filter [ws_sold_date_sk,ws_bill_customer_sk]
                                                             ColumnarToRow
                                                               InputAdapter
-                                                                Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk]
+                                                                Scan parquet default.web_sales [ws_sold_date_sk,ws_bill_customer_sk]
                                                           InputAdapter
                                                             ReusedExchange [d_date_sk,d_date] #3
                                                       InputAdapter
                                                         ReusedExchange [c_customer_sk,c_first_name,c_last_name] #4
-                                InputAdapter
-                                  BroadcastExchange #7
-                                    WholeStageCodegen (10)
-                                      HashAggregate [c_last_name,c_first_name,d_date]
-                                        InputAdapter
-                                          Exchange [c_last_name,c_first_name,d_date] #8
-                                            WholeStageCodegen (9)
-                                              HashAggregate [c_last_name,c_first_name,d_date]
-                                                Project [c_last_name,c_first_name,d_date]
-                                                  BroadcastHashJoin [ws_bill_customer_sk,c_customer_sk]
-                                                    Project [ws_bill_customer_sk,d_date]
-                                                      BroadcastHashJoin [ws_sold_date_sk,d_date_sk]
-                                                        Filter [ws_sold_date_sk,ws_bill_customer_sk]
-                                                          ColumnarToRow
-                                                            InputAdapter
-                                                              Scan parquet default.web_sales [ws_sold_date_sk,ws_bill_customer_sk]
-                                                        InputAdapter
-                                                          ReusedExchange [d_date_sk,d_date] #3
-                                                    InputAdapter
-                                                      ReusedExchange [c_customer_sk,c_first_name,c_last_name] #4
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87.sf100/explain.txt
index 377bd36f520eb..e635f0297d69f 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87.sf100/explain.txt
@@ -1,71 +1,72 @@
 == Physical Plan ==
-* HashAggregate (67)
-+- Exchange (66)
-   +- * HashAggregate (65)
-      +- * HashAggregate (64)
-         +- * HashAggregate (63)
-            +- * HashAggregate (62)
-               +- * HashAggregate (61)
-                  +- * HashAggregate (60)
-                     +- Exchange (59)
-                        +- * HashAggregate (58)
-                           +- SortMergeJoin LeftAnti (57)
-                              :- SortMergeJoin LeftAnti (39)
-                              :  :- * Sort (21)
-                              :  :  +- Exchange (20)
-                              :  :     +- * Project (19)
-                              :  :        +- * SortMergeJoin Inner (18)
-                              :  :           :- * Sort (12)
-                              :  :           :  +- Exchange (11)
-                              :  :           :     +- * Project (10)
-                              :  :           :        +- * BroadcastHashJoin Inner BuildRight (9)
-                              :  :           :           :- * Filter (3)
-                              :  :           :           :  +- * ColumnarToRow (2)
-                              :  :           :           :     +- Scan parquet default.store_sales (1)
-                              :  :           :           +- BroadcastExchange (8)
-                              :  :           :              +- * Project (7)
-                              :  :           :                 +- * Filter (6)
-                              :  :           :                    +- * ColumnarToRow (5)
-                              :  :           :                       +- Scan parquet default.date_dim (4)
-                              :  :           +- * Sort (17)
-                              :  :              +- Exchange (16)
-                              :  :                 +- * Filter (15)
-                              :  :                    +- * ColumnarToRow (14)
-                              :  :                       +- Scan parquet default.customer (13)
-                              :  +- * Sort (38)
-                              :     +- Exchange (37)
-                              :        +- * HashAggregate (36)
-                              :           +- Exchange (35)
-                              :              +- * HashAggregate (34)
-                              :                 +- * Project (33)
-                              :                    +- * SortMergeJoin Inner (32)
-                              :                       :- * Sort (29)
-                              :                       :  +- Exchange (28)
-                              :                       :     +- * Project (27)
-                              :                       :        +- * BroadcastHashJoin Inner BuildRight (26)
-                              :                       :           :- * Filter (24)
-                              :                       :           :  +- * ColumnarToRow (23)
-                              :                       :           :     +- Scan parquet default.catalog_sales (22)
-                              :                       :           +- ReusedExchange (25)
-                              :                       +- * Sort (31)
-                              :                          +- ReusedExchange (30)
-                              +- * Sort (56)
-                                 +- Exchange (55)
-                                    +- * HashAggregate (54)
-                                       +- Exchange (53)
-                                          +- * HashAggregate (52)
-                                             +- * Project (51)
-                                                +- * SortMergeJoin Inner (50)
-                                                   :- * Sort (47)
-                                                   :  +- Exchange (46)
-                                                   :     +- * Project (45)
-                                                   :        +- * BroadcastHashJoin Inner BuildRight (44)
-                                                   :           :- * Filter (42)
-                                                   :           :  +- * ColumnarToRow (41)
-                                                   :           :     +- Scan parquet default.web_sales (40)
-                                                   :           +- ReusedExchange (43)
-                                                   +- * Sort (49)
-                                                      +- ReusedExchange (48)
+* HashAggregate (68)
++- Exchange (67)
+   +- * HashAggregate (66)
+      +- * HashAggregate (65)
+         +- * HashAggregate (64)
+            +- * HashAggregate (63)
+               +- * HashAggregate (62)
+                  +- * HashAggregate (61)
+                     +- Exchange (60)
+                        +- * HashAggregate (59)
+                           +- * Project (58)
+                              +- SortMergeJoin LeftAnti (57)
+                                 :- SortMergeJoin LeftAnti (39)
+                                 :  :- * Sort (21)
+                                 :  :  +- Exchange (20)
+                                 :  :     +- * Project (19)
+                                 :  :        +- * SortMergeJoin Inner (18)
+                                 :  :           :- * Sort (12)
+                                 :  :           :  +- Exchange (11)
+                                 :  :           :     +- * Project (10)
+                                 :  :           :        +- * BroadcastHashJoin Inner BuildRight (9)
+                                 :  :           :           :- * Filter (3)
+                                 :  :           :           :  +- * ColumnarToRow (2)
+                                 :  :           :           :     +- Scan parquet default.store_sales (1)
+                                 :  :           :           +- BroadcastExchange (8)
+                                 :  :           :              +- * Project (7)
+                                 :  :           :                 +- * Filter (6)
+                                 :  :           :                    +- * ColumnarToRow (5)
+                                 :  :           :                       +- Scan parquet default.date_dim (4)
+                                 :  :           +- * Sort (17)
+                                 :  :              +- Exchange (16)
+                                 :  :                 +- * Filter (15)
+                                 :  :                    +- * ColumnarToRow (14)
+                                 :  :                       +- Scan parquet default.customer (13)
+                                 :  +- * Sort (38)
+                                 :     +- Exchange (37)
+                                 :        +- * HashAggregate (36)
+                                 :           +- Exchange (35)
+                                 :              +- * HashAggregate (34)
+                                 :                 +- * Project (33)
+                                 :                    +- * SortMergeJoin Inner (32)
+                                 :                       :- * Sort (29)
+                                 :                       :  +- Exchange (28)
+                                 :                       :     +- * Project (27)
+                                 :                       :        +- * BroadcastHashJoin Inner BuildRight (26)
+                                 :                       :           :- * Filter (24)
+                                 :                       :           :  +- * ColumnarToRow (23)
+                                 :                       :           :     +- Scan parquet default.catalog_sales (22)
+                                 :                       :           +- ReusedExchange (25)
+                                 :                       +- * Sort (31)
+                                 :                          +- ReusedExchange (30)
+                                 +- * Sort (56)
+                                    +- Exchange (55)
+                                       +- * HashAggregate (54)
+                                          +- Exchange (53)
+                                             +- * HashAggregate (52)
+                                                +- * Project (51)
+                                                   +- * SortMergeJoin Inner (50)
+                                                      :- * Sort (47)
+                                                      :  +- Exchange (46)
+                                                      :     +- * Project (45)
+                                                      :        +- * BroadcastHashJoin Inner BuildRight (44)
+                                                      :           :- * Filter (42)
+                                                      :           :  +- * ColumnarToRow (41)
+                                                      :           :     +- Scan parquet default.web_sales (40)
+                                                      :           +- ReusedExchange (43)
+                                                      +- * Sort (49)
+                                                         +- ReusedExchange (48)
 
 
 (1) Scan parquet default.store_sales
@@ -115,7 +116,7 @@ Input [4]: [ss_sold_date_sk#1, ss_customer_sk#2, d_date_sk#3, d_date#4]
 
 (11) Exchange
 Input [2]: [ss_customer_sk#2, d_date#4]
-Arguments: hashpartitioning(ss_customer_sk#2, 5), true, [id=#7]
+Arguments: hashpartitioning(ss_customer_sk#2, 5), ENSURE_REQUIREMENTS, [id=#7]
 
 (12) Sort [codegen id : 3]
 Input [2]: [ss_customer_sk#2, d_date#4]
@@ -137,7 +138,7 @@ Condition : isnotnull(c_customer_sk#8)
 
 (16) Exchange
 Input [3]: [c_customer_sk#8, c_first_name#9, c_last_name#10]
-Arguments: hashpartitioning(c_customer_sk#8, 5), true, [id=#11]
+Arguments: hashpartitioning(c_customer_sk#8, 5), ENSURE_REQUIREMENTS, [id=#11]
 
 (17) Sort [codegen id : 5]
 Input [3]: [c_customer_sk#8, c_first_name#9, c_last_name#10]
@@ -154,7 +155,7 @@ Input [5]: [ss_customer_sk#2, d_date#4, c_customer_sk#8, c_first_name#9, c_last_
 
 (20) Exchange
 Input [3]: [d_date#4, c_first_name#9, c_last_name#10]
-Arguments: hashpartitioning(coalesce(c_last_name#10, ), isnull(c_last_name#10), coalesce(c_first_name#9, ), isnull(c_first_name#9), coalesce(d_date#4, 0), isnull(d_date#4), 5), true, [id=#12]
+Arguments: hashpartitioning(coalesce(c_last_name#10, ), isnull(c_last_name#10), coalesce(c_first_name#9, ), isnull(c_first_name#9), coalesce(d_date#4, 0), isnull(d_date#4), 5), ENSURE_REQUIREMENTS, [id=#12]
 
 (21) Sort [codegen id : 7]
 Input [3]: [d_date#4, c_first_name#9, c_last_name#10]
@@ -188,7 +189,7 @@ Input [4]: [cs_sold_date_sk#13, cs_bill_customer_sk#14, d_date_sk#15, d_date#16]
 
 (28) Exchange
 Input [2]: [cs_bill_customer_sk#14, d_date#16]
-Arguments: hashpartitioning(cs_bill_customer_sk#14, 5), true, [id=#17]
+Arguments: hashpartitioning(cs_bill_customer_sk#14, 5), ENSURE_REQUIREMENTS, [id=#17]
 
 (29) Sort [codegen id : 10]
 Input [2]: [cs_bill_customer_sk#14, d_date#16]
@@ -219,7 +220,7 @@ Results [3]: [c_last_name#20, c_first_name#19, d_date#16]
 
 (35) Exchange
 Input [3]: [c_last_name#20, c_first_name#19, d_date#16]
-Arguments: hashpartitioning(c_last_name#20, c_first_name#19, d_date#16, 5), true, [id=#21]
+Arguments: hashpartitioning(c_last_name#20, c_first_name#19, d_date#16, 5), ENSURE_REQUIREMENTS, [id=#21]
 
 (36) HashAggregate [codegen id : 14]
 Input [3]: [c_last_name#20, c_first_name#19, d_date#16]
@@ -230,7 +231,7 @@ Results [3]: [c_last_name#20, c_first_name#19, d_date#16]
 
 (37) Exchange
 Input [3]: [c_last_name#20, c_first_name#19, d_date#16]
-Arguments: hashpartitioning(coalesce(c_last_name#20, ), isnull(c_last_name#20), coalesce(c_first_name#19, ), isnull(c_first_name#19), coalesce(d_date#16, 0), isnull(d_date#16), 5), true, [id=#22]
+Arguments: hashpartitioning(coalesce(c_last_name#20, ), isnull(c_last_name#20), coalesce(c_first_name#19, ), isnull(c_first_name#19), coalesce(d_date#16, 0), isnull(d_date#16), 5), ENSURE_REQUIREMENTS, [id=#22]
 
 (38) Sort [codegen id : 15]
 Input [3]: [c_last_name#20, c_first_name#19, d_date#16]
@@ -269,7 +270,7 @@ Input [4]: [ws_sold_date_sk#23, ws_bill_customer_sk#24, d_date_sk#25, d_date#26]
 
 (46) Exchange
 Input [2]: [ws_bill_customer_sk#24, d_date#26]
-Arguments: hashpartitioning(ws_bill_customer_sk#24, 5), true, [id=#27]
+Arguments: hashpartitioning(ws_bill_customer_sk#24, 5), ENSURE_REQUIREMENTS, [id=#27]
 
 (47) Sort [codegen id : 18]
 Input [2]: [ws_bill_customer_sk#24, d_date#26]
@@ -300,7 +301,7 @@ Results [3]: [c_last_name#30, c_first_name#29, d_date#26]
 
 (53) Exchange
 Input [3]: [c_last_name#30, c_first_name#29, d_date#26]
-Arguments: hashpartitioning(c_last_name#30, c_first_name#29, d_date#26, 5), true, [id=#31]
+Arguments: hashpartitioning(c_last_name#30, c_first_name#29, d_date#26, 5), ENSURE_REQUIREMENTS, [id=#31]
 
 (54) HashAggregate [codegen id : 22]
 Input [3]: [c_last_name#30, c_first_name#29, d_date#26]
@@ -311,7 +312,7 @@ Results [3]: [c_last_name#30, c_first_name#29, d_date#26]
 
 (55) Exchange
 Input [3]: [c_last_name#30, c_first_name#29, d_date#26]
-Arguments: hashpartitioning(coalesce(c_last_name#30, ), isnull(c_last_name#30), coalesce(c_first_name#29, ), isnull(c_first_name#29), coalesce(d_date#26, 0), isnull(d_date#26), 5), true, [id=#32]
+Arguments: hashpartitioning(coalesce(c_last_name#30, ), isnull(c_last_name#30), coalesce(c_first_name#29, ), isnull(c_first_name#29), coalesce(d_date#26, 0), isnull(d_date#26), 5), ENSURE_REQUIREMENTS, [id=#32]
 
 (56) Sort [codegen id : 23]
 Input [3]: [c_last_name#30, c_first_name#29, d_date#26]
@@ -322,64 +323,68 @@ Left keys [6]: [coalesce(c_last_name#10, ), isnull(c_last_name#10), coalesce(c_f
 Right keys [6]: [coalesce(c_last_name#30, ), isnull(c_last_name#30), coalesce(c_first_name#29, ), isnull(c_first_name#29), coalesce(d_date#26, 0), isnull(d_date#26)]
 Join condition: None
 
-(58) HashAggregate [codegen id : 24]
+(58) Project [codegen id : 24]
+Output [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Input [3]: [d_date#4, c_first_name#9, c_last_name#10]
+
+(59) HashAggregate [codegen id : 24]
+Input [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Keys [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#10, c_first_name#9, d_date#4]
 
-(59) Exchange
+(60) Exchange
 Input [3]: [c_last_name#10, c_first_name#9, d_date#4]
-Arguments: hashpartitioning(c_last_name#10, c_first_name#9, d_date#4, 5), true, [id=#33]
+Arguments: hashpartitioning(c_last_name#10, c_first_name#9, d_date#4, 5), ENSURE_REQUIREMENTS, [id=#33]
 
-(60) HashAggregate [codegen id : 25]
+(61) HashAggregate [codegen id : 25]
 Input [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Keys [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#10, c_first_name#9, d_date#4]
 
-(61) HashAggregate [codegen id : 25]
+(62) HashAggregate [codegen id : 25]
 Input [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Keys [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#10, c_first_name#9, d_date#4]
 
-(62) HashAggregate [codegen id : 25]
+(63) HashAggregate [codegen id : 25]
 Input [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Keys [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#10, c_first_name#9, d_date#4]
 
-(63) HashAggregate [codegen id : 25]
+(64) HashAggregate [codegen id : 25]
 Input [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Keys [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#10, c_first_name#9, d_date#4]
 
-(64) HashAggregate [codegen id : 25]
+(65) HashAggregate [codegen id : 25]
 Input [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Keys [3]: [c_last_name#10, c_first_name#9, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results: []
 
-(65) HashAggregate [codegen id : 25]
+(66) HashAggregate [codegen id : 25]
 Input: []
 Keys: []
 Functions [1]: [partial_count(1)]
 Aggregate Attributes [1]: [count#34]
 Results [1]: [count#35]
 
-(66) Exchange
+(67) Exchange
 Input [1]: [count#35]
-Arguments: SinglePartition, true, [id=#36]
+Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#36]
 
-(67) HashAggregate [codegen id : 26]
+(68) HashAggregate [codegen id : 26]
 Input [1]: [count#35]
 Keys: []
 Functions [1]: [count(1)]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87.sf100/simplified.txt
index 8dd59340cf069..015d3c5a81972 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87.sf100/simplified.txt
@@ -13,105 +13,106 @@ WholeStageCodegen (26)
                         Exchange [c_last_name,c_first_name,d_date] #2
                           WholeStageCodegen (24)
                             HashAggregate [c_last_name,c_first_name,d_date]
-                              InputAdapter
-                                SortMergeJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date]
+                              Project [c_last_name,c_first_name,d_date]
+                                InputAdapter
                                   SortMergeJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date]
-                                    WholeStageCodegen (7)
-                                      Sort [c_last_name,c_first_name,d_date]
-                                        InputAdapter
-                                          Exchange [c_last_name,c_first_name,d_date] #3
-                                            WholeStageCodegen (6)
-                                              Project [d_date,c_first_name,c_last_name]
-                                                SortMergeJoin [ss_customer_sk,c_customer_sk]
+                                    SortMergeJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date]
+                                      WholeStageCodegen (7)
+                                        Sort [c_last_name,c_first_name,d_date]
+                                          InputAdapter
+                                            Exchange [c_last_name,c_first_name,d_date] #3
+                                              WholeStageCodegen (6)
+                                                Project [d_date,c_first_name,c_last_name]
+                                                  SortMergeJoin [ss_customer_sk,c_customer_sk]
+                                                    InputAdapter
+                                                      WholeStageCodegen (3)
+                                                        Sort [ss_customer_sk]
+                                                          InputAdapter
+                                                            Exchange [ss_customer_sk] #4
+                                                              WholeStageCodegen (2)
+                                                                Project [ss_customer_sk,d_date]
+                                                                  BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
+                                                                    Filter [ss_sold_date_sk,ss_customer_sk]
+                                                                      ColumnarToRow
+                                                                        InputAdapter
+                                                                          Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk]
+                                                                    InputAdapter
+                                                                      BroadcastExchange #5
+                                                                        WholeStageCodegen (1)
+                                                                          Project [d_date_sk,d_date]
+                                                                            Filter [d_month_seq,d_date_sk]
+                                                                              ColumnarToRow
+                                                                                InputAdapter
+                                                                                  Scan parquet default.date_dim [d_date_sk,d_date,d_month_seq]
+                                                    InputAdapter
+                                                      WholeStageCodegen (5)
+                                                        Sort [c_customer_sk]
+                                                          InputAdapter
+                                                            Exchange [c_customer_sk] #6
+                                                              WholeStageCodegen (4)
+                                                                Filter [c_customer_sk]
+                                                                  ColumnarToRow
+                                                                    InputAdapter
+                                                                      Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name]
+                                      WholeStageCodegen (15)
+                                        Sort [c_last_name,c_first_name,d_date]
+                                          InputAdapter
+                                            Exchange [c_last_name,c_first_name,d_date] #7
+                                              WholeStageCodegen (14)
+                                                HashAggregate [c_last_name,c_first_name,d_date]
                                                   InputAdapter
-                                                    WholeStageCodegen (3)
-                                                      Sort [ss_customer_sk]
-                                                        InputAdapter
-                                                          Exchange [ss_customer_sk] #4
-                                                            WholeStageCodegen (2)
-                                                              Project [ss_customer_sk,d_date]
-                                                                BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
-                                                                  Filter [ss_sold_date_sk,ss_customer_sk]
-                                                                    ColumnarToRow
-                                                                      InputAdapter
-                                                                        Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk]
-                                                                  InputAdapter
-                                                                    BroadcastExchange #5
-                                                                      WholeStageCodegen (1)
-                                                                        Project [d_date_sk,d_date]
-                                                                          Filter [d_month_seq,d_date_sk]
-                                                                            ColumnarToRow
+                                                    Exchange [c_last_name,c_first_name,d_date] #8
+                                                      WholeStageCodegen (13)
+                                                        HashAggregate [c_last_name,c_first_name,d_date]
+                                                          Project [c_last_name,c_first_name,d_date]
+                                                            SortMergeJoin [cs_bill_customer_sk,c_customer_sk]
+                                                              InputAdapter
+                                                                WholeStageCodegen (10)
+                                                                  Sort [cs_bill_customer_sk]
+                                                                    InputAdapter
+                                                                      Exchange [cs_bill_customer_sk] #9
+                                                                        WholeStageCodegen (9)
+                                                                          Project [cs_bill_customer_sk,d_date]
+                                                                            BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
+                                                                              Filter [cs_sold_date_sk,cs_bill_customer_sk]
+                                                                                ColumnarToRow
+                                                                                  InputAdapter
+                                                                                    Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk]
                                                                               InputAdapter
-                                                                                Scan parquet default.date_dim [d_date_sk,d_date,d_month_seq]
-                                                  InputAdapter
-                                                    WholeStageCodegen (5)
-                                                      Sort [c_customer_sk]
-                                                        InputAdapter
-                                                          Exchange [c_customer_sk] #6
-                                                            WholeStageCodegen (4)
-                                                              Filter [c_customer_sk]
-                                                                ColumnarToRow
-                                                                  InputAdapter
-                                                                    Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name]
-                                    WholeStageCodegen (15)
+                                                                                ReusedExchange [d_date_sk,d_date] #5
+                                                              InputAdapter
+                                                                WholeStageCodegen (12)
+                                                                  Sort [c_customer_sk]
+                                                                    InputAdapter
+                                                                      ReusedExchange [c_customer_sk,c_first_name,c_last_name] #6
+                                    WholeStageCodegen (23)
                                       Sort [c_last_name,c_first_name,d_date]
                                         InputAdapter
-                                          Exchange [c_last_name,c_first_name,d_date] #7
-                                            WholeStageCodegen (14)
+                                          Exchange [c_last_name,c_first_name,d_date] #10
+                                            WholeStageCodegen (22)
                                               HashAggregate [c_last_name,c_first_name,d_date]
                                                 InputAdapter
-                                                  Exchange [c_last_name,c_first_name,d_date] #8
-                                                    WholeStageCodegen (13)
+                                                  Exchange [c_last_name,c_first_name,d_date] #11
+                                                    WholeStageCodegen (21)
                                                       HashAggregate [c_last_name,c_first_name,d_date]
                                                         Project [c_last_name,c_first_name,d_date]
-                                                          SortMergeJoin [cs_bill_customer_sk,c_customer_sk]
+                                                          SortMergeJoin [ws_bill_customer_sk,c_customer_sk]
                                                             InputAdapter
-                                                              WholeStageCodegen (10)
-                                                                Sort [cs_bill_customer_sk]
+                                                              WholeStageCodegen (18)
+                                                                Sort [ws_bill_customer_sk]
                                                                   InputAdapter
-                                                                    Exchange [cs_bill_customer_sk] #9
-                                                                      WholeStageCodegen (9)
-                                                                        Project [cs_bill_customer_sk,d_date]
-                                                                          BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
-                                                                            Filter [cs_sold_date_sk,cs_bill_customer_sk]
+                                                                    Exchange [ws_bill_customer_sk] #12
+                                                                      WholeStageCodegen (17)
+                                                                        Project [ws_bill_customer_sk,d_date]
+                                                                          BroadcastHashJoin [ws_sold_date_sk,d_date_sk]
+                                                                            Filter [ws_sold_date_sk,ws_bill_customer_sk]
                                                                               ColumnarToRow
                                                                                 InputAdapter
-                                                                                  Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk]
+                                                                                  Scan parquet default.web_sales [ws_sold_date_sk,ws_bill_customer_sk]
                                                                             InputAdapter
                                                                               ReusedExchange [d_date_sk,d_date] #5
                                                             InputAdapter
-                                                              WholeStageCodegen (12)
+                                                              WholeStageCodegen (20)
                                                                 Sort [c_customer_sk]
                                                                   InputAdapter
                                                                     ReusedExchange [c_customer_sk,c_first_name,c_last_name] #6
-                                  WholeStageCodegen (23)
-                                    Sort [c_last_name,c_first_name,d_date]
-                                      InputAdapter
-                                        Exchange [c_last_name,c_first_name,d_date] #10
-                                          WholeStageCodegen (22)
-                                            HashAggregate [c_last_name,c_first_name,d_date]
-                                              InputAdapter
-                                                Exchange [c_last_name,c_first_name,d_date] #11
-                                                  WholeStageCodegen (21)
-                                                    HashAggregate [c_last_name,c_first_name,d_date]
-                                                      Project [c_last_name,c_first_name,d_date]
-                                                        SortMergeJoin [ws_bill_customer_sk,c_customer_sk]
-                                                          InputAdapter
-                                                            WholeStageCodegen (18)
-                                                              Sort [ws_bill_customer_sk]
-                                                                InputAdapter
-                                                                  Exchange [ws_bill_customer_sk] #12
-                                                                    WholeStageCodegen (17)
-                                                                      Project [ws_bill_customer_sk,d_date]
-                                                                        BroadcastHashJoin [ws_sold_date_sk,d_date_sk]
-                                                                          Filter [ws_sold_date_sk,ws_bill_customer_sk]
-                                                                            ColumnarToRow
-                                                                              InputAdapter
-                                                                                Scan parquet default.web_sales [ws_sold_date_sk,ws_bill_customer_sk]
-                                                                          InputAdapter
-                                                                            ReusedExchange [d_date_sk,d_date] #5
-                                                          InputAdapter
-                                                            WholeStageCodegen (20)
-                                                              Sort [c_customer_sk]
-                                                                InputAdapter
-                                                                  ReusedExchange [c_customer_sk,c_first_name,c_last_name] #6
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87/explain.txt
index 3d59a670b7e8b..3f52e6de41fb0 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87/explain.txt
@@ -1,58 +1,59 @@
 == Physical Plan ==
-* HashAggregate (54)
-+- Exchange (53)
-   +- * HashAggregate (52)
-      +- * HashAggregate (51)
-         +- * HashAggregate (50)
-            +- * HashAggregate (49)
-               +- * HashAggregate (48)
-                  +- * HashAggregate (47)
-                     +- Exchange (46)
-                        +- * HashAggregate (45)
-                           +- * BroadcastHashJoin LeftAnti BuildRight (44)
-                              :- * BroadcastHashJoin LeftAnti BuildRight (30)
-                              :  :- * Project (16)
-                              :  :  +- * BroadcastHashJoin Inner BuildRight (15)
-                              :  :     :- * Project (10)
-                              :  :     :  +- * BroadcastHashJoin Inner BuildRight (9)
-                              :  :     :     :- * Filter (3)
-                              :  :     :     :  +- * ColumnarToRow (2)
-                              :  :     :     :     +- Scan parquet default.store_sales (1)
-                              :  :     :     +- BroadcastExchange (8)
-                              :  :     :        +- * Project (7)
-                              :  :     :           +- * Filter (6)
-                              :  :     :              +- * ColumnarToRow (5)
-                              :  :     :                 +- Scan parquet default.date_dim (4)
-                              :  :     +- BroadcastExchange (14)
-                              :  :        +- * Filter (13)
-                              :  :           +- * ColumnarToRow (12)
-                              :  :              +- Scan parquet default.customer (11)
-                              :  +- BroadcastExchange (29)
-                              :     +- * HashAggregate (28)
-                              :        +- Exchange (27)
-                              :           +- * HashAggregate (26)
-                              :              +- * Project (25)
-                              :                 +- * BroadcastHashJoin Inner BuildRight (24)
-                              :                    :- * Project (22)
-                              :                    :  +- * BroadcastHashJoin Inner BuildRight (21)
-                              :                    :     :- * Filter (19)
-                              :                    :     :  +- * ColumnarToRow (18)
-                              :                    :     :     +- Scan parquet default.catalog_sales (17)
-                              :                    :     +- ReusedExchange (20)
-                              :                    +- ReusedExchange (23)
-                              +- BroadcastExchange (43)
-                                 +- * HashAggregate (42)
-                                    +- Exchange (41)
-                                       +- * HashAggregate (40)
-                                          +- * Project (39)
-                                             +- * BroadcastHashJoin Inner BuildRight (38)
-                                                :- * Project (36)
-                                                :  +- * BroadcastHashJoin Inner BuildRight (35)
-                                                :     :- * Filter (33)
-                                                :     :  +- * ColumnarToRow (32)
-                                                :     :     +- Scan parquet default.web_sales (31)
-                                                :     +- ReusedExchange (34)
-                                                +- ReusedExchange (37)
+* HashAggregate (55)
++- Exchange (54)
+   +- * HashAggregate (53)
+      +- * HashAggregate (52)
+         +- * HashAggregate (51)
+            +- * HashAggregate (50)
+               +- * HashAggregate (49)
+                  +- * HashAggregate (48)
+                     +- Exchange (47)
+                        +- * HashAggregate (46)
+                           +- * Project (45)
+                              +- * BroadcastHashJoin LeftAnti BuildRight (44)
+                                 :- * BroadcastHashJoin LeftAnti BuildRight (30)
+                                 :  :- * Project (16)
+                                 :  :  +- * BroadcastHashJoin Inner BuildRight (15)
+                                 :  :     :- * Project (10)
+                                 :  :     :  +- * BroadcastHashJoin Inner BuildRight (9)
+                                 :  :     :     :- * Filter (3)
+                                 :  :     :     :  +- * ColumnarToRow (2)
+                                 :  :     :     :     +- Scan parquet default.store_sales (1)
+                                 :  :     :     +- BroadcastExchange (8)
+                                 :  :     :        +- * Project (7)
+                                 :  :     :           +- * Filter (6)
+                                 :  :     :              +- * ColumnarToRow (5)
+                                 :  :     :                 +- Scan parquet default.date_dim (4)
+                                 :  :     +- BroadcastExchange (14)
+                                 :  :        +- * Filter (13)
+                                 :  :           +- * ColumnarToRow (12)
+                                 :  :              +- Scan parquet default.customer (11)
+                                 :  +- BroadcastExchange (29)
+                                 :     +- * HashAggregate (28)
+                                 :        +- Exchange (27)
+                                 :           +- * HashAggregate (26)
+                                 :              +- * Project (25)
+                                 :                 +- * BroadcastHashJoin Inner BuildRight (24)
+                                 :                    :- * Project (22)
+                                 :                    :  +- * BroadcastHashJoin Inner BuildRight (21)
+                                 :                    :     :- * Filter (19)
+                                 :                    :     :  +- * ColumnarToRow (18)
+                                 :                    :     :     +- Scan parquet default.catalog_sales (17)
+                                 :                    :     +- ReusedExchange (20)
+                                 :                    +- ReusedExchange (23)
+                                 +- BroadcastExchange (43)
+                                    +- * HashAggregate (42)
+                                       +- Exchange (41)
+                                          +- * HashAggregate (40)
+                                             +- * Project (39)
+                                                +- * BroadcastHashJoin Inner BuildRight (38)
+                                                   :- * Project (36)
+                                                   :  +- * BroadcastHashJoin Inner BuildRight (35)
+                                                   :     :- * Filter (33)
+                                                   :     :  +- * ColumnarToRow (32)
+                                                   :     :     +- Scan parquet default.web_sales (31)
+                                                   :     +- ReusedExchange (34)
+                                                   +- ReusedExchange (37)
 
 
 (1) Scan parquet default.store_sales
@@ -174,7 +175,7 @@ Results [3]: [c_last_name#17, c_first_name#16, d_date#14]
 
 (27) Exchange
 Input [3]: [c_last_name#17, c_first_name#16, d_date#14]
-Arguments: hashpartitioning(c_last_name#17, c_first_name#16, d_date#14, 5), true, [id=#18]
+Arguments: hashpartitioning(c_last_name#17, c_first_name#16, d_date#14, 5), ENSURE_REQUIREMENTS, [id=#18]
 
 (28) HashAggregate [codegen id : 6]
 Input [3]: [c_last_name#17, c_first_name#16, d_date#14]
@@ -239,7 +240,7 @@ Results [3]: [c_last_name#26, c_first_name#25, d_date#23]
 
 (41) Exchange
 Input [3]: [c_last_name#26, c_first_name#25, d_date#23]
-Arguments: hashpartitioning(c_last_name#26, c_first_name#25, d_date#23, 5), true, [id=#27]
+Arguments: hashpartitioning(c_last_name#26, c_first_name#25, d_date#23, 5), ENSURE_REQUIREMENTS, [id=#27]
 
 (42) HashAggregate [codegen id : 10]
 Input [3]: [c_last_name#26, c_first_name#25, d_date#23]
@@ -257,64 +258,68 @@ Left keys [6]: [coalesce(c_last_name#9, ), isnull(c_last_name#9), coalesce(c_fir
 Right keys [6]: [coalesce(c_last_name#26, ), isnull(c_last_name#26), coalesce(c_first_name#25, ), isnull(c_first_name#25), coalesce(d_date#23, 0), isnull(d_date#23)]
 Join condition: None
 
-(45) HashAggregate [codegen id : 11]
+(45) Project [codegen id : 11]
+Output [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Input [3]: [d_date#4, c_first_name#8, c_last_name#9]
+
+(46) HashAggregate [codegen id : 11]
+Input [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Keys [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#9, c_first_name#8, d_date#4]
 
-(46) Exchange
+(47) Exchange
 Input [3]: [c_last_name#9, c_first_name#8, d_date#4]
-Arguments: hashpartitioning(c_last_name#9, c_first_name#8, d_date#4, 5), true, [id=#29]
+Arguments: hashpartitioning(c_last_name#9, c_first_name#8, d_date#4, 5), ENSURE_REQUIREMENTS, [id=#29]
 
-(47) HashAggregate [codegen id : 12]
+(48) HashAggregate [codegen id : 12]
 Input [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Keys [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#9, c_first_name#8, d_date#4]
 
-(48) HashAggregate [codegen id : 12]
+(49) HashAggregate [codegen id : 12]
 Input [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Keys [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#9, c_first_name#8, d_date#4]
 
-(49) HashAggregate [codegen id : 12]
+(50) HashAggregate [codegen id : 12]
 Input [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Keys [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#9, c_first_name#8, d_date#4]
 
-(50) HashAggregate [codegen id : 12]
+(51) HashAggregate [codegen id : 12]
 Input [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Keys [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results [3]: [c_last_name#9, c_first_name#8, d_date#4]
 
-(51) HashAggregate [codegen id : 12]
+(52) HashAggregate [codegen id : 12]
 Input [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Keys [3]: [c_last_name#9, c_first_name#8, d_date#4]
 Functions: []
 Aggregate Attributes: []
 Results: []
 
-(52) HashAggregate [codegen id : 12]
+(53) HashAggregate [codegen id : 12]
 Input: []
 Keys: []
 Functions [1]: [partial_count(1)]
 Aggregate Attributes [1]: [count#30]
 Results [1]: [count#31]
 
-(53) Exchange
+(54) Exchange
 Input [1]: [count#31]
-Arguments: SinglePartition, true, [id=#32]
+Arguments: SinglePartition, ENSURE_REQUIREMENTS, [id=#32]
 
-(54) HashAggregate [codegen id : 13]
+(55) HashAggregate [codegen id : 13]
 Input [1]: [count#31]
 Keys: []
 Functions [1]: [count(1)]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87/simplified.txt
index a5b57a4ac9450..0f32bfbed99d7 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q87/simplified.txt
@@ -13,68 +13,69 @@ WholeStageCodegen (13)
                         Exchange [c_last_name,c_first_name,d_date] #2
                           WholeStageCodegen (11)
                             HashAggregate [c_last_name,c_first_name,d_date]
-                              BroadcastHashJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date]
+                              Project [c_last_name,c_first_name,d_date]
                                 BroadcastHashJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date]
-                                  Project [d_date,c_first_name,c_last_name]
-                                    BroadcastHashJoin [ss_customer_sk,c_customer_sk]
-                                      Project [ss_customer_sk,d_date]
-                                        BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
-                                          Filter [ss_sold_date_sk,ss_customer_sk]
-                                            ColumnarToRow
-                                              InputAdapter
-                                                Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk]
-                                          InputAdapter
-                                            BroadcastExchange #3
-                                              WholeStageCodegen (1)
-                                                Project [d_date_sk,d_date]
-                                                  Filter [d_month_seq,d_date_sk]
-                                                    ColumnarToRow
-                                                      InputAdapter
-                                                        Scan parquet default.date_dim [d_date_sk,d_date,d_month_seq]
-                                      InputAdapter
-                                        BroadcastExchange #4
-                                          WholeStageCodegen (2)
-                                            Filter [c_customer_sk]
+                                  BroadcastHashJoin [c_last_name,c_first_name,d_date,c_last_name,c_first_name,d_date]
+                                    Project [d_date,c_first_name,c_last_name]
+                                      BroadcastHashJoin [ss_customer_sk,c_customer_sk]
+                                        Project [ss_customer_sk,d_date]
+                                          BroadcastHashJoin [ss_sold_date_sk,d_date_sk]
+                                            Filter [ss_sold_date_sk,ss_customer_sk]
                                               ColumnarToRow
                                                 InputAdapter
-                                                  Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name]
+                                                  Scan parquet default.store_sales [ss_sold_date_sk,ss_customer_sk]
+                                            InputAdapter
+                                              BroadcastExchange #3
+                                                WholeStageCodegen (1)
+                                                  Project [d_date_sk,d_date]
+                                                    Filter [d_month_seq,d_date_sk]
+                                                      ColumnarToRow
+                                                        InputAdapter
+                                                          Scan parquet default.date_dim [d_date_sk,d_date,d_month_seq]
+                                        InputAdapter
+                                          BroadcastExchange #4
+                                            WholeStageCodegen (2)
+                                              Filter [c_customer_sk]
+                                                ColumnarToRow
+                                                  InputAdapter
+                                                    Scan parquet default.customer [c_customer_sk,c_first_name,c_last_name]
+                                    InputAdapter
+                                      BroadcastExchange #5
+                                        WholeStageCodegen (6)
+                                          HashAggregate [c_last_name,c_first_name,d_date]
+                                            InputAdapter
+                                              Exchange [c_last_name,c_first_name,d_date] #6
+                                                WholeStageCodegen (5)
+                                                  HashAggregate [c_last_name,c_first_name,d_date]
+                                                    Project [c_last_name,c_first_name,d_date]
+                                                      BroadcastHashJoin [cs_bill_customer_sk,c_customer_sk]
+                                                        Project [cs_bill_customer_sk,d_date]
+                                                          BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
+                                                            Filter [cs_sold_date_sk,cs_bill_customer_sk]
+                                                              ColumnarToRow
+                                                                InputAdapter
+                                                                  Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk]
+                                                            InputAdapter
+                                                              ReusedExchange [d_date_sk,d_date] #3
+                                                        InputAdapter
+                                                          ReusedExchange [c_customer_sk,c_first_name,c_last_name] #4
                                   InputAdapter
-                                    BroadcastExchange #5
-                                      WholeStageCodegen (6)
+                                    BroadcastExchange #7
+                                      WholeStageCodegen (10)
                                         HashAggregate [c_last_name,c_first_name,d_date]
                                           InputAdapter
-                                            Exchange [c_last_name,c_first_name,d_date] #6
-                                              WholeStageCodegen (5)
+                                            Exchange [c_last_name,c_first_name,d_date] #8
+                                              WholeStageCodegen (9)
                                                 HashAggregate [c_last_name,c_first_name,d_date]
                                                   Project [c_last_name,c_first_name,d_date]
-                                                    BroadcastHashJoin [cs_bill_customer_sk,c_customer_sk]
-                                                      Project [cs_bill_customer_sk,d_date]
-                                                        BroadcastHashJoin [cs_sold_date_sk,d_date_sk]
-                                                          Filter [cs_sold_date_sk,cs_bill_customer_sk]
+                                                    BroadcastHashJoin [ws_bill_customer_sk,c_customer_sk]
+                                                      Project [ws_bill_customer_sk,d_date]
+                                                        BroadcastHashJoin [ws_sold_date_sk,d_date_sk]
+                                                          Filter [ws_sold_date_sk,ws_bill_customer_sk]
                                                             ColumnarToRow
                                                               InputAdapter
-                                                                Scan parquet default.catalog_sales [cs_sold_date_sk,cs_bill_customer_sk]
+                                                                Scan parquet default.web_sales [ws_sold_date_sk,ws_bill_customer_sk]
                                                           InputAdapter
                                                             ReusedExchange [d_date_sk,d_date] #3
                                                       InputAdapter
                                                         ReusedExchange [c_customer_sk,c_first_name,c_last_name] #4
-                                InputAdapter
-                                  BroadcastExchange #7
-                                    WholeStageCodegen (10)
-                                      HashAggregate [c_last_name,c_first_name,d_date]
-                                        InputAdapter
-                                          Exchange [c_last_name,c_first_name,d_date] #8
-                                            WholeStageCodegen (9)
-                                              HashAggregate [c_last_name,c_first_name,d_date]
-                                                Project [c_last_name,c_first_name,d_date]
-                                                  BroadcastHashJoin [ws_bill_customer_sk,c_customer_sk]
-                                                    Project [ws_bill_customer_sk,d_date]
-                                                      BroadcastHashJoin [ws_sold_date_sk,d_date_sk]
-                                                        Filter [ws_sold_date_sk,ws_bill_customer_sk]
-                                                          ColumnarToRow
-                                                            InputAdapter
-                                                              Scan parquet default.web_sales [ws_sold_date_sk,ws_bill_customer_sk]
-                                                        InputAdapter
-                                                          ReusedExchange [d_date_sk,d_date] #3
-                                                    InputAdapter
-                                                      ReusedExchange [c_customer_sk,c_first_name,c_last_name] #4
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22.sf100/explain.txt
index 3efe02a377d09..0c805bef9c1f1 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22.sf100/explain.txt
@@ -1,32 +1,33 @@
 == Physical Plan ==
-TakeOrderedAndProject (28)
-+- * HashAggregate (27)
-   +- Exchange (26)
-      +- * HashAggregate (25)
-         +- * Expand (24)
-            +- BroadcastNestedLoopJoin Inner BuildRight (23)
-               :- * Project (19)
-               :  +- * SortMergeJoin Inner (18)
-               :     :- * Sort (12)
-               :     :  +- Exchange (11)
-               :     :     +- * Project (10)
-               :     :        +- * BroadcastHashJoin Inner BuildRight (9)
-               :     :           :- * Filter (3)
-               :     :           :  +- * ColumnarToRow (2)
-               :     :           :     +- Scan parquet default.inventory (1)
-               :     :           +- BroadcastExchange (8)
-               :     :              +- * Project (7)
-               :     :                 +- * Filter (6)
-               :     :                    +- * ColumnarToRow (5)
-               :     :                       +- Scan parquet default.date_dim (4)
-               :     +- * Sort (17)
-               :        +- Exchange (16)
-               :           +- * Filter (15)
-               :              +- * ColumnarToRow (14)
-               :                 +- Scan parquet default.item (13)
-               +- BroadcastExchange (22)
-                  +- * ColumnarToRow (21)
-                     +- Scan parquet default.warehouse (20)
+TakeOrderedAndProject (29)
++- * HashAggregate (28)
+   +- Exchange (27)
+      +- * HashAggregate (26)
+         +- * Expand (25)
+            +- * Project (24)
+               +- BroadcastNestedLoopJoin Inner BuildRight (23)
+                  :- * Project (19)
+                  :  +- * SortMergeJoin Inner (18)
+                  :     :- * Sort (12)
+                  :     :  +- Exchange (11)
+                  :     :     +- * Project (10)
+                  :     :        +- * BroadcastHashJoin Inner BuildRight (9)
+                  :     :           :- * Filter (3)
+                  :     :           :  +- * ColumnarToRow (2)
+                  :     :           :     +- Scan parquet default.inventory (1)
+                  :     :           +- BroadcastExchange (8)
+                  :     :              +- * Project (7)
+                  :     :                 +- * Filter (6)
+                  :     :                    +- * ColumnarToRow (5)
+                  :     :                       +- Scan parquet default.date_dim (4)
+                  :     +- * Sort (17)
+                  :        +- Exchange (16)
+                  :           +- * Filter (15)
+                  :              +- * ColumnarToRow (14)
+                  :                 +- Scan parquet default.item (13)
+                  +- BroadcastExchange (22)
+                     +- * ColumnarToRow (21)
+                        +- Scan parquet default.warehouse (20)
 
 
 (1) Scan parquet default.inventory
@@ -76,7 +77,7 @@ Input [4]: [inv_date_sk#1, inv_item_sk#2, inv_quantity_on_hand#3, d_date_sk#4]
 
 (11) Exchange
 Input [2]: [inv_item_sk#2, inv_quantity_on_hand#3]
-Arguments: hashpartitioning(inv_item_sk#2, 5), true, [id=#7]
+Arguments: hashpartitioning(inv_item_sk#2, 5), ENSURE_REQUIREMENTS, [id=#7]
 
 (12) Sort [codegen id : 3]
 Input [2]: [inv_item_sk#2, inv_quantity_on_hand#3]
@@ -98,7 +99,7 @@ Condition : isnotnull(i_item_sk#8)
 
 (16) Exchange
 Input [5]: [i_item_sk#8, i_brand#9, i_class#10, i_category#11, i_product_name#12]
-Arguments: hashpartitioning(i_item_sk#8, 5), true, [id=#13]
+Arguments: hashpartitioning(i_item_sk#8, 5), ENSURE_REQUIREMENTS, [id=#13]
 
 (17) Sort [codegen id : 5]
 Input [5]: [i_item_sk#8, i_brand#9, i_class#10, i_category#11, i_product_name#12]
@@ -129,29 +130,33 @@ Arguments: IdentityBroadcastMode, [id=#14]
 (23) BroadcastNestedLoopJoin
 Join condition: None
 
-(24) Expand [codegen id : 8]
+(24) Project [codegen id : 8]
+Output [5]: [inv_quantity_on_hand#3, i_product_name#12, i_brand#9, i_class#10, i_category#11]
 Input [5]: [inv_quantity_on_hand#3, i_brand#9, i_class#10, i_category#11, i_product_name#12]
+
+(25) Expand [codegen id : 8]
+Input [5]: [inv_quantity_on_hand#3, i_product_name#12, i_brand#9, i_class#10, i_category#11]
 Arguments: [List(inv_quantity_on_hand#3, i_product_name#12, i_brand#9, i_class#10, i_category#11, 0), List(inv_quantity_on_hand#3, i_product_name#12, i_brand#9, i_class#10, null, 1), List(inv_quantity_on_hand#3, i_product_name#12, i_brand#9, null, null, 3), List(inv_quantity_on_hand#3, i_product_name#12, null, null, null, 7), List(inv_quantity_on_hand#3, null, null, null, null, 15)], [inv_quantity_on_hand#3, i_product_name#15, i_brand#16, i_class#17, i_category#18, spark_grouping_id#19]
 
-(25) HashAggregate [codegen id : 8]
+(26) HashAggregate [codegen id : 8]
 Input [6]: [inv_quantity_on_hand#3, i_product_name#15, i_brand#16, i_class#17, i_category#18, spark_grouping_id#19]
 Keys [5]: [i_product_name#15, i_brand#16, i_class#17, i_category#18, spark_grouping_id#19]
 Functions [1]: [partial_avg(cast(inv_quantity_on_hand#3 as bigint))]
 Aggregate Attributes [2]: [sum#20, count#21]
 Results [7]: [i_product_name#15, i_brand#16, i_class#17, i_category#18, spark_grouping_id#19, sum#22, count#23]
 
-(26) Exchange
+(27) Exchange
 Input [7]: [i_product_name#15, i_brand#16, i_class#17, i_category#18, spark_grouping_id#19, sum#22, count#23]
-Arguments: hashpartitioning(i_product_name#15, i_brand#16, i_class#17, i_category#18, spark_grouping_id#19, 5), true, [id=#24]
+Arguments: hashpartitioning(i_product_name#15, i_brand#16, i_class#17, i_category#18, spark_grouping_id#19, 5), ENSURE_REQUIREMENTS, [id=#24]
 
-(27) HashAggregate [codegen id : 9]
+(28) HashAggregate [codegen id : 9]
 Input [7]: [i_product_name#15, i_brand#16, i_class#17, i_category#18, spark_grouping_id#19, sum#22, count#23]
 Keys [5]: [i_product_name#15, i_brand#16, i_class#17, i_category#18, spark_grouping_id#19]
 Functions [1]: [avg(cast(inv_quantity_on_hand#3 as bigint))]
 Aggregate Attributes [1]: [avg(cast(inv_quantity_on_hand#3 as bigint))#25]
 Results [5]: [i_product_name#15, i_brand#16, i_class#17, i_category#18, avg(cast(inv_quantity_on_hand#3 as bigint))#25 AS qoh#26]
 
-(28) TakeOrderedAndProject
+(29) TakeOrderedAndProject
 Input [5]: [i_product_name#15, i_brand#16, i_class#17, i_category#18, qoh#26]
 Arguments: 100, [qoh#26 ASC NULLS FIRST, i_product_name#15 ASC NULLS FIRST, i_brand#16 ASC NULLS FIRST, i_class#17 ASC NULLS FIRST, i_category#18 ASC NULLS FIRST], [i_product_name#15, i_brand#16, i_class#17, i_category#18, qoh#26]
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22.sf100/simplified.txt
index d5f40d419175e..2d2e46ddd7013 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22.sf100/simplified.txt
@@ -6,43 +6,44 @@ TakeOrderedAndProject [qoh,i_product_name,i_brand,i_class,i_category]
           WholeStageCodegen (8)
             HashAggregate [i_product_name,i_brand,i_class,i_category,spark_grouping_id,inv_quantity_on_hand] [sum,count,sum,count]
               Expand [inv_quantity_on_hand,i_product_name,i_brand,i_class,i_category]
-                InputAdapter
-                  BroadcastNestedLoopJoin
-                    WholeStageCodegen (6)
-                      Project [inv_quantity_on_hand,i_brand,i_class,i_category,i_product_name]
-                        SortMergeJoin [inv_item_sk,i_item_sk]
-                          InputAdapter
-                            WholeStageCodegen (3)
-                              Sort [inv_item_sk]
-                                InputAdapter
-                                  Exchange [inv_item_sk] #2
-                                    WholeStageCodegen (2)
-                                      Project [inv_item_sk,inv_quantity_on_hand]
-                                        BroadcastHashJoin [inv_date_sk,d_date_sk]
-                                          Filter [inv_date_sk,inv_item_sk]
-                                            ColumnarToRow
-                                              InputAdapter
-                                                Scan parquet default.inventory [inv_date_sk,inv_item_sk,inv_quantity_on_hand]
-                                          InputAdapter
-                                            BroadcastExchange #3
-                                              WholeStageCodegen (1)
-                                                Project [d_date_sk]
-                                                  Filter [d_month_seq,d_date_sk]
-                                                    ColumnarToRow
-                                                      InputAdapter
-                                                        Scan parquet default.date_dim [d_date_sk,d_month_seq]
-                          InputAdapter
-                            WholeStageCodegen (5)
-                              Sort [i_item_sk]
-                                InputAdapter
-                                  Exchange [i_item_sk] #4
-                                    WholeStageCodegen (4)
-                                      Filter [i_item_sk]
-                                        ColumnarToRow
-                                          InputAdapter
-                                            Scan parquet default.item [i_item_sk,i_brand,i_class,i_category,i_product_name]
-                    BroadcastExchange #5
-                      WholeStageCodegen (7)
-                        ColumnarToRow
-                          InputAdapter
-                            Scan parquet default.warehouse
+                Project [inv_quantity_on_hand,i_product_name,i_brand,i_class,i_category]
+                  InputAdapter
+                    BroadcastNestedLoopJoin
+                      WholeStageCodegen (6)
+                        Project [inv_quantity_on_hand,i_brand,i_class,i_category,i_product_name]
+                          SortMergeJoin [inv_item_sk,i_item_sk]
+                            InputAdapter
+                              WholeStageCodegen (3)
+                                Sort [inv_item_sk]
+                                  InputAdapter
+                                    Exchange [inv_item_sk] #2
+                                      WholeStageCodegen (2)
+                                        Project [inv_item_sk,inv_quantity_on_hand]
+                                          BroadcastHashJoin [inv_date_sk,d_date_sk]
+                                            Filter [inv_date_sk,inv_item_sk]
+                                              ColumnarToRow
+                                                InputAdapter
+                                                  Scan parquet default.inventory [inv_date_sk,inv_item_sk,inv_quantity_on_hand]
+                                            InputAdapter
+                                              BroadcastExchange #3
+                                                WholeStageCodegen (1)
+                                                  Project [d_date_sk]
+                                                    Filter [d_month_seq,d_date_sk]
+                                                      ColumnarToRow
+                                                        InputAdapter
+                                                          Scan parquet default.date_dim [d_date_sk,d_month_seq]
+                            InputAdapter
+                              WholeStageCodegen (5)
+                                Sort [i_item_sk]
+                                  InputAdapter
+                                    Exchange [i_item_sk] #4
+                                      WholeStageCodegen (4)
+                                        Filter [i_item_sk]
+                                          ColumnarToRow
+                                            InputAdapter
+                                              Scan parquet default.item [i_item_sk,i_brand,i_class,i_category,i_product_name]
+                      BroadcastExchange #5
+                        WholeStageCodegen (7)
+                          ColumnarToRow
+                            InputAdapter
+                              Scan parquet default.warehouse
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22/explain.txt
index ad83edec0ad33..f2a95f704db57 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22/explain.txt
@@ -1,29 +1,30 @@
 == Physical Plan ==
-TakeOrderedAndProject (25)
-+- * HashAggregate (24)
-   +- Exchange (23)
-      +- * HashAggregate (22)
-         +- * Expand (21)
-            +- BroadcastNestedLoopJoin Inner BuildRight (20)
-               :- * Project (16)
-               :  +- * BroadcastHashJoin Inner BuildRight (15)
-               :     :- * Project (10)
-               :     :  +- * BroadcastHashJoin Inner BuildRight (9)
-               :     :     :- * Filter (3)
-               :     :     :  +- * ColumnarToRow (2)
-               :     :     :     +- Scan parquet default.inventory (1)
-               :     :     +- BroadcastExchange (8)
-               :     :        +- * Project (7)
-               :     :           +- * Filter (6)
-               :     :              +- * ColumnarToRow (5)
-               :     :                 +- Scan parquet default.date_dim (4)
-               :     +- BroadcastExchange (14)
-               :        +- * Filter (13)
-               :           +- * ColumnarToRow (12)
-               :              +- Scan parquet default.item (11)
-               +- BroadcastExchange (19)
-                  +- * ColumnarToRow (18)
-                     +- Scan parquet default.warehouse (17)
+TakeOrderedAndProject (26)
++- * HashAggregate (25)
+   +- Exchange (24)
+      +- * HashAggregate (23)
+         +- * Expand (22)
+            +- * Project (21)
+               +- BroadcastNestedLoopJoin Inner BuildRight (20)
+                  :- * Project (16)
+                  :  +- * BroadcastHashJoin Inner BuildRight (15)
+                  :     :- * Project (10)
+                  :     :  +- * BroadcastHashJoin Inner BuildRight (9)
+                  :     :     :- * Filter (3)
+                  :     :     :  +- * ColumnarToRow (2)
+                  :     :     :     +- Scan parquet default.inventory (1)
+                  :     :     +- BroadcastExchange (8)
+                  :     :        +- * Project (7)
+                  :     :           +- * Filter (6)
+                  :     :              +- * ColumnarToRow (5)
+                  :     :                 +- Scan parquet default.date_dim (4)
+                  :     +- BroadcastExchange (14)
+                  :        +- * Filter (13)
+                  :           +- * ColumnarToRow (12)
+                  :              +- Scan parquet default.item (11)
+                  +- BroadcastExchange (19)
+                     +- * ColumnarToRow (18)
+                        +- Scan parquet default.warehouse (17)
 
 
 (1) Scan parquet default.inventory
@@ -114,29 +115,33 @@ Arguments: IdentityBroadcastMode, [id=#13]
 (20) BroadcastNestedLoopJoin
 Join condition: None
 
-(21) Expand [codegen id : 5]
+(21) Project [codegen id : 5]
+Output [5]: [inv_quantity_on_hand#3, i_product_name#11, i_brand#8, i_class#9, i_category#10]
 Input [5]: [inv_quantity_on_hand#3, i_brand#8, i_class#9, i_category#10, i_product_name#11]
+
+(22) Expand [codegen id : 5]
+Input [5]: [inv_quantity_on_hand#3, i_product_name#11, i_brand#8, i_class#9, i_category#10]
 Arguments: [List(inv_quantity_on_hand#3, i_product_name#11, i_brand#8, i_class#9, i_category#10, 0), List(inv_quantity_on_hand#3, i_product_name#11, i_brand#8, i_class#9, null, 1), List(inv_quantity_on_hand#3, i_product_name#11, i_brand#8, null, null, 3), List(inv_quantity_on_hand#3, i_product_name#11, null, null, null, 7), List(inv_quantity_on_hand#3, null, null, null, null, 15)], [inv_quantity_on_hand#3, i_product_name#14, i_brand#15, i_class#16, i_category#17, spark_grouping_id#18]
 
-(22) HashAggregate [codegen id : 5]
+(23) HashAggregate [codegen id : 5]
 Input [6]: [inv_quantity_on_hand#3, i_product_name#14, i_brand#15, i_class#16, i_category#17, spark_grouping_id#18]
 Keys [5]: [i_product_name#14, i_brand#15, i_class#16, i_category#17, spark_grouping_id#18]
 Functions [1]: [partial_avg(cast(inv_quantity_on_hand#3 as bigint))]
 Aggregate Attributes [2]: [sum#19, count#20]
 Results [7]: [i_product_name#14, i_brand#15, i_class#16, i_category#17, spark_grouping_id#18, sum#21, count#22]
 
-(23) Exchange
+(24) Exchange
 Input [7]: [i_product_name#14, i_brand#15, i_class#16, i_category#17, spark_grouping_id#18, sum#21, count#22]
-Arguments: hashpartitioning(i_product_name#14, i_brand#15, i_class#16, i_category#17, spark_grouping_id#18, 5), true, [id=#23]
+Arguments: hashpartitioning(i_product_name#14, i_brand#15, i_class#16, i_category#17, spark_grouping_id#18, 5), ENSURE_REQUIREMENTS, [id=#23]
 
-(24) HashAggregate [codegen id : 6]
+(25) HashAggregate [codegen id : 6]
 Input [7]: [i_product_name#14, i_brand#15, i_class#16, i_category#17, spark_grouping_id#18, sum#21, count#22]
 Keys [5]: [i_product_name#14, i_brand#15, i_class#16, i_category#17, spark_grouping_id#18]
 Functions [1]: [avg(cast(inv_quantity_on_hand#3 as bigint))]
 Aggregate Attributes [1]: [avg(cast(inv_quantity_on_hand#3 as bigint))#24]
 Results [5]: [i_product_name#14, i_brand#15, i_class#16, i_category#17, avg(cast(inv_quantity_on_hand#3 as bigint))#24 AS qoh#25]
 
-(25) TakeOrderedAndProject
+(26) TakeOrderedAndProject
 Input [5]: [i_product_name#14, i_brand#15, i_class#16, i_category#17, qoh#25]
 Arguments: 100, [qoh#25 ASC NULLS FIRST, i_product_name#14 ASC NULLS FIRST, i_brand#15 ASC NULLS FIRST, i_class#16 ASC NULLS FIRST, i_category#17 ASC NULLS FIRST], [i_product_name#14, i_brand#15, i_class#16, i_category#17, qoh#25]
 
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22/simplified.txt
index cdf9335b1c48b..559e96388689b 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v2_7/q22/simplified.txt
@@ -6,34 +6,35 @@ TakeOrderedAndProject [qoh,i_product_name,i_brand,i_class,i_category]
           WholeStageCodegen (5)
             HashAggregate [i_product_name,i_brand,i_class,i_category,spark_grouping_id,inv_quantity_on_hand] [sum,count,sum,count]
               Expand [inv_quantity_on_hand,i_product_name,i_brand,i_class,i_category]
-                InputAdapter
-                  BroadcastNestedLoopJoin
-                    WholeStageCodegen (3)
-                      Project [inv_quantity_on_hand,i_brand,i_class,i_category,i_product_name]
-                        BroadcastHashJoin [inv_item_sk,i_item_sk]
-                          Project [inv_item_sk,inv_quantity_on_hand]
-                            BroadcastHashJoin [inv_date_sk,d_date_sk]
-                              Filter [inv_date_sk,inv_item_sk]
-                                ColumnarToRow
-                                  InputAdapter
-                                    Scan parquet default.inventory [inv_date_sk,inv_item_sk,inv_quantity_on_hand]
-                              InputAdapter
-                                BroadcastExchange #2
-                                  WholeStageCodegen (1)
-                                    Project [d_date_sk]
-                                      Filter [d_month_seq,d_date_sk]
-                                        ColumnarToRow
-                                          InputAdapter
-                                            Scan parquet default.date_dim [d_date_sk,d_month_seq]
-                          InputAdapter
-                            BroadcastExchange #3
-                              WholeStageCodegen (2)
-                                Filter [i_item_sk]
+                Project [inv_quantity_on_hand,i_product_name,i_brand,i_class,i_category]
+                  InputAdapter
+                    BroadcastNestedLoopJoin
+                      WholeStageCodegen (3)
+                        Project [inv_quantity_on_hand,i_brand,i_class,i_category,i_product_name]
+                          BroadcastHashJoin [inv_item_sk,i_item_sk]
+                            Project [inv_item_sk,inv_quantity_on_hand]
+                              BroadcastHashJoin [inv_date_sk,d_date_sk]
+                                Filter [inv_date_sk,inv_item_sk]
                                   ColumnarToRow
                                     InputAdapter
-                                      Scan parquet default.item [i_item_sk,i_brand,i_class,i_category,i_product_name]
-                    BroadcastExchange #4
-                      WholeStageCodegen (4)
-                        ColumnarToRow
-                          InputAdapter
-                            Scan parquet default.warehouse
+                                      Scan parquet default.inventory [inv_date_sk,inv_item_sk,inv_quantity_on_hand]
+                                InputAdapter
+                                  BroadcastExchange #2
+                                    WholeStageCodegen (1)
+                                      Project [d_date_sk]
+                                        Filter [d_month_seq,d_date_sk]
+                                          ColumnarToRow
+                                            InputAdapter
+                                              Scan parquet default.date_dim [d_date_sk,d_month_seq]
+                            InputAdapter
+                              BroadcastExchange #3
+                                WholeStageCodegen (2)
+                                  Filter [i_item_sk]
+                                    ColumnarToRow
+                                      InputAdapter
+                                        Scan parquet default.item [i_item_sk,i_brand,i_class,i_category,i_product_name]
+                      BroadcastExchange #4
+                        WholeStageCodegen (4)
+                          ColumnarToRow
+                            InputAdapter
+                              Scan parquet default.warehouse
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala
index 5bcec9b1e517c..dbdb066b6c5d6 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala
@@ -131,7 +131,7 @@ class LogicalPlanTagInSparkPlanSuite extends TPCDSQuerySuite with DisableAdaptiv
   }
 
   private def getLogicalPlan(node: SparkPlan): LogicalPlan = {
-    node.getTagValue(SparkPlan.LOGICAL_PLAN_TAG).getOrElse {
+    node.logicalLink.getOrElse {
       fail(node.getClass.getSimpleName + " does not have a logical plan link")
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantProjectsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantProjectsSuite.scala
index ab8cd96a1742a..7da66e4ed3cb7 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantProjectsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/RemoveRedundantProjectsSuite.scala
@@ -213,8 +213,23 @@ abstract class RemoveRedundantProjectsSuiteBase
         |ORDER BY t1.key, t2.key, s1, s2
         |LIMIT 10
         |""".stripMargin
-    assertProjectExec(query, 0, 3)
+    // The Project above the Expand is not removed due to SPARK-36020.
+    assertProjectExec(query, 1, 3)
+  }
 
+  test("SPARK-36020: Project should not be removed when child's logical link is different") {
+    val query =
+      """
+        |WITH t AS (
+        | SELECT key, a, b, c, explode(d) AS d FROM testView
+        |)
+        |SELECT t1.key, t1.d, t2.key
+        |FROM (SELECT d, key FROM t) t1
+        |JOIN testView t2 ON t1.key = t2.key
+        |""".stripMargin
+    // The ProjectExec above the GenerateExec should not be removed because
+    // they have different logical links.
+    assertProjectExec(query, enabled = 2, disabled = 3)
   }
 
   Seq("true", "false").foreach { codegenEnabled =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
index 067c1509eb73b..43aae7d59f9e1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
@@ -1482,6 +1482,27 @@ class AdaptiveQueryExecSuite
     }
   }
 
+  test("SPARK-36020: Check logical link in remove redundant projects") {
+    withTempView("t") {
+      spark.range(10).selectExpr("id % 10 as key", "cast(id * 2 as int) as a",
+        "cast(id * 3 as int) as b", "array(id, id + 1, id + 3) as c").createOrReplaceTempView("t")
+      withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "true") {
+        val query =
+          """
+            |WITH tt AS (
+            | SELECT key, a, b, explode(c) AS c FROM t
+            |)
+            |SELECT t1.key, t1.c, t2.key, t2.c
+            |FROM (SELECT a, b, c, key FROM tt WHERE a > 1) t1
+            |JOIN (SELECT a, b, c, key FROM tt) t2
+            |  ON t1.key = t2.key
+            |""".stripMargin
+        // here we only need to make sure this query can run
+        runAdaptiveAndVerifyResult(query)
+      }
+    }
+  }
+
   test("Skew Repartition Fetch in AQE") {
     def hasRepartitionShuffle(plan: SparkPlan): Boolean = {
       find(plan) {

From ccfbc1714620350468b5508f48d11104766a2b56 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Tue, 13 Jul 2021 17:37:12 +0900
Subject: [PATCH 111/169] [SQL][MINOR]
 EquivalentExpressions.commonChildrenToRecurse should skip CodegenFallback

### What changes were proposed in this pull request?

This is a very trivial mistake we found during a code refactor in https://github.com/apache/spark/pull/33142/files#r660900049

This PR backport this one-line fix.

### Why are the changes needed?

fix a mistake

### Does this PR introduce _any_ user-facing change?

no

### How was this patch tested?

existing tests

Closes #33304 from cloud-fan/backport.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../spark/sql/catalyst/expressions/EquivalentExpressions.scala   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
index d03dd53f31e6d..a26c19903186a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/EquivalentExpressions.scala
@@ -120,6 +120,7 @@ class EquivalentExpressions {
   // For some special expressions we cannot just recurse into all of its children, but we can
   // recursively add the common expressions shared between all of its children.
   private def commonChildrenToRecurse(expr: Expression): Seq[Seq[Expression]] = expr match {
+    case _: CodegenFallback => Nil
     case i: If => Seq(Seq(i.trueValue, i.falseValue))
     case c: CaseWhen =>
       // We look at subexpressions in conditions and values of `CaseWhen` separately. It is

From a27f02139e536ae2153c64cbc831e957c75e7e2c Mon Sep 17 00:00:00 2001
From: dgd-contributor <dgd_contributor@viettel.com.vn>
Date: Tue, 13 Jul 2021 22:17:03 +0800
Subject: [PATCH 112/169] =?UTF-8?q?[SPARK-36076][SQL][3.1]=20ArrayIndexOut?=
 =?UTF-8?q?OfBounds=20in=20Cast=20string=20to=20times=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…tamp
### What changes were proposed in this pull request?
Casting string to timestamp might throw ArrayIndexOutOfBounds in certain cases
This error only occur in branch 3.0, 3.1 and previous, it's not present on 3.2 or master
Code to reproduce:
```
val df = Seq(":8:434421+ 98:38").toDF("c0")
val df2 = df.withColumn("c1", col("c0").cast(DataTypes.TimestampType))
df2.show()
```

Error:
```
java.lang.ArrayIndexOutOfBoundsException: 9
  at org.apache.spark.sql.catalyst.util.DateTimeUtils$.stringToTimestamp(DateTimeUtils.scala:328)
  at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToTimestamp$2(Cast.scala:455)
  at org.apache.spark.sql.catalyst.expressions.CastBase.buildCast(Cast.scala:295)
  at org.apache.spark.sql.catalyst.expressions.CastBase.$anonfun$castToTimestamp$1(Cast.scala:451)
  at org.apache.spark.sql.catalyst.expressions.CastBase.nullSafeEval(Cast.scala:840)
  at org.apache.spark.sql.catalyst.expressions.UnaryExpression.eval(Expression.scala:476)
```

### Why are the changes needed?
Cast String to timestamp shouldn't throw error, it should return Null instead.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Add test in DateTimeUtilsSuite

Closes #33293 from dgd-contributor/SPARK-36076_CastStringToTimeStampThrowArrayIndexOutOfBoundsException.

Authored-by: dgd-contributor <dgd_contributor@viettel.com.vn>
Signed-off-by: Gengliang Wang <gengliang@apache.org>
---
 .../sql/catalyst/util/DateTimeUtils.scala     | 40 +++++++++----------
 .../catalyst/util/DateTimeUtilsSuite.scala    |  4 ++
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index acf529fe6f836..21b3adbc1c44f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -242,7 +242,7 @@ object DateTimeUtils {
    *     - +|-hhmmss
    *  - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
    */
-  def stringToTimestamp(s: UTF8String, timeZoneId: ZoneId): Option[Long] = {
+  def stringToTimestamp(s: UTF8String, timeZoneId: ZoneId): Option[Long] = try {
     if (s == null) {
       return None
     }
@@ -350,27 +350,25 @@ object DateTimeUtils {
       segments(6) /= 10
       digitsMilli -= 1
     }
-    try {
-      val zoneId = tz match {
-        case None => timeZoneId
-        case Some("+") => ZoneOffset.ofHoursMinutes(segments(7), segments(8))
-        case Some("-") => ZoneOffset.ofHoursMinutes(-segments(7), -segments(8))
-        case Some(zoneName: String) => getZoneId(zoneName.trim)
-      }
-      val nanoseconds = MICROSECONDS.toNanos(segments(6))
-      val localTime = LocalTime.of(segments(3), segments(4), segments(5), nanoseconds.toInt)
-      val localDate = if (justTime) {
-        LocalDate.now(zoneId)
-      } else {
-        LocalDate.of(segments(0), segments(1), segments(2))
-      }
-      val localDateTime = LocalDateTime.of(localDate, localTime)
-      val zonedDateTime = ZonedDateTime.of(localDateTime, zoneId)
-      val instant = Instant.from(zonedDateTime)
-      Some(instantToMicros(instant))
-    } catch {
-      case NonFatal(_) => None
+    val zoneId = tz match {
+      case None => timeZoneId
+      case Some("+") => ZoneOffset.ofHoursMinutes(segments(7), segments(8))
+      case Some("-") => ZoneOffset.ofHoursMinutes(-segments(7), -segments(8))
+      case Some(zoneName: String) => getZoneId(zoneName.trim)
+    }
+    val nanoseconds = MICROSECONDS.toNanos(segments(6))
+    val localTime = LocalTime.of(segments(3), segments(4), segments(5), nanoseconds.toInt)
+    val localDate = if (justTime) {
+      LocalDate.now(zoneId)
+    } else {
+      LocalDate.of(segments(0), segments(1), segments(2))
     }
+    val localDateTime = LocalDateTime.of(localDate, localTime)
+    val zonedDateTime = ZonedDateTime.of(localDateTime, zoneId)
+    val instant = Instant.from(zonedDateTime)
+    Some(instantToMicros(instant))
+  } catch {
+    case NonFatal(_) => None
   }
 
   def stringToTimestampAnsi(s: UTF8String, timeZoneId: ZoneId): Long = {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index 6862c3b5bc8e9..81fa5db946827 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -283,6 +283,10 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper {
     }
   }
 
+  test("SPARK-36076: Cast string to timestamp throw ArrayIndexOutOfBounds") {
+    assert(toTimestamp(":8:434421+ 98:38", UTC) === None)
+  }
+
   test("SPARK-15379: special invalid date string") {
     // Test stringToDate
     assert(toDate("2015-02-29 00:00:00").isEmpty)

From 2bbd9bf625f084076993fe9b82833ad7a47fd2eb Mon Sep 17 00:00:00 2001
From: Max Gekk <max.gekk@gmail.com>
Date: Fri, 16 Jul 2021 11:53:02 +0900
Subject: [PATCH 113/169] [SPARK-36034][SQL][3.1] Rebase datetime in pushed
 down filters to parquet

### What changes were proposed in this pull request?
In the PR, I propose to propagate either the SQL config `spark.sql.parquet.datetimeRebaseModeInRead` or/and Parquet option `datetimeRebaseMode` to `ParquetFilters`. The `ParquetFilters` class uses the settings in conversions of dates/timestamps instances from datasource filters to values pushed via `FilterApi` to the `parquet-column` lib.

Before the changes, date/timestamp values expressed as days/microseconds/milliseconds are interpreted as offsets in Proleptic Gregorian calendar, and pushed to the parquet library as is. That works fine if timestamp/dates values in parquet files were saved in the `CORRECTED` mode but in the `LEGACY` mode, filter's values could not match to actual values.

After the changes, timestamp/dates values of filters pushed down to parquet libs such as `FilterApi.eq(col1, -719162)` are rebased according the rebase settings. For the example, if the rebase mode is `CORRECTED`, **-719162** is pushed down as is but if the current rebase mode is `LEGACY`, the number of days is rebased to **-719164**. For more context, the PR description https://github.com/apache/spark/pull/28067 shows the diffs between two calendars.

### Why are the changes needed?
The changes fix the bug portrayed by the following example from SPARK-36034:
```scala
In [27]: spark.conf.set("spark.sql.legacy.parquet.datetimeRebaseModeInWrite", "LEGACY")
>>> spark.sql("SELECT DATE '0001-01-01' AS date").write.mode("overwrite").parquet("date_written_by_spark3_legacy")
>>> spark.read.parquet("date_written_by_spark3_legacy").where("date = '0001-01-01'").show()
+----+
|date|
+----+
+----+
```
The result must have the date value `0001-01-01`.

### Does this PR introduce _any_ user-facing change?
In some sense, yes. Query results can be different in some cases. For the example above:
```scala
scala> spark.conf.set("spark.sql.parquet.datetimeRebaseModeInWrite", "LEGACY")
scala> spark.sql("SELECT DATE '0001-01-01' AS date").write.mode("overwrite").parquet("date_written_by_spark3_legacy")
scala> spark.read.parquet("date_written_by_spark3_legacy").where("date = '0001-01-01'").show(false)
+----------+
|date      |
+----------+
|0001-01-01|
+----------+
```

### How was this patch tested?
By running the modified test suite `ParquetFilterSuite`:
```
$ build/sbt "test:testOnly *ParquetV1FilterSuite"
$ build/sbt "test:testOnly *ParquetV2FilterSuite"
```

Authored-by: Max Gekk <max.gekkgmail.com>
Signed-off-by: Max Gekk <max.gekkgmail.com>
(cherry picked from commit b09b7f7cc024d3054debd7bdb51caec3b53764d7)
Signed-off-by: Max Gekk <max.gekkgmail.com>

Closes #33375 from MaxGekk/fix-parquet-ts-filter-pushdown-3.1.

Authored-by: Max Gekk <max.gekk@gmail.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../parquet/ParquetFileFormat.scala           |  17 ++-
 .../datasources/parquet/ParquetFilters.scala  |  29 +++-
 .../ParquetPartitionReaderFactory.scala       |  17 ++-
 .../v2/parquet/ParquetScanBuilder.scala       |  14 +-
 .../parquet/ParquetFilterSuite.scala          | 144 ++++++++++--------
 5 files changed, 135 insertions(+), 86 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
index 34101397134a4..e92382a783975 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -271,11 +271,21 @@ class ParquetFileFormat
       S3FileUtils.tryOpenClose(sharedConf, filePath)
       lazy val footerFileMetaData =
         ParquetFileReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS).getFileMetaData
+      val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+        footerFileMetaData.getKeyValueMetaData.get,
+        SQLConf.get.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ))
       // Try to push down filters when filter push-down is enabled.
       val pushed = if (enableParquetFilterPushDown) {
         val parquetSchema = footerFileMetaData.getSchema
-        val parquetFilters = new ParquetFilters(parquetSchema, pushDownDate, pushDownTimestamp,
-          pushDownDecimal, pushDownStringStartWith, pushDownInFilterThreshold, isCaseSensitive)
+        val parquetFilters = new ParquetFilters(
+          parquetSchema,
+          pushDownDate,
+          pushDownTimestamp,
+          pushDownDecimal,
+          pushDownStringStartWith,
+          pushDownInFilterThreshold,
+          isCaseSensitive,
+          datetimeRebaseMode)
         filters
           // Collects all converted Parquet filter predicates. Notice that not all predicates can be
           // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
@@ -301,9 +311,6 @@ class ParquetFileFormat
           None
         }
 
-      val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
-        footerFileMetaData.getKeyValueMetaData.get,
-        SQLConf.get.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ))
       val int96RebaseMode = DataSourceUtils.int96RebaseMode(
         footerFileMetaData.getKeyValueMetaData.get,
         SQLConf.get.getConf(SQLConf.LEGACY_PARQUET_INT96_REBASE_MODE_IN_READ))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
index 73910c3943e9a..87fc4c16f2443 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilters.scala
@@ -34,6 +34,8 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName
 import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._
 
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
+import org.apache.spark.sql.catalyst.util.RebaseDateTime.{rebaseGregorianToJulianDays, rebaseGregorianToJulianMicros}
+import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
 import org.apache.spark.sql.sources
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -47,7 +49,8 @@ class ParquetFilters(
     pushDownDecimal: Boolean,
     pushDownStartWith: Boolean,
     pushDownInFilterThreshold: Int,
-    caseSensitive: Boolean) {
+    caseSensitive: Boolean,
+    datetimeRebaseMode: LegacyBehaviorPolicy.Value) {
   // A map which contains parquet field name and data type, if predicate push down applies.
   //
   // Each key in `nameToParquetField` represents a column; `dots` are used as separators for
@@ -123,14 +126,26 @@ class ParquetFilters(
   private val ParquetTimestampMicrosType = ParquetSchemaType(TIMESTAMP_MICROS, INT64, 0, null)
   private val ParquetTimestampMillisType = ParquetSchemaType(TIMESTAMP_MILLIS, INT64, 0, null)
 
-  private def dateToDays(date: Any): Int = date match {
-    case d: Date => DateTimeUtils.fromJavaDate(d)
-    case ld: LocalDate => DateTimeUtils.localDateToDays(ld)
+  private def dateToDays(date: Any): Int = {
+    val gregorianDays = date match {
+      case d: Date => DateTimeUtils.fromJavaDate(d)
+      case ld: LocalDate => DateTimeUtils.localDateToDays(ld)
+    }
+    datetimeRebaseMode match {
+      case LegacyBehaviorPolicy.LEGACY => rebaseGregorianToJulianDays(gregorianDays)
+      case _ => gregorianDays
+    }
   }
 
-  private def timestampToMicros(v: Any): JLong = v match {
-    case i: Instant => DateTimeUtils.instantToMicros(i)
-    case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t)
+  private def timestampToMicros(v: Any): JLong = {
+    val gregorianMicros = v match {
+      case i: Instant => DateTimeUtils.instantToMicros(i)
+      case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t)
+    }
+    datetimeRebaseMode match {
+      case LegacyBehaviorPolicy.LEGACY => rebaseGregorianToJulianMicros(gregorianMicros)
+      case _ => gregorianMicros
+    }
   }
 
   private def decimalToInt32(decimal: JBigDecimal): Integer = decimal.unscaledValue().intValue()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala
index e4d5e9b2d9f6d..7ea45c4e5a7b7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala
@@ -135,11 +135,21 @@ case class ParquetPartitionReaderFactory(
 
     lazy val footerFileMetaData =
       ParquetFileReader.readFooter(conf, filePath, SKIP_ROW_GROUPS).getFileMetaData
+    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
+      footerFileMetaData.getKeyValueMetaData.get,
+      SQLConf.get.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ))
     // Try to push down filters when filter push-down is enabled.
     val pushed = if (enableParquetFilterPushDown) {
       val parquetSchema = footerFileMetaData.getSchema
-      val parquetFilters = new ParquetFilters(parquetSchema, pushDownDate, pushDownTimestamp,
-        pushDownDecimal, pushDownStringStartWith, pushDownInFilterThreshold, isCaseSensitive)
+      val parquetFilters = new ParquetFilters(
+        parquetSchema,
+        pushDownDate,
+        pushDownTimestamp,
+        pushDownDecimal,
+        pushDownStringStartWith,
+        pushDownInFilterThreshold,
+        isCaseSensitive,
+        datetimeRebaseMode)
       filters
         // Collects all converted Parquet filter predicates. Notice that not all predicates can be
         // converted (`ParquetFilters.createFilter` returns an `Option`). That's why a `flatMap`
@@ -172,9 +182,6 @@ case class ParquetPartitionReaderFactory(
     if (pushed.isDefined) {
       ParquetInputFormat.setFilterPredicate(hadoopAttemptContext.getConfiguration, pushed.get)
     }
-    val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode(
-      footerFileMetaData.getKeyValueMetaData.get,
-      SQLConf.get.getConf(SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_READ))
     val int96RebaseMode = DataSourceUtils.int96RebaseMode(
       footerFileMetaData.getKeyValueMetaData.get,
       SQLConf.get.getConf(SQLConf.LEGACY_PARQUET_INT96_REBASE_MODE_IN_READ))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala
index 2f861356e9499..2bf6d798358de 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScanBuilder.scala
@@ -24,6 +24,7 @@ import org.apache.spark.sql.connector.read.{Scan, SupportsPushDownFilters}
 import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
 import org.apache.spark.sql.execution.datasources.parquet.{ParquetFilters, SparkToParquetSchemaConverter}
 import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder
+import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
 import org.apache.spark.sql.sources.Filter
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
@@ -51,8 +52,17 @@ case class ParquetScanBuilder(
     val isCaseSensitive = sqlConf.caseSensitiveAnalysis
     val parquetSchema =
       new SparkToParquetSchemaConverter(sparkSession.sessionState.conf).convert(schema)
-    val parquetFilters = new ParquetFilters(parquetSchema, pushDownDate, pushDownTimestamp,
-      pushDownDecimal, pushDownStringStartWith, pushDownInFilterThreshold, isCaseSensitive)
+    val parquetFilters = new ParquetFilters(
+      parquetSchema,
+      pushDownDate,
+      pushDownTimestamp,
+      pushDownDecimal,
+      pushDownStringStartWith,
+      pushDownInFilterThreshold,
+      isCaseSensitive,
+      // The rebase mode doesn't matter here because the filters are used to determine
+      // whether they is convertible.
+      LegacyBehaviorPolicy.CORRECTED)
     parquetFilters.convertibleFilters(this.filters).toArray
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
index 24a1ba124e56b..2f8cbc924ba6b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala
@@ -42,7 +42,9 @@ import org.apache.spark.sql.execution.datasources.v2.DataSourceV2ScanRelation
 import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType
+import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy
+import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy.{CORRECTED, LEGACY}
+import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType.{INT96, TIMESTAMP_MICROS, TIMESTAMP_MILLIS}
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.sql.types._
 import org.apache.spark.tags.ExtendedSQLTest
@@ -70,11 +72,14 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared
 
   protected def createParquetFilters(
       schema: MessageType,
-      caseSensitive: Option[Boolean] = None): ParquetFilters =
+      caseSensitive: Option[Boolean] = None,
+      datetimeRebaseMode: LegacyBehaviorPolicy.Value = LegacyBehaviorPolicy.CORRECTED
+    ): ParquetFilters =
     new ParquetFilters(schema, conf.parquetFilterPushDownDate, conf.parquetFilterPushDownTimestamp,
       conf.parquetFilterPushDownDecimal, conf.parquetFilterPushDownStringStartWith,
       conf.parquetFilterPushDownInFilterThreshold,
-      caseSensitive.getOrElse(conf.caseSensitiveAnalysis))
+      caseSensitive.getOrElse(conf.caseSensitiveAnalysis),
+      datetimeRebaseMode)
 
   override def beforeEach(): Unit = {
     super.beforeEach()
@@ -521,62 +526,66 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared
       def date: Date = Date.valueOf(s)
     }
 
-    val data = Seq("2018-03-18", "2018-03-19", "2018-03-20", "2018-03-21")
+    val data = Seq("1000-01-01", "2018-03-19", "2018-03-20", "2018-03-21")
     import testImplicits._
 
     Seq(false, true).foreach { java8Api =>
-      withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString) {
-        val dates = data.map(i => Tuple1(Date.valueOf(i))).toDF()
-        withNestedParquetDataFrame(dates) { case (inputDF, colName, fun) =>
-          implicit val df: DataFrame = inputDF
+      Seq(CORRECTED, LEGACY).foreach { rebaseMode =>
+        withSQLConf(
+          SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString,
+          SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> rebaseMode.toString) {
+          val dates = data.map(i => Tuple1(Date.valueOf(i))).toDF()
+          withNestedParquetDataFrame(dates) { case (inputDF, colName, fun) =>
+            implicit val df: DataFrame = inputDF
+
+            def resultFun(dateStr: String): Any = {
+              val parsed = if (java8Api) LocalDate.parse(dateStr) else Date.valueOf(dateStr)
+              fun(parsed)
+            }
 
-          def resultFun(dateStr: String): Any = {
-            val parsed = if (java8Api) LocalDate.parse(dateStr) else Date.valueOf(dateStr)
-            fun(parsed)
+            val dateAttr: Expression = df(colName).expr
+            assert(df(colName).expr.dataType === DateType)
+
+            checkFilterPredicate(dateAttr.isNull, classOf[Eq[_]], Seq.empty[Row])
+            checkFilterPredicate(dateAttr.isNotNull, classOf[NotEq[_]],
+              data.map(i => Row.apply(resultFun(i))))
+
+            checkFilterPredicate(dateAttr === "1000-01-01".date, classOf[Eq[_]],
+              resultFun("1000-01-01"))
+            checkFilterPredicate(dateAttr <=> "1000-01-01".date, classOf[Eq[_]],
+              resultFun("1000-01-01"))
+            checkFilterPredicate(dateAttr =!= "1000-01-01".date, classOf[NotEq[_]],
+              Seq("2018-03-19", "2018-03-20", "2018-03-21").map(i => Row.apply(resultFun(i))))
+
+            checkFilterPredicate(dateAttr < "2018-03-19".date, classOf[Lt[_]],
+              resultFun("1000-01-01"))
+            checkFilterPredicate(dateAttr > "2018-03-20".date, classOf[Gt[_]],
+              resultFun("2018-03-21"))
+            checkFilterPredicate(dateAttr <= "1000-01-01".date, classOf[LtEq[_]],
+              resultFun("1000-01-01"))
+            checkFilterPredicate(dateAttr >= "2018-03-21".date, classOf[GtEq[_]],
+              resultFun("2018-03-21"))
+
+            checkFilterPredicate(Literal("1000-01-01".date) === dateAttr, classOf[Eq[_]],
+              resultFun("1000-01-01"))
+            checkFilterPredicate(Literal("1000-01-01".date) <=> dateAttr, classOf[Eq[_]],
+              resultFun("1000-01-01"))
+            checkFilterPredicate(Literal("2018-03-19".date) > dateAttr, classOf[Lt[_]],
+              resultFun("1000-01-01"))
+            checkFilterPredicate(Literal("2018-03-20".date) < dateAttr, classOf[Gt[_]],
+              resultFun("2018-03-21"))
+            checkFilterPredicate(Literal("1000-01-01".date) >= dateAttr, classOf[LtEq[_]],
+              resultFun("1000-01-01"))
+            checkFilterPredicate(Literal("2018-03-21".date) <= dateAttr, classOf[GtEq[_]],
+              resultFun("2018-03-21"))
+
+            checkFilterPredicate(!(dateAttr < "2018-03-21".date), classOf[GtEq[_]],
+              resultFun("2018-03-21"))
+            checkFilterPredicate(
+              dateAttr < "2018-03-19".date || dateAttr > "2018-03-20".date,
+              classOf[Operators.Or],
+              Seq(Row(resultFun("1000-01-01")), Row(resultFun("2018-03-21"))))
           }
-
-          val dateAttr: Expression = df(colName).expr
-          assert(df(colName).expr.dataType === DateType)
-
-          checkFilterPredicate(dateAttr.isNull, classOf[Eq[_]], Seq.empty[Row])
-          checkFilterPredicate(dateAttr.isNotNull, classOf[NotEq[_]],
-            data.map(i => Row.apply(resultFun(i))))
-
-          checkFilterPredicate(dateAttr === "2018-03-18".date, classOf[Eq[_]],
-            resultFun("2018-03-18"))
-          checkFilterPredicate(dateAttr <=> "2018-03-18".date, classOf[Eq[_]],
-            resultFun("2018-03-18"))
-          checkFilterPredicate(dateAttr =!= "2018-03-18".date, classOf[NotEq[_]],
-            Seq("2018-03-19", "2018-03-20", "2018-03-21").map(i => Row.apply(resultFun(i))))
-
-          checkFilterPredicate(dateAttr < "2018-03-19".date, classOf[Lt[_]],
-            resultFun("2018-03-18"))
-          checkFilterPredicate(dateAttr > "2018-03-20".date, classOf[Gt[_]],
-            resultFun("2018-03-21"))
-          checkFilterPredicate(dateAttr <= "2018-03-18".date, classOf[LtEq[_]],
-            resultFun("2018-03-18"))
-          checkFilterPredicate(dateAttr >= "2018-03-21".date, classOf[GtEq[_]],
-            resultFun("2018-03-21"))
-
-          checkFilterPredicate(Literal("2018-03-18".date) === dateAttr, classOf[Eq[_]],
-            resultFun("2018-03-18"))
-          checkFilterPredicate(Literal("2018-03-18".date) <=> dateAttr, classOf[Eq[_]],
-            resultFun("2018-03-18"))
-          checkFilterPredicate(Literal("2018-03-19".date) > dateAttr, classOf[Lt[_]],
-            resultFun("2018-03-18"))
-          checkFilterPredicate(Literal("2018-03-20".date) < dateAttr, classOf[Gt[_]],
-            resultFun("2018-03-21"))
-          checkFilterPredicate(Literal("2018-03-18".date) >= dateAttr, classOf[LtEq[_]],
-            resultFun("2018-03-18"))
-          checkFilterPredicate(Literal("2018-03-21".date) <= dateAttr, classOf[GtEq[_]],
-            resultFun("2018-03-21"))
-
-          checkFilterPredicate(!(dateAttr < "2018-03-21".date), classOf[GtEq[_]],
-            resultFun("2018-03-21"))
-          checkFilterPredicate(
-            dateAttr < "2018-03-19".date || dateAttr > "2018-03-20".date,
-            classOf[Operators.Or],
-            Seq(Row(resultFun("2018-03-18")), Row(resultFun("2018-03-21"))))
         }
       }
     }
@@ -584,35 +593,36 @@ abstract class ParquetFilterSuite extends QueryTest with ParquetTest with Shared
 
   test("filter pushdown - timestamp") {
     Seq(true, false).foreach { java8Api =>
-      withSQLConf(
-        SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString,
-        SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> "CORRECTED",
-        SQLConf.LEGACY_PARQUET_INT96_REBASE_MODE_IN_WRITE.key -> "CORRECTED") {
-        // spark.sql.parquet.outputTimestampType = TIMESTAMP_MILLIS
+      Seq(CORRECTED, LEGACY).foreach { rebaseMode =>
         val millisData = Seq(
           "1000-06-14 08:28:53.123",
           "1582-06-15 08:28:53.001",
           "1900-06-16 08:28:53.0",
           "2018-06-17 08:28:53.999")
-        withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key ->
-          ParquetOutputTimestampType.TIMESTAMP_MILLIS.toString) {
+        withSQLConf(
+          SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString,
+          SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> rebaseMode.toString,
+          SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> TIMESTAMP_MILLIS.toString) {
           testTimestampPushdown(millisData, java8Api)
         }
 
-        // spark.sql.parquet.outputTimestampType = TIMESTAMP_MICROS
         val microsData = Seq(
           "1000-06-14 08:28:53.123456",
           "1582-06-15 08:28:53.123456",
           "1900-06-16 08:28:53.123456",
           "2018-06-17 08:28:53.123456")
-        withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key ->
-          ParquetOutputTimestampType.TIMESTAMP_MICROS.toString) {
+        withSQLConf(
+          SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString,
+          SQLConf.LEGACY_PARQUET_REBASE_MODE_IN_WRITE.key -> rebaseMode.toString,
+          SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> TIMESTAMP_MICROS.toString) {
           testTimestampPushdown(microsData, java8Api)
         }
 
-        // spark.sql.parquet.outputTimestampType = INT96 doesn't support pushdown
-        withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key ->
-          ParquetOutputTimestampType.INT96.toString) {
+        // INT96 doesn't support pushdown
+        withSQLConf(
+          SQLConf.DATETIME_JAVA8API_ENABLED.key -> java8Api.toString,
+          SQLConf.LEGACY_PARQUET_INT96_REBASE_MODE_IN_WRITE.key -> rebaseMode.toString,
+          SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> INT96.toString) {
           import testImplicits._
           withTempPath { file =>
             millisData.map(i => Tuple1(Timestamp.valueOf(i))).toDF

From a151a0ca314d0c201a5c3e36aef3843f2dbf7a38 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Fri, 16 Jul 2021 00:41:13 -0700
Subject: [PATCH 114/169] [SPARK-35972][SQL][3.1] When replace ExtractValue in
 NestedColumnAliasing we should use semanticEquals

### What changes were proposed in this pull request?
Ideally, in SQL query, nested columns should result to GetStructField with non-None name. But there are places that can create GetStructField with None name, such as UnresolvedStar.expand, Dataset encoder stuff, etc.
the current `nestedFieldToAlias` cannot catch it up and will cause job failed.

### Why are the changes needed?
Fix bug

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Added UT,

Closes #33227 from AngersZhuuuu/SPARK-35972-3.0.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 .../optimizer/NestedColumnAliasing.scala      | 20 +++++++-------
 .../optimizer/NestedColumnAliasingSuite.scala | 27 +++++++++++++++++++
 2 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
index b053bf6d61e6b..028e112213451 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasing.scala
@@ -68,7 +68,7 @@ object NestedColumnAliasing {
    */
   private def replaceToAliases(
       plan: LogicalPlan,
-      nestedFieldToAlias: Map[ExtractValue, Alias],
+      nestedFieldToAlias: Map[Expression, Alias],
       attrToAliases: Map[ExprId, Seq[Alias]]): LogicalPlan = plan match {
     case Project(projectList, child) =>
       Project(
@@ -85,10 +85,10 @@ object NestedColumnAliasing {
    */
   def getNewProjectList(
       projectList: Seq[NamedExpression],
-      nestedFieldToAlias: Map[ExtractValue, Alias]): Seq[NamedExpression] = {
+      nestedFieldToAlias: Map[Expression, Alias]): Seq[NamedExpression] = {
     projectList.map(_.transform {
-      case f: ExtractValue if nestedFieldToAlias.contains(f) =>
-        nestedFieldToAlias(f).toAttribute
+      case f: ExtractValue if nestedFieldToAlias.contains(f.canonicalized) =>
+        nestedFieldToAlias(f.canonicalized).toAttribute
     }.asInstanceOf[NamedExpression])
   }
 
@@ -98,13 +98,13 @@ object NestedColumnAliasing {
    */
   def replaceWithAliases(
       plan: LogicalPlan,
-      nestedFieldToAlias: Map[ExtractValue, Alias],
+      nestedFieldToAlias: Map[Expression, Alias],
       attrToAliases: Map[ExprId, Seq[Alias]]): LogicalPlan = {
     plan.withNewChildren(plan.children.map { plan =>
       Project(plan.output.flatMap(a => attrToAliases.getOrElse(a.exprId, Seq(a))), plan)
     }).transformExpressions {
-      case f: ExtractValue if nestedFieldToAlias.contains(f) =>
-        nestedFieldToAlias(f).toAttribute
+      case f: ExtractValue if nestedFieldToAlias.contains(f.canonicalized) =>
+        nestedFieldToAlias(f.canonicalized).toAttribute
     }
   }
 
@@ -158,7 +158,7 @@ object NestedColumnAliasing {
    * 2. ExprId -> Seq[Alias]: A reference attribute has multiple aliases pointing it.
    */
   def getAliasSubMap(exprList: Seq[Expression], exclusiveAttrs: Seq[Attribute] = Seq.empty)
-    : Option[(Map[ExtractValue, Alias], Map[ExprId, Seq[Alias]])] = {
+    : Option[(Map[Expression, Alias], Map[ExprId, Seq[Alias]])] = {
     val (nestedFieldReferences, otherRootReferences) =
       exprList.flatMap(collectRootReferenceAndExtractValue).partition {
         case _: ExtractValue => true
@@ -208,7 +208,9 @@ object NestedColumnAliasing {
     if (aliasSub.isEmpty) {
       None
     } else {
-      Some((aliasSub.values.flatten.toMap, aliasSub.map(x => (x._1, x._2.map(_._2)))))
+      Some((aliasSub.values.flatten.map {
+        case (field, alias) => field.canonicalized -> alias
+      }.toMap, aliasSub.map(x => (x._1, x._2.map(_._2)))))
     }
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
index c83ab375ee15a..7a667cc6991c6 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NestedColumnAliasingSuite.scala
@@ -684,6 +684,33 @@ class NestedColumnAliasingSuite extends SchemaPruningTest {
     ).analyze
     comparePlans(optimized2, expected2)
   }
+
+  test("SPARK-35972: NestedColumnAliasing should consider semantic equality") {
+    val dataType = new StructType()
+      .add(StructField("itemid", StringType))
+      .add(StructField("search_params", StructType(Seq(
+        StructField("col1", StringType),
+        StructField("col2", StringType)
+      ))))
+    val relation = LocalRelation('struct_data.struct(dataType))
+    val plan = relation
+      .repartition(100)
+      .select(
+        GetStructField('struct_data, 1, None).as("value"),
+        $"struct_data.search_params.col1".as("col1"),
+        $"struct_data.search_params.col2".as("col2")).analyze
+    val query = Optimize.execute(plan)
+    val alias = collectGeneratedAliases(query)
+
+    val optimized = relation
+      .select(GetStructField('struct_data, 1, None).as(alias(0)))
+      .repartition(100)
+      .select(
+        $"${alias(0)}".as("value"),
+        $"${alias(0)}.col1".as("col1"),
+        $"${alias(0)}.col2".as("col2")).analyze
+    comparePlans(optimized, query)
+  }
 }
 
 object NestedColumnAliasingSuite {

From 8eef6e839ea92a9f94cc50ce009b616699c23251 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Mon, 19 Jul 2021 23:11:19 +0800
Subject: [PATCH 115/169] [SPARK-36093][SQL][3.1] RemoveRedundantAliases should
 not change Command's parameter's expression's name

### What changes were proposed in this pull request?
RemoveRedundantAliases may change DataWritingCommand's parameter's attribute name.
In the UT's case before RemoveRedundantAliases the partitionColumns is `CAL_DT`, and change by RemoveRedundantAliases and change to `cal_dt` then case the error case

### Why are the changes needed?
Fix bug

### Does this PR introduce _any_ user-facing change?
For below SQL case
```
sql("create table t1(cal_dt date) using parquet")
sql("insert into t1 values (date'2021-06-27'),(date'2021-06-28'),(date'2021-06-29'),(date'2021-06-30')")
sql("create view t1_v as select * from t1")
sql("CREATE TABLE t2 USING PARQUET PARTITIONED BY (CAL_DT) AS SELECT 1 AS FLAG,CAL_DT FROM t1_v WHERE CAL_DT BETWEEN '2021-06-27' AND '2021-06-28'")
sql("INSERT INTO t2 SELECT 2 AS FLAG,CAL_DT FROM t1_v WHERE CAL_DT BETWEEN '2021-06-29' AND '2021-06-30'")
```

Before this pr
```
sql("SELECT * FROM t2 WHERE CAL_DT BETWEEN '2021-06-29' AND '2021-06-30'").show
+----+------+
|FLAG|CAL_DT|
+----+------+
+----+------+
sql("SELECT * FROM t2 ").show
+----+----------+
|FLAG|    CAL_DT|
+----+----------+
|   1|2021-06-27|
|   1|2021-06-28|
+----+----------+
```

After this pr
```
sql("SELECT * FROM t2 WHERE CAL_DT BETWEEN '2021-06-29' AND '2021-06-30'").show
+----+------+
|FLAG|CAL_DT|
+----+------+
|   2|2021-06-29|
|   2|2021-06-30|
+----+------+
sql("SELECT * FROM t2 ").show
+----+----------+
|FLAG|    CAL_DT|
+----+----------+
|   1|2021-06-27|
|   1|2021-06-28|
|   2|2021-06-29|
|   2|2021-06-30|
+----+----------+
```

### How was this patch tested?
Added UT

Closes #33417 from AngersZhuuuu/SPARK-36093-3.1.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/optimizer/Optimizer.scala    |  4 +--
 .../org/apache/spark/sql/SQLQuerySuite.scala  | 35 ++++++++++++++++++-
 2 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index d8e6c04f7e270..0412b455f8d64 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -446,7 +446,7 @@ object RemoveRedundantAliases extends Rule[LogicalPlan] {
           createAttributeMapping(left, newLeft) ++
           createAttributeMapping(right, newRight))
         val newCondition = condition.map(_.transform {
-          case a: Attribute => mapping.getOrElse(a, a)
+          case a: Attribute => mapping.get(a).map(_.withName(a.name)).getOrElse(a)
         })
         Join(newLeft, newRight, joinType, newCondition, hint)
 
@@ -476,7 +476,7 @@ object RemoveRedundantAliases extends Rule[LogicalPlan] {
         // Transform the expressions.
         newNode.mapExpressions { expr =>
           clean(expr.transform {
-            case a: Attribute => mapping.getOrElse(a, a)
+            case a: Attribute => mapping.get(a).map(_.withName(a.name)).getOrElse(a)
           })
         }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index d154e887b7ec4..e8992c90b4dbb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -32,7 +32,8 @@ import org.apache.spark.sql.catalyst.util.StringUtils
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
 import org.apache.spark.sql.execution.aggregate.{HashAggregateExec, ObjectHashAggregateExec, SortAggregateExec}
 import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
-import org.apache.spark.sql.execution.command.FunctionsCommand
+import org.apache.spark.sql.execution.command.{DataWritingCommandExec, FunctionsCommand}
+import org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand
 import org.apache.spark.sql.execution.datasources.SchemaColumnConvertNotSupportedException
 import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
 import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan
@@ -3972,6 +3973,38 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
       }
     }
   }
+
+  test("SPARK-36093: RemoveRedundantAliases should not change expression's name") {
+    withTable("t1", "t2") {
+      withView("t1_v") {
+        sql("CREATE TABLE t1(cal_dt DATE) USING PARQUET")
+        sql(
+          """
+            |INSERT INTO t1 VALUES
+            |(date'2021-06-27'),
+            |(date'2021-06-28'),
+            |(date'2021-06-29'),
+            |(date'2021-06-30')""".stripMargin)
+        sql("CREATE VIEW t1_v AS SELECT * FROM t1")
+        sql(
+          """
+            |CREATE TABLE t2(FLAG INT, CAL_DT DATE)
+            |USING PARQUET
+            |PARTITIONED BY (CAL_DT)""".stripMargin)
+        val insert = sql(
+          """
+            |INSERT INTO t2 SELECT 2 AS FLAG,CAL_DT FROM t1_v
+            |WHERE CAL_DT BETWEEN '2021-06-29' AND '2021-06-30'""".stripMargin)
+        insert.queryExecution.executedPlan.collectFirst {
+          case DataWritingCommandExec(i: InsertIntoHadoopFsRelationCommand, _) => i
+        }.get.partitionColumns.map(_.name).foreach(name => assert(name == "CAL_DT"))
+        checkAnswer(sql("SELECT FLAG, CAST(CAL_DT as STRING) FROM t2 "),
+          Row(2, "2021-06-29") :: Row(2, "2021-06-30") :: Nil)
+        checkAnswer(sql("SHOW PARTITIONS t2"),
+          Row("CAL_DT=2021-06-29") :: Row("CAL_DT=2021-06-30") :: Nil)
+      }
+    }
+  }
 }
 
 case class Foo(bar: Option[String])

From e304be467f25b32819898258c8f96a75a8509cd1 Mon Sep 17 00:00:00 2001
From: Karen Feng <karen.feng@databricks.com>
Date: Tue, 20 Jul 2021 21:32:13 +0800
Subject: [PATCH 116/169] [SPARK-36079][SQL] Null-based filter estimate should
 always be in the range [0, 1]

Forces the selectivity estimate for null-based filters to be in the range `[0,1]`.

I noticed in a few TPC-DS query tests that the column statistic null count can be higher than the table statistic row count. In the current implementation, the selectivity estimate for `IsNotNull` is negative.

No

Unit test

Closes #33286 from karenfeng/bound-selectivity-est.

Authored-by: Karen Feng <karen.feng@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit ddc61e62b9af5deff1b93e22f466f2a13f281155)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/plans/logical/Statistics.scala   | 13 ++++++
 .../statsEstimation/EstimationUtils.scala     | 18 ++++++---
 .../statsEstimation/FilterEstimation.scala    | 30 ++++++--------
 .../statsEstimation/JoinEstimation.scala      | 13 ++----
 .../FilterEstimationSuite.scala               | 40 ++++++++++++++++++-
 5 files changed, 80 insertions(+), 34 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
index 1346f80247a1f..e80eae64d1451 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/Statistics.scala
@@ -24,6 +24,7 @@ import net.jpountz.lz4.{LZ4BlockInputStream, LZ4BlockOutputStream}
 
 import org.apache.spark.sql.catalyst.catalog.CatalogColumnStat
 import org.apache.spark.sql.catalyst.expressions._
+import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -116,6 +117,18 @@ case class ColumnStat(
       maxLen = maxLen,
       histogram = histogram,
       version = version)
+
+  def updateCountStats(
+      oldNumRows: BigInt,
+      newNumRows: BigInt,
+      updatedColumnStatOpt: Option[ColumnStat] = None): ColumnStat = {
+    val updatedColumnStat = updatedColumnStatOpt.getOrElse(this)
+    val newDistinctCount = EstimationUtils.updateStat(oldNumRows, newNumRows,
+      distinctCount, updatedColumnStat.distinctCount)
+    val newNullCount = EstimationUtils.updateStat(oldNumRows, newNumRows,
+      nullCount, updatedColumnStat.nullCount)
+    updatedColumnStat.copy(distinctCount = newDistinctCount, nullCount = newNullCount)
+  }
 }
 
 /**
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
index 967ccedeeeacb..dafb979767ad1 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/EstimationUtils.scala
@@ -52,14 +52,20 @@ object EstimationUtils {
   }
 
   /**
-   * Updates (scales down) the number of distinct values if the number of rows decreases after
-   * some operation (such as filter, join). Otherwise keep it unchanged.
+   * Updates (scales down) a statistic (eg. number of distinct values) if the number of rows
+   * decreases after some operation (such as filter, join). Otherwise keep it unchanged.
    */
-  def updateNdv(oldNumRows: BigInt, newNumRows: BigInt, oldNdv: BigInt): BigInt = {
-    if (newNumRows < oldNumRows) {
-      ceil(BigDecimal(oldNdv) * BigDecimal(newNumRows) / BigDecimal(oldNumRows))
+  def updateStat(
+      oldNumRows: BigInt,
+      newNumRows: BigInt,
+      oldStatOpt: Option[BigInt],
+      updatedStatOpt: Option[BigInt]): Option[BigInt] = {
+    if (oldStatOpt.isDefined && updatedStatOpt.isDefined && updatedStatOpt.get > 1 &&
+      newNumRows < oldNumRows) {
+        // no need to scale down since it is already down to 1
+        Some(ceil(BigDecimal(oldStatOpt.get) * BigDecimal(newNumRows) / BigDecimal(oldNumRows)))
     } else {
-      oldNdv
+      updatedStatOpt
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
index 2c5beef43f52a..bc341b9fd563a 100755
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/FilterEstimation.scala
@@ -106,7 +106,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
       // The foldable Not has been processed in the ConstantFolding rule
       // This is a top-down traversal. The Not could be pushed down by the above two cases.
       case Not(l @ Literal(null, _)) =>
-        calculateSingleCondition(l, update = false)
+        calculateSingleCondition(l, update = false).map(boundProbability(_))
 
       case Not(cond) =>
         calculateFilterSelectivity(cond, update = false) match {
@@ -115,7 +115,7 @@ case class FilterEstimation(plan: Filter) extends Logging {
         }
 
       case _ =>
-        calculateSingleCondition(condition, update)
+        calculateSingleCondition(condition, update).map(boundProbability(_))
     }
   }
 
@@ -233,6 +233,8 @@ case class FilterEstimation(plan: Filter) extends Logging {
     val rowCountValue = childStats.rowCount.get
     val nullPercent: Double = if (rowCountValue == 0) {
       0
+    } else if (colStat.nullCount.get > rowCountValue) {
+      1
     } else {
       (BigDecimal(colStat.nullCount.get) / BigDecimal(rowCountValue)).toDouble
     }
@@ -854,6 +856,10 @@ case class FilterEstimation(plan: Filter) extends Logging {
     Some(percent)
   }
 
+  // Bound result in [0, 1]
+  private def boundProbability(p: Double): Double = {
+    Math.max(0.0, Math.min(1.0, p))
+  }
 }
 
 /**
@@ -907,26 +913,14 @@ case class ColumnStatsMap(originalMap: AttributeMap[ColumnStat]) {
   def update(a: Attribute, stats: ColumnStat): Unit = updatedMap.update(a.exprId, a -> stats)
 
   /**
-   * Collects updated column stats, and scales down ndv for other column stats if the number of rows
-   * decreases after this Filter operator.
+   * Collects updated column stats; scales down column count stats if the
+   * number of rows decreases after this Filter operator.
    */
   def outputColumnStats(rowsBeforeFilter: BigInt, rowsAfterFilter: BigInt)
     : AttributeMap[ColumnStat] = {
     val newColumnStats = originalMap.map { case (attr, oriColStat) =>
-      val colStat = updatedMap.get(attr.exprId).map(_._2).getOrElse(oriColStat)
-      val newNdv = if (colStat.distinctCount.isEmpty) {
-        // No NDV in the original stats.
-        None
-      } else if (colStat.distinctCount.get > 1) {
-        // Update ndv based on the overall filter selectivity: scale down ndv if the number of rows
-        // decreases; otherwise keep it unchanged.
-        Some(EstimationUtils.updateNdv(oldNumRows = rowsBeforeFilter,
-          newNumRows = rowsAfterFilter, oldNdv = oriColStat.distinctCount.get))
-      } else {
-        // no need to scale down since it is already down to 1 (for skewed distribution case)
-        colStat.distinctCount
-      }
-      attr -> colStat.copy(distinctCount = newNdv)
+      attr -> oriColStat.updateCountStats(
+        rowsBeforeFilter, rowsAfterFilter, updatedMap.get(attr.exprId).map(_._2))
     }
     AttributeMap(newColumnStats.toSeq)
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
index 777a4c8291223..c9661173b1bcc 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala
@@ -308,17 +308,12 @@ case class JoinEstimation(join: Join) extends Logging {
         outputAttrStats += a -> keyStatsAfterJoin(a)
       } else {
         val oldColStat = oldAttrStats(a)
-        val oldNdv = oldColStat.distinctCount
-        val newNdv = if (oldNdv.isDefined) {
-          Some(if (join.left.outputSet.contains(a)) {
-            updateNdv(oldNumRows = leftRows, newNumRows = outputRows, oldNdv = oldNdv.get)
-          } else {
-            updateNdv(oldNumRows = rightRows, newNumRows = outputRows, oldNdv = oldNdv.get)
-          })
+        val oldNumRows = if (join.left.outputSet.contains(a)) {
+          leftRows
         } else {
-          None
+          rightRows
         }
-        val newColStat = oldColStat.copy(distinctCount = newNdv)
+        val newColStat = oldColStat.updateCountStats(oldNumRows, outputRows)
         // TODO: support nullCount updates for specific outer joins
         outputAttrStats += a -> newColStat
       }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
index 878fae4c547b3..2ec247564caf3 100755
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/FilterEstimationSuite.scala
@@ -822,6 +822,41 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
       expectedRowCount = 3)
   }
 
+  test("SPARK-36079: Null count should be no higher than row count after filter") {
+    val colStatNullableString = colStatString.copy(nullCount = Some(10))
+    val condition = Filter(EqualTo(attrBool, Literal(true)),
+      childStatsTestPlan(Seq(attrBool, attrString), tableRowCount = 10L,
+        attributeMap = AttributeMap(Seq(
+          attrBool -> colStatBool, attrString -> colStatNullableString))))
+    validateEstimatedStats(
+      condition,
+      Seq(attrBool -> colStatBool.copy(distinctCount = Some(1), min = Some(true)),
+        attrString -> colStatNullableString.copy(distinctCount = Some(5), nullCount = Some(5))),
+      expectedRowCount = 5)
+  }
+
+  test("SPARK-36079: Null count higher than row count") {
+    val colStatNullableString = colStatString.copy(nullCount = Some(15))
+    val condition = Filter(IsNotNull(attrString),
+      childStatsTestPlan(Seq(attrString), tableRowCount = 10L,
+        attributeMap = AttributeMap(Seq(attrString -> colStatNullableString))))
+    validateEstimatedStats(
+      condition,
+      Seq(attrString -> colStatNullableString),
+      expectedRowCount = 0)
+  }
+
+  test("SPARK-36079: Bound selectivity >= 0") {
+    val colStatNullableString = colStatString.copy(nullCount = Some(-1))
+    val condition = Filter(IsNotNull(attrString),
+      childStatsTestPlan(Seq(attrString), tableRowCount = 10L,
+        attributeMap = AttributeMap(Seq(attrString -> colStatNullableString))))
+    validateEstimatedStats(
+      condition,
+      Seq(attrString -> colStatString),
+      expectedRowCount = 10)
+  }
+
   test("ColumnStatsMap tests") {
     val attrNoDistinct = AttributeReference("att_without_distinct", IntegerType)()
     val attrNoCount = AttributeReference("att_without_count", BooleanType)()
@@ -848,7 +883,10 @@ class FilterEstimationSuite extends StatsEstimationTestBase {
     assert(!columnStatsMap.hasMinMaxStats(attrNoMinMax))
   }
 
-  private def childStatsTestPlan(outList: Seq[Attribute], tableRowCount: BigInt): StatsTestPlan = {
+  private def childStatsTestPlan(
+      outList: Seq[Attribute],
+      tableRowCount: BigInt,
+      attributeMap: AttributeMap[ColumnStat] = attributeMap): StatsTestPlan = {
     StatsTestPlan(
       outputList = outList,
       rowCount = tableRowCount,

From bcdfe8b0a0bcc44c5144576dc8209cad879fcb7f Mon Sep 17 00:00:00 2001
From: Koert Kuipers <koert@tresata.com>
Date: Tue, 20 Jul 2021 09:09:22 -0700
Subject: [PATCH 117/169] [SPARK-36210][SQL] Preserve column insertion order in
 Dataset.withColumns

### What changes were proposed in this pull request?
Preserve the insertion order of columns in Dataset.withColumns

### Why are the changes needed?
It is the expected behavior. We preserve insertion order in all other places.

### Does this PR introduce _any_ user-facing change?
No. Currently Dataset.withColumns is not actually used anywhere to insert more than one column. This change is to make sure it behaves as expected when it is used for that purpose in future.

### How was this patch tested?
Added test in DatasetSuite

Closes #33423 from koertkuipers/feat-withcolumns-preserve-order.

Authored-by: Koert Kuipers <koert@tresata.com>
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
(cherry picked from commit bf680bf25aae9619d462caee05c41cc33909338a)
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 .../scala/org/apache/spark/sql/Dataset.scala    |  6 +++---
 .../org/apache/spark/sql/DatasetSuite.scala     | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 51a8f5aadba6d..dd58f50cbe077 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -2426,10 +2426,10 @@ class Dataset[T] private[sql](
     val resolver = sparkSession.sessionState.analyzer.resolver
     val output = queryExecution.analyzed.output
 
-    val columnMap = colNames.zip(cols).toMap
+    val columnSeq = colNames.zip(cols)
 
     val replacedAndExistingColumns = output.map { field =>
-      columnMap.find { case (colName, _) =>
+      columnSeq.find { case (colName, _) =>
         resolver(field.name, colName)
       } match {
         case Some((colName: String, col: Column)) => col.as(colName)
@@ -2437,7 +2437,7 @@ class Dataset[T] private[sql](
       }
     }
 
-    val newColumns = columnMap.filter { case (colName, col) =>
+    val newColumns = columnSeq.filter { case (colName, col) =>
       !output.exists(f => resolver(f.name, colName))
     }.map { case (colName, col) => col.as(colName) }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index 1b8bb3f4c92d6..074a517151b42 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -1998,6 +1998,23 @@ class DatasetSuite extends QueryTest
       joined,
       (1, 1), (2, 2), (3, 3))
   }
+
+  test("SPARK-36210: withColumns preserve insertion ordering") {
+    val df = Seq(1, 2, 3).toDS()
+
+    val colNames = (1 to 10).map(i => s"value${i}")
+    val cols = (1 to 10).map(i => col("value") + i)
+
+    val inserted = df.withColumns(colNames, cols)
+
+    assert(inserted.columns === "value" +: colNames)
+
+    checkDataset(
+      inserted.as[(Int, Int, Int, Int, Int, Int, Int, Int, Int, Int, Int)],
+      (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11),
+      (2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12),
+      (3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13))
+  }
 }
 
 case class Bar(a: Int)

From c50043d8f0ee9aee9c436d007ef63adb684f9297 Mon Sep 17 00:00:00 2001
From: Jie <gt.hu.chang@gmail.com>
Date: Tue, 20 Jul 2021 21:23:51 -0500
Subject: [PATCH 118/169] =?UTF-8?q?[SPARK-35027][CORE]=20Close=20the=20inp?=
 =?UTF-8?q?utStream=20in=20FileAppender=20when=20writin=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?

1. add "closeStreams" to FileAppender and RollingFileAppender
2. set "closeStreams" to "true" in ExecutorRunner

### Why are the changes needed?

The executor will hang when due disk full or other exceptions which happened in writting to outputStream: the root cause is the "inputStream" is not closed after the error happens:
1. ExecutorRunner creates two files appenders for pipe: one for stdout, one for stderr
2. FileAppender.appendStreamToFile exits the loop when writing to outputStream
3. FileAppender closes the outputStream, but left the inputStream which refers the pipe's stdout and stderr opened
4. The executor will hang when printing the log message if the pipe is full (no one consume the outputs)
5. From the driver side, you can see the task can't be completed for ever

With this fix, the step 4 will throw an exception, the driver can catch up the exception and reschedule the failed task to other executors.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Add new tests for the "closeStreams" in FileAppenderSuite

Closes #33263 from jhu-chang/SPARK-35027.

Authored-by: Jie <gt.hu.chang@gmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
(cherry picked from commit 1a8c6755a1802afdb9a73793e9348d322176125a)
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../spark/deploy/worker/ExecutorRunner.scala  |  4 +-
 .../spark/util/logging/FileAppender.scala     | 37 ++++++++++++++-----
 .../util/logging/RollingFileAppender.scala    |  6 ++-
 .../apache/spark/util/FileAppenderSuite.scala | 35 ++++++++++++++++++
 4 files changed, 68 insertions(+), 14 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
index 2e26ccf671d88..974c2d670c234 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
@@ -185,11 +185,11 @@ private[deploy] class ExecutorRunner(
 
       // Redirect its stdout and stderr to files
       val stdout = new File(executorDir, "stdout")
-      stdoutAppender = FileAppender(process.getInputStream, stdout, conf)
+      stdoutAppender = FileAppender(process.getInputStream, stdout, conf, true)
 
       val stderr = new File(executorDir, "stderr")
       Files.write(header, stderr, StandardCharsets.UTF_8)
-      stderrAppender = FileAppender(process.getErrorStream, stderr, conf)
+      stderrAppender = FileAppender(process.getErrorStream, stderr, conf, true)
 
       state = ExecutorState.RUNNING
       worker.send(ExecutorStateChanged(appId, execId, state, None, None))
diff --git a/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala b/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala
index 7107be25eb505..2243239dce6fd 100644
--- a/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala
+++ b/core/src/main/scala/org/apache/spark/util/logging/FileAppender.scala
@@ -26,8 +26,12 @@ import org.apache.spark.util.{IntParam, Utils}
 /**
  * Continuously appends the data from an input stream into the given file.
  */
-private[spark] class FileAppender(inputStream: InputStream, file: File, bufferSize: Int = 8192)
-  extends Logging {
+private[spark] class FileAppender(
+  inputStream: InputStream,
+  file: File,
+  bufferSize: Int = 8192,
+  closeStreams: Boolean = false
+) extends Logging {
   @volatile private var outputStream: FileOutputStream = null
   @volatile private var markedForStop = false     // has the appender been asked to stopped
 
@@ -76,7 +80,13 @@ private[spark] class FileAppender(inputStream: InputStream, file: File, bufferSi
           }
         }
       } {
-        closeFile()
+        try {
+          if (closeStreams) {
+            inputStream.close()
+          }
+        } finally {
+          closeFile()
+        }
       }
     } catch {
       case e: Exception =>
@@ -113,7 +123,12 @@ private[spark] class FileAppender(inputStream: InputStream, file: File, bufferSi
 private[spark] object FileAppender extends Logging {
 
   /** Create the right appender based on Spark configuration */
-  def apply(inputStream: InputStream, file: File, conf: SparkConf): FileAppender = {
+  def apply(
+    inputStream: InputStream,
+    file: File,
+    conf: SparkConf,
+    closeStreams: Boolean = false
+  ) : FileAppender = {
 
     val rollingStrategy = conf.get(config.EXECUTOR_LOGS_ROLLING_STRATEGY)
     val rollingSizeBytes = conf.get(config.EXECUTOR_LOGS_ROLLING_MAX_SIZE)
@@ -141,9 +156,10 @@ private[spark] object FileAppender extends Logging {
       validatedParams.map {
         case (interval, pattern) =>
           new RollingFileAppender(
-            inputStream, file, new TimeBasedRollingPolicy(interval, pattern), conf)
+            inputStream, file, new TimeBasedRollingPolicy(interval, pattern), conf,
+            closeStreams = closeStreams)
       }.getOrElse {
-        new FileAppender(inputStream, file)
+        new FileAppender(inputStream, file, closeStreams = closeStreams)
       }
     }
 
@@ -151,17 +167,18 @@ private[spark] object FileAppender extends Logging {
       rollingSizeBytes match {
         case IntParam(bytes) =>
           logInfo(s"Rolling executor logs enabled for $file with rolling every $bytes bytes")
-          new RollingFileAppender(inputStream, file, new SizeBasedRollingPolicy(bytes), conf)
+          new RollingFileAppender(
+            inputStream, file, new SizeBasedRollingPolicy(bytes), conf, closeStreams = closeStreams)
         case _ =>
           logWarning(
             s"Illegal size [$rollingSizeBytes] for rolling executor logs, rolling logs not enabled")
-          new FileAppender(inputStream, file)
+          new FileAppender(inputStream, file, closeStreams = closeStreams)
       }
     }
 
     rollingStrategy match {
       case "" =>
-        new FileAppender(inputStream, file)
+        new FileAppender(inputStream, file, closeStreams = closeStreams)
       case "time" =>
         createTimeBasedAppender()
       case "size" =>
@@ -170,7 +187,7 @@ private[spark] object FileAppender extends Logging {
         logWarning(
           s"Illegal strategy [$rollingStrategy] for rolling executor logs, " +
             s"rolling logs not enabled")
-        new FileAppender(inputStream, file)
+        new FileAppender(inputStream, file, closeStreams = closeStreams)
     }
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala b/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala
index b73f422649312..68a59232c7a96 100644
--- a/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala
+++ b/core/src/main/scala/org/apache/spark/util/logging/RollingFileAppender.scala
@@ -36,14 +36,16 @@ import org.apache.spark.internal.config
  * @param rollingPolicy           Policy based on which files will be rolled over.
  * @param conf                    SparkConf that is used to pass on extra configurations
  * @param bufferSize              Optional buffer size. Used mainly for testing.
+ * @param closeStreams            Option flag: whether to close the inputStream at the end.
  */
 private[spark] class RollingFileAppender(
     inputStream: InputStream,
     activeFile: File,
     val rollingPolicy: RollingPolicy,
     conf: SparkConf,
-    bufferSize: Int = RollingFileAppender.DEFAULT_BUFFER_SIZE
-  ) extends FileAppender(inputStream, activeFile, bufferSize) {
+    bufferSize: Int = RollingFileAppender.DEFAULT_BUFFER_SIZE,
+    closeStreams: Boolean = false
+  ) extends FileAppender(inputStream, activeFile, bufferSize, closeStreams) {
 
   private val maxRetainedFiles = conf.get(config.EXECUTOR_LOGS_ROLLING_MAX_RETAINED_FILES)
   private val enableCompression = conf.get(config.EXECUTOR_LOGS_ROLLING_ENABLE_COMPRESSION)
diff --git a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
index 12d97573ff6ee..71010a10cb23c 100644
--- a/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/FileAppenderSuite.scala
@@ -61,6 +61,15 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging {
     assert(Files.toString(testFile, StandardCharsets.UTF_8) === header + testString)
   }
 
+  test("SPARK-35027: basic file appender - close stream") {
+    val inputStream = mock(classOf[InputStream])
+    val appender = new FileAppender(inputStream, testFile, closeStreams = true)
+    Thread.sleep(10)
+    appender.stop()
+    appender.awaitTermination()
+    verify(inputStream).close()
+  }
+
   test("rolling file appender - time-based rolling") {
     // setup input stream and appender
     val testOutputStream = new PipedOutputStream()
@@ -96,6 +105,32 @@ class FileAppenderSuite extends SparkFunSuite with BeforeAndAfter with Logging {
       appender, testOutputStream, textToAppend, rolloverIntervalMillis, isCompressed = true)
   }
 
+  test("SPARK-35027: rolling file appender - time-based rolling close stream") {
+    val inputStream = mock(classOf[InputStream])
+    val sparkConf = new SparkConf()
+    sparkConf.set(config.EXECUTOR_LOGS_ROLLING_STRATEGY.key, "time")
+    val appender = FileAppender(inputStream, testFile, sparkConf, closeStreams = true)
+    assert(
+      appender.asInstanceOf[RollingFileAppender].rollingPolicy.isInstanceOf[TimeBasedRollingPolicy])
+    Thread.sleep(10)
+    appender.stop()
+    appender.awaitTermination()
+    verify(inputStream).close()
+  }
+
+  test("SPARK-35027: rolling file appender - size-based rolling close stream") {
+    val inputStream = mock(classOf[InputStream])
+    val sparkConf = new SparkConf()
+    sparkConf.set(config.EXECUTOR_LOGS_ROLLING_STRATEGY.key, "size")
+    val appender = FileAppender(inputStream, testFile, sparkConf, closeStreams = true)
+    assert(
+      appender.asInstanceOf[RollingFileAppender].rollingPolicy.isInstanceOf[SizeBasedRollingPolicy])
+    Thread.sleep(10)
+    appender.stop()
+    appender.awaitTermination()
+    verify(inputStream).close()
+  }
+
   test("rolling file appender - size-based rolling") {
     // setup input stream and appender
     val testOutputStream = new PipedOutputStream()

From e031a6e4497289ce1189575a1e1aa6e2baccc3da Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 21 Jul 2021 14:03:06 +0800
Subject: [PATCH 119/169] [SPARK-36020][SQL][FOLLOWUP] RemoveRedundantProjects
 should retain the LOGICAL_PLAN_TAG tag

### What changes were proposed in this pull request?

This is a followup of https://github.com/apache/spark/pull/33222 .

https://github.com/apache/spark/pull/33222 made a mistake that, `RemoveRedundantProjects` may lose the `LOGICAL_PLAN_TAG` tag, even though the logical plan link is retained. This was actually caught by the test `LogicalPlanTagInSparkPlanSuite`, but was not being taken care of.

There is no problem so far, but losing information can always lead to potential bugs.

### Why are the changes needed?

fix a mistake

### Does this PR introduce _any_ user-facing change?

no

### How was this patch tested?

existing test

Closes #33442 from cloud-fan/minor.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 94aece4325af8c07160f124997d51b79a4abd242)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/execution/RemoveRedundantProjects.scala       | 6 +++++-
 .../sql/execution/LogicalPlanTagInSparkPlanSuite.scala      | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala
index eeeb8683668d2..8f4ce0f49a89a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/RemoveRedundantProjects.scala
@@ -49,7 +49,11 @@ object RemoveRedundantProjects extends Rule[SparkPlan] {
     plan match {
       case p @ ProjectExec(_, child) =>
         if (isRedundant(p, child, requireOrdering) && canRemove(p, child)) {
-          removeProject(child, requireOrdering)
+          val newPlan = removeProject(child, requireOrdering)
+          // The `newPlan` should retain the logical plan link already. We call `setLogicalLink`
+          // here to make sure the `newPlan` sets the `LOGICAL_PLAN_TAG` tag.
+          newPlan.setLogicalLink(child.logicalLink.get)
+          newPlan
         } else {
           p.mapChildren(removeProject(_, false))
         }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala
index dbdb066b6c5d6..5bcec9b1e517c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/LogicalPlanTagInSparkPlanSuite.scala
@@ -131,7 +131,7 @@ class LogicalPlanTagInSparkPlanSuite extends TPCDSQuerySuite with DisableAdaptiv
   }
 
   private def getLogicalPlan(node: SparkPlan): LogicalPlan = {
-    node.logicalLink.getOrElse {
+    node.getTagValue(SparkPlan.LOGICAL_PLAN_TAG).getOrElse {
       fail(node.getClass.getSimpleName + " does not have a logical plan link")
     }
   }

From 51146da4adec755a6b120590b27daf369db27815 Mon Sep 17 00:00:00 2001
From: Shardul Mahadik <smahadik@linkedin.com>
Date: Wed, 21 Jul 2021 22:40:39 +0800
Subject: [PATCH 120/169] [SPARK-28266][SQL] convertToLogicalRelation should
 not interpret `path` property when reading Hive tables

### What changes were proposed in this pull request?

For non-datasource Hive tables, e.g. tables written outside of Spark (through Hive or Trino), we have certain optimzations in Spark where we use Spark ORC and Parquet datasources to read these tables ([Ref](https://github.com/apache/spark/blob/fbf53dee37129a493a4e5d5a007625b35f44fbda/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala#L128)) rather than using the Hive serde.
If such a table contains a `path` property, Spark will try to list this path property in addition to the table location when creating an `InMemoryFileIndex`. ([Ref](https://github.com/apache/spark/blob/fbf53dee37129a493a4e5d5a007625b35f44fbda/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala#L575)) This can lead to wrong data if `path` property points to a directory location or an error if `path` is not a location. A concrete example is provided in [SPARK-28266 (comment)](https://issues.apache.org/jira/browse/SPARK-28266?focusedCommentId=17380170&page=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-17380170).

Since these tables were not written through Spark, Spark should not interpret this `path` property as it can be set by an external system with a different meaning.

### Why are the changes needed?

For better compatibility with Hive tables generated by other platforms (non-Spark)

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Added unit test

Closes #33328 from shardulm94/spark-28266.

Authored-by: Shardul Mahadik <smahadik@linkedin.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 685c3fd05bf8e9d85ea9b33d4e28807d436cd5ca)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/hive/HiveMetastoreCatalog.scala |  4 +-
 .../sql/hive/HiveMetastoreCatalogSuite.scala  | 47 +++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
index a89243c331c7b..c67bc7d344781 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -244,7 +244,9 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
                 paths = rootPath.toString :: Nil,
                 userSpecifiedSchema = Option(updatedTable.dataSchema),
                 bucketSpec = None,
-                options = options,
+                // Do not interpret the 'path' option at all when tables are read using the Hive
+                // source, since the URIs will already have been read from the table's LOCATION.
+                options = options.filter { case (k, _) => !k.equalsIgnoreCase("path") },
                 className = fileType).resolveRelation(),
               table = updatedTable)
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
index 1a6f6843d3911..af3d4555bc5cf 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveMetastoreCatalogSuite.scala
@@ -363,4 +363,51 @@ class DataSourceWithHiveMetastoreCatalogSuite
       }
     })
   }
+
+  Seq(
+    "parquet" -> (
+      "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
+      HiveUtils.CONVERT_METASTORE_PARQUET.key),
+    "orc" -> (
+      "org.apache.hadoop.hive.ql.io.orc.OrcSerde",
+      HiveUtils.CONVERT_METASTORE_ORC.key)
+  ).foreach { case (format, (serde, formatConvertConf)) =>
+    test("SPARK-28266: convertToLogicalRelation should not interpret `path` property when " +
+      s"reading Hive tables using $format file format") {
+      withTempPath(dir => {
+        val baseDir = dir.getAbsolutePath
+        withSQLConf(formatConvertConf -> "true") {
+
+          withTable("t1") {
+            hiveClient.runSqlHive(
+              s"""
+                 |CREATE TABLE t1 (id bigint)
+                 |ROW FORMAT SERDE '$serde'
+                 |WITH SERDEPROPERTIES ('path'='someNonLocationValue')
+                 |STORED AS $format LOCATION '$baseDir'
+                 |""".stripMargin)
+
+            assertResult(0) {
+              spark.sql("SELECT * FROM t1").count()
+            }
+          }
+
+          spark.range(3).selectExpr("id").write.format(format).save(baseDir)
+          withTable("t2") {
+            hiveClient.runSqlHive(
+              s"""
+                 |CREATE TABLE t2 (id bigint)
+                 |ROW FORMAT SERDE '$serde'
+                 |WITH SERDEPROPERTIES ('path'='$baseDir')
+                 |STORED AS $format LOCATION '$baseDir'
+                 |""".stripMargin)
+
+            assertResult(3) {
+              spark.sql("SELECT * FROM t2").count()
+            }
+          }
+        }
+      })
+    }
+  }
 }

From 3f99be0ffacd5653dc72fc4e420e60b0df947aeb Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Thu, 22 Jul 2021 00:52:31 +0800
Subject: [PATCH 121/169] [SPARK-36213][SQL] Normalize PartitionSpec for
 Describe Table Command with PartitionSpec

### What changes were proposed in this pull request?

This fixes a case sensitivity issue for desc table commands with partition specified.

### Why are the changes needed?

bugfix

### Does this PR introduce _any_ user-facing change?

yes, but it's a bugfix

### How was this patch tested?

new tests

#### before
```
+-- !query
+DESC EXTENDED t PARTITION (C='Us', D=1)
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+Partition spec is invalid. The spec (C, D) must match the partition spec (c, d) defined in table '`default`.`t`'
+
```

#### after

https://github.com/apache/spark/pull/33424/files#diff-554189c49950974a948f99fa9b7436f615052511660c6a0ae3062fa8ca0a327cR328

Closes #33424 from yaooqinn/SPARK-36213.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Kent Yao <yao@apache.org>
(cherry picked from commit 4cd6cfc773da726a90d41bfc590ea9188c17d5ae)
Signed-off-by: Kent Yao <yao@apache.org>
---
 .../spark/sql/execution/command/tables.scala  |  7 +++-
 .../resources/sql-tests/inputs/describe.sql   |  2 ++
 .../sql-tests/results/describe.sql.out        | 33 ++++++++++++++++++-
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index f88200e19448b..b557fd276d3de 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -693,7 +693,12 @@ case class DescribeTableCommand(
         s"DESC PARTITION is not allowed on a view: ${table.identifier}")
     }
     DDLUtils.verifyPartitionProviderIsHive(spark, metadata, "DESC PARTITION")
-    val partition = catalog.getPartition(table, partitionSpec)
+    val normalizedPartSpec = PartitioningUtils.normalizePartitionSpec(
+      partitionSpec,
+      metadata.partitionSchema,
+      table.quotedString,
+      spark.sessionState.conf.resolver)
+    val partition = catalog.getPartition(table, normalizedPartSpec)
     if (isExtended) describeFormattedDetailedPartitionInfo(table, metadata, partition, result)
   }
 
diff --git a/sql/core/src/test/resources/sql-tests/inputs/describe.sql b/sql/core/src/test/resources/sql-tests/inputs/describe.sql
index a0ee9322372ba..deff5bb7ca6fc 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/describe.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/describe.sql
@@ -43,6 +43,8 @@ DESC EXTENDED t PARTITION (c='Us', d=1);
 
 DESC FORMATTED t PARTITION (c='Us', d=1);
 
+DESC EXTENDED t PARTITION (C='Us', D=1);
+
 -- NoSuchPartitionException: Partition not found in table
 DESC t PARTITION (c='Us', d=2);
 
diff --git a/sql/core/src/test/resources/sql-tests/results/describe.sql.out b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
index 93b0cc3fe97e1..b0fc2a4250669 100644
--- a/sql/core/src/test/resources/sql-tests/results/describe.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/describe.sql.out
@@ -1,5 +1,5 @@
 -- Automatically generated by SQLQueryTestSuite
--- Number of queries: 41
+-- Number of queries: 42
 
 
 -- !query
@@ -324,6 +324,37 @@ Location [not included in comparison]/{warehouse_dir}/t
 Storage Properties  	[a=1, b=2]
 
 
+-- !query
+DESC EXTENDED t PARTITION (C='Us', D=1)
+-- !query schema
+struct<col_name:string,data_type:string,comment:string>
+-- !query output
+a                   	string              	                    
+b                   	int                 	                    
+c                   	string              	                    
+d                   	string              	                    
+# Partition Information	                    	                    
+# col_name          	data_type           	comment             
+c                   	string              	                    
+d                   	string              	                    
+                    	                    	                    
+# Detailed Partition Information	                    	                    
+Database            	default             	                    
+Table               	t                   	                    
+Partition Values    	[c=Us, d=1]         	                    
+Location [not included in comparison]/{warehouse_dir}/t/c=Us/d=1	                    
+Storage Properties  	[a=1, b=2]          	                    
+Created Time [not included in comparison]
+Last Access [not included in comparison]
+                    	                    	                    
+# Storage Information	                    	                    
+Num Buckets         	2                   	                    
+Bucket Columns      	[`a`]               	                    
+Sort Columns        	[`b`]               	                    
+Location [not included in comparison]/{warehouse_dir}/t	                    
+Storage Properties  	[a=1, b=2]
+
+
 -- !query
 DESC t PARTITION (c='Us', d=2)
 -- !query schema

From 9c2f8b39e77e27101f7222dcf26acda12961cafe Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Tue, 27 Jul 2021 10:17:55 -0700
Subject: [PATCH 122/169] [SPARK-36242][CORE][3.1] Ensure spill file closed
 before set success = true in ExternalSorter.spillMemoryIteratorToDisk method

### What changes were proposed in this pull request?
The main change of this pr is move `writer.close()` before `success = true` to ensure spill file closed before set `success = true` in `ExternalSorter.spillMemoryIteratorToDisk` method.

### Why are the changes needed?
Avoid setting `success = true` first and then failure of close spill file

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?

- Pass the Jenkins or GitHub Action
- Add a new Test case to check `The spill file should not exists if writer close fails`

Closes #33513 from LuciferYang/SPARK-36242-3.1.

Authored-by: yangjie01 <yangjie01@baidu.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../util/collection/ExternalSorter.scala      |   5 +-
 .../collection/ExternalSorterSpillSuite.scala | 147 ++++++++++++++++++
 2 files changed, 149 insertions(+), 3 deletions(-)
 create mode 100644 core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSpillSuite.scala

diff --git a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
index dc39170ecf382..6e95bce8107a8 100644
--- a/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
+++ b/core/src/main/scala/org/apache/spark/util/collection/ExternalSorter.scala
@@ -307,14 +307,13 @@ private[spark] class ExternalSorter[K, V, C](
       }
       if (objectsWritten > 0) {
         flush()
+        writer.close()
       } else {
         writer.revertPartialWritesAndClose()
       }
       success = true
     } finally {
-      if (success) {
-        writer.close()
-      } else {
+      if (!success) {
         // This code path only happens if an exception was thrown above before we set success;
         // close our stuff and let the exception be thrown further
         writer.revertPartialWritesAndClose()
diff --git a/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSpillSuite.scala b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSpillSuite.scala
new file mode 100644
index 0000000000000..959d5d813df81
--- /dev/null
+++ b/core/src/test/scala/org/apache/spark/util/collection/ExternalSorterSpillSuite.scala
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.util.collection
+
+import java.io.{File, IOException}
+import java.util.UUID
+
+import scala.collection.mutable.ArrayBuffer
+
+import org.mockito.ArgumentMatchers.{any, anyInt}
+import org.mockito.Mockito.{mock, when}
+import org.mockito.invocation.InvocationOnMock
+import org.scalatest.BeforeAndAfterEach
+
+import org.apache.spark.{SparkConf, SparkEnv, SparkFunSuite, TaskContext}
+import org.apache.spark.executor.ShuffleWriteMetrics
+import org.apache.spark.internal.config
+import org.apache.spark.memory.{TaskMemoryManager, TestMemoryManager}
+import org.apache.spark.serializer.{KryoSerializer, SerializerInstance, SerializerManager}
+import org.apache.spark.storage.{BlockId, BlockManager, DiskBlockManager, DiskBlockObjectWriter, TempShuffleBlockId}
+import org.apache.spark.util.{Utils => UUtils}
+
+class ExternalSorterSpillSuite extends SparkFunSuite with BeforeAndAfterEach {
+
+  private val spillFilesCreated = ArrayBuffer.empty[File]
+
+  private var tempDir: File = _
+  private var conf: SparkConf = _
+  private var taskMemoryManager: TaskMemoryManager = _
+
+  private var blockManager: BlockManager = _
+  private var diskBlockManager: DiskBlockManager = _
+  private var taskContext: TaskContext = _
+
+  override protected def beforeEach(): Unit = {
+    tempDir = UUtils.createTempDir(null, "test")
+    spillFilesCreated.clear()
+
+    val env: SparkEnv = mock(classOf[SparkEnv])
+    SparkEnv.set(env)
+
+    conf = new SparkConf()
+    when(SparkEnv.get.conf).thenReturn(conf)
+
+    val serializer = new KryoSerializer(conf)
+    when(SparkEnv.get.serializer).thenReturn(serializer)
+
+    blockManager = mock(classOf[BlockManager])
+    when(SparkEnv.get.blockManager).thenReturn(blockManager)
+
+    val manager = new SerializerManager(serializer, conf)
+    when(blockManager.serializerManager).thenReturn(manager)
+
+    diskBlockManager = mock(classOf[DiskBlockManager])
+    when(blockManager.diskBlockManager).thenReturn(diskBlockManager)
+
+    taskContext = mock(classOf[TaskContext])
+    val memoryManager = new TestMemoryManager(conf)
+    taskMemoryManager = new TaskMemoryManager(memoryManager, 0)
+    when(taskContext.taskMemoryManager()).thenReturn(taskMemoryManager)
+
+    when(diskBlockManager.createTempShuffleBlock())
+      .thenAnswer((_: InvocationOnMock) => {
+        val blockId = TempShuffleBlockId(UUID.randomUUID)
+        val file = File.createTempFile("spillFile", ".spill", tempDir)
+        spillFilesCreated += file
+        (blockId, file)
+      })
+  }
+
+  override protected def afterEach(): Unit = {
+    UUtils.deleteRecursively(tempDir)
+    SparkEnv.set(null)
+
+    val leakedMemory = taskMemoryManager.cleanUpAllAllocatedMemory
+    if (leakedMemory != 0) {
+      fail("Test leaked " + leakedMemory + " bytes of managed memory")
+    }
+  }
+
+  test("SPARK-36242 Spill File should not exists if writer close fails") {
+    // Prepare the data and ensure that the amount of data let the `spill()` method
+    // to enter the `objectsWritten > 0` branch
+    val writeSize = conf.get(config.SHUFFLE_SPILL_BATCH_SIZE) + 1
+    val dataBuffer = new PartitionedPairBuffer[Int, Int]
+    (0 until writeSize.toInt).foreach(i => dataBuffer.insert(0, 0, i))
+
+    val externalSorter = new TestExternalSorter[Int, Int, Int](taskContext)
+
+    // Mock the answer of `blockManager.getDiskWriter` and let the `close()` method of
+    // `DiskBlockObjectWriter` throw IOException.
+    val errorMessage = "Spill file close failed"
+    when(blockManager.getDiskWriter(
+      any(classOf[BlockId]),
+      any(classOf[File]),
+      any(classOf[SerializerInstance]),
+      anyInt(),
+      any(classOf[ShuffleWriteMetrics])
+    )).thenAnswer((invocation: InvocationOnMock) => {
+      val args = invocation.getArguments
+      new DiskBlockObjectWriter(
+        args(1).asInstanceOf[File],
+        blockManager.serializerManager,
+        args(2).asInstanceOf[SerializerInstance],
+        args(3).asInstanceOf[Int],
+        false,
+        args(4).asInstanceOf[ShuffleWriteMetrics],
+        args(0).asInstanceOf[BlockId]
+      ) {
+        override def close(): Unit = throw new IOException(errorMessage)
+      }
+    })
+
+    val ioe = intercept[IOException] {
+      externalSorter.spill(dataBuffer)
+    }
+
+    ioe.getMessage.equals(errorMessage)
+    // The `TempShuffleBlock` create by diskBlockManager
+    // will remain before SPARK-36242
+    assert(!spillFilesCreated(0).exists())
+  }
+}
+
+/**
+ * `TestExternalSorter` used to expand the access scope of the spill method.
+ */
+private[this] class TestExternalSorter[K, V, C](context: TaskContext)
+  extends ExternalSorter[K, V, C](context) {
+  override def spill(collection: WritablePartitionedPairCollection[K, C]): Unit =
+    super.spill(collection)
+}

From 7d1ba37ee3cba67f027f9fec3bd40ccade28dab3 Mon Sep 17 00:00:00 2001
From: Dongjoon Hyun <dhyun@apple.com>
Date: Wed, 4 Aug 2021 20:26:06 +0900
Subject: [PATCH 123/169] [SPARK-36354][CORE] EventLogFileReader should skip
 rolling event log directories with no logs

### What changes were proposed in this pull request?

This PR aims to skip rolling event log directories which has only `appstatus` file.

### Why are the changes needed?

Currently, Spark History server shows `IllegalArgumentException` warning, but the event log might arrive later. The situation also can happen when the job is killed before uploading its first log to the remote storages like S3.
```
21/07/30 07:38:26 WARN FsHistoryProvider:
Error while reading new log s3a://.../eventlog_v2_spark-95b5c736c8e44037afcf152534d08771
java.lang.IllegalArgumentException: requirement failed:
Log directory must contain at least one event log file!
...
at org.apache.spark.deploy.history.RollingEventLogFilesFileReader.files$lzycompute(EventLogFileReaders.scala:216)
```

### Does this PR introduce _any_ user-facing change?

Yes. Users will not see `IllegalArgumentException` warnings.

### How was this patch tested?

Pass the CIs with the newly added test case.

Closes #33586 from dongjoon-hyun/SPARK-36354.

Authored-by: Dongjoon Hyun <dhyun@apple.com>
Signed-off-by: Jungtaek Lim <kabhwan.opensource@gmail.com>
(cherry picked from commit 28a2a2238fbaf4fad3c98cfef2b3049c1f4616c8)
Signed-off-by: Jungtaek Lim <kabhwan.opensource@gmail.com>
---
 .../deploy/history/EventLogFileReaders.scala  | 10 +++++--
 .../history/EventLogFileReadersSuite.scala    |  2 ++
 .../history/FsHistoryProviderSuite.scala      | 29 +++++++++++++++++++
 3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileReaders.scala b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileReaders.scala
index b4771c80a175f..b21c67a2823af 100644
--- a/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileReaders.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/history/EventLogFileReaders.scala
@@ -27,6 +27,7 @@ import org.apache.hadoop.hdfs.DFSInputStream
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.history.EventLogFileWriter.codecName
+import org.apache.spark.internal.Logging
 import org.apache.spark.io.CompressionCodec
 import org.apache.spark.util.Utils
 
@@ -96,7 +97,7 @@ abstract class EventLogFileReader(
   def totalSize: Long
 }
 
-object EventLogFileReader {
+object EventLogFileReader extends Logging {
   // A cache for compression codecs to avoid creating the same codec many times
   private val codecMap = new ConcurrentHashMap[String, CompressionCodec]()
 
@@ -118,7 +119,12 @@ object EventLogFileReader {
     if (isSingleEventLog(status)) {
       Some(new SingleFileEventLogFileReader(fs, status.getPath, Option(status)))
     } else if (isRollingEventLogs(status)) {
-      Some(new RollingEventLogFilesFileReader(fs, status.getPath))
+      if (fs.listStatus(status.getPath).exists(RollingEventLogFilesWriter.isEventLogFile)) {
+        Some(new RollingEventLogFilesFileReader(fs, status.getPath))
+      } else {
+        logDebug(s"Rolling event log directory have no event log file at ${status.getPath}")
+        None
+      }
     } else {
       None
     }
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileReadersSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileReadersSuite.scala
index 8eab2da1a37b7..0bdc015349036 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileReadersSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/EventLogFileReadersSuite.scala
@@ -77,6 +77,8 @@ abstract class EventLogFileReadersSuite extends SparkFunSuite with LocalSparkCon
         }
       } else {
         fileSystem.mkdirs(path)
+        fileSystem.create(getAppStatusFilePath(path, "app", None, true))
+        fileSystem.create(getEventLogFilePath(path, "app", None, 1, None))
       }
 
       val reader = EventLogFileReader(fileSystem, path)
diff --git a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
index 3caa0a8ba7d4e..4aaafa353731b 100644
--- a/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/history/FsHistoryProviderSuite.scala
@@ -1524,6 +1524,35 @@ class FsHistoryProviderSuite extends SparkFunSuite with Matchers with Logging {
     }
   }
 
+  test("SPARK-36354: EventLogFileReader should skip rolling event log directories with no logs") {
+    withTempDir { dir =>
+      val conf = createTestConf(true)
+      conf.set(HISTORY_LOG_DIR, dir.getAbsolutePath)
+      val hadoopConf = SparkHadoopUtil.newConfiguration(conf)
+      val fs = new Path(dir.getAbsolutePath).getFileSystem(hadoopConf)
+
+      val provider = new FsHistoryProvider(conf)
+
+      val writer = new RollingEventLogFilesWriter("app", None, dir.toURI, conf, hadoopConf)
+      writer.start()
+
+      writeEventsToRollingWriter(writer, Seq(
+        SparkListenerApplicationStart("app", Some("app"), 0, "user", None),
+        SparkListenerJobStart(1, 0, Seq.empty)), rollFile = false)
+      provider.checkForLogs()
+      provider.cleanLogs()
+      assert(dir.listFiles().size === 1)
+      assert(provider.getListing.length === 1)
+
+      // Manually delete event log files and create event log file reader
+      val eventLogDir = dir.listFiles().head
+      eventLogDir.listFiles
+        .filter(f => RollingEventLogFilesWriter.isEventLogFile(f.getName))
+        .foreach(f => f.delete())
+      EventLogFileReader(fs, new Path(eventLogDir.getAbsolutePath)).map(_.lastIndex)
+    }
+  }
+
   test("SPARK-33215: check ui view permissions without retrieving ui") {
     val conf = createTestConf()
       .set(HISTORY_SERVER_UI_ACLS_ENABLE, true)

From d2e9151f73250504a95ba150a9f36ddb83e919ee Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Mon, 9 Aug 2021 16:42:12 +0800
Subject: [PATCH 124/169] [SPARK-36086][SQL][3.1] CollapseProject project
 replace alias should use origin column name

### What changes were proposed in this pull request?
For added UT, without this patch will failed as below
```
[info] - SHOW TABLES V2: SPARK-36086: CollapseProject project replace alias should use origin column name *** FAILED *** (4 seconds, 935 milliseconds)
[info]   java.lang.RuntimeException: After applying rule org.apache.spark.sql.catalyst.optimizer.CollapseProject in batch Operator Optimization before Inferring Filters, the structural integrity of the plan is broken.
[info]   at org.apache.spark.sql.errors.QueryExecutionErrors$.structuralIntegrityIsBrokenAfterApplyingRuleError(QueryExecutionErrors.scala:1217)
[info]   at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:229)
[info]   at scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
[info]   at scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
[info]   at scala.collection.immutable.List.foldLeft(List.scala:91)
[info]   at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:208)
[info]   at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:200)
[info]   at scala.collection.immutable.List.foreach(List.scala:431)
[info]   at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:200)
[info]   at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:179)
[info]   at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:88)
```

CollapseProject project replace alias should use origin column name
### Why are the changes needed?
Fix bug

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Added UT

Closes #33685 from AngersZhuuuu/SPARK-36086-3.1.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/expressions/AliasHelper.scala    |  2 +-
 .../expressions/namedExpressions.scala        |  8 ++++++++
 .../optimizer/CollapseProjectSuite.scala      |  9 +++++++++
 .../approved-plans-v1_4/q5.sf100/explain.txt  | 20 +++++++++----------
 .../q5.sf100/simplified.txt                   |  6 +++---
 .../approved-plans-v1_4/q5/explain.txt        | 16 +++++++--------
 .../approved-plans-v1_4/q5/simplified.txt     |  6 +++---
 7 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala
index 7eb95e6a84f15..4c94fa448cefe 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AliasHelper.scala
@@ -72,7 +72,7 @@ trait AliasHelper {
     // Use transformUp to prevent infinite recursion when the replacement expression
     // redefines the same ExprId,
     trimNonTopLevelAliases(expr.transformUp {
-      case a: Attribute => aliasMap.getOrElse(a, a)
+      case a: Attribute => aliasMap.get(a).map(_.withName(a.name)).getOrElse(a)
     }).asInstanceOf[NamedExpression]
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
index badc2ecc9cb28..cabc09052bab6 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/namedExpressions.scala
@@ -180,6 +180,14 @@ case class Alias(child: Expression, name: String)(
     }
   }
 
+  def withName(newName: String): NamedExpression = {
+    Alias(child, newName)(
+      exprId = exprId,
+      qualifier = qualifier,
+      explicitMetadata = explicitMetadata,
+      nonInheritableMetadataKeys = nonInheritableMetadataKeys)
+  }
+
   def newInstance(): NamedExpression =
     Alias(child, name)(
       qualifier = qualifier,
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
index 42bcd13ee378d..1e7f9b0edd91c 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/CollapseProjectSuite.scala
@@ -170,4 +170,13 @@ class CollapseProjectSuite extends PlanTest {
     val expected = Sample(0.0, 0.6, false, 11L, relation.select('a as 'c)).analyze
     comparePlans(optimized, expected)
   }
+
+  test("SPARK-36086: CollapseProject should keep output schema name") {
+    val relation = LocalRelation('a.int, 'b.int)
+    val select = relation.select(('a + 'b).as('c)).analyze
+    val query = Project(Seq(select.output.head.withName("C")), select)
+    val optimized = Optimize.execute(query)
+    val expected = relation.select(('a + 'b).as('C)).analyze
+    comparePlans(optimized, expected)
+  }
 }
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5.sf100/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5.sf100/explain.txt
index 55bd25c501294..6a0794dc53efd 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5.sf100/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5.sf100/explain.txt
@@ -186,14 +186,14 @@ Results [5]: [s_store_id#25, sum#31, sum#32, sum#33, sum#34]
 
 (24) Exchange
 Input [5]: [s_store_id#25, sum#31, sum#32, sum#33, sum#34]
-Arguments: hashpartitioning(s_store_id#25, 5), true, [id=#35]
+Arguments: hashpartitioning(s_store_id#25, 5), ENSURE_REQUIREMENTS, [id=#35]
 
 (25) HashAggregate [codegen id : 6]
 Input [5]: [s_store_id#25, sum#31, sum#32, sum#33, sum#34]
 Keys [1]: [s_store_id#25]
 Functions [4]: [sum(UnscaledValue(sales_price#7)), sum(UnscaledValue(return_amt#9)), sum(UnscaledValue(profit#8)), sum(UnscaledValue(net_loss#10))]
 Aggregate Attributes [4]: [sum(UnscaledValue(sales_price#7))#36, sum(UnscaledValue(return_amt#9))#37, sum(UnscaledValue(profit#8))#38, sum(UnscaledValue(net_loss#10))#39]
-Results [5]: [MakeDecimal(sum(UnscaledValue(sales_price#7))#36,17,2) AS sales#40, MakeDecimal(sum(UnscaledValue(return_amt#9))#37,17,2) AS RETURNS#41, CheckOverflow((promote_precision(cast(MakeDecimal(sum(UnscaledValue(profit#8))#38,17,2) as decimal(18,2))) - promote_precision(cast(MakeDecimal(sum(UnscaledValue(net_loss#10))#39,17,2) as decimal(18,2)))), DecimalType(18,2), true) AS profit#42, store channel AS channel#43, concat(store, s_store_id#25) AS id#44]
+Results [5]: [MakeDecimal(sum(UnscaledValue(sales_price#7))#36,17,2) AS sales#40, MakeDecimal(sum(UnscaledValue(return_amt#9))#37,17,2) AS returns#41, CheckOverflow((promote_precision(cast(MakeDecimal(sum(UnscaledValue(profit#8))#38,17,2) as decimal(18,2))) - promote_precision(cast(MakeDecimal(sum(UnscaledValue(net_loss#10))#39,17,2) as decimal(18,2)))), DecimalType(18,2), true) AS profit#42, store channel AS channel#43, concat(store, s_store_id#25) AS id#44]
 
 (26) Scan parquet default.catalog_sales
 Output [4]: [cs_sold_date_sk#45, cs_catalog_page_sk#46, cs_ext_sales_price#47, cs_net_profit#48]
@@ -281,14 +281,14 @@ Results [5]: [cp_catalog_page_id#66, sum#72, sum#73, sum#74, sum#75]
 
 (45) Exchange
 Input [5]: [cp_catalog_page_id#66, sum#72, sum#73, sum#74, sum#75]
-Arguments: hashpartitioning(cp_catalog_page_id#66, 5), true, [id=#76]
+Arguments: hashpartitioning(cp_catalog_page_id#66, 5), ENSURE_REQUIREMENTS, [id=#76]
 
 (46) HashAggregate [codegen id : 12]
 Input [5]: [cp_catalog_page_id#66, sum#72, sum#73, sum#74, sum#75]
 Keys [1]: [cp_catalog_page_id#66]
 Functions [4]: [sum(UnscaledValue(sales_price#51)), sum(UnscaledValue(return_amt#53)), sum(UnscaledValue(profit#52)), sum(UnscaledValue(net_loss#54))]
 Aggregate Attributes [4]: [sum(UnscaledValue(sales_price#51))#77, sum(UnscaledValue(return_amt#53))#78, sum(UnscaledValue(profit#52))#79, sum(UnscaledValue(net_loss#54))#80]
-Results [5]: [MakeDecimal(sum(UnscaledValue(sales_price#51))#77,17,2) AS sales#81, MakeDecimal(sum(UnscaledValue(return_amt#53))#78,17,2) AS RETURNS#82, CheckOverflow((promote_precision(cast(MakeDecimal(sum(UnscaledValue(profit#52))#79,17,2) as decimal(18,2))) - promote_precision(cast(MakeDecimal(sum(UnscaledValue(net_loss#54))#80,17,2) as decimal(18,2)))), DecimalType(18,2), true) AS profit#83, catalog channel AS channel#84, concat(catalog_page, cp_catalog_page_id#66) AS id#85]
+Results [5]: [MakeDecimal(sum(UnscaledValue(sales_price#51))#77,17,2) AS sales#81, MakeDecimal(sum(UnscaledValue(return_amt#53))#78,17,2) AS returns#82, CheckOverflow((promote_precision(cast(MakeDecimal(sum(UnscaledValue(profit#52))#79,17,2) as decimal(18,2))) - promote_precision(cast(MakeDecimal(sum(UnscaledValue(net_loss#54))#80,17,2) as decimal(18,2)))), DecimalType(18,2), true) AS profit#83, catalog channel AS channel#84, concat(catalog_page, cp_catalog_page_id#66) AS id#85]
 
 (47) Scan parquet default.web_sales
 Output [4]: [ws_sold_date_sk#86, ws_web_site_sk#87, ws_ext_sales_price#88, ws_net_profit#89]
@@ -324,7 +324,7 @@ Condition : isnotnull(wr_returned_date_sk#96)
 
 (54) Exchange
 Input [5]: [wr_returned_date_sk#96, wr_item_sk#97, wr_order_number#98, wr_return_amt#99, wr_net_loss#100]
-Arguments: hashpartitioning(wr_item_sk#97, wr_order_number#98, 5), true, [id=#101]
+Arguments: hashpartitioning(wr_item_sk#97, wr_order_number#98, 5), ENSURE_REQUIREMENTS, [id=#101]
 
 (55) Sort [codegen id : 15]
 Input [5]: [wr_returned_date_sk#96, wr_item_sk#97, wr_order_number#98, wr_return_amt#99, wr_net_loss#100]
@@ -346,7 +346,7 @@ Condition : ((isnotnull(ws_item_sk#102) AND isnotnull(ws_order_number#103)) AND
 
 (59) Exchange
 Input [3]: [ws_item_sk#102, ws_web_site_sk#87, ws_order_number#103]
-Arguments: hashpartitioning(cast(ws_item_sk#102 as bigint), cast(ws_order_number#103 as bigint), 5), true, [id=#104]
+Arguments: hashpartitioning(cast(ws_item_sk#102 as bigint), cast(ws_order_number#103 as bigint), 5), ENSURE_REQUIREMENTS, [id=#104]
 
 (60) Sort [codegen id : 17]
 Input [3]: [ws_item_sk#102, ws_web_site_sk#87, ws_order_number#103]
@@ -411,19 +411,19 @@ Results [5]: [web_site_id#112, sum#118, sum#119, sum#120, sum#121]
 
 (74) Exchange
 Input [5]: [web_site_id#112, sum#118, sum#119, sum#120, sum#121]
-Arguments: hashpartitioning(web_site_id#112, 5), true, [id=#122]
+Arguments: hashpartitioning(web_site_id#112, 5), ENSURE_REQUIREMENTS, [id=#122]
 
 (75) HashAggregate [codegen id : 22]
 Input [5]: [web_site_id#112, sum#118, sum#119, sum#120, sum#121]
 Keys [1]: [web_site_id#112]
 Functions [4]: [sum(UnscaledValue(sales_price#92)), sum(UnscaledValue(return_amt#94)), sum(UnscaledValue(profit#93)), sum(UnscaledValue(net_loss#95))]
 Aggregate Attributes [4]: [sum(UnscaledValue(sales_price#92))#123, sum(UnscaledValue(return_amt#94))#124, sum(UnscaledValue(profit#93))#125, sum(UnscaledValue(net_loss#95))#126]
-Results [5]: [MakeDecimal(sum(UnscaledValue(sales_price#92))#123,17,2) AS sales#127, MakeDecimal(sum(UnscaledValue(return_amt#94))#124,17,2) AS RETURNS#128, CheckOverflow((promote_precision(cast(MakeDecimal(sum(UnscaledValue(profit#93))#125,17,2) as decimal(18,2))) - promote_precision(cast(MakeDecimal(sum(UnscaledValue(net_loss#95))#126,17,2) as decimal(18,2)))), DecimalType(18,2), true) AS profit#129, web channel AS channel#130, concat(web_site, web_site_id#112) AS id#131]
+Results [5]: [MakeDecimal(sum(UnscaledValue(sales_price#92))#123,17,2) AS sales#127, MakeDecimal(sum(UnscaledValue(return_amt#94))#124,17,2) AS returns#128, CheckOverflow((promote_precision(cast(MakeDecimal(sum(UnscaledValue(profit#93))#125,17,2) as decimal(18,2))) - promote_precision(cast(MakeDecimal(sum(UnscaledValue(net_loss#95))#126,17,2) as decimal(18,2)))), DecimalType(18,2), true) AS profit#129, web channel AS channel#130, concat(web_site, web_site_id#112) AS id#131]
 
 (76) Union
 
 (77) Expand [codegen id : 23]
-Input [5]: [sales#40, RETURNS#41, profit#42, channel#43, id#44]
+Input [5]: [sales#40, returns#41, profit#42, channel#43, id#44]
 Arguments: [List(sales#40, returns#41, profit#42, channel#43, id#44, 0), List(sales#40, returns#41, profit#42, channel#43, null, 1), List(sales#40, returns#41, profit#42, null, null, 3)], [sales#40, returns#41, profit#42, channel#132, id#133, spark_grouping_id#134]
 
 (78) HashAggregate [codegen id : 23]
@@ -435,7 +435,7 @@ Results [9]: [channel#132, id#133, spark_grouping_id#134, sum#141, isEmpty#142,
 
 (79) Exchange
 Input [9]: [channel#132, id#133, spark_grouping_id#134, sum#141, isEmpty#142, sum#143, isEmpty#144, sum#145, isEmpty#146]
-Arguments: hashpartitioning(channel#132, id#133, spark_grouping_id#134, 5), true, [id=#147]
+Arguments: hashpartitioning(channel#132, id#133, spark_grouping_id#134, 5), ENSURE_REQUIREMENTS, [id=#147]
 
 (80) HashAggregate [codegen id : 24]
 Input [9]: [channel#132, id#133, spark_grouping_id#134, sum#141, isEmpty#142, sum#143, isEmpty#144, sum#145, isEmpty#146]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5.sf100/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5.sf100/simplified.txt
index 80b07a3712d36..25102fb37a80f 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5.sf100/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5.sf100/simplified.txt
@@ -9,7 +9,7 @@ TakeOrderedAndProject [channel,id,sales,returns,profit]
                 InputAdapter
                   Union
                     WholeStageCodegen (6)
-                      HashAggregate [s_store_id,sum,sum,sum,sum] [sum(UnscaledValue(sales_price)),sum(UnscaledValue(return_amt)),sum(UnscaledValue(profit)),sum(UnscaledValue(net_loss)),sales,RETURNS,profit,channel,id,sum,sum,sum,sum]
+                      HashAggregate [s_store_id,sum,sum,sum,sum] [sum(UnscaledValue(sales_price)),sum(UnscaledValue(return_amt)),sum(UnscaledValue(profit)),sum(UnscaledValue(net_loss)),sales,returns,profit,channel,id,sum,sum,sum,sum]
                         InputAdapter
                           Exchange [s_store_id] #2
                             WholeStageCodegen (5)
@@ -48,7 +48,7 @@ TakeOrderedAndProject [channel,id,sales,returns,profit]
                                               InputAdapter
                                                 Scan parquet default.store [s_store_sk,s_store_id]
                     WholeStageCodegen (12)
-                      HashAggregate [cp_catalog_page_id,sum,sum,sum,sum] [sum(UnscaledValue(sales_price)),sum(UnscaledValue(return_amt)),sum(UnscaledValue(profit)),sum(UnscaledValue(net_loss)),sales,RETURNS,profit,channel,id,sum,sum,sum,sum]
+                      HashAggregate [cp_catalog_page_id,sum,sum,sum,sum] [sum(UnscaledValue(sales_price)),sum(UnscaledValue(return_amt)),sum(UnscaledValue(profit)),sum(UnscaledValue(net_loss)),sales,returns,profit,channel,id,sum,sum,sum,sum]
                         InputAdapter
                           Exchange [cp_catalog_page_id] #5
                             WholeStageCodegen (11)
@@ -81,7 +81,7 @@ TakeOrderedAndProject [channel,id,sales,returns,profit]
                                               InputAdapter
                                                 Scan parquet default.catalog_page [cp_catalog_page_sk,cp_catalog_page_id]
                     WholeStageCodegen (22)
-                      HashAggregate [web_site_id,sum,sum,sum,sum] [sum(UnscaledValue(sales_price)),sum(UnscaledValue(return_amt)),sum(UnscaledValue(profit)),sum(UnscaledValue(net_loss)),sales,RETURNS,profit,channel,id,sum,sum,sum,sum]
+                      HashAggregate [web_site_id,sum,sum,sum,sum] [sum(UnscaledValue(sales_price)),sum(UnscaledValue(return_amt)),sum(UnscaledValue(profit)),sum(UnscaledValue(net_loss)),sales,returns,profit,channel,id,sum,sum,sum,sum]
                         InputAdapter
                           Exchange [web_site_id] #7
                             WholeStageCodegen (21)
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5/explain.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5/explain.txt
index 15f0cda0b5f9f..602013322fde7 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5/explain.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5/explain.txt
@@ -183,14 +183,14 @@ Results [5]: [s_store_id#25, sum#31, sum#32, sum#33, sum#34]
 
 (24) Exchange
 Input [5]: [s_store_id#25, sum#31, sum#32, sum#33, sum#34]
-Arguments: hashpartitioning(s_store_id#25, 5), true, [id=#35]
+Arguments: hashpartitioning(s_store_id#25, 5), ENSURE_REQUIREMENTS, [id=#35]
 
 (25) HashAggregate [codegen id : 6]
 Input [5]: [s_store_id#25, sum#31, sum#32, sum#33, sum#34]
 Keys [1]: [s_store_id#25]
 Functions [4]: [sum(UnscaledValue(sales_price#7)), sum(UnscaledValue(return_amt#9)), sum(UnscaledValue(profit#8)), sum(UnscaledValue(net_loss#10))]
 Aggregate Attributes [4]: [sum(UnscaledValue(sales_price#7))#36, sum(UnscaledValue(return_amt#9))#37, sum(UnscaledValue(profit#8))#38, sum(UnscaledValue(net_loss#10))#39]
-Results [5]: [MakeDecimal(sum(UnscaledValue(sales_price#7))#36,17,2) AS sales#40, MakeDecimal(sum(UnscaledValue(return_amt#9))#37,17,2) AS RETURNS#41, CheckOverflow((promote_precision(cast(MakeDecimal(sum(UnscaledValue(profit#8))#38,17,2) as decimal(18,2))) - promote_precision(cast(MakeDecimal(sum(UnscaledValue(net_loss#10))#39,17,2) as decimal(18,2)))), DecimalType(18,2), true) AS profit#42, store channel AS channel#43, concat(store, s_store_id#25) AS id#44]
+Results [5]: [MakeDecimal(sum(UnscaledValue(sales_price#7))#36,17,2) AS sales#40, MakeDecimal(sum(UnscaledValue(return_amt#9))#37,17,2) AS returns#41, CheckOverflow((promote_precision(cast(MakeDecimal(sum(UnscaledValue(profit#8))#38,17,2) as decimal(18,2))) - promote_precision(cast(MakeDecimal(sum(UnscaledValue(net_loss#10))#39,17,2) as decimal(18,2)))), DecimalType(18,2), true) AS profit#42, store channel AS channel#43, concat(store, s_store_id#25) AS id#44]
 
 (26) Scan parquet default.catalog_sales
 Output [4]: [cs_sold_date_sk#45, cs_catalog_page_sk#46, cs_ext_sales_price#47, cs_net_profit#48]
@@ -278,14 +278,14 @@ Results [5]: [cp_catalog_page_id#66, sum#72, sum#73, sum#74, sum#75]
 
 (45) Exchange
 Input [5]: [cp_catalog_page_id#66, sum#72, sum#73, sum#74, sum#75]
-Arguments: hashpartitioning(cp_catalog_page_id#66, 5), true, [id=#76]
+Arguments: hashpartitioning(cp_catalog_page_id#66, 5), ENSURE_REQUIREMENTS, [id=#76]
 
 (46) HashAggregate [codegen id : 12]
 Input [5]: [cp_catalog_page_id#66, sum#72, sum#73, sum#74, sum#75]
 Keys [1]: [cp_catalog_page_id#66]
 Functions [4]: [sum(UnscaledValue(sales_price#51)), sum(UnscaledValue(return_amt#53)), sum(UnscaledValue(profit#52)), sum(UnscaledValue(net_loss#54))]
 Aggregate Attributes [4]: [sum(UnscaledValue(sales_price#51))#77, sum(UnscaledValue(return_amt#53))#78, sum(UnscaledValue(profit#52))#79, sum(UnscaledValue(net_loss#54))#80]
-Results [5]: [MakeDecimal(sum(UnscaledValue(sales_price#51))#77,17,2) AS sales#81, MakeDecimal(sum(UnscaledValue(return_amt#53))#78,17,2) AS RETURNS#82, CheckOverflow((promote_precision(cast(MakeDecimal(sum(UnscaledValue(profit#52))#79,17,2) as decimal(18,2))) - promote_precision(cast(MakeDecimal(sum(UnscaledValue(net_loss#54))#80,17,2) as decimal(18,2)))), DecimalType(18,2), true) AS profit#83, catalog channel AS channel#84, concat(catalog_page, cp_catalog_page_id#66) AS id#85]
+Results [5]: [MakeDecimal(sum(UnscaledValue(sales_price#51))#77,17,2) AS sales#81, MakeDecimal(sum(UnscaledValue(return_amt#53))#78,17,2) AS returns#82, CheckOverflow((promote_precision(cast(MakeDecimal(sum(UnscaledValue(profit#52))#79,17,2) as decimal(18,2))) - promote_precision(cast(MakeDecimal(sum(UnscaledValue(net_loss#54))#80,17,2) as decimal(18,2)))), DecimalType(18,2), true) AS profit#83, catalog channel AS channel#84, concat(catalog_page, cp_catalog_page_id#66) AS id#85]
 
 (47) Scan parquet default.web_sales
 Output [4]: [ws_sold_date_sk#86, ws_web_site_sk#87, ws_ext_sales_price#88, ws_net_profit#89]
@@ -396,19 +396,19 @@ Results [5]: [web_site_id#111, sum#117, sum#118, sum#119, sum#120]
 
 (71) Exchange
 Input [5]: [web_site_id#111, sum#117, sum#118, sum#119, sum#120]
-Arguments: hashpartitioning(web_site_id#111, 5), true, [id=#121]
+Arguments: hashpartitioning(web_site_id#111, 5), ENSURE_REQUIREMENTS, [id=#121]
 
 (72) HashAggregate [codegen id : 19]
 Input [5]: [web_site_id#111, sum#117, sum#118, sum#119, sum#120]
 Keys [1]: [web_site_id#111]
 Functions [4]: [sum(UnscaledValue(sales_price#92)), sum(UnscaledValue(return_amt#94)), sum(UnscaledValue(profit#93)), sum(UnscaledValue(net_loss#95))]
 Aggregate Attributes [4]: [sum(UnscaledValue(sales_price#92))#122, sum(UnscaledValue(return_amt#94))#123, sum(UnscaledValue(profit#93))#124, sum(UnscaledValue(net_loss#95))#125]
-Results [5]: [MakeDecimal(sum(UnscaledValue(sales_price#92))#122,17,2) AS sales#126, MakeDecimal(sum(UnscaledValue(return_amt#94))#123,17,2) AS RETURNS#127, CheckOverflow((promote_precision(cast(MakeDecimal(sum(UnscaledValue(profit#93))#124,17,2) as decimal(18,2))) - promote_precision(cast(MakeDecimal(sum(UnscaledValue(net_loss#95))#125,17,2) as decimal(18,2)))), DecimalType(18,2), true) AS profit#128, web channel AS channel#129, concat(web_site, web_site_id#111) AS id#130]
+Results [5]: [MakeDecimal(sum(UnscaledValue(sales_price#92))#122,17,2) AS sales#126, MakeDecimal(sum(UnscaledValue(return_amt#94))#123,17,2) AS returns#127, CheckOverflow((promote_precision(cast(MakeDecimal(sum(UnscaledValue(profit#93))#124,17,2) as decimal(18,2))) - promote_precision(cast(MakeDecimal(sum(UnscaledValue(net_loss#95))#125,17,2) as decimal(18,2)))), DecimalType(18,2), true) AS profit#128, web channel AS channel#129, concat(web_site, web_site_id#111) AS id#130]
 
 (73) Union
 
 (74) Expand [codegen id : 20]
-Input [5]: [sales#40, RETURNS#41, profit#42, channel#43, id#44]
+Input [5]: [sales#40, returns#41, profit#42, channel#43, id#44]
 Arguments: [List(sales#40, returns#41, profit#42, channel#43, id#44, 0), List(sales#40, returns#41, profit#42, channel#43, null, 1), List(sales#40, returns#41, profit#42, null, null, 3)], [sales#40, returns#41, profit#42, channel#131, id#132, spark_grouping_id#133]
 
 (75) HashAggregate [codegen id : 20]
@@ -420,7 +420,7 @@ Results [9]: [channel#131, id#132, spark_grouping_id#133, sum#140, isEmpty#141,
 
 (76) Exchange
 Input [9]: [channel#131, id#132, spark_grouping_id#133, sum#140, isEmpty#141, sum#142, isEmpty#143, sum#144, isEmpty#145]
-Arguments: hashpartitioning(channel#131, id#132, spark_grouping_id#133, 5), true, [id=#146]
+Arguments: hashpartitioning(channel#131, id#132, spark_grouping_id#133, 5), ENSURE_REQUIREMENTS, [id=#146]
 
 (77) HashAggregate [codegen id : 21]
 Input [9]: [channel#131, id#132, spark_grouping_id#133, sum#140, isEmpty#141, sum#142, isEmpty#143, sum#144, isEmpty#145]
diff --git a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5/simplified.txt b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5/simplified.txt
index 9b7cc3360367c..6f38d3158ef2d 100644
--- a/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5/simplified.txt
+++ b/sql/core/src/test/resources/tpcds-plan-stability/approved-plans-v1_4/q5/simplified.txt
@@ -9,7 +9,7 @@ TakeOrderedAndProject [channel,id,sales,returns,profit]
                 InputAdapter
                   Union
                     WholeStageCodegen (6)
-                      HashAggregate [s_store_id,sum,sum,sum,sum] [sum(UnscaledValue(sales_price)),sum(UnscaledValue(return_amt)),sum(UnscaledValue(profit)),sum(UnscaledValue(net_loss)),sales,RETURNS,profit,channel,id,sum,sum,sum,sum]
+                      HashAggregate [s_store_id,sum,sum,sum,sum] [sum(UnscaledValue(sales_price)),sum(UnscaledValue(return_amt)),sum(UnscaledValue(profit)),sum(UnscaledValue(net_loss)),sales,returns,profit,channel,id,sum,sum,sum,sum]
                         InputAdapter
                           Exchange [s_store_id] #2
                             WholeStageCodegen (5)
@@ -48,7 +48,7 @@ TakeOrderedAndProject [channel,id,sales,returns,profit]
                                               InputAdapter
                                                 Scan parquet default.store [s_store_sk,s_store_id]
                     WholeStageCodegen (12)
-                      HashAggregate [cp_catalog_page_id,sum,sum,sum,sum] [sum(UnscaledValue(sales_price)),sum(UnscaledValue(return_amt)),sum(UnscaledValue(profit)),sum(UnscaledValue(net_loss)),sales,RETURNS,profit,channel,id,sum,sum,sum,sum]
+                      HashAggregate [cp_catalog_page_id,sum,sum,sum,sum] [sum(UnscaledValue(sales_price)),sum(UnscaledValue(return_amt)),sum(UnscaledValue(profit)),sum(UnscaledValue(net_loss)),sales,returns,profit,channel,id,sum,sum,sum,sum]
                         InputAdapter
                           Exchange [cp_catalog_page_id] #5
                             WholeStageCodegen (11)
@@ -81,7 +81,7 @@ TakeOrderedAndProject [channel,id,sales,returns,profit]
                                               InputAdapter
                                                 Scan parquet default.catalog_page [cp_catalog_page_sk,cp_catalog_page_id]
                     WholeStageCodegen (19)
-                      HashAggregate [web_site_id,sum,sum,sum,sum] [sum(UnscaledValue(sales_price)),sum(UnscaledValue(return_amt)),sum(UnscaledValue(profit)),sum(UnscaledValue(net_loss)),sales,RETURNS,profit,channel,id,sum,sum,sum,sum]
+                      HashAggregate [web_site_id,sum,sum,sum,sum] [sum(UnscaledValue(sales_price)),sum(UnscaledValue(return_amt)),sum(UnscaledValue(profit)),sum(UnscaledValue(net_loss)),sales,returns,profit,channel,id,sum,sum,sum,sum]
                         InputAdapter
                           Exchange [web_site_id] #7
                             WholeStageCodegen (18)

From 67d5aa92838115129543e83929ec3999169fe616 Mon Sep 17 00:00:00 2001
From: gaoyajun02 <gaoyajun02@gmail.com>
Date: Mon, 9 Aug 2021 16:53:27 +0800
Subject: [PATCH 125/169] [SPARK-36339][SQL][3.0] References to grouping that
 not part of aggregation should be replaced

Currently, references to grouping sets are reported as errors after aggregated expressions, e.g.
```
SELECT count(name) c, name
FROM VALUES ('Alice'), ('Bob') people(name)
GROUP BY name GROUPING SETS(name);
```
Error in query: expression 'people.`name`' is neither present in the group by, nor is it an aggregate function. Add to group by or wrap in first() (or first_value) if you don't care which value you get.;;

Fix the map anonymous function in the constructAggregateExprs function does not use underscores to avoid

No

Unit tests.

Closes #33669 from gaoyajun02/branch-3.0.

Authored-by: gaoyajun02 <gaoyajun02@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit e10cc78db4aeacb6e8b16ef19679d8aac82601ea)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../spark/sql/catalyst/analysis/Analyzer.scala      |  4 ++--
 .../scala/org/apache/spark/sql/SQLQuerySuite.scala  | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 600a5afe62938..a9ff6ee2ab206 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -539,7 +539,7 @@ class Analyzer(override val catalogManager: CatalogManager)
         aggregations: Seq[NamedExpression],
         groupByAliases: Seq[Alias],
         groupingAttrs: Seq[Expression],
-        gid: Attribute): Seq[NamedExpression] = aggregations.map {
+        gid: Attribute): Seq[NamedExpression] = aggregations.map { agg =>
       // collect all the found AggregateExpression, so we can check an expression is part of
       // any AggregateExpression or not.
       val aggsBuffer = ArrayBuffer[Expression]()
@@ -547,7 +547,7 @@ class Analyzer(override val catalogManager: CatalogManager)
       def isPartOfAggregation(e: Expression): Boolean = {
         aggsBuffer.exists(a => a.find(_ eq e).isDefined)
       }
-      replaceGroupingFunc(_, groupByExprs, gid).transformDown {
+      replaceGroupingFunc(agg, groupByExprs, gid).transformDown {
         // AggregateExpression should be computed on the unmodified value of its argument
         // expressions, so we should not replace any references to grouping expression
         // inside it.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index e8992c90b4dbb..58cf9f4ffd5b0 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -3460,6 +3460,19 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
     }
   }
 
+  test("SPARK-36339: References to grouping attributes should be replaced") {
+    withTempView("t") {
+      Seq("a", "a", "b").toDF("x").createOrReplaceTempView("t")
+      checkAnswer(
+        sql(
+          """
+            |select count(x) c, x from t
+            |group by x grouping sets(x)
+          """.stripMargin),
+        Seq(Row(2, "a"), Row(1, "b")))
+    }
+  }
+
   test("SPARK-31166: UNION map<null, null> and other maps should not fail") {
     checkAnswer(
       sql("(SELECT map()) UNION ALL (SELECT map(1, 2))"),

From ca5113d3169cca470e90ee63e7a6713e8b380591 Mon Sep 17 00:00:00 2001
From: Kazuyuki Tanimura <ktanimura@apple.com>
Date: Tue, 10 Aug 2021 10:29:54 -0700
Subject: [PATCH 126/169] [SPARK-36464][CORE] Fix Underlying Size Variable
 Initialization in ChunkedByteBufferOutputStream for Writing Over 2GB Data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### What changes were proposed in this pull request?
The `size` method of `ChunkedByteBufferOutputStream` returns a `Long` value; however, the underlying `_size` variable is initialized as `Int`.
That causes an overflow and returns a negative size when over 2GB data is written into `ChunkedByteBufferOutputStream`

This PR proposes to change the underlying `_size` variable from `Int` to `Long` at the initialization

### Why are the changes needed?
Be cause the `size` method of `ChunkedByteBufferOutputStream` incorrectly returns a negative value when over 2GB data is written.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Passed existing tests
```
build/sbt "core/testOnly *ChunkedByteBufferOutputStreamSuite"
```
Also added a new unit test
```
build/sbt "core/testOnly *ChunkedByteBufferOutputStreamSuite – -z SPARK-36464"
```

Closes #33690 from kazuyukitanimura/SPARK-36464.

Authored-by: Kazuyuki Tanimura <ktanimura@apple.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit c888bad6a12b45f3eda8d898bdd90405985ee05c)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../spark/util/io/ChunkedByteBufferOutputStream.scala  |  3 ++-
 .../util/io/ChunkedByteBufferOutputStreamSuite.scala   | 10 ++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBufferOutputStream.scala b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBufferOutputStream.scala
index a625b3289538a..34d36655a6069 100644
--- a/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBufferOutputStream.scala
+++ b/core/src/main/scala/org/apache/spark/util/io/ChunkedByteBufferOutputStream.scala
@@ -48,7 +48,7 @@ private[spark] class ChunkedByteBufferOutputStream(
    * This can also never be 0.
    */
   private[this] var position = chunkSize
-  private[this] var _size = 0
+  private[this] var _size = 0L
   private[this] var closed: Boolean = false
 
   def size: Long = _size
@@ -120,4 +120,5 @@ private[spark] class ChunkedByteBufferOutputStream(
       new ChunkedByteBuffer(ret)
     }
   }
+
 }
diff --git a/core/src/test/scala/org/apache/spark/util/io/ChunkedByteBufferOutputStreamSuite.scala b/core/src/test/scala/org/apache/spark/util/io/ChunkedByteBufferOutputStreamSuite.scala
index 86961745673c6..29443e275f769 100644
--- a/core/src/test/scala/org/apache/spark/util/io/ChunkedByteBufferOutputStreamSuite.scala
+++ b/core/src/test/scala/org/apache/spark/util/io/ChunkedByteBufferOutputStreamSuite.scala
@@ -119,4 +119,14 @@ class ChunkedByteBufferOutputStreamSuite extends SparkFunSuite {
     assert(arrays(1).toSeq === ref.slice(10, 20))
     assert(arrays(2).toSeq === ref.slice(20, 30))
   }
+
+  test("SPARK-36464: size returns correct positive number even with over 2GB data") {
+    val ref = new Array[Byte](1024 * 1024 * 1024)
+    val o = new ChunkedByteBufferOutputStream(1024 * 1024, ByteBuffer.allocate)
+    o.write(ref)
+    o.write(ref)
+    o.close()
+    assert(o.size > 0L) // make sure it is not overflowing
+    assert(o.size == ref.length.toLong * 2)
+  }
 }

From 24c7d5e7f145a2736bc2623e08a603a4c2bb770b Mon Sep 17 00:00:00 2001
From: IonutBoicuAms <ionut.boicu@databricks.com>
Date: Thu, 12 Aug 2021 15:22:38 +0800
Subject: [PATCH 127/169] [SPARK-36489][SQL] Aggregate functions over no
 grouping keys, on tables with a single bucket, return multiple rows

### What changes were proposed in this pull request?

This PR fixes a bug in `DisableUnnecessaryBucketedScan`.
When running any aggregate function, without any grouping keys, on a table with a single bucket, multiple rows are returned.
This happens because the aggregate function satisfies the `AllTuples` distribution, no `Exchange` will be planned, and the bucketed scan will be disabled.

### Why are the changes needed?

Bug fixing. Aggregates over no grouping keys should return a single row.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Added new test in `DisableUnnecessaryBucketedScanSuite`.

Closes #33711 from IonutBoicuAms/fix-bug-disableunnecessarybucketedscan.

Authored-by: IonutBoicuAms <ionut.boicu@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 2b665751d9c7e4fb07ea18ce6611328e24f3dce9)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../DisableUnnecessaryBucketedScan.scala      |  4 +--
 .../DisableUnnecessaryBucketedScanSuite.scala | 26 +++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/bucketing/DisableUnnecessaryBucketedScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/bucketing/DisableUnnecessaryBucketedScan.scala
index 6b195b3b49f09..62969dc043328 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/bucketing/DisableUnnecessaryBucketedScan.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/bucketing/DisableUnnecessaryBucketedScan.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution.bucketing
 
-import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, HashClusteredDistribution}
+import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, ClusteredDistribution, HashClusteredDistribution}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{FileSourceScanExec, FilterExec, ProjectExec, SortExec, SparkPlan}
 import org.apache.spark.sql.execution.aggregate.BaseAggregateExec
@@ -120,7 +120,7 @@ object DisableUnnecessaryBucketedScan extends Rule[SparkPlan] {
 
   private def hasInterestingPartition(plan: SparkPlan): Boolean = {
     plan.requiredChildDistribution.exists {
-      case _: ClusteredDistribution | _: HashClusteredDistribution => true
+      case _: ClusteredDistribution | _: HashClusteredDistribution | AllTuples => true
       case _ => false
     }
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/DisableUnnecessaryBucketedScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/DisableUnnecessaryBucketedScanSuite.scala
index 179cdeb976391..9b60f5d94bcce 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/sources/DisableUnnecessaryBucketedScanSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/sources/DisableUnnecessaryBucketedScanSuite.scala
@@ -258,4 +258,30 @@ abstract class DisableUnnecessaryBucketedScanSuite
       }
     }
   }
+
+  test("Aggregates with no groupby over tables having 1 BUCKET, return multiple rows") {
+    withTable("t1") {
+      withSQLConf(SQLConf.AUTO_BUCKETED_SCAN_ENABLED.key -> "true") {
+        sql(
+          """
+            |CREATE TABLE t1 (`id` BIGINT, `event_date` DATE)
+            |USING PARQUET
+            |CLUSTERED BY (id)
+            |INTO 1 BUCKETS
+            |""".stripMargin)
+        sql(
+          """
+            |INSERT INTO TABLE t1 VALUES(1.23, cast("2021-07-07" as date))
+            |""".stripMargin)
+        sql(
+          """
+            |INSERT INTO TABLE t1 VALUES(2.28, cast("2021-08-08" as date))
+            |""".stripMargin)
+        val df = spark.sql("select sum(id) from t1 where id is not null")
+        assert(df.count == 1)
+        checkDisableBucketedScan(query = "SELECT SUM(id) FROM t1 WHERE id is not null",
+          expectedNumScanWithAutoScanEnabled = 1, expectedNumScanWithAutoScanDisabled = 1)
+      }
+    }
+  }
 }

From 4688592fadb8d53377ed292f556761234b34211f Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Fri, 13 Aug 2021 16:43:28 +0800
Subject: [PATCH 128/169] [SPARK-36353][SQL][3.1] RemoveNoopOperators should
 keep output schema

### What changes were proposed in this pull request?
 RemoveNoopOperators should keep output schema

### Why are the changes needed?
Expand function

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Not need

Closes #33704 from AngersZhuuuu/SPARK-36353-3.1.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 26 +++++++++-
 .../optimizer/RemoveNoopOperatorsSuite.scala  | 47 +++++++++++++++++++
 2 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveNoopOperatorsSuite.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 0412b455f8d64..620ae21eaf19b 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -489,9 +489,33 @@ object RemoveRedundantAliases extends Rule[LogicalPlan] {
  * Remove no-op operators from the query plan that do not make any modifications.
  */
 object RemoveNoopOperators extends Rule[LogicalPlan] {
+  def restoreOriginalOutputNames(
+      projectList: Seq[NamedExpression],
+      originalNames: Seq[String]): Seq[NamedExpression] = {
+    projectList.zip(originalNames).map {
+      case (attr: Attribute, name) => attr.withName(name)
+      case (alias: Alias, name) => alias.withName(name)
+      case (other, _) => other
+    }
+  }
+
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     // Eliminate no-op Projects
-    case p @ Project(_, child) if child.sameOutput(p) => child
+    case p @ Project(projectList, child) if child.sameOutput(p) =>
+      val newChild = child match {
+        case p: Project =>
+          p.copy(projectList = restoreOriginalOutputNames(p.projectList, projectList.map(_.name)))
+        case agg: Aggregate =>
+          agg.copy(aggregateExpressions =
+            restoreOriginalOutputNames(agg.aggregateExpressions, projectList.map(_.name)))
+        case _ =>
+          child
+      }
+      if (newChild.output.zip(projectList).forall { case (a1, a2) => a1.name == a2.name }) {
+        newChild
+      } else {
+        p
+      }
 
     // Eliminate no-op Window
     case w: Window if w.windowExpressions.isEmpty => w.child
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveNoopOperatorsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveNoopOperatorsSuite.scala
new file mode 100644
index 0000000000000..0f70e6fa739dd
--- /dev/null
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/RemoveNoopOperatorsSuite.scala
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical._
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+
+class RemoveNoopOperatorsSuite extends PlanTest {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches =
+      Batch("RemoveNoopOperators", Once,
+        RemoveNoopOperators) :: Nil
+  }
+
+  val testRelation = LocalRelation('a.int, 'b.int, 'c.int)
+
+  test("SPARK-36353: RemoveNoopOperators should keep output schema") {
+    val query = testRelation
+      .select(('a + 'b).as("c"))
+      .analyze
+    val originalQuery = Project(Seq(query.output.head.withName("C")), query)
+    val optimized = Optimize.execute(originalQuery.analyze)
+    val result = testRelation
+      .select(('a + 'b).as("C"))
+      .analyze
+    comparePlans(optimized, result)
+  }
+}

From 6ad9939265df3f0476670fa96c78f226f88810d2 Mon Sep 17 00:00:00 2001
From: Xingbo Jiang <xingbo.jiang@databricks.com>
Date: Fri, 13 Aug 2021 19:25:20 +0900
Subject: [PATCH 129/169] [SPARK-36500][CORE] Fix temp_shuffle file leaking
 when a task is interrupted

### What changes were proposed in this pull request?

When a task thread is interrupted, the underlying output stream referred by `DiskBlockObjectWriter.mcs` may have been closed, then we get IOException when flushing the buffered data. This breaks the assumption that `revertPartialWritesAndClose()` should not throw exceptions.

To fix the issue, we can catch the IOException in `ManualCloseOutputStream.manualClose()`.

### Why are the changes needed?

Previously the IOException was not captured, thus `revertPartialWritesAndClose()` threw an exception. When this happens, `BypassMergeSortShuffleWriter.stop()` would stop deleting the temp_shuffle files tracked by `partitionWriters`, hens lead to temp_shuffle file leak issues.

### Does this PR introduce _any_ user-facing change?

No, this is an internal bug fix.

### How was this patch tested?

Tested by running a longevity stress test. After the fix, there is no more leaked temp_shuffle files.

Closes #33731 from jiangxb1987/temp_shuffle.

Authored-by: Xingbo Jiang <xingbo.jiang@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit ec5f3a17e33f7afe03e48f8b7690a8b18ae0c058)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../spark/storage/DiskBlockObjectWriter.scala       | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
index e55c09274cd9a..38b845eb5ebeb 100644
--- a/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
+++ b/core/src/main/scala/org/apache/spark/storage/DiskBlockObjectWriter.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.storage
 
-import java.io.{BufferedOutputStream, File, FileOutputStream, OutputStream}
+import java.io.{BufferedOutputStream, File, FileOutputStream, IOException, OutputStream}
 import java.nio.channels.{ClosedByInterruptException, FileChannel}
 
 import org.apache.spark.internal.Logging
@@ -62,7 +62,16 @@ private[spark] class DiskBlockObjectWriter(
     }
 
     def manualClose(): Unit = {
-      super.close()
+      try {
+        super.close()
+      } catch {
+        // The output stream may have been closed when the task thread is interrupted, then we
+        // get IOException when flushing the buffered data. We should catch and log the exception
+        // to ensure the revertPartialWritesAndClose() function doesn't throw an exception.
+        case e: IOException =>
+          logError("Exception occurred while manually close the output stream to file "
+            + file + ", " + e.getMessage)
+      }
     }
   }
 

From 83d81058be5bcf8de8a06e3fae14ec7fd6955214 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 24 Aug 2021 23:33:00 +0800
Subject: [PATCH 130/169] [SPARK-35876][SQL][3.1] ArraysZip should retain field
 names to avoid being re-written by analyzer/optimizer

#### What changes were proposed in this pull request?
This PR fixes an issue that field names of structs generated by arrays_zip function could be unexpectedly re-written by analyzer/optimizer.
Here is an example.
```
val df = sc.parallelize(Seq((Array(1, 2), Array(3, 4)))).toDF("a1", "b1").selectExpr("arrays_zip(a1, b1) as zipped")
df.printSchema
root
 |-- zipped: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- a1: integer (nullable = true)                                      // OK. a1 is expected name
 |    |    |-- b1: integer (nullable = true)                                      // OK. b1 is expected name

df.explain
== Physical Plan ==
*(1) Project [arrays_zip(_1#3, _2#4) AS zipped#12]               // Not OK. field names are re-written as _1 and _2 respectively

df.write.parquet("/tmp/test.parquet")
val df2 = spark.read.parquet("/tmp/test.parquet")

df2.printSchema
root
 |-- zipped: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _1: integer (nullable = true)                                      // Not OK. a1 is expected but got _1
 |    |    |-- _2: integer (nullable = true)                                      // Not OK. b1 is expected but got _2
```
This issue happens when aliases are eliminated by AliasHelper.replaceAliasButKeepName or AliasHelper.trimNonTopLevelAliases called via analyzer/optimizer
spark/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala

Line 883 in b89cd8d

 upper.map(replaceAliasButKeepName(_, aliases))

spark/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala

Line 3759 in b89cd8d

 val cleanedProjectList = projectList.map(trimNonTopLevelAliases)

I investigated functions which can be affected this issue but I found only arrays_zip so far.
To fix this issue, this PR changes the definition of ArraysZip to retain field names to avoid being re-written by analyzer/optimizer.

### Why are the changes needed?
This is apparently a bug.

### Does this PR introduce any user-facing change?
No. After this change, the field names are no longer re-written but it should be expected behavior for users.

#### How was this patch tested?
New tests.

Closes #33810 from AngersZhuuuu/SPARK-35876-3.1.

Lead-authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Co-authored-by: Kousuke Saruta <sarutak@oss.nttdata.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      | 14 +++-
 .../expressions/collectionOperations.scala    | 40 +++++++++--
 .../spark/sql/DataFrameFunctionsSuite.scala   | 72 ++++++++++++++++++-
 3 files changed, 115 insertions(+), 11 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index a9ff6ee2ab206..b2af21d1bb6da 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -248,7 +248,7 @@ class Analyzer(override val catalogManager: CatalogManager)
       ResolvePartitionSpec ::
       AddMetadataColumns ::
       ResolveReferences ::
-      ResolveCreateNamedStruct ::
+      ResolveExpressionsWithNamePlaceholders ::
       ResolveDeserializer ::
       ResolveNewInstance ::
       ResolveUpCast ::
@@ -3832,10 +3832,18 @@ object TimeWindowing extends Rule[LogicalPlan] {
 }
 
 /**
- * Resolve a [[CreateNamedStruct]] if it contains [[NamePlaceholder]]s.
+ * Resolve expressions if they contains [[NamePlaceholder]]s.
  */
-object ResolveCreateNamedStruct extends Rule[LogicalPlan] {
+object ResolveExpressionsWithNamePlaceholders extends Rule[LogicalPlan] {
   override def apply(plan: LogicalPlan): LogicalPlan = plan.resolveExpressions {
+    case e: ArraysZip if !e.resolved =>
+      val names = e.children.zip(e.names).map {
+        case (e: NamedExpression, NamePlaceholder) if e.resolved =>
+          Literal(e.name)
+        case (_, other) => other
+      }
+      ArraysZip(e.children, names)
+
     case e: CreateNamedStruct if !e.resolved =>
       val children = e.children.grouped(2).flatMap {
         case Seq(NamePlaceholder, e: NamedExpression) if e.resolved =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 0765bfdd78fa6..b34189504b3ca 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -23,7 +23,7 @@ import scala.collection.mutable
 import scala.reflect.ClassTag
 
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion}
+import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.expressions.ArraySortLike.NullOrder
 import org.apache.spark.sql.catalyst.expressions.codegen._
 import org.apache.spark.sql.catalyst.expressions.codegen.Block._
@@ -174,16 +174,36 @@ case class MapKeys(child: Expression)
   """,
   group = "array_funcs",
   since = "2.4.0")
-case class ArraysZip(children: Seq[Expression]) extends Expression with ExpectsInputTypes {
+case class ArraysZip(children: Seq[Expression], names: Seq[Expression])
+  extends Expression with ExpectsInputTypes {
+
+  def this(children: Seq[Expression]) = {
+    this(
+      children,
+      children.zipWithIndex.map {
+        case (u: UnresolvedAttribute, _) => Literal(u.nameParts.last)
+        case (e: NamedExpression, _) if e.resolved => Literal(e.name)
+        case (e: NamedExpression, _) => NamePlaceholder
+        case (_, idx) => Literal(idx.toString)
+      })
+  }
+
+  if (children.size != names.size) {
+    throw new IllegalArgumentException(
+      "The numbers of zipped arrays and field names should be the same")
+  }
+
+  override lazy val resolved: Boolean =
+    childrenResolved && checkInputDataTypes().isSuccess && names.forall(_.resolved)
 
   override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.length)(ArrayType)
 
   @transient override lazy val dataType: DataType = {
-    val fields = children.zip(arrayElementTypes).zipWithIndex.map {
-      case ((expr: NamedExpression, elementType), _) =>
-        StructField(expr.name, elementType, nullable = true)
-      case ((_, elementType), idx) =>
-        StructField(idx.toString, elementType, nullable = true)
+    val fields = arrayElementTypes.zip(names).map {
+      case (elementType, Literal(name, StringType)) =>
+        StructField(name.toString, elementType, nullable = true)
+      case _ =>
+        throw new IllegalStateException("Schema name of arrays_zip should be string literal.")
     }
     ArrayType(StructType(fields), containsNull = false)
   }
@@ -322,6 +342,12 @@ case class ArraysZip(children: Seq[Expression]) extends Expression with ExpectsI
   override def prettyName: String = "arrays_zip"
 }
 
+object ArraysZip {
+  def apply(children: Seq[Expression]): ArraysZip = {
+    new ArraysZip(children)
+  }
+}
+
 /**
  * Returns an unordered array containing the values of the map.
  */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 9baad7c026693..1c40215b8b243 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql
 
+import java.io.File
 import java.nio.charset.StandardCharsets
 import java.sql.{Date, Timestamp}
 
@@ -24,7 +25,8 @@ import scala.util.Random
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.Expression
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
+import org.apache.spark.sql.catalyst.expressions.{Alias, ArraysZip, AttributeReference, Expression, NamedExpression}
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
 import org.apache.spark.sql.catalyst.plans.logical.OneRowRelation
 import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{withDefaultTimeZone, UTC}
@@ -552,6 +554,74 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSparkSession {
       Row(Seq(Row(0, 1, 2, 3, 4, 5))))
   }
 
+  test("SPARK-35876: arrays_zip should retain field names") {
+    withTempDir { dir =>
+      val df = spark.sparkContext.parallelize(
+        Seq((Seq(9001, 9002, 9003), Seq(4, 5, 6)))).toDF("val1", "val2")
+      val qualifiedDF = df.as("foo")
+
+      // Fields are UnresolvedAttribute
+      val zippedDF1 = qualifiedDF.select(arrays_zip($"foo.val1", $"foo.val2") as "zipped")
+      val maybeAlias1 = zippedDF1.queryExecution.logical.expressions.head
+      assert(maybeAlias1.isInstanceOf[Alias])
+      val maybeArraysZip1 = maybeAlias1.children.head
+      assert(maybeArraysZip1.isInstanceOf[ArraysZip])
+      assert(maybeArraysZip1.children.forall(_.isInstanceOf[UnresolvedAttribute]))
+      val file1 = new File(dir, "arrays_zip1")
+      zippedDF1.write.parquet(file1.getAbsolutePath)
+      val restoredDF1 = spark.read.parquet(file1.getAbsolutePath)
+      val fieldNames1 = restoredDF1.schema.head.dataType.asInstanceOf[ArrayType]
+        .elementType.asInstanceOf[StructType].fieldNames
+      assert(fieldNames1.toSeq === Seq("val1", "val2"))
+
+      // Fields are resolved NamedExpression
+      val zippedDF2 = df.select(arrays_zip(df("val1"), df("val2")) as "zipped")
+      val maybeAlias2 = zippedDF2.queryExecution.logical.expressions.head
+      assert(maybeAlias2.isInstanceOf[Alias])
+      val maybeArraysZip2 = maybeAlias2.children.head
+      assert(maybeArraysZip2.isInstanceOf[ArraysZip])
+      assert(maybeArraysZip2.children.forall(
+        e => e.isInstanceOf[AttributeReference] && e.resolved))
+      val file2 = new File(dir, "arrays_zip2")
+      zippedDF2.write.parquet(file2.getAbsolutePath)
+      val restoredDF2 = spark.read.parquet(file2.getAbsolutePath)
+      val fieldNames2 = restoredDF2.schema.head.dataType.asInstanceOf[ArrayType]
+        .elementType.asInstanceOf[StructType].fieldNames
+      assert(fieldNames2.toSeq === Seq("val1", "val2"))
+
+      // Fields are unresolved NamedExpression
+      val zippedDF3 = df.select(arrays_zip($"val1" as "val3", $"val2" as "val4") as "zipped")
+      val maybeAlias3 = zippedDF3.queryExecution.logical.expressions.head
+      assert(maybeAlias3.isInstanceOf[Alias])
+      val maybeArraysZip3 = maybeAlias3.children.head
+      assert(maybeArraysZip3.isInstanceOf[ArraysZip])
+      assert(maybeArraysZip3.children.forall(e => e.isInstanceOf[Alias] && !e.resolved))
+      val file3 = new File(dir, "arrays_zip3")
+      zippedDF3.write.parquet(file3.getAbsolutePath)
+      val restoredDF3 = spark.read.parquet(file3.getAbsolutePath)
+      val fieldNames3 = restoredDF3.schema.head.dataType.asInstanceOf[ArrayType]
+        .elementType.asInstanceOf[StructType].fieldNames
+      assert(fieldNames3.toSeq === Seq("val3", "val4"))
+
+      // Fields are neither UnresolvedAttribute nor NamedExpression
+      val zippedDF4 = df.select(arrays_zip(array_sort($"val1"), array_sort($"val2")) as "zipped")
+      val maybeAlias4 = zippedDF4.queryExecution.logical.expressions.head
+      assert(maybeAlias4.isInstanceOf[Alias])
+      val maybeArraysZip4 = maybeAlias4.children.head
+      assert(maybeArraysZip4.isInstanceOf[ArraysZip])
+      assert(maybeArraysZip4.children.forall {
+        case _: UnresolvedAttribute | _: NamedExpression => false
+        case _ => true
+      })
+      val file4 = new File(dir, "arrays_zip4")
+      zippedDF4.write.parquet(file4.getAbsolutePath)
+      val restoredDF4 = spark.read.parquet(file4.getAbsolutePath)
+      val fieldNames4 = restoredDF4.schema.head.dataType.asInstanceOf[ArrayType]
+        .elementType.asInstanceOf[StructType].fieldNames
+      assert(fieldNames4.toSeq === Seq("0", "1"))
+    }
+  }
+
   def testSizeOfMap(sizeOfNull: Any): Unit = {
     val df = Seq(
       (Map[Int, Int](1 -> 1, 2 -> 2), "x"),

From 8ffb3f082789e74cbd2737865a45565cb018e127 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Tue, 24 Aug 2021 13:33:42 -0700
Subject: [PATCH 131/169] [SPARK-36564][CORE] Fix NullPointerException in
 LiveRDDDistribution.toApi

### What changes were proposed in this pull request?

This PR fixes `NullPointerException` in `LiveRDDDistribution.toApi`.

### Why are the changes needed?

Looking at the stack trace, the NPE is caused by the null `exec.hostPort`. I can't get the complete log to take a close look but only guess that it might be due to the event `SparkListenerBlockManagerAdded` is dropped or out of order.

```
21/08/23 12:26:29 ERROR AsyncEventQueue: Listener AppStatusListener threw an exception
java.lang.NullPointerException
	at com.google.common.base.Preconditions.checkNotNull(Preconditions.java:192)
	at com.google.common.collect.MapMakerInternalMap.putIfAbsent(MapMakerInternalMap.java:3507)
	at com.google.common.collect.Interners$WeakInterner.intern(Interners.java:85)
	at org.apache.spark.status.LiveEntityHelpers$.weakIntern(LiveEntity.scala:696)
	at org.apache.spark.status.LiveRDDDistribution.toApi(LiveEntity.scala:563)
	at org.apache.spark.status.LiveRDD.$anonfun$doUpdate$4(LiveEntity.scala:629)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238)
	at scala.collection.mutable.HashMap$$anon$2.$anonfun$foreach$3(HashMap.scala:158)
	at scala.collection.mutable.HashTable.foreachEntry(HashTable.scala:237)
	at scala.collection.mutable.HashTable.foreachEntry$(HashTable.scala:230)
	at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:44)
	at scala.collection.mutable.HashMap$$anon$2.foreach(HashMap.scala:158)
	at scala.collection.TraversableLike.map(TraversableLike.scala:238)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:231)
	at scala.collection.AbstractTraversable.map(Traversable.scala:108)
	at org.apache.spark.status.LiveRDD.doUpdate(LiveEntity.scala:629)
	at org.apache.spark.status.LiveEntity.write(LiveEntity.scala:51)
	at org.apache.spark.status.AppStatusListener.update(AppStatusListener.scala:1206)
	at org.apache.spark.status.AppStatusListener.maybeUpdate(AppStatusListener.scala:1212)
	at org.apache.spark.status.AppStatusListener.$anonfun$onExecutorMetricsUpdate$6(AppStatusListener.scala:956)
	...
```

### Does this PR introduce _any_ user-facing change?

Yes, users will see the expected RDD info in UI instead of the NPE error.

 ### How was this patch tested?

Pass existing tests.

Closes #33812 from Ngone51/fix-hostport-npe.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
(cherry picked from commit d6c453aaea06327b37ab13b03a35a23a8225f010)
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 core/src/main/scala/org/apache/spark/status/LiveEntity.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
index d5cfdcb5842eb..9f0720bb9d9b6 100644
--- a/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
+++ b/core/src/main/scala/org/apache/spark/status/LiveEntity.scala
@@ -556,7 +556,7 @@ private class LiveRDDDistribution(exec: LiveExecutor) {
   def toApi(): v1.RDDDataDistribution = {
     if (lastUpdate == null) {
       lastUpdate = new v1.RDDDataDistribution(
-        weakIntern(exec.hostPort),
+        weakIntern(if (exec.hostPort != null) exec.hostPort else exec.host),
         memoryUsed,
         exec.maxMemory - exec.memoryUsed,
         diskUsed,

From 6d49a3168388abc2a38c6e76b7aa6e47b3a92986 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 25 Aug 2021 13:54:39 +0800
Subject: [PATCH 132/169] [SPARK-36352][SQL][3.1] Spark should check result
 plan's output schema name

### What changes were proposed in this pull request?
Spark should check result plan's output schema name

### Why are the changes needed?
In current code, some optimizer rule may change plan's output schema, since in the code we always use semantic equal to check output, but it may change the plan's output schema.
For example, for SchemaPruning, if we have a plan
```
Project[a, B]
|--Scan[A, b, c]
```
the origin output schema is `a, B`, after SchemaPruning. it become
```
Project[A, b]
|--Scan[A, b]
```
It change the plan's schema. when we use CTAS, the schema is same as query plan's output.
Then since we change the schema, it not consistent with origin SQL. So we need to check final result plan's schema with origin plan's schema

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
existed UT

Closes #33703 from AngersZhuuuu/SPARK-36352-3.1.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/analysis/Analyzer.scala      |  6 +++--
 .../sql/catalyst/optimizer/Optimizer.scala    | 23 ++++++++-----------
 .../sql/catalyst/rules/RuleExecutor.scala     |  6 ++---
 .../org/apache/spark/sql/types/DataType.scala |  2 +-
 .../apache/spark/sql/util/SchemaUtils.scala   | 11 +++++++++
 .../catalyst/trees/RuleExecutorSuite.scala    |  8 +++++--
 .../sql/execution/adaptive/AQEOptimizer.scala | 12 ++++++----
 .../datasources/DataSourceStrategy.scala      |  2 +-
 .../execution/datasources/SchemaPruning.scala | 10 ++++----
 .../v2/V2ScanRelationPushDown.scala           |  3 ++-
 10 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index b2af21d1bb6da..62bfd5395f4db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -157,8 +157,10 @@ class Analyzer(override val catalogManager: CatalogManager)
 
   private val v1SessionCatalog: SessionCatalog = catalogManager.v1SessionCatalog
 
-  override protected def isPlanIntegral(plan: LogicalPlan): Boolean = {
-    !Utils.isTesting || LogicalPlanIntegrity.checkIfExprIdsAreGloballyUnique(plan)
+  override protected def isPlanIntegral(
+      previousPlan: LogicalPlan,
+      currentPlan: LogicalPlan): Boolean = {
+    !Utils.isTesting || LogicalPlanIntegrity.checkIfExprIdsAreGloballyUnique(currentPlan)
   }
 
   override def isView(nameParts: Seq[String]): Boolean = v1SessionCatalog.isView(nameParts)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 620ae21eaf19b..a20ade3c2400c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.connector.catalog.CatalogManager
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.util.SchemaUtils._
 import org.apache.spark.util.Utils
 
 /**
@@ -44,10 +45,14 @@ abstract class Optimizer(catalogManager: CatalogManager)
   // - is still resolved
   // - only host special expressions in supported operators
   // - has globally-unique attribute IDs
-  override protected def isPlanIntegral(plan: LogicalPlan): Boolean = {
-    !Utils.isTesting || (plan.resolved &&
-      plan.find(PlanHelper.specialExpressionsInUnsupportedOperator(_).nonEmpty).isEmpty &&
-      LogicalPlanIntegrity.checkIfExprIdsAreGloballyUnique(plan))
+  // - optimized plan have same schema with previous plan.
+  override protected def isPlanIntegral(
+      previousPlan: LogicalPlan,
+      currentPlan: LogicalPlan): Boolean = {
+    !Utils.isTesting || (currentPlan.resolved &&
+      currentPlan.find(PlanHelper.specialExpressionsInUnsupportedOperator(_).nonEmpty).isEmpty &&
+      LogicalPlanIntegrity.checkIfExprIdsAreGloballyUnique(currentPlan) &&
+      DataType.equalsIgnoreNullability(previousPlan.schema, currentPlan.schema))
   }
 
   override protected val excludedOnceBatches: Set[String] =
@@ -489,16 +494,6 @@ object RemoveRedundantAliases extends Rule[LogicalPlan] {
  * Remove no-op operators from the query plan that do not make any modifications.
  */
 object RemoveNoopOperators extends Rule[LogicalPlan] {
-  def restoreOriginalOutputNames(
-      projectList: Seq[NamedExpression],
-      originalNames: Seq[String]): Seq[NamedExpression] = {
-    projectList.zip(originalNames).map {
-      case (attr: Attribute, name) => attr.withName(name)
-      case (alias: Alias, name) => alias.withName(name)
-      case (other, _) => other
-    }
-  }
-
   def apply(plan: LogicalPlan): LogicalPlan = plan transform {
     // Eliminate no-op Projects
     case p @ Project(projectList, child) if child.sameOutput(p) =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
index d5b0884f6ff13..45910f2e839b3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/rules/RuleExecutor.scala
@@ -156,7 +156,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
    * `Optimizer`, so we can catch rules that return invalid plans. The check function returns
    * `false` if the given plan doesn't pass the structural integrity check.
    */
-  protected def isPlanIntegral(plan: TreeType): Boolean = true
+  protected def isPlanIntegral(previousPlan: TreeType, currentPlan: TreeType): Boolean = true
 
   /**
    * Util method for checking whether a plan remains the same if re-optimized.
@@ -196,7 +196,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
     val beforeMetrics = RuleExecutor.getCurrentMetrics()
 
     // Run the structural integrity checker against the initial input
-    if (!isPlanIntegral(plan)) {
+    if (!isPlanIntegral(plan, plan)) {
       val message = "The structural integrity of the input plan is broken in " +
         s"${this.getClass.getName.stripSuffix("$")}."
       throw new TreeNodeException(plan, message, null)
@@ -229,7 +229,7 @@ abstract class RuleExecutor[TreeType <: TreeNode[_]] extends Logging {
             tracker.foreach(_.recordRuleInvocation(rule.ruleName, runTime, effective))
 
             // Run the structural integrity checker against the plan after each rule.
-            if (effective && !isPlanIntegral(result)) {
+            if (effective && !isPlanIntegral(plan, result)) {
               val message = s"After applying rule ${rule.ruleName} in batch ${batch.name}, " +
                 "the structural integrity of the plan is broken."
               throw new TreeNodeException(result, message, null)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
index 9e820f0796a96..016d0bc7e9030 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -276,7 +276,7 @@ object DataType {
   /**
    * Compares two types, ignoring nullability of ArrayType, MapType, StructType.
    */
-  private[types] def equalsIgnoreNullability(left: DataType, right: DataType): Boolean = {
+  private[sql] def equalsIgnoreNullability(left: DataType, right: DataType): Boolean = {
     (left, right) match {
       case (ArrayType(leftElementType, _), ArrayType(rightElementType, _)) =>
         equalsIgnoreNullability(leftElementType, rightElementType)
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala
index 4aae7bd7dc408..ccbe6984bbf83 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SchemaUtils.scala
@@ -21,6 +21,7 @@ import java.util.Locale
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.analysis._
+import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, NamedExpression}
 import org.apache.spark.sql.connector.expressions.{BucketTransform, FieldReference, NamedTransform, Transform}
 import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructField, StructType}
 
@@ -201,6 +202,16 @@ private[spark] object SchemaUtils {
     }
   }
 
+  def restoreOriginalOutputNames(
+       projectList: Seq[NamedExpression],
+       originalNames: Seq[String]): Seq[NamedExpression] = {
+    projectList.zip(originalNames).map {
+      case (attr: Attribute, name) => attr.withName(name)
+      case (alias: Alias, name) => alias.withName(name)
+      case (other, _) => other
+    }
+  }
+
   /**
    * Returns the given column's ordinal within the given `schema`. The length of the returned
    * position will be as long as how nested the column is.
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/RuleExecutorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/RuleExecutorSuite.scala
index 60dd9a1fe8a12..5855289692d42 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/RuleExecutorSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/RuleExecutorSuite.scala
@@ -75,7 +75,9 @@ class RuleExecutorSuite extends SparkFunSuite {
 
   test("structural integrity checker - verify initial input") {
     object WithSIChecker extends RuleExecutor[Expression] {
-      override protected def isPlanIntegral(expr: Expression): Boolean = expr match {
+      override protected def isPlanIntegral(
+          previousPlan: Expression,
+          currentPlan: Expression): Boolean = currentPlan match {
         case IntegerLiteral(_) => true
         case _ => false
       }
@@ -93,7 +95,9 @@ class RuleExecutorSuite extends SparkFunSuite {
 
   test("structural integrity checker - verify rule execution result") {
     object WithSICheckerForPositiveLiteral extends RuleExecutor[Expression] {
-      override protected def isPlanIntegral(expr: Expression): Boolean = expr match {
+      override protected def isPlanIntegral(
+          previousPlan: Expression,
+          currentPlan: Expression): Boolean = currentPlan match {
         case IntegerLiteral(i) if i > 0 => true
         case _ => false
       }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEOptimizer.scala
index 04b8ade8ac629..bddb81ccc10c8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEOptimizer.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEOptimizer.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.execution.adaptive
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, LogicalPlanIntegrity, PlanHelper}
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.DataType
 import org.apache.spark.util.Utils
 
 /**
@@ -55,9 +56,12 @@ class AQEOptimizer(conf: SQLConf) extends RuleExecutor[LogicalPlan] {
     }
   }
 
-  override protected def isPlanIntegral(plan: LogicalPlan): Boolean = {
-    !Utils.isTesting || (plan.resolved &&
-      plan.find(PlanHelper.specialExpressionsInUnsupportedOperator(_).nonEmpty).isEmpty &&
-      LogicalPlanIntegrity.checkIfExprIdsAreGloballyUnique(plan))
+  override protected def isPlanIntegral(
+      previousPlan: LogicalPlan,
+      currentPlan: LogicalPlan): Boolean = {
+    !Utils.isTesting || (currentPlan.resolved &&
+      currentPlan.find(PlanHelper.specialExpressionsInUnsupportedOperator(_).nonEmpty).isEmpty &&
+      LogicalPlanIntegrity.checkIfExprIdsAreGloballyUnique(currentPlan) &&
+      DataType.equalsIgnoreNullability(previousPlan.schema, currentPlan.schema))
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
index e4f001d61a767..439fe75e430ae 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -471,7 +471,7 @@ object DataSourceStrategy
    */
   protected[sql] def normalizeExprs(
       exprs: Seq[Expression],
-      attributes: Seq[AttributeReference]): Seq[Expression] = {
+      attributes: Seq[Attribute]): Seq[Expression] = {
     exprs.map { e =>
       e transform {
         case a: AttributeReference =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala
index 76a6a48ca0b0c..55ae49e31a1a5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SchemaPruning.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructType}
+import org.apache.spark.sql.util.SchemaUtils._
 
 /**
  * Prunes unnecessary physical columns given a [[PhysicalOperation]] over a data source relation.
@@ -83,8 +84,8 @@ object SchemaPruning extends Rule[LogicalPlan] {
         val prunedRelation = leafNodeBuilder(prunedDataSchema)
         val projectionOverSchema = ProjectionOverSchema(prunedDataSchema)
 
-        Some(buildNewProjection(normalizedProjects, normalizedFilters, prunedRelation,
-          projectionOverSchema))
+        Some(buildNewProjection(projects, normalizedProjects, normalizedFilters,
+          prunedRelation, projectionOverSchema))
       } else {
         None
       }
@@ -126,6 +127,7 @@ object SchemaPruning extends Rule[LogicalPlan] {
    */
   private def buildNewProjection(
       projects: Seq[NamedExpression],
+      normalizedProjects: Seq[NamedExpression],
       filters: Seq[Expression],
       leafNode: LeafNode,
       projectionOverSchema: ProjectionOverSchema): Project = {
@@ -144,7 +146,7 @@ object SchemaPruning extends Rule[LogicalPlan] {
 
     // Construct the new projections of our Project by
     // rewriting the original projections
-    val newProjects = projects.map(_.transformDown {
+    val newProjects = normalizedProjects.map(_.transformDown {
       case projectionOverSchema(expr) => expr
     }).map { case expr: NamedExpression => expr }
 
@@ -152,7 +154,7 @@ object SchemaPruning extends Rule[LogicalPlan] {
       logDebug(s"New projects:\n${newProjects.map(_.treeString).mkString("\n")}")
     }
 
-    Project(newProjects, projectionChild)
+    Project(restoreOriginalOutputNames(newProjects, projects.map(_.name)), projectionChild)
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala
index d2180566790ac..daef7571806f7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.connector.read.{Scan, V1Scan}
 import org.apache.spark.sql.execution.datasources.DataSourceStrategy
 import org.apache.spark.sql.sources
 import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.util.SchemaUtils._
 
 object V2ScanRelationPushDown extends Rule[LogicalPlan] {
   import DataSourceV2Implicits._
@@ -79,7 +80,7 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] {
         val newProjects = normalizedProjects
           .map(projectionFunc)
           .asInstanceOf[Seq[NamedExpression]]
-        Project(newProjects, withFilter)
+        Project(restoreOriginalOutputNames(newProjects, project.map(_.name)), withFilter)
       } else {
         withFilter
       }

From 491e1edb94547da87a25d3b01aa96ef8b7408df9 Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.com>
Date: Sat, 28 Aug 2021 18:01:55 +0900
Subject: [PATCH 133/169] [SPARK-36509][CORE] Fix the issue that executors are
 never re-scheduled if the worker stops with standalone cluster

### What changes were proposed in this pull request?

This PR fixes an issue that executors are never re-scheduled if the worker which the executors run on stops.
As a result, the application stucks.
You can easily reproduce this issue by the following procedures.

```
# Run master
$ sbin/start-master.sh

# Run worker 1
$ SPARK_LOG_DIR=/tmp/worker1 SPARK_PID_DIR=/tmp/worker1/ sbin/start-worker.sh -c 1 -h localhost -d /tmp/worker1 --webui-port 8081 spark://<hostname>:7077

# Run worker 2
$ SPARK_LOG_DIR=/tmp/worker2 SPARK_PID_DIR=/tmp/worker2/ sbin/start-worker.sh -c 1 -h localhost -d /tmp/worker2 --webui-port 8082 spark://<hostname>:7077

# Run Spark Shell
$ bin/spark-shell --master spark://<hostname>:7077 --executor-cores 1 --total-executor-cores 1

# Check which worker the executor runs on and then kill the worker.
$ kill <worker pid>
```

With the procedure above, we will expect that the executor is re-scheduled on the other worker but it won't.

The reason seems that `Master.schedule` cannot be called after the worker is marked as `WorkerState.DEAD`.
So, the solution this PR proposes is to call `Master.schedule` whenever `Master.removeWorker` is called.

This PR also fixes an issue that `ExecutorRunner` can send `ExecutorStateChanged` message without changing its state.
This issue causes assertion error.
```
2021-08-13 14:05:37,991 [dispatcher-event-loop-9] ERROR: Ignoring errorjava.lang.AssertionError: assertion failed: executor 0 state transfer from RUNNING to RUNNING is illegal
```

### Why are the changes needed?

It's a critical bug.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Manually tested with the procedure shown above and confirmed the executor is re-scheduled.

Closes #33818 from sarutak/fix-scheduling-stuck.

Authored-by: Kousuke Saruta <sarutak@oss.nttdata.com>
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
(cherry picked from commit ea8c31e5ea233da4407f6821b2d6dd7f3c88f8d9)
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
---
 core/src/main/scala/org/apache/spark/deploy/master/Master.scala | 1 +
 .../scala/org/apache/spark/deploy/worker/ExecutorRunner.scala   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
index 5ee5c7dcdc665..8a70c6edc44cf 100644
--- a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -967,6 +967,7 @@ private[deploy] class Master(
       app.driver.send(WorkerRemoved(worker.id, worker.host, msg))
     }
     persistenceEngine.removeWorker(worker)
+    schedule()
   }
 
   private def relaunchDriver(driver: DriverInfo): Unit = {
diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
index 974c2d670c234..40d940778e51b 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/ExecutorRunner.scala
@@ -83,7 +83,7 @@ private[deploy] class ExecutorRunner(
     shutdownHook = ShutdownHookManager.addShutdownHook { () =>
       // It's possible that we arrive here before calling `fetchAndRunExecutor`, then `state` will
       // be `ExecutorState.LAUNCHING`. In this case, we should set `state` to `FAILED`.
-      if (state == ExecutorState.LAUNCHING) {
+      if (state == ExecutorState.LAUNCHING || state == ExecutorState.RUNNING) {
         state = ExecutorState.FAILED
       }
       killProcess(Some("Worker shutting down")) }

From f001d98bdc832a4e695f41fb5d38f095f342713f Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Mon, 30 Aug 2021 09:09:22 -0700
Subject: [PATCH 134/169] [SPARK-36614][CORE][UI] Correct executor loss reason
 caused by decommission in UI

### What changes were proposed in this pull request?

Post the correct executor loss reason to UI.

### Why are the changes needed?

To show the accurate loss reason.

### Does this PR introduce _any_ user-facing change?

Yes. Users can see the difference from the UI.

Before:
<img width="509" alt="WeChataad8d1f27d9f9aa7cf93ced4bcc820e2" src="https://user-images.githubusercontent.com/16397174/131341692-6f412607-87b8-405e-822d-0d28f07928da.png">
<img width="1138" alt="WeChat13c9f1345a096ff83d193e4e9853b165" src="https://user-images.githubusercontent.com/16397174/131341699-f2c9de09-635f-49df-8e27-2495f34276c0.png">

After:

<img width="599" alt="WeChata4313fa2dbf27bf2dbfaef5c1d4a19cf" src="https://user-images.githubusercontent.com/16397174/131341754-e3c93b5d-5252-4006-a4cc-94d76f41303b.png">
<img width="1182" alt="WeChat5559d52fd3070ae6c42fe32d56f9dc94" src="https://user-images.githubusercontent.com/16397174/131341761-e1e0644f-1e76-49c0-915a-26aad77ec272.png">

### How was this patch tested?

Manully tested.

Closes #33868 from Ngone51/fix-executor-remove-reason.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
(cherry picked from commit ebe7bb62176ac3c29b0c238e411a0dc989371c33)
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
---
 .../scheduler/cluster/CoarseGrainedSchedulerBackend.scala     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index ccb5eb115882e..548cab928e140 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -409,8 +409,8 @@ class CoarseGrainedSchedulerBackend(scheduler: TaskSchedulerImpl, val rpcEnv: Rp
           totalCoreCount.addAndGet(-executorInfo.totalCores)
           totalRegisteredExecutors.addAndGet(-1)
           scheduler.executorLost(executorId, lossReason)
-          listenerBus.post(
-            SparkListenerExecutorRemoved(System.currentTimeMillis(), executorId, reason.toString))
+          listenerBus.post(SparkListenerExecutorRemoved(
+            System.currentTimeMillis(), executorId, lossReason.toString))
         case None =>
           // SPARK-15262: If an executor is still alive even after the scheduler has removed
           // its metadata, we may receive a heartbeat from that executor and tell its block

From cabf1bf45988c78785b1f206ad270f5f3fdf9f5f Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.com>
Date: Fri, 3 Sep 2021 23:25:18 +0900
Subject: [PATCH 135/169] [SPARK-36639][SQL] Fix an issue that sequence builtin
 function causes ArrayIndexOutOfBoundsException if the arguments are under the
 condition of start == stop && step < 0

### What changes were proposed in this pull request?

This PR fixes an issue that `sequence` builtin function causes `ArrayIndexOutOfBoundsException` if the arguments are under the condition of `start == stop && step < 0`.
This is an example.
```
SELECT sequence(timestamp'2021-08-31', timestamp'2021-08-31', -INTERVAL 1 month);
21/09/02 04:14:42 ERROR SparkSQLDriver: Failed in [SELECT sequence(timestamp'2021-08-31', timestamp'2021-08-31', -INTERVAL 1 month)]
java.lang.ArrayIndexOutOfBoundsException: 1
```
Actually, this example succeeded before SPARK-31980 (#28819) was merged.

### Why are the changes needed?

Bug fix.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

New tests.

Closes #33895 from sarutak/fix-sequence-issue.

Authored-by: Kousuke Saruta <sarutak@oss.nttdata.com>
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
(cherry picked from commit cf3bc65e69dcb0f8ba3dee89642d082265edab31)
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
---
 .../expressions/collectionOperations.scala     |  4 ++--
 .../CollectionExpressionsSuite.scala           | 18 ++++++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index b34189504b3ca..bb2163c1e847f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -2711,7 +2711,7 @@ object Sequence {
         val maxEstimatedArrayLength =
           getSequenceLength(startMicros, stopMicros, intervalStepInMicros)
 
-        val stepSign = if (stopMicros >= startMicros) +1 else -1
+        val stepSign = if (intervalStepInMicros > 0) +1 else -1
         val exclusiveItem = stopMicros + stepSign
         val arr = new Array[T](maxEstimatedArrayLength)
         var t = startMicros
@@ -2786,7 +2786,7 @@ object Sequence {
          |
          |  $sequenceLengthCode
          |
-         |  final int $stepSign = $stopMicros >= $startMicros ? +1 : -1;
+         |  final int $stepSign = $intervalInMicros > 0 ? +1 : -1;
          |  final long $exclusiveItem = $stopMicros + $stepSign;
          |
          |  $arr = new $elemType[$arrLength];
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index 095894b9fffac..d79f06f556f11 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -1888,6 +1888,24 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
       Seq(Date.valueOf("2018-01-01")))
   }
 
+  test("SPARK-36639: Start and end equal in month range with a negative step") {
+    checkEvaluation(new Sequence(
+      Literal(Date.valueOf("2018-01-01")),
+      Literal(Date.valueOf("2018-01-01")),
+      Literal(stringToInterval("interval -1 day"))),
+      Seq(Date.valueOf("2018-01-01")))
+    checkEvaluation(new Sequence(
+      Literal(Date.valueOf("2018-01-01")),
+      Literal(Date.valueOf("2018-01-01")),
+      Literal(stringToInterval("interval -1 month"))),
+      Seq(Date.valueOf("2018-01-01")))
+    checkEvaluation(new Sequence(
+      Literal(Date.valueOf("2018-01-01")),
+      Literal(Date.valueOf("2018-01-01")),
+      Literal(stringToInterval("interval -1 year"))),
+      Seq(Date.valueOf("2018-01-01")))
+  }
+
   test("SPARK-33386: element_at ArrayIndexOutOfBoundsException") {
     Seq(true, false).foreach { ansiEnabled =>
       withSQLConf(SQLConf.ANSI_ENABLED.key -> ansiEnabled.toString) {

From 051073bf5cbd7dae415e30443c3e40d1b5040a5d Mon Sep 17 00:00:00 2001
From: Sean Owen <srowen@gmail.com>
Date: Sat, 11 Sep 2021 13:38:10 -0500
Subject: [PATCH 136/169] [SPARK-36704][CORE] Expand exception handling to more
 Java 9 cases where reflection is limited at runtime, when reflecting to
 manage DirectByteBuffer settings

### What changes were proposed in this pull request?

Improve exception handling in the Platform initialization, where it attempts to assess whether reflection is possible to modify DirectByteBuffer. This can apparently fail in more cases on Java 9+ than are currently handled, whereas Spark can continue without reflection if needed.

More detailed comments on the change inline.

### Why are the changes needed?

This exception seems to be possible and fails startup:

```
Caused by: java.lang.reflect.InaccessibleObjectException: Unable to make private java.nio.DirectByteBuffer(long,int) accessible: module java.base does not "opens java.nio" to unnamed module 71e9ddb4
        at java.base/java.lang.reflect.AccessibleObject.checkCanSetAccessible(AccessibleObject.java:357)
        at java.base/java.lang.reflect.AccessibleObject.checkCanSetAccessible(AccessibleObject.java:297)
        at java.base/java.lang.reflect.Constructor.checkCanSetAccessible(Constructor.java:188)
        at java.base/java.lang.reflect.Constructor.setAccessible(Constructor.java:181)
        at org.apache.spark.unsafe.Platform.<clinit>(Platform.java:56)
```

### Does this PR introduce _any_ user-facing change?

Should strictly allow Spark to continue in more cases.

### How was this patch tested?

Existing tests.

Closes #33947 from srowen/SPARK-36704.

Authored-by: Sean Owen <srowen@gmail.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
(cherry picked from commit e5283f5ed5efa5bf3652c3959166f59dc5b5daaa)
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../org/apache/spark/unsafe/Platform.java     | 87 +++++++++++--------
 1 file changed, 51 insertions(+), 36 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java
index dc8d6e3a2f0ba..12867627379fb 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/Platform.java
@@ -45,30 +45,20 @@ public final class Platform {
 
   private static final boolean unaligned;
 
-  // Access fields and constructors once and store them, for performance:
-
-  private static final Constructor<?> DBB_CONSTRUCTOR;
-  private static final Field DBB_CLEANER_FIELD;
-  static {
-    try {
-      Class<?> cls = Class.forName("java.nio.DirectByteBuffer");
-      Constructor<?> constructor = cls.getDeclaredConstructor(Long.TYPE, Integer.TYPE);
-      constructor.setAccessible(true);
-      Field cleanerField = cls.getDeclaredField("cleaner");
-      cleanerField.setAccessible(true);
-      DBB_CONSTRUCTOR = constructor;
-      DBB_CLEANER_FIELD = cleanerField;
-    } catch (ClassNotFoundException | NoSuchMethodException | NoSuchFieldException e) {
-      throw new IllegalStateException(e);
-    }
-  }
-
   // Split java.version on non-digit chars:
   private static final int majorVersion =
     Integer.parseInt(System.getProperty("java.version").split("\\D+")[0]);
 
+  // Access fields and constructors once and store them, for performance:
+  private static final Constructor<?> DBB_CONSTRUCTOR;
+  private static final Field DBB_CLEANER_FIELD;
   private static final Method CLEANER_CREATE_METHOD;
+
   static {
+    // At the end of this block, CLEANER_CREATE_METHOD should be non-null iff it's possible to use
+    // reflection to invoke it, which is not necessarily possible by default in Java 9+.
+    // Code below can test for null to see whether to use it.
+
     // The implementation of Cleaner changed from JDK 8 to 9
     String cleanerClassName;
     if (majorVersion < 9) {
@@ -77,28 +67,53 @@ public final class Platform {
       cleanerClassName = "jdk.internal.ref.Cleaner";
     }
     try {
-      Class<?> cleanerClass = Class.forName(cleanerClassName);
-      Method createMethod = cleanerClass.getMethod("create", Object.class, Runnable.class);
-      // Accessing jdk.internal.ref.Cleaner should actually fail by default in JDK 9+,
-      // unfortunately, unless the user has allowed access with something like
-      // --add-opens java.base/java.lang=ALL-UNNAMED  If not, we can't really use the Cleaner
-      // hack below. It doesn't break, just means the user might run into the default JVM limit
-      // on off-heap memory and increase it or set the flag above. This tests whether it's
-      // available:
+      Class<?> cls = Class.forName("java.nio.DirectByteBuffer");
+      Constructor<?> constructor = cls.getDeclaredConstructor(Long.TYPE, Integer.TYPE);
+      Field cleanerField = cls.getDeclaredField("cleaner");
       try {
-        createMethod.invoke(null, null, null);
-      } catch (IllegalAccessException e) {
-        // Don't throw an exception, but can't log here?
-        createMethod = null;
-      } catch (InvocationTargetException ite) {
-        // shouldn't happen; report it
-        throw new IllegalStateException(ite);
+        constructor.setAccessible(true);
+        cleanerField.setAccessible(true);
+      } catch (RuntimeException re) {
+        // This is a Java 9+ exception, so needs to be handled without importing it
+        if ("InaccessibleObjectException".equals(re.getClass().getSimpleName())) {
+          // Continue, but the constructor/field are not available
+          // See comment below for more context
+          constructor = null;
+          cleanerField = null;
+        } else {
+          throw re;
+        }
       }
-      CLEANER_CREATE_METHOD = createMethod;
-    } catch (ClassNotFoundException | NoSuchMethodException e) {
+      // Have to set these values no matter what:
+      DBB_CONSTRUCTOR = constructor;
+      DBB_CLEANER_FIELD = cleanerField;
+
+      // no point continuing if the above failed:
+      if (DBB_CONSTRUCTOR != null && DBB_CLEANER_FIELD != null) {
+        Class<?> cleanerClass = Class.forName(cleanerClassName);
+        Method createMethod = cleanerClass.getMethod("create", Object.class, Runnable.class);
+        // Accessing jdk.internal.ref.Cleaner should actually fail by default in JDK 9+,
+        // unfortunately, unless the user has allowed access with something like
+        // --add-opens java.base/java.lang=ALL-UNNAMED  If not, we can't really use the Cleaner
+        // hack below. It doesn't break, just means the user might run into the default JVM limit
+        // on off-heap memory and increase it or set the flag above. This tests whether it's
+        // available:
+        try {
+          createMethod.invoke(null, null, null);
+        } catch (IllegalAccessException e) {
+          // Don't throw an exception, but can't log here?
+          createMethod = null;
+        }
+        CLEANER_CREATE_METHOD = createMethod;
+      } else {
+        CLEANER_CREATE_METHOD = null;
+      }
+    } catch (ClassNotFoundException | NoSuchMethodException | NoSuchFieldException e) {
+      // These are all fatal in any Java version - rethrow (have to wrap as this is a static block)
       throw new IllegalStateException(e);
+    } catch (InvocationTargetException ite) {
+      throw new IllegalStateException(ite.getCause());
     }
-
   }
 
   /**

From 589d0d510f22d6bb90d23cf262988b7ee35c2d24 Mon Sep 17 00:00:00 2001
From: Fu Chen <cfmcgrady@gmail.com>
Date: Tue, 14 Sep 2021 09:26:11 +0900
Subject: [PATCH 137/169] [SPARK-36715][SQL] InferFiltersFromGenerate should
 not infer filter for udf

### What changes were proposed in this pull request?

Fix InferFiltersFromGenerate bug, InferFiltersFromGenerate should not infer filter for generate when the children contain an expression which is instance of `org.apache.spark.sql.catalyst.expressions.UserDefinedExpression`.
Before this pr, the following case will throw an exception.

```scala
spark.udf.register("vec", (i: Int) => (0 until i).toArray)
sql("select explode(vec(8)) as c1").show
```

```
Once strategy's idempotence is broken for batch Infer Filters
 GlobalLimit 21                                                        GlobalLimit 21
 +- LocalLimit 21                                                      +- LocalLimit 21
    +- Project [cast(c1#3 as string) AS c1#12]                            +- Project [cast(c1#3 as string) AS c1#12]
       +- Generate explode(vec(8)), false, [c1#3]                            +- Generate explode(vec(8)), false, [c1#3]
          +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8)))            +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8)))
!            +- OneRowRelation                                                     +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8)))
!                                                                                     +- OneRowRelation

java.lang.RuntimeException:
Once strategy's idempotence is broken for batch Infer Filters
 GlobalLimit 21                                                        GlobalLimit 21
 +- LocalLimit 21                                                      +- LocalLimit 21
    +- Project [cast(c1#3 as string) AS c1#12]                            +- Project [cast(c1#3 as string) AS c1#12]
       +- Generate explode(vec(8)), false, [c1#3]                            +- Generate explode(vec(8)), false, [c1#3]
          +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8)))            +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8)))
!            +- OneRowRelation                                                     +- Filter ((size(vec(8), true) > 0) AND isnotnull(vec(8)))
!                                                                                     +- OneRowRelation

	at org.apache.spark.sql.errors.QueryExecutionErrors$.onceStrategyIdempotenceIsBrokenForBatchError(QueryExecutionErrors.scala:1200)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.checkBatchIdempotence(RuleExecutor.scala:168)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:254)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:200)
	at scala.collection.immutable.List.foreach(List.scala:431)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:200)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:179)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:88)
	at org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:179)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$optimizedPlan$1(QueryExecution.scala:138)
	at org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:196)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:196)
	at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:134)
	at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:130)
	at org.apache.spark.sql.execution.QueryExecution.assertOptimized(QueryExecution.scala:148)
	at org.apache.spark.sql.execution.QueryExecution.$anonfun$executedPlan$1(QueryExecution.scala:166)
	at org.apache.spark.sql.execution.QueryExecution.withCteMap(QueryExecution.scala:73)
	at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:163)
	at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:163)
	at org.apache.spark.sql.execution.QueryExecution.simpleString(QueryExecution.scala:214)
	at org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$explainString(QueryExecution.scala:259)
	at org.apache.spark.sql.execution.QueryExecution.explainString(QueryExecution.scala:228)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3731)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2755)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2962)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:288)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:327)
	at org.apache.spark.sql.Dataset.show(Dataset.scala:807)
```

### Does this PR introduce _any_ user-facing change?

No, only bug fix.

### How was this patch tested?

Unit test.

Closes #33956 from cfmcgrady/SPARK-36715.

Authored-by: Fu Chen <cfmcgrady@gmail.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 52c5ff20ca132653f505040a4dff522b136d2626)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../sql/catalyst/optimizer/Optimizer.scala    |  3 ++-
 .../InferFiltersFromGenerateSuite.scala       | 24 ++++++++++++++++++-
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index a20ade3c2400c..ecaa6b96ca031 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -899,7 +899,8 @@ object InferFiltersFromGenerate extends Rule[LogicalPlan] {
     // like 'size([1, 2, 3]) > 0'. These do not show up in child's constraints and
     // then the idempotence will break.
     case generate @ Generate(e, _, _, _, _, _)
-      if !e.deterministic || e.children.forall(_.foldable) => generate
+      if !e.deterministic || e.children.forall(_.foldable) ||
+        e.children.exists(_.isInstanceOf[UserDefinedExpression]) => generate
 
     case generate @ Generate(g, _, false, _, _, _) if canInferFilters(g) =>
       // Exclude child's constraints to guarantee idempotency
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala
index 93a1d414ed403..800d37eaa0d4a 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala
@@ -17,14 +17,16 @@
 
 package org.apache.spark.sql.catalyst.optimizer
 
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules.RuleExecutor
-import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
+import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType}
 
 class InferFiltersFromGenerateSuite extends PlanTest {
   object Optimize extends RuleExecutor[LogicalPlan] {
@@ -111,4 +113,24 @@ class InferFiltersFromGenerateSuite extends PlanTest {
        comparePlans(optimized, originalQuery)
      }
    }
+
+  test("SPARK-36715: Don't infer filters from udf") {
+    Seq(Explode(_), PosExplode(_), Inline(_)).foreach { f =>
+      val returnSchema = ArrayType(StructType(Seq(
+        StructField("x", IntegerType),
+        StructField("y", StringType)
+      )))
+      val fakeUDF = ScalaUDF(
+        (i: Int) => Array(Row.fromSeq(Seq(1, "a")), Row.fromSeq(Seq(2, "b"))),
+        returnSchema, Literal(8) :: Nil,
+        Option(ExpressionEncoder[Int]().resolveAndBind()) :: Nil)
+      val generator = f(fakeUDF)
+      val originalQuery = OneRowRelation().generate(generator).analyze
+      val optimized = OptimizeInferAndConstantFold.execute(originalQuery)
+      val correctAnswer = OneRowRelation()
+        .generate(generator)
+        .analyze
+      comparePlans(optimized, correctAnswer)
+    }
+  }
 }

From ebe5b888ec6d0aa47d4d8fb4034641fbd44dd232 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Tue, 14 Sep 2021 18:25:47 +0800
Subject: [PATCH 138/169] [SPARK-36702][SQL] ArrayUnion handle duplicated
 Double.NaN and Float.Nan

### What changes were proposed in this pull request?
For query
```
select array_union(array(cast('nan' as double), cast('nan' as double)), array())
```
This returns [NaN, NaN], but it should return [NaN].
This issue is caused by `OpenHashSet` can't handle `Double.NaN` and `Float.NaN` too.
In this pr we add a wrap for OpenHashSet that can handle `null`, `Double.NaN`, `Float.NaN` together

### Why are the changes needed?
Fix bug

### Does this PR introduce _any_ user-facing change?
ArrayUnion won't show duplicated `NaN` value

### How was this patch tested?
Added UT

Closes #33955 from AngersZhuuuu/SPARK-36702-WrapOpenHashSet.

Lead-authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Co-authored-by: AngersZhuuuu <angers.zhu@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit f71f37755d581017f549ecc8683fb7afc2852c67)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/collectionOperations.scala    | 61 +++++++++++-----
 .../spark/sql/util/SQLOpenHashSet.scala       | 72 +++++++++++++++++++
 .../CollectionExpressionsSuite.scala          | 17 +++++
 3 files changed, 133 insertions(+), 17 deletions(-)
 create mode 100644 sql/catalyst/src/main/scala/org/apache/spark/sql/util/SQLOpenHashSet.scala

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index bb2163c1e847f..b829ac0265afd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.util.DateTimeConstants._
 import org.apache.spark.sql.catalyst.util.DateTimeUtils._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
+import org.apache.spark.sql.util.SQLOpenHashSet
 import org.apache.spark.unsafe.UTF8StringBuilder
 import org.apache.spark.unsafe.array.ByteArrayMethods
 import org.apache.spark.unsafe.array.ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH
@@ -3367,24 +3368,31 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
     if (TypeUtils.typeWithProperEquals(elementType)) {
       (array1, array2) =>
         val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
-        val hs = new OpenHashSet[Any]
-        var foundNullElement = false
+        val hs = new SQLOpenHashSet[Any]()
+        val isNaN = SQLOpenHashSet.isNaN(elementType)
         Seq(array1, array2).foreach { array =>
           var i = 0
           while (i < array.numElements()) {
             if (array.isNullAt(i)) {
-              if (!foundNullElement) {
+              if (!hs.containsNull) {
+                hs.addNull
                 arrayBuffer += null
-                foundNullElement = true
               }
             } else {
               val elem = array.get(i, elementType)
-              if (!hs.contains(elem)) {
-                if (arrayBuffer.size > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) {
-                  ArrayBinaryLike.throwUnionLengthOverflowException(arrayBuffer.size)
+              if (isNaN(elem)) {
+                if (!hs.containsNaN) {
+                  arrayBuffer += elem
+                  hs.addNaN
+                }
+              } else {
+                if (!hs.contains(elem)) {
+                  if (arrayBuffer.size > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) {
+                    ArrayBinaryLike.throwUnionLengthOverflowException(arrayBuffer.size)
+                  }
+                  arrayBuffer += elem
+                  hs.add(elem)
                 }
-                arrayBuffer += elem
-                hs.add(elem)
               }
             }
             i += 1
@@ -3441,13 +3449,12 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
       val ptName = CodeGenerator.primitiveTypeName(jt)
 
       nullSafeCodeGen(ctx, ev, (array1, array2) => {
-        val foundNullElement = ctx.freshName("foundNullElement")
         val nullElementIndex = ctx.freshName("nullElementIndex")
         val builder = ctx.freshName("builder")
         val array = ctx.freshName("array")
         val arrays = ctx.freshName("arrays")
         val arrayDataIdx = ctx.freshName("arrayDataIdx")
-        val openHashSet = classOf[OpenHashSet[_]].getName
+        val openHashSet = classOf[SQLOpenHashSet[_]].getName
         val classTag = s"scala.reflect.ClassTag$$.MODULE$$.$hsTypeName()"
         val hashSet = ctx.freshName("hashSet")
         val arrayBuilder = classOf[mutable.ArrayBuilder[_]].getName
@@ -3457,9 +3464,9 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
           if (dataType.asInstanceOf[ArrayType].containsNull) {
             s"""
                |if ($array.isNullAt($i)) {
-               |  if (!$foundNullElement) {
+               |  if (!$hashSet.containsNull()) {
                |    $nullElementIndex = $size;
-               |    $foundNullElement = true;
+               |    $hashSet.addNull();
                |    $size++;
                |    $builder.$$plus$$eq($nullValueHolder);
                |  }
@@ -3471,9 +3478,28 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
             body
           }
 
-        val processArray = withArrayNullAssignment(
+        def withNaNCheck(body: String): String = {
+          (elementType match {
+            case DoubleType => Some(s"java.lang.Double.isNaN((double)$value)")
+            case FloatType => Some(s"java.lang.Float.isNaN((float)$value)")
+            case _ => None
+          }).map { isNaN =>
+            s"""
+               |if ($isNaN) {
+               |  if (!$hashSet.containsNaN()) {
+               |     $size++;
+               |     $hashSet.addNaN();
+               |     $builder.$$plus$$eq($value);
+               |  }
+               |} else {
+               |  $body
+               |}
+             """.stripMargin
+          }
+        }.getOrElse(body)
+
+        val body =
           s"""
-             |$jt $value = ${genGetValue(array, i)};
              |if (!$hashSet.contains($hsValueCast$value)) {
              |  if (++$size > ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}) {
              |    break;
@@ -3481,12 +3507,13 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
              |  $hashSet.add$hsPostFix($hsValueCast$value);
              |  $builder.$$plus$$eq($value);
              |}
-           """.stripMargin)
+           """.stripMargin
+        val processArray =
+          withArrayNullAssignment(s"$jt $value = ${genGetValue(array, i)};" + withNaNCheck(body))
 
         // Only need to track null element index when result array's element is nullable.
         val declareNullTrackVariables = if (dataType.asInstanceOf[ArrayType].containsNull) {
           s"""
-             |boolean $foundNullElement = false;
              |int $nullElementIndex = -1;
            """.stripMargin
         } else {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SQLOpenHashSet.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SQLOpenHashSet.scala
new file mode 100644
index 0000000000000..5ffe733459fbd
--- /dev/null
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SQLOpenHashSet.scala
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.util
+
+import scala.reflect._
+
+import org.apache.spark.annotation.Private
+import org.apache.spark.sql.types.{DataType, DoubleType, FloatType}
+import org.apache.spark.util.collection.OpenHashSet
+
+// A wrap of OpenHashSet that can handle null, Double.NaN and Float.NaN w.r.t. the SQL semantic.
+@Private
+class SQLOpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
+    initialCapacity: Int,
+    loadFactor: Double) {
+
+  def this(initialCapacity: Int) = this(initialCapacity, 0.7)
+
+  def this() = this(64)
+
+  private val hashSet = new OpenHashSet[T](initialCapacity, loadFactor)
+
+  private var containNull = false
+  private var containNaN = false
+
+  def addNull(): Unit = {
+    containNull = true
+  }
+
+  def addNaN(): Unit = {
+    containNaN = true
+  }
+
+  def add(k: T): Unit = {
+    hashSet.add(k)
+  }
+
+  def contains(k: T): Boolean = {
+    hashSet.contains(k)
+  }
+
+  def containsNull(): Boolean = containNull
+
+  def containsNaN(): Boolean = containNaN
+}
+
+object SQLOpenHashSet {
+  def isNaN(dataType: DataType): Any => Boolean = {
+    dataType match {
+      case DoubleType =>
+        (value: Any) => java.lang.Double.isNaN(value.asInstanceOf[java.lang.Double])
+      case FloatType =>
+        (value: Any) => java.lang.Float.isNaN(value.asInstanceOf[java.lang.Float])
+      case _ => (_: Any) => false
+    }
+  }
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index d79f06f556f11..25e40c474e804 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -1948,4 +1948,21 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
       }
     }
   }
+
+  test("SPARK-36702: ArrayUnion should handle duplicated Double.NaN and Float.Nan") {
+    checkEvaluation(ArrayUnion(
+      Literal.apply(Array(Double.NaN, Double.NaN)), Literal.apply(Array(1d))),
+      Seq(Double.NaN, 1d))
+    checkEvaluation(ArrayUnion(
+      Literal.create(Seq(Double.NaN, null), ArrayType(DoubleType)),
+      Literal.create(Seq(Double.NaN, null, 1d), ArrayType(DoubleType))),
+      Seq(Double.NaN, null, 1d))
+    checkEvaluation(ArrayUnion(
+      Literal.apply(Array(Float.NaN, Float.NaN)), Literal.apply(Array(1f))),
+      Seq(Float.NaN, 1f))
+    checkEvaluation(ArrayUnion(
+      Literal.create(Seq(Float.NaN, null), ArrayType(FloatType)),
+      Literal.create(Seq(Float.NaN, null, 1f), ArrayType(FloatType))),
+      Seq(Float.NaN, null, 1f))
+  }
 }

From 29b38d3740f565d02b7eadeffd43af7694532af9 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 15 Sep 2021 22:04:09 +0800
Subject: [PATCH 139/169] [SPARK-36702][SQL][FOLLOWUP] ArrayUnion handle
 duplicated Double.NaN and Float.NaN

### What changes were proposed in this pull request?
According to https://github.com/apache/spark/pull/33955#discussion_r708570515 use normalized  NaN

### Why are the changes needed?
Use normalized NaN for duplicated NaN value

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Exiting UT

Closes #34003 from AngersZhuuuu/SPARK-36702-FOLLOWUP.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 638085953f931f98241856c9f652e5f15202fcc0)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/expressions/collectionOperations.scala | 13 ++++++++-----
 .../org/apache/spark/sql/util/SQLOpenHashSet.scala  |  8 ++++++++
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index b829ac0265afd..7b231fead012c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -3370,6 +3370,7 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
         val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
         val hs = new SQLOpenHashSet[Any]()
         val isNaN = SQLOpenHashSet.isNaN(elementType)
+        val valueNaN = SQLOpenHashSet.valueNaN(elementType)
         Seq(array1, array2).foreach { array =>
           var i = 0
           while (i < array.numElements()) {
@@ -3382,7 +3383,7 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
               val elem = array.get(i, elementType)
               if (isNaN(elem)) {
                 if (!hs.containsNaN) {
-                  arrayBuffer += elem
+                  arrayBuffer += valueNaN
                   hs.addNaN
                 }
               } else {
@@ -3480,16 +3481,18 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
 
         def withNaNCheck(body: String): String = {
           (elementType match {
-            case DoubleType => Some(s"java.lang.Double.isNaN((double)$value)")
-            case FloatType => Some(s"java.lang.Float.isNaN((float)$value)")
+            case DoubleType =>
+              Some((s"java.lang.Double.isNaN((double)$value)", "java.lang.Double.NaN"))
+            case FloatType =>
+              Some((s"java.lang.Float.isNaN((float)$value)", "java.lang.Float.NaN"))
             case _ => None
-          }).map { isNaN =>
+          }).map { case (isNaN, valueNaN) =>
             s"""
                |if ($isNaN) {
                |  if (!$hashSet.containsNaN()) {
                |     $size++;
                |     $hashSet.addNaN();
-               |     $builder.$$plus$$eq($value);
+               |     $builder.$$plus$$eq($valueNaN);
                |  }
                |} else {
                |  $body
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SQLOpenHashSet.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SQLOpenHashSet.scala
index 5ffe733459fbd..083cfddf07177 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SQLOpenHashSet.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SQLOpenHashSet.scala
@@ -69,4 +69,12 @@ object SQLOpenHashSet {
       case _ => (_: Any) => false
     }
   }
+
+  def valueNaN(dataType: DataType): Any = {
+    dataType match {
+      case DoubleType => java.lang.Double.NaN
+      case FloatType => java.lang.Float.NaN
+      case _ => null
+    }
+  }
 }

From 402ca59758b8a8c6353f8cc9195b721847a45ed6 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 15 Sep 2021 22:31:46 +0800
Subject: [PATCH 140/169] [SPARK-36755][SQL] ArraysOverlap should handle
 duplicated Double.NaN and Float.NaN

### What changes were proposed in this pull request?
For query
```
select arrays_overlap(array(cast('nan' as double), 1d), array(cast('nan' as double)))
```
This returns [false], but it should return [true].
This issue is caused by `scala.mutable.HashSet` can't handle `Double.NaN` and `Float.NaN`.

### Why are the changes needed?
Fix bug

### Does this PR introduce _any_ user-facing change?
arrays_overlap won't handle equal `NaN` value

### How was this patch tested?
Added UT

Closes #34006 from AngersZhuuuu/SPARK-36755.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit b665782f0d3729928be4ca897ec2eb990b714879)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../catalyst/expressions/collectionOperations.scala |  4 ++--
 .../expressions/CollectionExpressionsSuite.scala    | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 7b231fead012c..9f922d1d24a03 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -1262,12 +1262,12 @@ case class ArraysOverlap(left: Expression, right: Expression)
       (arr2, arr1)
     }
     if (smaller.numElements() > 0) {
-      val smallestSet = new mutable.HashSet[Any]
+      val smallestSet = new java.util.HashSet[Any]()
       smaller.foreach(elementType, (_, v) =>
         if (v == null) {
           hasNull = true
         } else {
-          smallestSet += v
+          smallestSet.add(v)
         })
       bigger.foreach(elementType, (_, v1) =>
         if (v1 == null) {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index 25e40c474e804..69a24d9ddc1cf 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -1965,4 +1965,17 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
       Literal.create(Seq(Float.NaN, null, 1f), ArrayType(FloatType))),
       Seq(Float.NaN, null, 1f))
   }
+
+  test("SPARK-36755: ArraysOverlap hould handle duplicated Double.NaN and Float.Nan") {
+    checkEvaluation(ArraysOverlap(
+      Literal.apply(Array(Double.NaN, 1d)), Literal.apply(Array(Double.NaN))), true)
+    checkEvaluation(ArraysOverlap(
+      Literal.create(Seq(Double.NaN, null), ArrayType(DoubleType)),
+      Literal.create(Seq(Double.NaN, null, 1d), ArrayType(DoubleType))), true)
+    checkEvaluation(ArraysOverlap(
+      Literal.apply(Array(Float.NaN)), Literal.apply(Array(Float.NaN, 1f))), true)
+    checkEvaluation(ArraysOverlap(
+      Literal.create(Seq(Float.NaN, null), ArrayType(FloatType)),
+      Literal.create(Seq(Float.NaN, null, 1f), ArrayType(FloatType))), true)
+  }
 }

From 8e3a91979a3dc0c3f2284609076e9657ec5335ed Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 17 Sep 2021 10:51:15 +0800
Subject: [PATCH 141/169] [SPARK-36783][SQL] ScanOperation should not push
 Filter through nondeterministic Project

### What changes were proposed in this pull request?

`ScanOperation` collects adjacent Projects and Filters. The caller side always assume that the collected Filters should run before collected Projects, which means `ScanOperation` effectively pushes Filter through Project.

Following `PushPredicateThroughNonJoin`, we should not push Filter through nondeterministic Project. This PR fixes `ScanOperation` to follow this rule.

### Why are the changes needed?

Fix a bug that violates the semantic of nondeterministic expressions.

### Does this PR introduce _any_ user-facing change?

Most likely no change, but in some cases, this is a correctness bug fix which changes the query result.

### How was this patch tested?

existing tests

Closes #34023 from cloud-fan/scan.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit dfd5237c0cd6e3024032b371f0182d2af691af7d)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/planning/patterns.scala      | 14 +++++++-----
 .../planning/ScanOperationSuite.scala         | 22 +++++--------------
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
index c22a874779fca..fac4d2a980ca2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/planning/patterns.scala
@@ -144,13 +144,15 @@ object ScanOperation extends OperationHelper with PredicateHelper {
       case Filter(condition, child) =>
         collectProjectsAndFilters(child) match {
           case Some((fields, filters, other, aliases)) =>
-            // Follow CombineFilters and only keep going if 1) the collected Filters
-            // and this filter are all deterministic or 2) if this filter is the first
-            // collected filter and doesn't have common non-deterministic expressions
-            // with lower Project.
+            // When collecting projects and filters, we effectively push down filters through
+            // projects. We need to meet the following conditions to do so:
+            //   1) no Project collected so far or the collected Projects are all deterministic
+            //   2) the collected filters and this filter are all deterministic, or this is the
+            //      first collected filter.
+            val canCombineFilters = fields.forall(_.forall(_.deterministic)) && {
+              filters.isEmpty || (filters.forall(_.deterministic) && condition.deterministic)
+            }
             val substitutedCondition = substitute(aliases)(condition)
-            val canCombineFilters = (filters.nonEmpty && filters.forall(_.deterministic) &&
-              substitutedCondition.deterministic) || filters.isEmpty
             if (canCombineFilters && !hasCommonNonDeterministic(Seq(condition), aliases)) {
               Some((fields, filters ++ splitConjunctivePredicates(substitutedCondition),
                 other, aliases))
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/planning/ScanOperationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/planning/ScanOperationSuite.scala
index 1290f770349e7..b1baeccbe94b9 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/planning/ScanOperationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/planning/ScanOperationSuite.scala
@@ -71,31 +71,19 @@ class ScanOperationSuite extends SparkFunSuite {
     }
   }
 
-  test("Filter which has the same non-deterministic expression with its child Project") {
-    val filter1 = Filter(EqualTo(colR, Literal(1)), Project(Seq(colA, aliasR), relation))
+  test("Filter with non-deterministic Project") {
+    val filter1 = Filter(EqualTo(colA, Literal(1)), Project(Seq(colA, aliasR), relation))
     assert(ScanOperation.unapply(filter1).isEmpty)
   }
 
-  test("Deterministic filter with a child Project with a non-deterministic expression") {
-    val filter2 = Filter(EqualTo(colA, Literal(1)), Project(Seq(colA, aliasR), relation))
-    filter2 match {
-      case ScanOperation(projects, filters, _: LocalRelation) =>
-        assert(projects.size === 2)
-        assert(projects(0) === colA)
-        assert(projects(1) === aliasR)
-        assert(filters.size === 1)
-      case _ => assert(false)
-    }
-  }
-
-  test("Filter which has different non-deterministic expressions with its child Project") {
+  test("Non-deterministic Filter with deterministic Project") {
     val filter3 = Filter(EqualTo(MonotonicallyIncreasingID(), Literal(1)),
-      Project(Seq(colA, aliasR), relation))
+      Project(Seq(colA, colB), relation))
     filter3 match {
       case ScanOperation(projects, filters, _: LocalRelation) =>
         assert(projects.size === 2)
         assert(projects(0) === colA)
-        assert(projects(1) === aliasR)
+        assert(projects(1) === colB)
         assert(filters.size === 1)
       case _ => assert(false)
     }

From 37a18712299b0593ece77e4be5fe1150ad1d72f5 Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Fri, 17 Sep 2021 16:49:54 +0900
Subject: [PATCH 142/169] [SPARK-36789][SQL] Use the correct constant type as
 the null value holder in array functions

### What changes were proposed in this pull request?

In array functions, we use constant 0 as the placeholder when adding a null value to an array buffer. This PR makes sure the constant 0 matches the type of the array element.

### Why are the changes needed?

Fix a potential bug. Somehow we can hit this bug sometimes after https://github.com/apache/spark/pull/33955 .

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

existing tests

Closes #34029 from cloud-fan/minor.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit 4145498826f288e610b00033d9fc2063fd1acc9f)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../spark/sql/catalyst/expressions/collectionOperations.scala | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 9f922d1d24a03..e273c60974e9a 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -3127,7 +3127,9 @@ trait ArraySetLike {
   @transient protected lazy val nullValueHolder = et match {
     case ByteType => "(byte) 0"
     case ShortType => "(short) 0"
-    case LongType => "(long) 0"
+    case LongType => "0L"
+    case FloatType => "0.0f"
+    case DoubleType => "0.0"
     case _ => "0"
   }
 

From 27ca66c4e331f6ed71c5328d2b63efdce25afb95 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Fri, 17 Sep 2021 20:48:17 +0800
Subject: [PATCH 143/169] [SPARK-36741][SQL] ArrayDistinct handle duplicated
 Double.NaN and Float.Nan

### What changes were proposed in this pull request?
For query
```
select array_distinct(array(cast('nan' as double), cast('nan' as double)))
```
This returns [NaN, NaN], but it should return [NaN].
This issue is caused by `OpenHashSet` can't handle `Double.NaN` and `Float.NaN` too.
In this pr fix this based on https://github.com/apache/spark/pull/33955

### Why are the changes needed?
Fix bug

### Does this PR introduce _any_ user-facing change?
ArrayDistinct won't show duplicated `NaN` value

### How was this patch tested?
Added UT

Closes #33993 from AngersZhuuuu/SPARK-36741.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit e356f6aa1119f4ceeafc7bcdea5f7b8f1f010638)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/collectionOperations.scala    | 124 ++++++++++--------
 .../spark/sql/util/SQLOpenHashSet.scala       |  54 ++++++--
 .../CollectionExpressionsSuite.scala          |   9 ++
 3 files changed, 121 insertions(+), 66 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index e273c60974e9a..6cf1eadff513c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -3205,32 +3205,59 @@ case class ArrayDistinct(child: Expression)
   }
 
   override def nullSafeEval(array: Any): Any = {
-    val data = array.asInstanceOf[ArrayData].toArray[AnyRef](elementType)
+    val data = array.asInstanceOf[ArrayData]
     doEvaluation(data)
   }
 
   @transient private lazy val doEvaluation = if (TypeUtils.typeWithProperEquals(elementType)) {
-    (data: Array[AnyRef]) => new GenericArrayData(data.distinct.asInstanceOf[Array[Any]])
+    (array: ArrayData) =>
+      val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
+      val hs = new SQLOpenHashSet[Any]()
+      val withNaNCheckFunc = SQLOpenHashSet.withNaNCheckFunc(elementType, hs,
+        (value: Any) =>
+          if (!hs.contains(value)) {
+            if (arrayBuffer.size > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) {
+              ArrayBinaryLike.throwUnionLengthOverflowException(arrayBuffer.size)
+            }
+            arrayBuffer += value
+            hs.add(value)
+          },
+        (valueNaN: Any) => arrayBuffer += valueNaN)
+      var i = 0
+      while (i < array.numElements()) {
+        if (array.isNullAt(i)) {
+          if (!hs.containsNull) {
+            hs.addNull
+            arrayBuffer += null
+          }
+        } else {
+          val elem = array.get(i, elementType)
+          withNaNCheckFunc(elem)
+        }
+        i += 1
+      }
+      new GenericArrayData(arrayBuffer.toSeq)
   } else {
-    (data: Array[AnyRef]) => {
+    (data: ArrayData) => {
+      val array = data.toArray[AnyRef](elementType)
       val arrayBuffer = new scala.collection.mutable.ArrayBuffer[AnyRef]
       var alreadyStoredNull = false
-      for (i <- 0 until data.length) {
-        if (data(i) != null) {
+      for (i <- 0 until array.length) {
+        if (array(i) != null) {
           var found = false
           var j = 0
           while (!found && j < arrayBuffer.size) {
             val va = arrayBuffer(j)
-            found = (va != null) && ordering.equiv(va, data(i))
+            found = (va != null) && ordering.equiv(va, array(i))
             j += 1
           }
           if (!found) {
-            arrayBuffer += data(i)
+            arrayBuffer += array(i)
           }
         } else {
           // De-duplicate the null values.
           if (!alreadyStoredNull) {
-            arrayBuffer += data(i)
+            arrayBuffer += array(i)
             alreadyStoredNull = true
           }
         }
@@ -3249,10 +3276,9 @@ case class ArrayDistinct(child: Expression)
       val ptName = CodeGenerator.primitiveTypeName(jt)
 
       nullSafeCodeGen(ctx, ev, (array) => {
-        val foundNullElement = ctx.freshName("foundNullElement")
         val nullElementIndex = ctx.freshName("nullElementIndex")
         val builder = ctx.freshName("builder")
-        val openHashSet = classOf[OpenHashSet[_]].getName
+        val openHashSet = classOf[SQLOpenHashSet[_]].getName
         val classTag = s"scala.reflect.ClassTag$$.MODULE$$.$hsTypeName()"
         val hashSet = ctx.freshName("hashSet")
         val arrayBuilder = classOf[mutable.ArrayBuilder[_]].getName
@@ -3261,7 +3287,6 @@ case class ArrayDistinct(child: Expression)
         // Only need to track null element index when array's element is nullable.
         val declareNullTrackVariables = if (dataType.asInstanceOf[ArrayType].containsNull) {
           s"""
-             |boolean $foundNullElement = false;
              |int $nullElementIndex = -1;
            """.stripMargin
         } else {
@@ -3272,9 +3297,9 @@ case class ArrayDistinct(child: Expression)
           if (dataType.asInstanceOf[ArrayType].containsNull) {
             s"""
                |if ($array.isNullAt($i)) {
-               |  if (!$foundNullElement) {
+               |  if (!$hashSet.containsNull()) {
                |    $nullElementIndex = $size;
-               |    $foundNullElement = true;
+               |    $hashSet.addNull();
                |    $size++;
                |    $builder.$$plus$$eq($nullValueHolder);
                |  }
@@ -3286,9 +3311,8 @@ case class ArrayDistinct(child: Expression)
             body
           }
 
-        val processArray = withArrayNullAssignment(
+        val body =
           s"""
-             |$jt $value = ${genGetValue(array, i)};
              |if (!$hashSet.contains($hsValueCast$value)) {
              |  if (++$size > ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}) {
              |    break;
@@ -3296,7 +3320,16 @@ case class ArrayDistinct(child: Expression)
              |  $hashSet.add$hsPostFix($hsValueCast$value);
              |  $builder.$$plus$$eq($value);
              |}
-           """.stripMargin)
+           """.stripMargin
+
+        val processArray = withArrayNullAssignment(
+          s"$jt $value = ${genGetValue(array, i)};" +
+            SQLOpenHashSet.withNaNCheckCode(elementType, value, hashSet, body,
+              (valueNaN: String) =>
+                s"""
+                   |$size++;
+                   |$builder.$$plus$$eq($valueNaN);
+                   |""".stripMargin))
 
         s"""
            |$openHashSet $hashSet = new $openHashSet$hsPostFix($classTag);
@@ -3371,8 +3404,16 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
       (array1, array2) =>
         val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
         val hs = new SQLOpenHashSet[Any]()
-        val isNaN = SQLOpenHashSet.isNaN(elementType)
-        val valueNaN = SQLOpenHashSet.valueNaN(elementType)
+        val withNaNCheckFunc = SQLOpenHashSet.withNaNCheckFunc(elementType, hs,
+          (value: Any) =>
+            if (!hs.contains(value)) {
+              if (arrayBuffer.size > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) {
+                ArrayBinaryLike.throwUnionLengthOverflowException(arrayBuffer.size)
+              }
+              arrayBuffer += value
+              hs.add(value)
+            },
+          (valueNaN: Any) => arrayBuffer += valueNaN)
         Seq(array1, array2).foreach { array =>
           var i = 0
           while (i < array.numElements()) {
@@ -3383,20 +3424,7 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
               }
             } else {
               val elem = array.get(i, elementType)
-              if (isNaN(elem)) {
-                if (!hs.containsNaN) {
-                  arrayBuffer += valueNaN
-                  hs.addNaN
-                }
-              } else {
-                if (!hs.contains(elem)) {
-                  if (arrayBuffer.size > ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH) {
-                    ArrayBinaryLike.throwUnionLengthOverflowException(arrayBuffer.size)
-                  }
-                  arrayBuffer += elem
-                  hs.add(elem)
-                }
-              }
+              withNaNCheckFunc(elem)
             }
             i += 1
           }
@@ -3481,28 +3509,6 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
             body
           }
 
-        def withNaNCheck(body: String): String = {
-          (elementType match {
-            case DoubleType =>
-              Some((s"java.lang.Double.isNaN((double)$value)", "java.lang.Double.NaN"))
-            case FloatType =>
-              Some((s"java.lang.Float.isNaN((float)$value)", "java.lang.Float.NaN"))
-            case _ => None
-          }).map { case (isNaN, valueNaN) =>
-            s"""
-               |if ($isNaN) {
-               |  if (!$hashSet.containsNaN()) {
-               |     $size++;
-               |     $hashSet.addNaN();
-               |     $builder.$$plus$$eq($valueNaN);
-               |  }
-               |} else {
-               |  $body
-               |}
-             """.stripMargin
-          }
-        }.getOrElse(body)
-
         val body =
           s"""
              |if (!$hashSet.contains($hsValueCast$value)) {
@@ -3513,8 +3519,14 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi
              |  $builder.$$plus$$eq($value);
              |}
            """.stripMargin
-        val processArray =
-          withArrayNullAssignment(s"$jt $value = ${genGetValue(array, i)};" + withNaNCheck(body))
+        val processArray = withArrayNullAssignment(
+          s"$jt $value = ${genGetValue(array, i)};" +
+            SQLOpenHashSet.withNaNCheckCode(elementType, value, hashSet, body,
+              (valueNaN: String) =>
+                s"""
+                   |$size++;
+                   |$builder.$$plus$$eq($valueNaN);
+                 """.stripMargin))
 
         // Only need to track null element index when result array's element is nullable.
         val declareNullTrackVariables = if (dataType.asInstanceOf[ArrayType].containsNull) {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SQLOpenHashSet.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SQLOpenHashSet.scala
index 083cfddf07177..e09cd95db50bd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SQLOpenHashSet.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/util/SQLOpenHashSet.scala
@@ -60,21 +60,55 @@ class SQLOpenHashSet[@specialized(Long, Int, Double, Float) T: ClassTag](
 }
 
 object SQLOpenHashSet {
-  def isNaN(dataType: DataType): Any => Boolean = {
-    dataType match {
+  def withNaNCheckFunc(
+      dataType: DataType,
+      hashSet: SQLOpenHashSet[Any],
+      handleNotNaN: Any => Unit,
+      handleNaN: Any => Unit): Any => Unit = {
+    val (isNaN, valueNaN) = dataType match {
       case DoubleType =>
-        (value: Any) => java.lang.Double.isNaN(value.asInstanceOf[java.lang.Double])
+        ((value: Any) => java.lang.Double.isNaN(value.asInstanceOf[java.lang.Double]),
+          java.lang.Double.NaN)
       case FloatType =>
-        (value: Any) => java.lang.Float.isNaN(value.asInstanceOf[java.lang.Float])
-      case _ => (_: Any) => false
+        ((value: Any) => java.lang.Float.isNaN(value.asInstanceOf[java.lang.Float]),
+          java.lang.Float.NaN)
+      case _ => ((_: Any) => false, null)
     }
+    (value: Any) =>
+      if (isNaN(value)) {
+        if (!hashSet.containsNaN) {
+          hashSet.addNaN
+          handleNaN(valueNaN)
+        }
+      } else {
+        handleNotNaN(value)
+      }
   }
 
-  def valueNaN(dataType: DataType): Any = {
-    dataType match {
-      case DoubleType => java.lang.Double.NaN
-      case FloatType => java.lang.Float.NaN
-      case _ => null
+  def withNaNCheckCode(
+      dataType: DataType,
+      valueName: String,
+      hashSet: String,
+      handleNotNaN: String,
+      handleNaN: String => String): String = {
+    val ret = dataType match {
+      case DoubleType =>
+        Some((s"java.lang.Double.isNaN((double)$valueName)", "java.lang.Double.NaN"))
+      case FloatType =>
+        Some((s"java.lang.Float.isNaN((float)$valueName)", "java.lang.Float.NaN"))
+      case _ => None
     }
+    ret.map { case (isNaN, valueNaN) =>
+      s"""
+         |if ($isNaN) {
+         |  if (!$hashSet.containsNaN()) {
+         |     $hashSet.addNaN();
+         |     ${handleNaN(valueNaN)}
+         |  }
+         |} else {
+         |  $handleNotNaN
+         |}
+       """.stripMargin
+    }.getOrElse(handleNotNaN)
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index 69a24d9ddc1cf..efe498374bd52 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -1966,6 +1966,15 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
       Seq(Float.NaN, null, 1f))
   }
 
+  test("SPARK-36741: ArrayDistinct should handle duplicated Double.NaN and Float.Nan") {
+    checkEvaluation(ArrayDistinct(
+      Literal.create(Seq(Double.NaN, Double.NaN, null, null, 1d, 1d), ArrayType(DoubleType))),
+      Seq(Double.NaN, null, 1d))
+    checkEvaluation(ArrayDistinct(
+      Literal.create(Seq(Float.NaN, Float.NaN, null, null, 1f, 1f), ArrayType(FloatType))),
+      Seq(Float.NaN, null, 1f))
+  }
+
   test("SPARK-36755: ArraysOverlap hould handle duplicated Double.NaN and Float.Nan") {
     checkEvaluation(ArraysOverlap(
       Literal.apply(Array(Double.NaN, 1d)), Literal.apply(Array(Double.NaN))), true)

From 0b41e99a971afce36b7b9a4b72baab8d3216f6c6 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxin_gao@apple.com>
Date: Sat, 18 Sep 2021 20:12:03 -0700
Subject: [PATCH 144/169] [SPARK-35985][SQL][3.1] push partitionFilters for
 empty readDataSchema

### What changes were proposed in this pull request?
Push down partition filters when the readDataSchema is empty.

### Why are the changes needed?
bug fix

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
New test

Closes #34037 from huaxingao/spark-36776.

Authored-by: Huaxin Gao <huaxin_gao@apple.com>
Signed-off-by: Liang-Chi Hsieh <viirya@gmail.com>
---
 .../PruneFileSourcePartitions.scala           |  3 +--
 .../PruneFileSourcePartitionsSuite.scala      | 22 +++++++++++++++++--
 .../execution/PrunePartitionSuiteBase.scala   |  4 +++-
 3 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
index 0c56e7675da6f..6a33d1c1b124d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -110,8 +110,7 @@ private[sql] object PruneFileSourcePartitions
       }
 
     case op @ PhysicalOperation(projects, filters,
-        v2Relation @ DataSourceV2ScanRelation(_, scan: FileScan, output))
-        if filters.nonEmpty && scan.readDataSchema.nonEmpty =>
+        v2Relation @ DataSourceV2ScanRelation(_, scan: FileScan, output)) if filters.nonEmpty =>
       val (partitionKeyFilters, dataFilters) =
         getPartitionKeyFiltersAndDataFilters(scan.sparkSession, v2Relation,
           scan.readPartitionSchema, filters, output)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
index ab37645b1c90c..74b6dd3063381 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
@@ -27,6 +27,7 @@ import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.execution.{FileSourceScanExec, SparkPlan}
 import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
+import org.apache.spark.sql.execution.datasources.v2.BatchScanExec
 import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
 import org.apache.spark.sql.functions.broadcast
 import org.apache.spark.sql.internal.SQLConf
@@ -109,9 +110,26 @@ class PruneFileSourcePartitionsSuite extends PrunePartitionSuiteBase {
     }
   }
 
+  test("SPARK-35985 push filters for empty read schema") {
+    // Force datasource v2 for parquet
+    withSQLConf((SQLConf.USE_V1_SOURCE_LIST.key, "")) {
+      withTempPath { dir =>
+        spark.range(10).selectExpr("id", "id % 3 as p")
+          .write.partitionBy("p").parquet(dir.getCanonicalPath)
+        withTempView("tmp") {
+          spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("tmp");
+          assertPrunedPartitions("SELECT COUNT(*) FROM tmp WHERE p = 0", 1, "(tmp.`p` = 0)")
+          assertPrunedPartitions(
+            "SELECT input_file_name() FROM tmp WHERE p = 0", 1, "(tmp.`p` = 0)")
+        }
+      }
+    }
+  }
+
   override def getScanExecPartitionSize(plan: SparkPlan): Long = {
     plan.collectFirst {
-      case p: FileSourceScanExec => p
-    }.get.selectedPartitions.length
+      case p: FileSourceScanExec => p.selectedPartitions.length
+      case b: BatchScanExec => b.partitions.size
+    }.get
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PrunePartitionSuiteBase.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PrunePartitionSuiteBase.scala
index bc170fcd59026..47cc3609ba7af 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PrunePartitionSuiteBase.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PrunePartitionSuiteBase.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.hive.execution
 import org.apache.spark.sql.QueryTest
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, BinaryOperator, Expression, IsNotNull, Literal}
 import org.apache.spark.sql.execution.{FileSourceScanExec, SparkPlan}
+import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, FileScan}
 import org.apache.spark.sql.hive.test.TestHiveSingleton
 import org.apache.spark.sql.internal.SQLConf.ADAPTIVE_EXECUTION_ENABLED
 import org.apache.spark.sql.test.SQLTestUtils
@@ -95,9 +96,10 @@ abstract class PrunePartitionSuiteBase extends QueryTest with SQLTestUtils with
     val plan = qe.sparkPlan
     assert(getScanExecPartitionSize(plan) == expectedPartitionCount)
 
-    val pushedDownPartitionFilters = qe.executedPlan.collectFirst {
+    val pushedDownPartitionFilters = plan.collectFirst {
       case scan: FileSourceScanExec => scan.partitionFilters
       case scan: HiveTableScanExec => scan.partitionPruningPred
+      case BatchScanExec(_, scan: FileScan) => scan.partitionFilters
     }.map(exps => exps.filterNot(e => e.isInstanceOf[IsNotNull]))
     val pushedFilters = pushedDownPartitionFilters.map(filters => {
       filters.foldLeft("")((currentStr, exp) => {

From 43b00a5933a78c2770f47e50e9fe5f97fc3ba3e6 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Mon, 20 Sep 2021 16:48:59 +0800
Subject: [PATCH 145/169] [SPARK-36754][SQL] ArrayIntersect handle duplicated
 Double.NaN and Float.NaN

### What changes were proposed in this pull request?
For query
```
select array_intersect(array(cast('nan' as double), 1d), array(cast('nan' as double)))
```
This returns [NaN], but it should return [].
This issue is caused by `OpenHashSet` can't handle `Double.NaN` and `Float.NaN` too.
In this pr fix this based on https://github.com/apache/spark/pull/33955

### Why are the changes needed?
Fix bug

### Does this PR introduce _any_ user-facing change?
ArrayIntersect won't show equal `NaN` value

### How was this patch tested?
Added UT

Closes #33995 from AngersZhuuuu/SPARK-36754.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 2fc7f2f702c6c08d9c76332f45e2902728ba2ee3)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/collectionOperations.scala    | 66 ++++++++++++-------
 .../CollectionExpressionsSuite.scala          | 17 +++++
 2 files changed, 58 insertions(+), 25 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 6cf1eadff513c..77340c6b6a23e 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -3632,33 +3632,42 @@ case class ArrayIntersect(left: Expression, right: Expression) extends ArrayBina
     if (TypeUtils.typeWithProperEquals(elementType)) {
       (array1, array2) =>
         if (array1.numElements() != 0 && array2.numElements() != 0) {
-          val hs = new OpenHashSet[Any]
-          val hsResult = new OpenHashSet[Any]
-          var foundNullElement = false
+          val hs = new SQLOpenHashSet[Any]
+          val hsResult = new SQLOpenHashSet[Any]
+          val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
+          val withArray2NaNCheckFunc = SQLOpenHashSet.withNaNCheckFunc(elementType, hs,
+            (value: Any) => hs.add(value),
+            (valueNaN: Any) => {} )
+          val withArray1NaNCheckFunc = SQLOpenHashSet.withNaNCheckFunc(elementType, hsResult,
+            (value: Any) =>
+              if (hs.contains(value) && !hsResult.contains(value)) {
+                arrayBuffer += value
+                hsResult.add(value)
+              },
+            (valueNaN: Any) =>
+              if (hs.containsNaN()) {
+                arrayBuffer += valueNaN
+              })
           var i = 0
           while (i < array2.numElements()) {
             if (array2.isNullAt(i)) {
-              foundNullElement = true
+              hs.addNull()
             } else {
               val elem = array2.get(i, elementType)
-              hs.add(elem)
+              withArray2NaNCheckFunc(elem)
             }
             i += 1
           }
-          val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
           i = 0
           while (i < array1.numElements()) {
             if (array1.isNullAt(i)) {
-              if (foundNullElement) {
+              if (hs.containsNull() && !hsResult.containsNull()) {
                 arrayBuffer += null
-                foundNullElement = false
+                hsResult.addNull()
               }
             } else {
               val elem = array1.get(i, elementType)
-              if (hs.contains(elem) && !hsResult.contains(elem)) {
-                arrayBuffer += elem
-                hsResult.add(elem)
-              }
+              withArray1NaNCheckFunc(elem)
             }
             i += 1
           }
@@ -3733,10 +3742,9 @@ case class ArrayIntersect(left: Expression, right: Expression) extends ArrayBina
       val ptName = CodeGenerator.primitiveTypeName(jt)
 
       nullSafeCodeGen(ctx, ev, (array1, array2) => {
-        val foundNullElement = ctx.freshName("foundNullElement")
         val nullElementIndex = ctx.freshName("nullElementIndex")
         val builder = ctx.freshName("builder")
-        val openHashSet = classOf[OpenHashSet[_]].getName
+        val openHashSet = classOf[SQLOpenHashSet[_]].getName
         val classTag = s"scala.reflect.ClassTag$$.MODULE$$.$hsTypeName()"
         val hashSet = ctx.freshName("hashSet")
         val hashSetResult = ctx.freshName("hashSetResult")
@@ -3748,7 +3756,7 @@ case class ArrayIntersect(left: Expression, right: Expression) extends ArrayBina
             if (left.dataType.asInstanceOf[ArrayType].containsNull) {
               s"""
                  |if ($array2.isNullAt($i)) {
-                 |  $foundNullElement = true;
+                 |  $hashSet.addNull();
                  |} else {
                  |  $body
                  |}
@@ -3766,19 +3774,18 @@ case class ArrayIntersect(left: Expression, right: Expression) extends ArrayBina
           }
 
         val writeArray2ToHashSet = withArray2NullCheck(
-          s"""
-             |$jt $value = ${genGetValue(array2, i)};
-             |$hashSet.add$hsPostFix($hsValueCast$value);
-           """.stripMargin)
+        s"$jt $value = ${genGetValue(array2, i)};" +
+          SQLOpenHashSet.withNaNCheckCode(elementType, value, hashSet,
+            s"$hashSet.add$hsPostFix($hsValueCast$value);", (valueNaN: String) => ""))
 
         def withArray1NullAssignment(body: String) =
           if (left.dataType.asInstanceOf[ArrayType].containsNull) {
             if (right.dataType.asInstanceOf[ArrayType].containsNull) {
               s"""
                  |if ($array1.isNullAt($i)) {
-                 |  if ($foundNullElement) {
+                 |  if ($hashSet.containsNull() && !$hashSetResult.containsNull()) {
                  |    $nullElementIndex = $size;
-                 |    $foundNullElement = false;
+                 |    $hashSetResult.addNull();
                  |    $size++;
                  |    $builder.$$plus$$eq($nullValueHolder);
                  |  }
@@ -3797,9 +3804,8 @@ case class ArrayIntersect(left: Expression, right: Expression) extends ArrayBina
             body
           }
 
-        val processArray1 = withArray1NullAssignment(
+        val body =
           s"""
-             |$jt $value = ${genGetValue(array1, i)};
              |if ($hashSet.contains($hsValueCast$value) &&
              |    !$hashSetResult.contains($hsValueCast$value)) {
              |  if (++$size > ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}) {
@@ -3808,12 +3814,22 @@ case class ArrayIntersect(left: Expression, right: Expression) extends ArrayBina
              |  $hashSetResult.add$hsPostFix($hsValueCast$value);
              |  $builder.$$plus$$eq($value);
              |}
-           """.stripMargin)
+           """.stripMargin
+
+        val processArray1 = withArray1NullAssignment(
+          s"$jt $value = ${genGetValue(array1, i)};" +
+            SQLOpenHashSet.withNaNCheckCode(elementType, value, hashSetResult, body,
+              (valueNaN: Any) =>
+                s"""
+                   |if ($hashSet.containsNaN()) {
+                   |  ++$size;
+                   |  $builder.$$plus$$eq($valueNaN);
+                   |}
+                 """.stripMargin))
 
         // Only need to track null element index when result array's element is nullable.
         val declareNullTrackVariables = if (dataType.asInstanceOf[ArrayType].containsNull) {
           s"""
-             |boolean $foundNullElement = false;
              |int $nullElementIndex = -1;
            """.stripMargin
         } else {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index efe498374bd52..7fac97b7e0724 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -1966,6 +1966,23 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
       Seq(Float.NaN, null, 1f))
   }
 
+  test("SPARK-36754: ArrayIntersect should handle duplicated Double.NaN and Float.Nan") {
+    checkEvaluation(ArrayIntersect(
+      Literal.apply(Array(Double.NaN, 1d)), Literal.apply(Array(Double.NaN, 1d, 2d))),
+      Seq(Double.NaN, 1d))
+    checkEvaluation(ArrayIntersect(
+      Literal.create(Seq(null, Double.NaN, null, 1d), ArrayType(DoubleType)),
+      Literal.create(Seq(null, Double.NaN, null), ArrayType(DoubleType))),
+      Seq(null, Double.NaN))
+    checkEvaluation(ArrayIntersect(
+      Literal.apply(Array(Float.NaN, 1f)), Literal.apply(Array(Float.NaN, 1f, 2f))),
+      Seq(Float.NaN, 1f))
+    checkEvaluation(ArrayIntersect(
+      Literal.create(Seq(null, Float.NaN, null, 1f), ArrayType(FloatType)),
+      Literal.create(Seq(null, Float.NaN, null), ArrayType(FloatType))),
+      Seq(null, Float.NaN))
+  }
+
   test("SPARK-36741: ArrayDistinct should handle duplicated Double.NaN and Float.Nan") {
     checkEvaluation(ArrayDistinct(
       Literal.create(Seq(Double.NaN, Double.NaN, null, null, 1d, 1d), ArrayType(DoubleType))),

From 6129ca4ab06fdddd81a71b58d3b2b4db31db52a1 Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxin_gao@apple.com>
Date: Mon, 20 Sep 2021 13:27:44 -0700
Subject: [PATCH 146/169] [SPARK-36706][SQL][3.1] OverwriteByExpression
 conversion in DataSourceV2Strategy use wrong param in translateFilter

### What changes were proposed in this pull request?
The wrong parameter is used in `translateFilter` in the following code
```
      val filters = splitConjunctivePredicates(deleteExpr).map {
        filter => DataSourceStrategy.translateFilter(deleteExpr,
          supportNestedPredicatePushdown = true).getOrElse(
            throw new AnalysisException(s"Cannot translate expression to source filter: $filter"))
      }.toArray
```

Using this as an example
```
spark.table("source2_t").writeTo("testcat.table_name").overwrite($"id1" === 3 && $"id2" === 3)
```

The above code will generate these filters:
```
And(EqualTo(id1, 3),EqualTo(id2, 3))
And(EqualTo(id1, 3),EqualTo(id2, 3))
```

 we want to fix the code so it will generate the filters like these:
```
EqualTo(id1, 3)
EqualTo(id2, 3)
```

This problem only exists in 3.1. In 3.2 and 3.3, we have

```
      val filters = splitConjunctivePredicates(deleteExpr).flatMap { pred =>
        val filter = DataSourceStrategy.translateFilter(pred, supportNestedPredicatePushdown = true)
        if (filter.isEmpty) {
          throw QueryCompilationErrors.cannotTranslateExpressionToSourceFilterError(pred)
        }
        filter
      }.toArray
```

### Why are the changes needed?
fix a bug in the code

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
existing tests

Closes #33997 from huaxingao/spark-36706.

Authored-by: Huaxin Gao <huaxin_gao@apple.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../sql/execution/datasources/v2/DataSourceV2Strategy.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
index 643607a905d7a..96e009b292d95 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
@@ -213,7 +213,7 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat
     case OverwriteByExpression(r: DataSourceV2Relation, deleteExpr, query, writeOptions, _) =>
       // fail if any filter cannot be converted. correctness depends on removing all matching data.
       val filters = splitConjunctivePredicates(deleteExpr).map {
-        filter => DataSourceStrategy.translateFilter(deleteExpr,
+        filter => DataSourceStrategy.translateFilter(filter,
           supportNestedPredicatePushdown = true).getOrElse(
             throw new AnalysisException(s"Cannot translate expression to source filter: $filter"))
       }.toArray

From 401e641cb66f7070e9f3f41a3ca6473009b42ebd Mon Sep 17 00:00:00 2001
From: Ivan Sadikov <ivan.sadikov@databricks.com>
Date: Wed, 22 Sep 2021 17:40:39 +0800
Subject: [PATCH 147/169] [SPARK-36803][SQL] Fix ArrayType conversion when
 reading Parquet files written in legacy mode

### What changes were proposed in this pull request?

This PR fixes an issue when reading of a Parquet file written with legacy mode would fail due to incorrect Parquet LIST to ArrayType conversion.

The issue arises when using schema evolution and utilising the parquet-mr reader. 2-level LIST annotated types could be parsed incorrectly as 3-level LIST annotated types because their underlying element type does not match the full inferred Catalyst schema.

### Why are the changes needed?

It appears to be a long-standing issue with the legacy mode due to the imprecise check in ParquetRowConverter that was trying to determine Parquet backward compatibility using Catalyst schema: `DataType.equalsIgnoreCompatibleNullability(guessedElementType, elementType)` in https://github.com/apache/spark/blob/master/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala#L606.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?

Added a new test case in ParquetInteroperabilitySuite.scala.

Closes #34044 from sadikovi/parquet-legacy-write-mode-list-issue.

Authored-by: Ivan Sadikov <ivan.sadikov@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit ec26d94eacb22562b8f5d60d12cf153d8ef3fd50)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../parquet/ParquetRowConverter.scala         | 16 ++++--
 .../parquet/ParquetSchemaConverter.scala      |  2 +-
 .../ParquetInteroperabilitySuite.scala        | 53 +++++++++++++++++++
 3 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
index dca12ff6b4deb..12a71b5af349b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetRowConverter.scala
@@ -550,7 +550,7 @@ private[parquet] class ParquetRowConverter(
       val repeatedType = parquetSchema.getType(0)
       val elementType = catalystSchema.elementType
 
-      // At this stage, we're not sure whether the repeated field maps to the element type or is
+      // At this stage, we need to figure out if the repeated field maps to the element type or is
       // just the syntactic repeated group of the 3-level standard LIST layout. Take the following
       // Parquet LIST-annotated group type as an example:
       //
@@ -574,12 +574,20 @@ private[parquet] class ParquetRowConverter(
       //
       //      ARRAY<STRUCT<element: STRUCT<element: INT>>>
       //
+      //
       // Here we try to convert field `list` into a Catalyst type to see whether the converted type
-      // matches the Catalyst array element type. If it doesn't match, then it's case 1; otherwise,
-      // it's case 2.
+      // matches the Catalyst array element type.
+      //
+      // If the guessed element type from the above does not match the Catalyst type (for example,
+      // in case of schema evolution), we need to check if the repeated type matches one of the
+      // backward-compatibility rules for legacy LIST types (see the link above).
+      //
+      // If the element type does not match the Catalyst type and the underlying repeated type
+      // does not belong to the legacy LIST type, then it is case 1; otherwise, it is case 2.
       val guessedElementType = schemaConverter.convertField(repeatedType)
+      val isLegacy = schemaConverter.isElementType(repeatedType, parquetSchema.getName)
 
-      if (DataType.equalsIgnoreCompatibleNullability(guessedElementType, elementType)) {
+      if (DataType.equalsIgnoreCompatibleNullability(guessedElementType, elementType) || isLegacy) {
         // If the repeated field corresponds to the element type, creates a new converter using the
         // type of the repeated field.
         newConverter(repeatedType, elementType, new ParentContainerUpdater {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
index ff804e25ede4b..d456a35b83c61 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaConverter.scala
@@ -241,7 +241,7 @@ class ParquetToSparkSchemaConverter(
   // Here we implement Parquet LIST backwards-compatibility rules.
   // See: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#backward-compatibility-rules
   // scalastyle:on
-  private def isElementType(repeatedType: Type, parentName: String): Boolean = {
+  private[parquet] def isElementType(repeatedType: Type, parentName: String): Boolean = {
     {
       // For legacy 2-level list types with primitive element type, e.g.:
       //
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
index 2fe5953cbe12e..f7a2535d7486e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
@@ -30,6 +30,7 @@ import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
+import org.apache.spark.sql.types.{ArrayType, IntegerType, StructField, StructType}
 
 class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedSparkSession {
   test("parquet files with different physical schemas but share the same logical schema") {
@@ -97,6 +98,58 @@ class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedS
     }
   }
 
+  test("SPARK-36803: parquet files with legacy mode and schema evolution") {
+    // This test case writes arrays in Parquet legacy mode and schema evolution and verifies that
+    // the data can be correctly read back.
+
+    Seq(false, true).foreach { legacyMode =>
+      withSQLConf(SQLConf.PARQUET_WRITE_LEGACY_FORMAT.key -> legacyMode.toString) {
+        withTempPath { tableDir =>
+          val schema1 = StructType(
+            StructField("col-0", ArrayType(
+              StructType(
+                StructField("col-0", IntegerType, true) ::
+                Nil
+              ),
+              containsNull = false // allows to create 2-level Parquet LIST type in legacy mode
+            )) ::
+            Nil
+          )
+          val row1 = Row(Seq(Row(1)))
+          val df1 = spark.createDataFrame(spark.sparkContext.parallelize(row1 :: Nil, 1), schema1)
+          df1.write.parquet(tableDir.getAbsolutePath)
+
+          val schema2 = StructType(
+            StructField("col-0", ArrayType(
+              StructType(
+                StructField("col-0", IntegerType, true) ::
+                StructField("col-1", IntegerType, true) :: // additional field
+                Nil
+              ),
+              containsNull = false
+            )) ::
+            Nil
+          )
+          val row2 = Row(Seq(Row(1, 2)))
+          val df2 = spark.createDataFrame(spark.sparkContext.parallelize(row2 :: Nil, 1), schema2)
+          df2.write.mode("append").parquet(tableDir.getAbsolutePath)
+
+          // Reading of data should succeed and should not fail with
+          // java.lang.ClassCastException: optional int32 col-0 is not a group
+          withAllParquetReaders {
+            checkAnswer(
+              spark.read.schema(schema2).parquet(tableDir.getAbsolutePath),
+              Seq(
+                Row(Seq(Row(1, null))),
+                Row(Seq(Row(1, 2)))
+              )
+            )
+          }
+        }
+      }
+    }
+  }
+
   test("parquet timestamp conversion") {
     // Make a table with one parquet file written by impala, and one parquet file written by spark.
     // We should only adjust the timestamps in the impala file, and only if the conf is set

From 821dd6a59884c0abeb666e1a0b07c7990449717f Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Wed, 22 Sep 2021 23:51:41 +0800
Subject: [PATCH 148/169] [SPARK-36753][SQL] ArrayExcept handle duplicated
 Double.NaN and Float.NaN

### What changes were proposed in this pull request?
For query
```
select array_except(array(cast('nan' as double), 1d), array(cast('nan' as double)))
```
This returns [NaN, 1d], but it should return [1d].
This issue is caused by `OpenHashSet` can't handle `Double.NaN` and `Float.NaN` too.
In this pr fix this based on https://github.com/apache/spark/pull/33955

### Why are the changes needed?
Fix bug

### Does this PR introduce _any_ user-facing change?
ArrayExcept won't show handle equal `NaN` value

### How was this patch tested?
Added UT

Closes #33994 from AngersZhuuuu/SPARK-36753.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit a7cbe699863a6b68d27bdf3934dda7d396d80404)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/collectionOperations.scala    | 61 +++++++++++--------
 .../CollectionExpressionsSuite.scala          | 17 ++++++
 2 files changed, 53 insertions(+), 25 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
index 77340c6b6a23e..94291e66c9ce7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala
@@ -37,7 +37,6 @@ import org.apache.spark.unsafe.UTF8StringBuilder
 import org.apache.spark.unsafe.array.ByteArrayMethods
 import org.apache.spark.unsafe.array.ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH
 import org.apache.spark.unsafe.types.{ByteArray, CalendarInterval, UTF8String}
-import org.apache.spark.util.collection.OpenHashSet
 
 /**
  * Base trait for [[BinaryExpression]]s with two arrays of the same element type and implicit
@@ -3890,32 +3889,38 @@ case class ArrayExcept(left: Expression, right: Expression) extends ArrayBinaryL
   @transient lazy val evalExcept: (ArrayData, ArrayData) => ArrayData = {
     if (TypeUtils.typeWithProperEquals(elementType)) {
       (array1, array2) =>
-        val hs = new OpenHashSet[Any]
-        var notFoundNullElement = true
+        val hs = new SQLOpenHashSet[Any]
+        val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
+        val withArray2NaNCheckFunc = SQLOpenHashSet.withNaNCheckFunc(elementType, hs,
+          (value: Any) => hs.add(value),
+          (valueNaN: Any) => {})
+        val withArray1NaNCheckFunc = SQLOpenHashSet.withNaNCheckFunc(elementType, hs,
+          (value: Any) =>
+            if (!hs.contains(value)) {
+              arrayBuffer += value
+              hs.add(value)
+            },
+          (valueNaN: Any) => arrayBuffer += valueNaN)
         var i = 0
         while (i < array2.numElements()) {
           if (array2.isNullAt(i)) {
-            notFoundNullElement = false
+            hs.addNull()
           } else {
             val elem = array2.get(i, elementType)
-            hs.add(elem)
+            withArray2NaNCheckFunc(elem)
           }
           i += 1
         }
-        val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any]
         i = 0
         while (i < array1.numElements()) {
           if (array1.isNullAt(i)) {
-            if (notFoundNullElement) {
+            if (!hs.containsNull()) {
               arrayBuffer += null
-              notFoundNullElement = false
+              hs.addNull()
             }
           } else {
             val elem = array1.get(i, elementType)
-            if (!hs.contains(elem)) {
-              arrayBuffer += elem
-              hs.add(elem)
-            }
+            withArray1NaNCheckFunc(elem)
           }
           i += 1
         }
@@ -3984,10 +3989,9 @@ case class ArrayExcept(left: Expression, right: Expression) extends ArrayBinaryL
       val ptName = CodeGenerator.primitiveTypeName(jt)
 
       nullSafeCodeGen(ctx, ev, (array1, array2) => {
-        val notFoundNullElement = ctx.freshName("notFoundNullElement")
         val nullElementIndex = ctx.freshName("nullElementIndex")
         val builder = ctx.freshName("builder")
-        val openHashSet = classOf[OpenHashSet[_]].getName
+        val openHashSet = classOf[SQLOpenHashSet[_]].getName
         val classTag = s"scala.reflect.ClassTag$$.MODULE$$.$hsTypeName()"
         val hashSet = ctx.freshName("hashSet")
         val arrayBuilder = classOf[mutable.ArrayBuilder[_]].getName
@@ -3998,7 +4002,7 @@ case class ArrayExcept(left: Expression, right: Expression) extends ArrayBinaryL
             if (left.dataType.asInstanceOf[ArrayType].containsNull) {
               s"""
                  |if ($array2.isNullAt($i)) {
-                 |  $notFoundNullElement = false;
+                 |  $hashSet.addNull();
                  |} else {
                  |  $body
                  |}
@@ -4016,18 +4020,18 @@ case class ArrayExcept(left: Expression, right: Expression) extends ArrayBinaryL
           }
 
         val writeArray2ToHashSet = withArray2NullCheck(
-          s"""
-             |$jt $value = ${genGetValue(array2, i)};
-             |$hashSet.add$hsPostFix($hsValueCast$value);
-           """.stripMargin)
+          s"$jt $value = ${genGetValue(array2, i)};" +
+            SQLOpenHashSet.withNaNCheckCode(elementType, value, hashSet,
+              s"$hashSet.add$hsPostFix($hsValueCast$value);",
+              (valueNaN: Any) => ""))
 
         def withArray1NullAssignment(body: String) =
           if (left.dataType.asInstanceOf[ArrayType].containsNull) {
             s"""
                |if ($array1.isNullAt($i)) {
-               |  if ($notFoundNullElement) {
+               |  if (!$hashSet.containsNull()) {
+               |    $hashSet.addNull();
                |    $nullElementIndex = $size;
-               |    $notFoundNullElement = false;
                |    $size++;
                |    $builder.$$plus$$eq($nullValueHolder);
                |  }
@@ -4039,9 +4043,8 @@ case class ArrayExcept(left: Expression, right: Expression) extends ArrayBinaryL
             body
           }
 
-        val processArray1 = withArray1NullAssignment(
+        val body =
           s"""
-             |$jt $value = ${genGetValue(array1, i)};
              |if (!$hashSet.contains($hsValueCast$value)) {
              |  if (++$size > ${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}) {
              |    break;
@@ -4049,12 +4052,20 @@ case class ArrayExcept(left: Expression, right: Expression) extends ArrayBinaryL
              |  $hashSet.add$hsPostFix($hsValueCast$value);
              |  $builder.$$plus$$eq($value);
              |}
-           """.stripMargin)
+           """.stripMargin
+
+        val processArray1 = withArray1NullAssignment(
+          s"$jt $value = ${genGetValue(array1, i)};" +
+            SQLOpenHashSet.withNaNCheckCode(elementType, value, hashSet, body,
+              (valueNaN: String) =>
+                s"""
+                   |$size++;
+                   |$builder.$$plus$$eq($valueNaN);
+                 """.stripMargin))
 
         // Only need to track null element index when array1's element is nullable.
         val declareNullTrackVariables = if (left.dataType.asInstanceOf[ArrayType].containsNull) {
           s"""
-             |boolean $notFoundNullElement = true;
              |int $nullElementIndex = -1;
            """.stripMargin
         } else {
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
index 7fac97b7e0724..67665343d8f81 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollectionExpressionsSuite.scala
@@ -1966,6 +1966,23 @@ class CollectionExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper
       Seq(Float.NaN, null, 1f))
   }
 
+  test("SPARK-36753: ArrayExcept should handle duplicated Double.NaN and Float.Nan") {
+    checkEvaluation(ArrayExcept(
+      Literal.apply(Array(Double.NaN, 1d)), Literal.apply(Array(Double.NaN))),
+      Seq(1d))
+    checkEvaluation(ArrayExcept(
+      Literal.create(Seq(null, Double.NaN, null, 1d), ArrayType(DoubleType)),
+      Literal.create(Seq(Double.NaN, null), ArrayType(DoubleType))),
+      Seq(1d))
+    checkEvaluation(ArrayExcept(
+      Literal.apply(Array(Float.NaN, 1f)), Literal.apply(Array(Float.NaN))),
+      Seq(1f))
+    checkEvaluation(ArrayExcept(
+      Literal.create(Seq(null, Float.NaN, null, 1f), ArrayType(FloatType)),
+      Literal.create(Seq(Float.NaN, null), ArrayType(FloatType))),
+      Seq(1f))
+  }
+
   test("SPARK-36754: ArrayIntersect should handle duplicated Double.NaN and Float.Nan") {
     checkEvaluation(ArrayIntersect(
       Literal.apply(Array(Double.NaN, 1d)), Literal.apply(Array(Double.NaN, 1d, 2d))),

From 90a1cf9f346a3da6951b8983aa39597531fc144d Mon Sep 17 00:00:00 2001
From: "Fabian A.J. Thiele" <fabian.thiele@posteo.de>
Date: Thu, 23 Sep 2021 12:56:49 +0800
Subject: [PATCH 149/169] [SPARK-36782][CORE] Avoid blocking
 dispatcher-BlockManagerMaster during UpdateBlockInfo

### What changes were proposed in this pull request?
Delegate potentially blocking call to `mapOutputTracker.updateMapOutput` from within  `UpdateBlockInfo` from `dispatcher-BlockManagerMaster` to the threadpool to avoid blocking the endpoint. This code path is only accessed for `ShuffleIndexBlockId`, other blocks are still executed on the `dispatcher-BlockManagerMaster` itself.

Change `updateBlockInfo` to return `Future[Boolean]` instead of `Boolean`. Response will be sent to RPC caller upon successful completion of the future.

Introduce a unit test that forces `MapOutputTracker` to make a broadcast as part of `MapOutputTracker.serializeOutputStatuses` when running decommission tests.

### Why are the changes needed?
[SPARK-36782](https://issues.apache.org/jira/browse/SPARK-36782) describes a deadlock occurring if the `dispatcher-BlockManagerMaster` is allowed to block while waiting for write access to data structures.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Unit test as introduced in this PR.

---

Ping eejbyfeldt for notice.

Closes #34043 from f-thiele/SPARK-36782.

Lead-authored-by: Fabian A.J. Thiele <fabian.thiele@posteo.de>
Co-authored-by: Emil Ejbyfeldt <eejbyfeldt@liveintent.com>
Co-authored-by: Fabian A.J. Thiele <fthiele@liveintent.com>
Signed-off-by: Gengliang Wang <gengliang@apache.org>
---
 .../storage/BlockManagerMasterEndpoint.scala  | 37 +++++++++++--------
 ...kManagerDecommissionIntegrationSuite.scala | 13 ++++++-
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
index dd1cf70525484..9aaa2c83e70b9 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
@@ -117,12 +117,15 @@ class BlockManagerMasterEndpoint(
 
     case _updateBlockInfo @
         UpdateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size) =>
-      val isSuccess = updateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size)
-      context.reply(isSuccess)
-      // SPARK-30594: we should not post `SparkListenerBlockUpdated` when updateBlockInfo
-      // returns false since the block info would be updated again later.
-      if (isSuccess) {
-        listenerBus.post(SparkListenerBlockUpdated(BlockUpdatedInfo(_updateBlockInfo)))
+      val response = updateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size)
+
+      response.foreach { isSuccess =>
+        // SPARK-30594: we should not post `SparkListenerBlockUpdated` when updateBlockInfo
+        // returns false since the block info would be updated again later.
+        if (isSuccess) {
+          listenerBus.post(SparkListenerBlockUpdated(BlockUpdatedInfo(_updateBlockInfo)))
+        }
+        context.reply(isSuccess)
       }
 
     case GetLocations(blockId) =>
@@ -574,23 +577,25 @@ class BlockManagerMasterEndpoint(
       blockId: BlockId,
       storageLevel: StorageLevel,
       memSize: Long,
-      diskSize: Long): Boolean = {
+      diskSize: Long): Future[Boolean] = {
     logDebug(s"Updating block info on master ${blockId} for ${blockManagerId}")
 
     if (blockId.isShuffle) {
       blockId match {
         case ShuffleIndexBlockId(shuffleId, mapId, _) =>
           // We need to update this at index file because there exists the index-only block
-          logDebug(s"Received shuffle index block update for ${shuffleId} ${mapId}, updating.")
-          mapOutputTracker.updateMapOutput(shuffleId, mapId, blockManagerId)
-          return true
+          return Future {
+            logDebug(s"Received shuffle index block update for ${shuffleId} ${mapId}, updating.")
+            mapOutputTracker.updateMapOutput(shuffleId, mapId, blockManagerId)
+            true
+          }
         case ShuffleDataBlockId(shuffleId: Int, mapId: Long, reduceId: Int) =>
           logDebug(s"Received shuffle data block update for ${shuffleId} ${mapId}, ignore.")
-          return true
+          return Future.successful(true)
         case _ =>
           logDebug(s"Unexpected shuffle block type ${blockId}" +
             s"as ${blockId.getClass().getSimpleName()}")
-          return false
+          return Future.successful(false)
       }
     }
 
@@ -598,15 +603,15 @@ class BlockManagerMasterEndpoint(
       if (blockManagerId.isDriver && !isLocal) {
         // We intentionally do not register the master (except in local mode),
         // so we should not indicate failure.
-        return true
+        return Future.successful(true)
       } else {
-        return false
+        return Future.successful(false)
       }
     }
 
     if (blockId == null) {
       blockManagerInfo(blockManagerId).updateLastSeenMs()
-      return true
+      return Future.successful(true)
     }
 
     blockManagerInfo(blockManagerId).updateBlockInfo(blockId, storageLevel, memSize, diskSize)
@@ -638,7 +643,7 @@ class BlockManagerMasterEndpoint(
     if (locations.size == 0) {
       blockLocations.remove(blockId)
     }
-    true
+    Future.successful(true)
   }
 
   private def getLocations(blockId: BlockId): Seq[BlockManagerId] = {
diff --git a/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala
index e461474294f38..7bd1b6421b191 100644
--- a/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/storage/BlockManagerDecommissionIntegrationSuite.scala
@@ -98,12 +98,22 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
     runDecomTest(true, true, JobEnded)
   }
 
+  test(s"SPARK-36782 not deadlock if MapOutput uses broadcast") {
+    runDecomTest(false, true, JobEnded, forceMapOutputBroadcast = true)
+  }
+
   private def runDecomTest(
       persist: Boolean,
       shuffle: Boolean,
-      whenToDecom: String): Unit = {
+      whenToDecom: String,
+      forceMapOutputBroadcast: Boolean = false): Unit = {
     val migrateDuring = whenToDecom != JobEnded
     val master = s"local-cluster[${numExecs}, 1, 1024]"
+    val minBroadcastSize = if (forceMapOutputBroadcast) {
+      0
+    } else {
+      config.SHUFFLE_MAPOUTPUT_MIN_SIZE_FOR_BROADCAST.defaultValue.get
+    }
     val conf = new SparkConf().setAppName("test").setMaster(master)
       .set(config.DECOMMISSION_ENABLED, true)
       .set(config.STORAGE_DECOMMISSION_ENABLED, true)
@@ -114,6 +124,7 @@ class BlockManagerDecommissionIntegrationSuite extends SparkFunSuite with LocalS
       // Just replicate blocks quickly during testing, there isn't another
       // workload we need to worry about.
       .set(config.STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL, 10L)
+      .set(config.SHUFFLE_MAPOUTPUT_MIN_SIZE_FOR_BROADCAST, minBroadcastSize)
 
     if (whenToDecom == TaskStarted) {
       // We are using accumulators below, make sure those are reported frequently.

From b5eb1a474e004b3111fb20564f33e0d077c66d34 Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Thu, 23 Sep 2021 16:29:54 +0800
Subject: [PATCH 150/169] [SPARK-36782][CORE][FOLLOW-UP] Only handle shuffle
 block in separate thread pool

### What changes were proposed in this pull request?

This's a follow-up of https://github.com/apache/spark/pull/34043. This PR proposes to only handle shuffle blocks in the separate thread pool and leave other blocks the same behavior as it is.

### Why are the changes needed?

To avoid any potential overhead.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Pass existing tests.

Closes #34076 from Ngone51/spark-36782-follow-up.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: Gengliang Wang <gengliang@apache.org>
(cherry picked from commit 9d8ac7c8e90c1c2a6060e8f1e8f2f21e19622567)
Signed-off-by: Gengliang Wang <gengliang@apache.org>
---
 .../storage/BlockManagerMasterEndpoint.scala  | 69 +++++++++++--------
 1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
index 9aaa2c83e70b9..04d092dd28d56 100644
--- a/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
+++ b/core/src/main/scala/org/apache/spark/storage/BlockManagerMasterEndpoint.scala
@@ -117,15 +117,20 @@ class BlockManagerMasterEndpoint(
 
     case _updateBlockInfo @
         UpdateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size) =>
-      val response = updateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size)
 
-      response.foreach { isSuccess =>
+      @inline def handleResult(success: Boolean): Unit = {
         // SPARK-30594: we should not post `SparkListenerBlockUpdated` when updateBlockInfo
         // returns false since the block info would be updated again later.
-        if (isSuccess) {
+        if (success) {
           listenerBus.post(SparkListenerBlockUpdated(BlockUpdatedInfo(_updateBlockInfo)))
         }
-        context.reply(isSuccess)
+        context.reply(success)
+      }
+
+      if (blockId.isShuffle) {
+        updateShuffleBlockInfo(blockId, blockManagerId).foreach(handleResult)
+      } else {
+        handleResult(updateBlockInfo(blockManagerId, blockId, storageLevel, deserializedSize, size))
       }
 
     case GetLocations(blockId) =>
@@ -572,46 +577,54 @@ class BlockManagerMasterEndpoint(
     id
   }
 
+ private def updateShuffleBlockInfo(blockId: BlockId, blockManagerId: BlockManagerId)
+    : Future[Boolean] = {
+   blockId match {
+     case ShuffleIndexBlockId(shuffleId, mapId, _) =>
+       // SPARK-36782: Invoke `MapOutputTracker.updateMapOutput` within the thread
+       // `dispatcher-BlockManagerMaster` could lead to the deadlock when
+       // `MapOutputTracker.serializeOutputStatuses` broadcasts the serialized mapstatues under
+       // the acquired write lock. The broadcast block would report its status to
+       // `BlockManagerMasterEndpoint`, while the `BlockManagerMasterEndpoint` is occupied by
+       // `updateMapOutput` since it's waiting for the write lock. Thus, we use `Future` to call
+       // `updateMapOutput` in a separate thread to avoid the deadlock.
+       Future {
+         // We need to update this at index file because there exists the index-only block
+         logDebug(s"Received shuffle index block update for ${shuffleId} ${mapId}, updating.")
+         mapOutputTracker.updateMapOutput(shuffleId, mapId, blockManagerId)
+         true
+       }
+     case ShuffleDataBlockId(shuffleId: Int, mapId: Long, reduceId: Int) =>
+       logDebug(s"Received shuffle data block update for ${shuffleId} ${mapId}, ignore.")
+       Future.successful(true)
+     case _ =>
+       logDebug(s"Unexpected shuffle block type ${blockId}" +
+         s"as ${blockId.getClass().getSimpleName()}")
+       Future.successful(false)
+   }
+ }
+
   private def updateBlockInfo(
       blockManagerId: BlockManagerId,
       blockId: BlockId,
       storageLevel: StorageLevel,
       memSize: Long,
-      diskSize: Long): Future[Boolean] = {
+      diskSize: Long): Boolean = {
     logDebug(s"Updating block info on master ${blockId} for ${blockManagerId}")
 
-    if (blockId.isShuffle) {
-      blockId match {
-        case ShuffleIndexBlockId(shuffleId, mapId, _) =>
-          // We need to update this at index file because there exists the index-only block
-          return Future {
-            logDebug(s"Received shuffle index block update for ${shuffleId} ${mapId}, updating.")
-            mapOutputTracker.updateMapOutput(shuffleId, mapId, blockManagerId)
-            true
-          }
-        case ShuffleDataBlockId(shuffleId: Int, mapId: Long, reduceId: Int) =>
-          logDebug(s"Received shuffle data block update for ${shuffleId} ${mapId}, ignore.")
-          return Future.successful(true)
-        case _ =>
-          logDebug(s"Unexpected shuffle block type ${blockId}" +
-            s"as ${blockId.getClass().getSimpleName()}")
-          return Future.successful(false)
-      }
-    }
-
     if (!blockManagerInfo.contains(blockManagerId)) {
       if (blockManagerId.isDriver && !isLocal) {
         // We intentionally do not register the master (except in local mode),
         // so we should not indicate failure.
-        return Future.successful(true)
+        return true
       } else {
-        return Future.successful(false)
+        return false
       }
     }
 
     if (blockId == null) {
       blockManagerInfo(blockManagerId).updateLastSeenMs()
-      return Future.successful(true)
+      return true
     }
 
     blockManagerInfo(blockManagerId).updateBlockInfo(blockId, storageLevel, memSize, diskSize)
@@ -643,7 +656,7 @@ class BlockManagerMasterEndpoint(
     if (locations.size == 0) {
       blockLocations.remove(blockId)
     }
-    Future.successful(true)
+    true
   }
 
   private def getLocations(blockId: BlockId): Seq[BlockManagerId] = {

From 329d2edaeff88d39cfd8935cd98ea3f47e972f9a Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Fri, 24 Sep 2021 16:19:36 +0800
Subject: [PATCH 151/169] [SPARK-36792][SQL] InSet should handle NaN

### What changes were proposed in this pull request?
InSet should handle NaN
```
InSet(Literal(Double.NaN), Set(Double.NaN, 1d)) should return true, but return false.
```
### Why are the changes needed?
InSet should handle NaN

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
Added UT

Closes #34033 from AngersZhuuuu/SPARK-36792.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 64f4bf47af2412811ff2843cd363ce883a604ce7)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/expressions/predicates.scala | 38 +++++++++++++++++--
 .../catalyst/expressions/PredicateSuite.scala | 14 +++++++
 2 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 53ac3560bc3b3..f2d91b5d8d4bf 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -487,12 +487,24 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
   override def toString: String = s"$child INSET ${hset.mkString("(", ",", ")")}"
 
   @transient private[this] lazy val hasNull: Boolean = hset.contains(null)
+  @transient private[this] lazy val isNaN: Any => Boolean = child.dataType match {
+    case DoubleType => (value: Any) => java.lang.Double.isNaN(value.asInstanceOf[java.lang.Double])
+    case FloatType => (value: Any) => java.lang.Float.isNaN(value.asInstanceOf[java.lang.Float])
+    case _ => (_: Any) => false
+  }
+  @transient private[this] lazy val hasNaN = child.dataType match {
+    case DoubleType | FloatType => set.exists(isNaN)
+    case _ => false
+  }
+
 
   override def nullable: Boolean = child.nullable || hasNull
 
   protected override def nullSafeEval(value: Any): Any = {
     if (set.contains(value)) {
       true
+    } else if (isNaN(value)) {
+      hasNaN
     } else if (hasNull) {
       null
     } else {
@@ -524,15 +536,33 @@ case class InSet(child: Expression, hset: Set[Any]) extends UnaryExpression with
   private def genCodeWithSet(ctx: CodegenContext, ev: ExprCode): ExprCode = {
     nullSafeCodeGen(ctx, ev, c => {
       val setTerm = ctx.addReferenceObj("set", set)
+
       val setIsNull = if (hasNull) {
         s"${ev.isNull} = !${ev.value};"
       } else {
         ""
       }
-      s"""
-         |${ev.value} = $setTerm.contains($c);
-         |$setIsNull
-       """.stripMargin
+
+      val ret = child.dataType match {
+        case DoubleType => Some((v: Any) => s"java.lang.Double.isNaN($v)")
+        case FloatType => Some((v: Any) => s"java.lang.Float.isNaN($v)")
+        case _ => None
+      }
+
+      ret.map { isNaN =>
+        s"""
+          |if ($setTerm.contains($c)) {
+          |  ${ev.value} = true;
+          |} else if (${isNaN(c)}) {
+          |  ${ev.value} =  $hasNaN;
+          |}
+          |$setIsNull
+          |""".stripMargin
+      }.getOrElse(
+        s"""
+           |${ev.value} = $setTerm.contains($c);
+           |$setIsNull
+         """.stripMargin)
     })
   }
 
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 6f75623dc59ae..c34b37d7c6c88 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -644,4 +644,18 @@ class PredicateSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkExpr(GreaterThan, Double.NaN, Double.NaN, false)
     checkExpr(GreaterThan, 0.0, -0.0, false)
   }
+
+  test("SPARK-36792: InSet should handle Double.NaN and Float.NaN") {
+    checkInAndInSet(In(Literal(Double.NaN), Seq(Literal(Double.NaN), Literal(2d))), true)
+    checkInAndInSet(In(Literal.create(null, DoubleType),
+      Seq(Literal(Double.NaN), Literal(2d), Literal.create(null, DoubleType))), null)
+    checkInAndInSet(In(Literal.create(null, DoubleType),
+      Seq(Literal(Double.NaN), Literal(2d))), null)
+    checkInAndInSet(In(Literal(3d),
+      Seq(Literal(Double.NaN), Literal(2d))), false)
+    checkInAndInSet(In(Literal(3d),
+      Seq(Literal(Double.NaN), Literal(2d), Literal.create(null, DoubleType))), null)
+    checkInAndInSet(In(Literal(Double.NaN),
+      Seq(Literal(Double.NaN), Literal(2d), Literal.create(null, DoubleType))), true)
+  }
 }

From 0498e2e878e062b153f989876821dcf041cfa4fa Mon Sep 17 00:00:00 2001
From: Kousuke Saruta <sarutak@oss.nttdata.com>
Date: Thu, 7 Oct 2021 15:56:52 +0900
Subject: [PATCH 152/169] [SPARK-36874][SPARK-34634][SQL][3.1]
 ResolveReference.dedupRight should copy dataset_id tag to avoid ambiguous
 self join

### What changes were proposed in this pull request?

This PR backports the change of SPARK-36874 (#34172) mainly, and SPARK-34634 (#31752) partially to care about the ambiguous self join for `ScriptTransformation`.
This PR fixes an issue that ambiguous self join can't be detected if the left and right DataFrame are swapped.
This is an example.
```
val df1 = Seq((1, 2, "A1"),(2, 1, "A2")).toDF("key1", "key2", "value")
val df2 = df1.filter($"value" === "A2")

df1.join(df2, df1("key1") === df2("key2")) // Ambiguous self join is detected and AnalysisException is thrown.

df2.join(df1, df1("key1") === df2("key2)) // Ambiguous self join is not detected.
```

The root cause seems that an inner function `collectConflictPlans` in `ResolveReference.dedupRight.` doesn't copy the `dataset_id` tag when it copies a `LogicalPlan`.

### Why are the changes needed?

Bug fix.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

New tests.

Closes #34205 from sarutak/backport-SPARK-36874.

Authored-by: Kousuke Saruta <sarutak@oss.nttdata.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 .../sql/catalyst/analysis/Analyzer.scala      |  44 +++++--
 .../spark/sql/DataFrameSelfJoinSuite.scala    | 122 +++++++++++++++++-
 2 files changed, 153 insertions(+), 13 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index 62bfd5395f4db..5578838df97db 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -1379,13 +1379,16 @@ class Analyzer(override val catalogManager: CatalogManager)
 
         case oldVersion: SerializeFromObject
             if oldVersion.outputSet.intersect(conflictingAttributes).nonEmpty =>
-          Seq((oldVersion, oldVersion.copy(
-            serializer = oldVersion.serializer.map(_.newInstance()))))
+          val newVersion = oldVersion.copy(serializer = oldVersion.serializer.map(_.newInstance()))
+          newVersion.copyTagsFrom(oldVersion)
+          Seq((oldVersion, newVersion))
 
         // Handle projects that create conflicting aliases.
         case oldVersion @ Project(projectList, _)
             if findAliases(projectList).intersect(conflictingAttributes).nonEmpty =>
-          Seq((oldVersion, oldVersion.copy(projectList = newAliases(projectList))))
+          val newVersion = oldVersion.copy(projectList = newAliases(projectList))
+          newVersion.copyTagsFrom(oldVersion)
+          Seq((oldVersion, newVersion))
 
         // We don't need to search child plan recursively if the projectList of a Project
         // is only composed of Alias and doesn't contain any conflicting attributes.
@@ -1397,8 +1400,9 @@ class Analyzer(override val catalogManager: CatalogManager)
 
         case oldVersion @ Aggregate(_, aggregateExpressions, _)
             if findAliases(aggregateExpressions).intersect(conflictingAttributes).nonEmpty =>
-          Seq((oldVersion, oldVersion.copy(
-            aggregateExpressions = newAliases(aggregateExpressions))))
+          val newVersion = oldVersion.copy(aggregateExpressions = newAliases(aggregateExpressions))
+          newVersion.copyTagsFrom(oldVersion)
+          Seq((oldVersion, newVersion))
 
         // We don't search the child plan recursively for the same reason as the above Project.
         case _ @ Aggregate(_, aggregateExpressions, _)
@@ -1407,20 +1411,28 @@ class Analyzer(override val catalogManager: CatalogManager)
 
         case oldVersion @ FlatMapGroupsInPandas(_, _, output, _)
             if oldVersion.outputSet.intersect(conflictingAttributes).nonEmpty =>
-          Seq((oldVersion, oldVersion.copy(output = output.map(_.newInstance()))))
+          val newVersion = oldVersion.copy(output = output.map(_.newInstance()))
+          newVersion.copyTagsFrom(oldVersion)
+          Seq((oldVersion, newVersion))
 
         case oldVersion @ FlatMapCoGroupsInPandas(_, _, _, output, _, _)
             if oldVersion.outputSet.intersect(conflictingAttributes).nonEmpty =>
-          Seq((oldVersion, oldVersion.copy(output = output.map(_.newInstance()))))
+          val newVersion = oldVersion.copy(output = output.map(_.newInstance()))
+          newVersion.copyTagsFrom(oldVersion)
+          Seq((oldVersion, newVersion))
 
         case oldVersion @ MapInPandas(_, output, _)
             if oldVersion.outputSet.intersect(conflictingAttributes).nonEmpty =>
-          Seq((oldVersion, oldVersion.copy(output = output.map(_.newInstance()))))
+          val newVersion = oldVersion.copy(output = output.map(_.newInstance()))
+          newVersion.copyTagsFrom(oldVersion)
+          Seq((oldVersion, newVersion))
 
         case oldVersion: Generate
             if oldVersion.producedAttributes.intersect(conflictingAttributes).nonEmpty =>
           val newOutput = oldVersion.generatorOutput.map(_.newInstance())
-          Seq((oldVersion, oldVersion.copy(generatorOutput = newOutput)))
+          val newVersion = oldVersion.copy(generatorOutput = newOutput)
+          newVersion.copyTagsFrom(oldVersion)
+          Seq((oldVersion, newVersion))
 
         case oldVersion: Expand
             if oldVersion.producedAttributes.intersect(conflictingAttributes).nonEmpty =>
@@ -1432,12 +1444,22 @@ class Analyzer(override val catalogManager: CatalogManager)
               attr
             }
           }
-          Seq((oldVersion, oldVersion.copy(output = newOutput)))
+          val newVersion = oldVersion.copy(output = newOutput)
+          newVersion.copyTagsFrom(oldVersion)
+          Seq((oldVersion, newVersion))
 
         case oldVersion @ Window(windowExpressions, _, _, child)
             if AttributeSet(windowExpressions.map(_.toAttribute)).intersect(conflictingAttributes)
             .nonEmpty =>
-          Seq((oldVersion, oldVersion.copy(windowExpressions = newAliases(windowExpressions))))
+          val newVersion = oldVersion.copy(windowExpressions = newAliases(windowExpressions))
+          newVersion.copyTagsFrom(oldVersion)
+          Seq((oldVersion, newVersion))
+
+        case oldVersion @ ScriptTransformation(_, _, output, _, _)
+          if AttributeSet(output).intersect(conflictingAttributes).nonEmpty =>
+          val newVersion = oldVersion.copy(output = output.map(_.newInstance()))
+          newVersion.copyTagsFrom(oldVersion)
+          Seq((oldVersion, newVersion))
 
         case _ => plan.children.flatMap(collectConflictPlans)
       }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala
index 062404f412bb7..9994981cb0e75 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala
@@ -17,12 +17,15 @@
 
 package org.apache.spark.sql
 
-import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.api.python.PythonEvalType
+import org.apache.spark.sql.catalyst.expressions.{Alias, Ascending, AttributeReference, PythonUDF, SortOrder}
+import org.apache.spark.sql.catalyst.plans.logical.{Expand, Generate, ScriptInputOutputSchema, ScriptTransformation, Window => WindowPlan}
 import org.apache.spark.sql.expressions.Window
-import org.apache.spark.sql.functions.{count, sum}
+import org.apache.spark.sql.functions.{count, explode, sum}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.sql.test.SQLTestData.TestData
+import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType}
 
 class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {
   import testImplicits._
@@ -344,4 +347,119 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession {
       assertAmbiguousSelfJoin(df1.join(df2).join(df5).join(df4).select(df2("b")))
     }
   }
+
+  test("SPARK-36874: DeduplicateRelations should copy dataset_id tag " +
+    "to avoid ambiguous self join") {
+    // Test for Project
+    val df1 = Seq((1, 2, "A1"), (2, 1, "A2")).toDF("key1", "key2", "value")
+    val df2 = df1.filter($"value" === "A2")
+    assertAmbiguousSelfJoin(df1.join(df2, df1("key1") === df2("key2")))
+    assertAmbiguousSelfJoin(df2.join(df1, df1("key1") === df2("key2")))
+
+    // Test for SerializeFromObject
+    val df3 = spark.sparkContext.parallelize(1 to 10).map(x => (x, x)).toDF
+    val df4 = df3.filter($"_1" <=> 0)
+    assertAmbiguousSelfJoin(df3.join(df4, df3("_1") === df4("_2")))
+    assertAmbiguousSelfJoin(df4.join(df3, df3("_1") === df4("_2")))
+
+    // Test For Aggregate
+    val df5 = df1.groupBy($"key1").agg(count($"value") as "count")
+    val df6 = df5.filter($"key1" > 0)
+    assertAmbiguousSelfJoin(df5.join(df6, df5("key1") === df6("count")))
+    assertAmbiguousSelfJoin(df6.join(df5, df5("key1") === df6("count")))
+
+    // Test for MapInPandas
+    val mapInPandasUDF = PythonUDF("mapInPandasUDF", null,
+      StructType(Seq(StructField("x", LongType), StructField("y", LongType))),
+      Seq.empty,
+      PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
+      true)
+    val df7 = df1.mapInPandas(mapInPandasUDF)
+    val df8 = df7.filter($"x" > 0)
+    assertAmbiguousSelfJoin(df7.join(df8, df7("x") === df8("y")))
+    assertAmbiguousSelfJoin(df8.join(df7, df7("x") === df8("y")))
+
+    // Test for FlatMapGroupsInPandas
+    val flatMapGroupsInPandasUDF = PythonUDF("flagMapGroupsInPandasUDF", null,
+      StructType(Seq(StructField("x", LongType), StructField("y", LongType))),
+      Seq.empty,
+      PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
+      true)
+    val df9 = df1.groupBy($"key1").flatMapGroupsInPandas(flatMapGroupsInPandasUDF)
+    val df10 = df9.filter($"x" > 0)
+    assertAmbiguousSelfJoin(df9.join(df10, df9("x") === df10("y")))
+    assertAmbiguousSelfJoin(df10.join(df9, df9("x") === df10("y")))
+
+    // Test for FlatMapCoGroupsInPandas
+    val flatMapCoGroupsInPandasUDF = PythonUDF("flagMapCoGroupsInPandasUDF", null,
+      StructType(Seq(StructField("x", LongType), StructField("y", LongType))),
+      Seq.empty,
+      PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
+      true)
+    val df11 = df1.groupBy($"key1").flatMapCoGroupsInPandas(
+      df1.groupBy($"key2"), flatMapCoGroupsInPandasUDF)
+    val df12 = df11.filter($"x" > 0)
+    assertAmbiguousSelfJoin(df11.join(df12, df11("x") === df12("y")))
+    assertAmbiguousSelfJoin(df12.join(df11, df11("x") === df12("y")))
+
+    // Test for Generate
+    // Ensure that the root of the plan is Generate
+    val df13 = Seq((1, Seq(1, 2, 3))).toDF("a", "intList").select($"a", explode($"intList"))
+      .queryExecution.optimizedPlan.find(_.isInstanceOf[Generate]).get.toDF
+    val df14 = df13.filter($"a" > 0)
+    assertAmbiguousSelfJoin(df13.join(df14, df13("a") === df14("col")))
+    assertAmbiguousSelfJoin(df14.join(df13, df13("a") === df14("col")))
+
+    // Test for Expand
+    // Ensure that the root of the plan is Expand
+    val df15 =
+      Expand(
+        Seq(Seq($"key1".expr, $"key2".expr)),
+        Seq(
+          AttributeReference("x", IntegerType)(),
+          AttributeReference("y", IntegerType)()),
+        df1.queryExecution.logical).toDF
+    val df16 = df15.filter($"x" > 0)
+    assertAmbiguousSelfJoin(df15.join(df16, df15("x") === df16("y")))
+    assertAmbiguousSelfJoin(df16.join(df15, df15("x") === df16("y")))
+
+    // Test for Window
+    val dfWithTS = spark.sql("SELECT timestamp'2021-10-15 01:52:00' time, 1 a, 2 b")
+    // Ensure that the root of the plan is Window
+    val df17 = WindowPlan(
+      Seq(Alias(dfWithTS("time").expr, "ts")()),
+      Seq(dfWithTS("a").expr),
+      Seq(SortOrder(dfWithTS("a").expr, Ascending)),
+      dfWithTS.queryExecution.logical).toDF
+    val df18 = df17.filter($"a" > 0)
+    assertAmbiguousSelfJoin(df17.join(df18, df17("a") === df18("b")))
+    assertAmbiguousSelfJoin(df18.join(df17, df17("a") === df18("b")))
+
+    // Test for ScriptTransformation
+    val ioSchema =
+      ScriptInputOutputSchema(
+        Seq(("TOK_TABLEROWFORMATFIELD", ","),
+          ("TOK_TABLEROWFORMATCOLLITEMS", "#"),
+          ("TOK_TABLEROWFORMATMAPKEYS", "@"),
+          ("TOK_TABLEROWFORMATNULL", "null"),
+          ("TOK_TABLEROWFORMATLINES", "\n")),
+        Seq(("TOK_TABLEROWFORMATFIELD", ","),
+          ("TOK_TABLEROWFORMATCOLLITEMS", "#"),
+          ("TOK_TABLEROWFORMATMAPKEYS", "@"),
+          ("TOK_TABLEROWFORMATNULL", "null"),
+          ("TOK_TABLEROWFORMATLINES", "\n")), None, None,
+        List.empty, List.empty, None, None, false)
+    // Ensure that the root of the plan is ScriptTransformation
+    val df19 = ScriptTransformation(
+      Seq($"key1".expr, $"key2".expr),
+      "cat",
+      Seq(
+        AttributeReference("x", IntegerType)(),
+        AttributeReference("y", IntegerType)()),
+      df1.queryExecution.logical,
+      ioSchema).toDF
+    val df20 = df19.filter($"x" > 0)
+    assertAmbiguousSelfJoin(df19.join(df20, df19("x") === df20("y")))
+    assertAmbiguousSelfJoin(df20.join(df19, df19("x") === df20("y")))
+  }
 }

From 74ce639edc404f5e853059ceb446f7223a71fea6 Mon Sep 17 00:00:00 2001
From: daugraph <daugraph@qq.com>
Date: Fri, 8 Oct 2021 07:11:26 -0500
Subject: [PATCH 153/169] [SPARK-36717][CORE] Incorrect order of variable
 initialization may lead incorrect behavior

### What changes were proposed in this pull request?
Incorrect order of variable initialization may lead to incorrect behavior, related code: TorrentBroadcast.scala , TorrentBroadCast will get wrong checksumEnabled value after initialization, this may not be what we need, we can move L94 front of setConf(SparkEnv.get.conf) to avoid this.

Supplement:
Snippet 1
```scala
class Broadcast {
  def setConf(): Unit = {
    checksumEnabled = true
  }
  setConf()
  var checksumEnabled = false
}
println(new Broadcast().checksumEnabled)
```
output:
```scala
false
```
 Snippet 2
```scala
class Broadcast {
  var checksumEnabled = false
  def setConf(): Unit = {
    checksumEnabled = true
  }
  setConf()
}
println(new Broadcast().checksumEnabled)
```
output:
```scala
true
```

### Why are the changes needed?
we can move L94 front of setConf(SparkEnv.get.conf) to avoid this.

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
No

Closes #33957 from daugraph/branch0.

Authored-by: daugraph <daugraph@qq.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
(cherry picked from commit 65f6a7c1ecdcf7d6df798e30c9fc03a5dbe0b047)
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../scala/org/apache/spark/broadcast/TorrentBroadcast.scala | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
index 1024d9b5060bc..e35a079746a64 100644
--- a/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
+++ b/core/src/main/scala/org/apache/spark/broadcast/TorrentBroadcast.scala
@@ -73,6 +73,10 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
   /** Size of each block. Default value is 4MB.  This value is only read by the broadcaster. */
   @transient private var blockSize: Int = _
 
+
+  /** Whether to generate checksum for blocks or not. */
+  private var checksumEnabled: Boolean = false
+
   private def setConf(conf: SparkConf): Unit = {
     compressionCodec = if (conf.get(config.BROADCAST_COMPRESS)) {
       Some(CompressionCodec.createCodec(conf))
@@ -90,8 +94,6 @@ private[spark] class TorrentBroadcast[T: ClassTag](obj: T, id: Long)
   /** Total number of blocks this broadcast variable contains. */
   private val numBlocks: Int = writeBlocks(obj)
 
-  /** Whether to generate checksum for blocks or not. */
-  private var checksumEnabled: Boolean = false
   /** The checksum for all the blocks. */
   private var checksums: Array[Int] = _
 

From 650d951854babd1e72aac3d86247f3e90e3a2631 Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Thu, 14 Oct 2021 14:34:24 -0700
Subject: [PATCH 154/169] [SPARK-23626][CORE] Eagerly compute RDD.partitions on
 entire DAG when submitting job to DAGScheduler

### What changes were proposed in this pull request?

This PR fixes a longstanding issue where the `DAGScheduler'`s single-threaded event processing loop could become blocked by slow `RDD.getPartitions()` calls, preventing other events (like task completions and concurrent job submissions) from being processed in a timely manner.

With this patch's change, Spark will now call `.partitions` on every RDD in the DAG before submitting a job to the scheduler, ensuring that the expensive `getPartitions()` calls occur outside of the scheduler event loop.

#### Background

The `RDD.partitions` method lazily computes an RDD's partitions by calling `RDD.getPartitions()`. The `getPartitions()` method is invoked only once per RDD and its result is cached in the `RDD.partitions_` private field. Sometimes the `getPartitions()` call can be expensive: for example, `HadoopRDD.getPartitions()` performs file listing operations.

The `.partitions` method is invoked at many different places in Spark's code, including many existing call sites that are outside of the scheduler event loop. As a result, it's _often_ the case that an RDD's partitions will have been computed before the RDD is submitted to the DAGScheduler. For example, [`submitJob` calls `rdd.partitions.length`](https://github.com/apache/spark/blob/3ba57f5edc5594ee676249cd309b8f0d8248462e/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala#L837), so the DAG root's partitions will be computed outside of the scheduler event loop.

However, there's still some cases where `partitions` gets evaluated for the first time inside of the `DAGScheduler` internals. For example, [`ShuffledRDD.getPartitions`](https://github.com/apache/spark/blob/3ba57f5edc5594ee676249cd309b8f0d8248462e/core/src/main/scala/org/apache/spark/rdd/ShuffledRDD.scala#L92-L94) doesn't call `.partitions` on the RDD being shuffled, so a plan with a ShuffledRDD at the root won't necessarily result in `.partitions` having been called on all RDDs prior to scheduler job submission.

#### Correctness: proving that we make no excess `.partitions` calls

This PR adds code to traverse the DAG prior to job submission and call `.partitions` on every RDD encountered.

I'd like to argue that this results in no _excess_ `.partitions` calls: in every case where the new code calls `.partitions` there is existing code which would have called `.partitions` at some point during a successful job execution:

- Assume that this is the first time we are computing every RDD in the DAG.
- Every RDD appears in some stage.
- [`submitStage` will call `submitMissingTasks`](https://github.com/databricks/runtime/blob/1e83dfe4f685bad7f260621e77282b1b4cf9bca4/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala#L1438) on every stage root RDD.
- [`submitStage` calls `getPreferredLocsInternal`](https://github.com/databricks/runtime/blob/1e83dfe4f685bad7f260621e77282b1b4cf9bca4/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala#L1687-L1696) on every stage root RDD.
- [`getPreferredLocsInternal`](https://github.com/databricks/runtime/blob/1e83dfe4f685bad7f260621e77282b1b4cf9bca4/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala#L2995-L3043) visits the RDD and all of its parents RDDs that are computed in the same stage (via narrow dependencies) and calls `.partitions` on each RDD visited.
- Therefore `.partitions` is invoked on every RDD in the DAG by the time the job has successfully completed.
- Therefore this patch's change does not introduce any new calls to `.partitions` which would not have otherwise occurred (assuming the job succeeded).

#### Ordering of `.partitions` calls

I don't think the order in which `.partitions` calls occur matters for correctness: the DAGScheduler happens to invoke `.partitions` in a particular order today (defined by the DAG traversal order in internal scheduler methods), but there's many  lots of out-of-order `.partition` calls occurring elsewhere in the codebase.

#### Handling of exceptions in `.partitions`

I've chosen **not** to add special error-handling for the new `.partitions` calls: if exceptions occur then they'll bubble up, unwrapped, to the user code submitting the Spark job.

It's sometimes important to preserve exception wrapping behavior, but I don't think that concern is warranted in this particular case: whether `getPartitions` occurred inside or outside of the scheduler (impacting whether exceptions manifest in wrapped or unwrapped form, and impacting whether failed jobs appear in the Spark UI) was not crisply defined (and in some rare cases could even be [influenced by Spark settings in non-obvious ways](https://github.com/apache/spark/blob/10d5303174bf4a47508f6227bbdb1eaf4c92fcdb/core/src/main/scala/org/apache/spark/Partitioner.scala#L75-L79)), so I think it's both unlikely that users were relying on the old behavior and very difficult to preserve it.

#### Should this have a configuration flag?

Per discussion from a previous PR trying to solve this problem (https://github.com/apache/spark/pull/24438#pullrequestreview-232692586), I've decided to skip adding a configuration flag for this.

### Why are the changes needed?

This fixes a longstanding scheduler performance problem which has been reported by multiple users.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

I added a regression test in `BasicSchedulerIntegrationSuite` to cover the regular job submission codepath (`DAGScheduler.submitJob`)This test uses CountDownLatches to simulate the submission of a job containing an RDD with a slow `getPartitions()` call and checks that a concurrently-submitted job is not blocked.

I have **not** added separate integration tests for the `runApproximateJob` and `submitMapStage` codepaths (both of which also received the same fix).

Closes #34265 from JoshRosen/SPARK-23626.

Authored-by: Josh Rosen <joshrosen@databricks.com>
Signed-off-by: Josh Rosen <joshrosen@databricks.com>
(cherry picked from commit c4e975e175c01f67ece7ae492a79554ad1b44106)
Signed-off-by: Josh Rosen <joshrosen@databricks.com>
---
 .../apache/spark/scheduler/DAGScheduler.scala |  45 +++++++
 .../HealthTrackerIntegrationSuite.scala       |   4 +-
 .../scheduler/SchedulerIntegrationSuite.scala | 120 ++++++++++++++++--
 3 files changed, 155 insertions(+), 14 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index 68ee368ddabaa..3e14ec839296a 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -700,6 +700,35 @@ private[spark] class DAGScheduler(
     missing.toList
   }
 
+  /** Invoke `.partitions` on the given RDD and all of its ancestors  */
+  private def eagerlyComputePartitionsForRddAndAncestors(rdd: RDD[_]): Unit = {
+    val startTime = System.nanoTime
+    val visitedRdds = new HashSet[RDD[_]]
+    // We are manually maintaining a stack here to prevent StackOverflowError
+    // caused by recursively visiting
+    val waitingForVisit = new ListBuffer[RDD[_]]
+    waitingForVisit += rdd
+
+    def visit(rdd: RDD[_]): Unit = {
+      if (!visitedRdds(rdd)) {
+        visitedRdds += rdd
+
+        // Eagerly compute:
+        rdd.partitions
+
+        for (dep <- rdd.dependencies) {
+          waitingForVisit.prepend(dep.rdd)
+        }
+      }
+    }
+
+    while (waitingForVisit.nonEmpty) {
+      visit(waitingForVisit.remove(0))
+    }
+    logDebug("eagerlyComputePartitionsForRddAndAncestors for RDD %d took %f seconds"
+      .format(rdd.id, (System.nanoTime - startTime) / 1e9))
+  }
+
   /**
    * Registers the given jobId among the jobs that need the given stage and
    * all of that stage's ancestors.
@@ -809,6 +838,11 @@ private[spark] class DAGScheduler(
           "Total number of partitions: " + maxPartitions)
     }
 
+    // SPARK-23626: `RDD.getPartitions()` can be slow, so we eagerly compute
+    // `.partitions` on every RDD in the DAG to ensure that `getPartitions()`
+    // is evaluated outside of the DAGScheduler's single-threaded event loop:
+    eagerlyComputePartitionsForRddAndAncestors(rdd)
+
     val jobId = nextJobId.getAndIncrement()
     if (partitions.isEmpty) {
       val clonedProperties = Utils.cloneProperties(properties)
@@ -900,6 +934,12 @@ private[spark] class DAGScheduler(
       listenerBus.post(SparkListenerJobEnd(jobId, time, JobSucceeded))
       return new PartialResult(evaluator.currentResult(), true)
     }
+
+    // SPARK-23626: `RDD.getPartitions()` can be slow, so we eagerly compute
+    // `.partitions` on every RDD in the DAG to ensure that `getPartitions()`
+    // is evaluated outside of the DAGScheduler's single-threaded event loop:
+    eagerlyComputePartitionsForRddAndAncestors(rdd)
+
     val listener = new ApproximateActionListener(rdd, func, evaluator, timeout)
     val func2 = func.asInstanceOf[(TaskContext, Iterator[_]) => _]
     eventProcessLoop.post(JobSubmitted(
@@ -932,6 +972,11 @@ private[spark] class DAGScheduler(
       throw new SparkException("Can't run submitMapStage on RDD with 0 partitions")
     }
 
+    // SPARK-23626: `RDD.getPartitions()` can be slow, so we eagerly compute
+    // `.partitions` on every RDD in the DAG to ensure that `getPartitions()`
+    // is evaluated outside of the DAGScheduler's single-threaded event loop:
+    eagerlyComputePartitionsForRddAndAncestors(rdd)
+
     // We create a JobWaiter with only one "task", which will be marked as complete when the whole
     // map stage has completed, and will be passed the MapOutputStatistics for that stage.
     // This makes it easier to avoid race conditions between the user code and the map output
diff --git a/core/src/test/scala/org/apache/spark/scheduler/HealthTrackerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/HealthTrackerIntegrationSuite.scala
index 29a8f4be8b72b..fd05ff9dfe8ad 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/HealthTrackerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/HealthTrackerIntegrationSuite.scala
@@ -112,7 +112,7 @@ class HealthTrackerIntegrationSuite extends SchedulerIntegrationSuite[MultiExecu
       backend.taskFailed(taskDescription, new RuntimeException("test task failure"))
     }
     withBackend(runBackend _) {
-      val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray)
+      val jobFuture = submit(new MockRDD(sc, 10, Nil, Nil), (0 until 10).toArray)
       awaitJobTermination(jobFuture, duration)
       val pattern = (
         s"""|Aborting TaskSet 0.0 because task .*
@@ -150,7 +150,7 @@ class MockRDDWithLocalityPrefs(
     sc: SparkContext,
     numPartitions: Int,
     shuffleDeps: Seq[ShuffleDependency[Int, Int, Nothing]],
-    val preferredLoc: String) extends MockRDD(sc, numPartitions, shuffleDeps) {
+    val preferredLoc: String) extends MockRDD(sc, numPartitions, shuffleDeps, Nil) {
   override def getPreferredLocations(split: Partition): Seq[String] = {
     Seq(preferredLoc)
   }
diff --git a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
index 88d2868b957f9..874abce68c11f 100644
--- a/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
+++ b/core/src/test/scala/org/apache/spark/scheduler/SchedulerIntegrationSuite.scala
@@ -17,7 +17,7 @@
 package org.apache.spark.scheduler
 
 import java.util.Properties
-import java.util.concurrent.{TimeoutException, TimeUnit}
+import java.util.concurrent.{CountDownLatch, TimeoutException, TimeUnit}
 import java.util.concurrent.atomic.{AtomicBoolean, AtomicReference}
 
 import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
@@ -205,7 +205,13 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
   def shuffle(nParts: Int, input: MockRDD): MockRDD = {
     val partitioner = new HashPartitioner(nParts)
     val shuffleDep = new ShuffleDependency[Int, Int, Nothing](input, partitioner)
-    new MockRDD(sc, nParts, List(shuffleDep))
+    new MockRDD(sc, nParts, List(shuffleDep), Nil)
+  }
+
+  /** models a one-to-one dependency within a stage, like a map or filter */
+  def oneToOne(input: MockRDD): MockRDD = {
+    val dep = new OneToOneDependency[(Int, Int)](input)
+    new MockRDD(sc, input.numPartitions, Nil, Seq(dep))
   }
 
   /** models a stage boundary with multiple dependencies, like a join */
@@ -214,7 +220,7 @@ abstract class SchedulerIntegrationSuite[T <: MockBackend: ClassTag] extends Spa
     val shuffleDeps = inputs.map { inputRDD =>
       new ShuffleDependency[Int, Int, Nothing](inputRDD, partitioner)
     }
-    new MockRDD(sc, nParts, shuffleDeps)
+    new MockRDD(sc, nParts, shuffleDeps, Nil)
   }
 
   val backendException = new AtomicReference[Exception](null)
@@ -449,10 +455,11 @@ case class ExecutorTaskStatus(host: String, executorId: String, var freeCores: I
 class MockRDD(
   sc: SparkContext,
   val numPartitions: Int,
-  val shuffleDeps: Seq[ShuffleDependency[Int, Int, Nothing]]
-) extends RDD[(Int, Int)](sc, shuffleDeps) with Serializable {
+  val shuffleDeps: Seq[ShuffleDependency[Int, Int, Nothing]],
+  val oneToOneDeps: Seq[OneToOneDependency[(Int, Int)]]
+) extends RDD[(Int, Int)](sc, deps = shuffleDeps ++ oneToOneDeps) with Serializable {
 
-  MockRDD.validate(numPartitions, shuffleDeps)
+  MockRDD.validate(numPartitions, shuffleDeps, oneToOneDeps)
 
   override def compute(split: Partition, context: TaskContext): Iterator[(Int, Int)] =
     throw new RuntimeException("should not be reached")
@@ -468,14 +475,25 @@ class MockRDD(
 object MockRDD extends AssertionsHelper with TripleEquals with Assertions {
   /**
    * make sure all the shuffle dependencies have a consistent number of output partitions
+   * and that one-to-one dependencies have the same partition counts as their parents
    * (mostly to make sure the test setup makes sense, not that Spark itself would get this wrong)
    */
-  def validate(numPartitions: Int, dependencies: Seq[ShuffleDependency[_, _, _]]): Unit = {
-    dependencies.foreach { dependency =>
+  def validate(
+      numPartitions: Int,
+      shuffleDependencies: Seq[ShuffleDependency[_, _, _]],
+      oneToOneDependencies: Seq[OneToOneDependency[_]]): Unit = {
+    shuffleDependencies.foreach { dependency =>
       val partitioner = dependency.partitioner
       assert(partitioner != null)
       assert(partitioner.numPartitions === numPartitions)
     }
+    oneToOneDependencies.foreach { dependency =>
+      // In order to support the SPARK-23626 testcase, we cast to MockRDD
+      // and access `numPartitions` instead of just calling `getNumPartitions`:
+      // `getNumPartitions` would call `getPartitions`, undermining the intention
+      // of the SPARK-23626 testcase.
+      assert(dependency.rdd.asInstanceOf[MockRDD].numPartitions === numPartitions)
+    }
   }
 }
 
@@ -539,7 +557,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor
       backend.taskSuccess(taskDescription, 42)
     }
     withBackend(runBackend _) {
-      val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray)
+      val jobFuture = submit(new MockRDD(sc, 10, Nil, Nil), (0 until 10).toArray)
       awaitJobTermination(jobFuture, duration)
     }
     assert(results === (0 until 10).map { _ -> 42 }.toMap)
@@ -564,7 +582,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor
       }
     }
 
-    val a = new MockRDD(sc, 2, Nil)
+    val a = new MockRDD(sc, 2, Nil, Nil)
     val b = shuffle(10, a)
     val c = shuffle(20, a)
     val d = join(30, b, c)
@@ -604,7 +622,7 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor
    * (b) we get a second attempt for stage 0 & stage 1
    */
   testScheduler("job with fetch failure") {
-    val input = new MockRDD(sc, 2, Nil)
+    val input = new MockRDD(sc, 2, Nil, Nil)
     val shuffledRdd = shuffle(10, input)
     val shuffleId = shuffledRdd.shuffleDeps.head.shuffleId
 
@@ -646,10 +664,88 @@ class BasicSchedulerIntegrationSuite extends SchedulerIntegrationSuite[SingleCor
       backend.taskFailed(taskDescription, new RuntimeException("test task failure"))
     }
     withBackend(runBackend _) {
-      val jobFuture = submit(new MockRDD(sc, 10, Nil), (0 until 10).toArray)
+      val jobFuture = submit(new MockRDD(sc, 10, Nil, Nil), (0 until 10).toArray)
       awaitJobTermination(jobFuture, duration)
       assert(failure.getMessage.contains("test task failure"))
     }
     assertDataStructuresEmpty(noFailure = false)
   }
+
+  testScheduler("SPARK-23626: RDD with expensive getPartitions() doesn't block scheduler loop") {
+    // Before SPARK-23626, expensive `RDD.getPartitions()` calls might occur inside of the
+    // DAGScheduler event loop, causing concurrently-submitted jobs to block. This test case
+    // reproduces a scenario where that blocking could occur.
+
+    // We'll use latches to simulate an RDD with a slow getPartitions() call.
+    import MockRDDWithSlowGetPartitions._
+
+    // DAGScheduler.submitJob calls `.partitions` on the RDD passed to it.
+    // Therefore to write a proper regression test for SPARK-23626 we must
+    // ensure that the slow getPartitions() call occurs deeper in the RDD DAG:
+    val rddWithSlowGetPartitions = oneToOne(new MockRDDWithSlowGetPartitions(sc, 1))
+
+    // A RDD whose execution should not be blocked by the other RDD's slow getPartitions():
+    val simpleRdd = new MockRDD(sc, 1, Nil, Nil)
+
+    getPartitionsShouldNotHaveBeenCalledYet.set(false)
+
+    def runBackend(): Unit = {
+      val (taskDescription, _) = backend.beginTask()
+      backend.taskSuccess(taskDescription, 42)
+    }
+
+    withBackend(runBackend _) {
+      // Submit a job containing an RDD which will hang in getPartitions() until we release
+      // the countdown latch:
+      import scala.concurrent.ExecutionContext.Implicits.global
+      val slowJobFuture = Future { submit(rddWithSlowGetPartitions, Array(0)) }.flatten
+
+      // Block the current thread until the other thread has started the getPartitions() call:
+      beginGetPartitionsLatch.await(duration.toSeconds, SECONDS)
+
+      // Submit a concurrent job. This job's execution should not be blocked by the other job:
+      val fastJobFuture = submit(simpleRdd, Array(0))
+      awaitJobTermination(fastJobFuture, duration)
+
+      // The slow job should still be blocked in the getPartitions() call:
+      assert(!slowJobFuture.isCompleted)
+
+      // Allow it to complete:
+      endGetPartitionsLatch.countDown()
+      awaitJobTermination(slowJobFuture, duration)
+    }
+
+    assertDataStructuresEmpty()
+  }
+}
+
+/** Helper class used in SPARK-23626 test case */
+private object MockRDDWithSlowGetPartitions {
+  // Latch for blocking the test execution thread until getPartitions() has been called:
+  val beginGetPartitionsLatch = new CountDownLatch(1)
+
+  // Latch for blocking the getPartitions() call from completing:
+  val endGetPartitionsLatch = new CountDownLatch(1)
+
+  // Atomic boolean which is used to fail the test in case getPartitions() is called earlier
+  // than expected. This guards against false-negatives (e.g. the test passing because
+  // `.getPartitions()` was called in the test setup before we even submitted a job):
+  val getPartitionsShouldNotHaveBeenCalledYet = new AtomicBoolean(true)
+}
+
+/** Helper class used in SPARK-23626 test case */
+private class MockRDDWithSlowGetPartitions(
+    sc: SparkContext,
+    numPartitions: Int) extends MockRDD(sc, numPartitions, Nil, Nil) {
+  import MockRDDWithSlowGetPartitions._
+
+  override def getPartitions: Array[Partition] = {
+    if (getPartitionsShouldNotHaveBeenCalledYet.get()) {
+      throw new Exception("getPartitions() should not have been called at this point")
+    }
+    beginGetPartitionsLatch.countDown()
+    val partitions = super.getPartitions
+    endGetPartitionsLatch.await()
+    partitions
+  }
 }

From 640d88b31425c6557ca2c7d160dbbd5f40ee90e3 Mon Sep 17 00:00:00 2001
From: ulysses-you <ulyssesyou18@gmail.com>
Date: Tue, 26 Oct 2021 00:53:38 -0700
Subject: [PATCH 155/169] [SPARK-37098][SQL][3.1] Alter table properties should
 invalidate cache

This PR backport https://github.com/apache/spark/pull/34365 to branch-3.1

### What changes were proposed in this pull request?

Invalidate the table cache after alter table properties (set and unset).

### Why are the changes needed?

The table properties can change the behavior of wriing. e.g. the parquet table with `parquet.compression`.

If you execute the following SQL, we will get the file with snappy compression rather than zstd.
```
CREATE TABLE t (c int) STORED AS PARQUET;
// cache table metadata
SELECT * FROM t;
ALTER TABLE t SET TBLPROPERTIES('parquet.compression'='zstd');
INSERT INTO TABLE t values(1);
```
So we should invalidate the table cache after alter table properties.

### Does this PR introduce _any_ user-facing change?

yes, bug fix

### How was this patch tested?

Add test

Closes #34390 from ulysses-you/SOARK-37098-3.1.

Authored-by: ulysses-you <ulyssesyou18@gmail.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../sql/catalyst/catalog/SessionCatalog.scala |  7 ++++++
 .../spark/sql/execution/command/ddl.scala     |  2 ++
 .../spark/sql/hive/HiveParquetSuite.scala     | 24 +++++++++++++++++++
 3 files changed, 33 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
index 0766cac1e7390..d08af6c7bae2d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala
@@ -172,6 +172,13 @@ class SessionCatalog(
     tableRelationCache.invalidate(key)
   }
 
+  /** This method discards any cached table relation plans for the given table identifier. */
+  def invalidateCachedTable(name: TableIdentifier): Unit = {
+    val dbName = formatDatabaseName(name.database.getOrElse(currentDb))
+    val tableName = formatTableName(name.table)
+    invalidateCachedTable(QualifiedTableName(dbName, tableName))
+  }
+
   /** This method provides a way to invalidate all the cached plans. */
   def invalidateAllCachedTables(): Unit = {
     tableRelationCache.invalidateAll()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
index 851b221883d73..89c41ae660217 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -281,6 +281,7 @@ case class AlterTableSetPropertiesCommand(
       properties = table.properties ++ properties,
       comment = properties.get(TableCatalog.PROP_COMMENT).orElse(table.comment))
     catalog.alterTable(newTable)
+    catalog.invalidateCachedTable(tableName)
     Seq.empty[Row]
   }
 
@@ -319,6 +320,7 @@ case class AlterTableUnsetPropertiesCommand(
     val newProperties = table.properties.filter { case (k, _) => !propKeys.contains(k) }
     val newTable = table.copy(properties = newProperties, comment = tableComment)
     catalog.alterTable(newTable)
+    catalog.invalidateCachedTable(tableName)
     Seq.empty[Row]
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
index df96b0675cc2d..10d5d01bf9728 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveParquetSuite.scala
@@ -123,4 +123,28 @@ class HiveParquetSuite extends QueryTest with ParquetTest with TestHiveSingleton
       assert(msg.contains("cannot resolve '`c3`' given input columns"))
     }
   }
+
+  test("SPARK-37098: Alter table properties should invalidate cache") {
+    // specify the compression in case we change it in future
+    withSQLConf(SQLConf.PARQUET_COMPRESSION.key -> "snappy") {
+      withTempPath { dir =>
+        withTable("t") {
+          sql(s"CREATE TABLE t (c int) STORED AS PARQUET LOCATION '${dir.getCanonicalPath}'")
+          // cache table metadata
+          sql("SELECT * FROM t")
+          sql("ALTER TABLE t SET TBLPROPERTIES('parquet.compression'='gzip')")
+          sql("INSERT INTO TABLE t values(1)")
+          val files1 = dir.listFiles().filter(_.getName.endsWith("gz.parquet"))
+          assert(files1.length == 1)
+
+          // cache table metadata again
+          sql("SELECT * FROM t")
+          sql("ALTER TABLE t UNSET TBLPROPERTIES('parquet.compression')")
+          sql("INSERT INTO TABLE t values(1)")
+          val files2 = dir.listFiles().filter(_.getName.endsWith("snappy.parquet"))
+          assert(files2.length == 1)
+        }
+      }
+    }
+  }
 }

From fa3c90bb6cd071f804cdec43244a24385a7496e2 Mon Sep 17 00:00:00 2001
From: Jiaan Geng <beliefer@163.com>
Date: Fri, 5 Nov 2021 16:13:11 +0800
Subject: [PATCH 156/169] [SPARK-37203][SQL] Fix NotSerializableException when
 observe with TypedImperativeAggregate

Currently,
```
val namedObservation = Observation("named")

val df = spark.range(100)
val observed_df = df.observe(
   namedObservation, percentile_approx($"id", lit(0.5), lit(100)).as("percentile_approx_val"))

observed_df.collect()
namedObservation.get
```
throws exception as follows:
```
15:16:27.994 ERROR org.apache.spark.util.Utils: Exception encountered
java.io.NotSerializableException: org.apache.spark.sql.catalyst.expressions.aggregate.ApproximatePercentile$PercentileDigest
	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1184)
	at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548)
	at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509)
	at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432)
	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
	at java.io.ObjectOutputStream.writeArray(ObjectOutputStream.java:1378)
	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1174)
	at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548)
	at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509)
	at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432)
	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
	at java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1548)
	at java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1509)
	at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1432)
	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
	at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348)
	at org.apache.spark.scheduler.DirectTaskResult.$anonfun$writeExternal$2(TaskResult.scala:55)
	at org.apache.spark.scheduler.DirectTaskResult.$anonfun$writeExternal$2$adapted(TaskResult.scala:55)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at scala.collection.IterableLike.foreach(IterableLike.scala:74)
	at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
	at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
	at org.apache.spark.scheduler.DirectTaskResult.$anonfun$writeExternal$1(TaskResult.scala:55)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.tryOrIOException(Utils.scala:1434)
	at org.apache.spark.scheduler.DirectTaskResult.writeExternal(TaskResult.scala:51)
	at java.io.ObjectOutputStream.writeExternalData(ObjectOutputStream.java:1459)
	at java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1430)
	at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1178)
	at java.io.ObjectOutputStream.writeObject(ObjectOutputStream.java:348)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:44)
	at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:101)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:616)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
```
This PR will fix the issue. After the change,
`assert(namedObservation.get === Map("percentile_approx_val" -> 49))`
`java.io.NotSerializableException` will not happen.

Fix `NotSerializableException` when observe with `TypedImperativeAggregate`.

No. This PR change the implement of `AggregatingAccumulator` who uses serialize and deserialize of `TypedImperativeAggregate` now.

New tests.

Closes #34474 from beliefer/SPARK-37203.

Authored-by: Jiaan Geng <beliefer@163.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 3f3201a7882b817a8a3ecbfeb369dde01e7689d8)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../org/apache/spark/util/AccumulatorV2.scala |  6 ++++-
 .../aggregate/ApproximatePercentile.scala     |  2 +-
 .../execution/AggregatingAccumulator.scala    | 26 ++++++++++++++++---
 .../StreamingQueryListenerSuite.scala         |  7 ++---
 4 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
index 1453840b834f2..e5a8146447076 100644
--- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
+++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
@@ -155,6 +155,10 @@ abstract class AccumulatorV2[IN, OUT] extends Serializable {
    */
   def value: OUT
 
+  // Serialize the buffer of this accumulator before sending back this accumulator to the driver.
+  // By default this method does nothing.
+  protected def withBufferSerialized(): AccumulatorV2[IN, OUT] = this
+
   // Called by Java when serializing an object
   final protected def writeReplace(): Any = {
     if (atDriverSide) {
@@ -177,7 +181,7 @@ abstract class AccumulatorV2[IN, OUT] extends Serializable {
       }
       copyAcc
     } else {
-      this
+      withBufferSerialized()
     }
   }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
index 2a5275e75d4f9..6db41b4082013 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/ApproximatePercentile.scala
@@ -274,7 +274,7 @@ object ApproximatePercentile {
   }
 
   /**
-   * Serializer  for class [[PercentileDigest]]
+   * Serializer for class [[PercentileDigest]]
    *
    * This class is thread safe.
    */
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/AggregatingAccumulator.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/AggregatingAccumulator.scala
index 0fa4e6c316360..36de9d84cb930 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/AggregatingAccumulator.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/AggregatingAccumulator.scala
@@ -162,9 +162,18 @@ class AggregatingAccumulator private(
             i += 1
           }
           i = 0
-          while (i < typedImperatives.length) {
-            typedImperatives(i).mergeBuffersObjects(buffer, otherBuffer)
-            i += 1
+          if (isAtDriverSide) {
+            while (i < typedImperatives.length) {
+              // The input buffer stores serialized data
+              typedImperatives(i).merge(buffer, otherBuffer)
+              i += 1
+            }
+          } else {
+            while (i < typedImperatives.length) {
+              // The input buffer stores deserialized object
+              typedImperatives(i).mergeBuffersObjects(buffer, otherBuffer)
+              i += 1
+            }
           }
         case _ =>
           throw new UnsupportedOperationException(
@@ -187,6 +196,17 @@ class AggregatingAccumulator private(
     resultProjection(input)
   }
 
+  override def withBufferSerialized(): AggregatingAccumulator = {
+    assert(!isAtDriverSide)
+    var i = 0
+    // AggregatingAccumulator runs on executor, we should serialize all TypedImperativeAggregate.
+    while (i < typedImperatives.length) {
+      typedImperatives(i).serializeAggregateBufferInPlace(buffer)
+      i += 1
+    }
+    this
+  }
+
   /**
    * Get the output schema of the aggregating accumulator.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
index 156528776d945..adf9764f0f22e 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamingQueryListenerSuite.scala
@@ -417,7 +417,8 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
         min($"value").as("min_val"),
         max($"value").as("max_val"),
         sum($"value").as("sum_val"),
-        count(when($"value" % 2 === 0, 1)).as("num_even"))
+        count(when($"value" % 2 === 0, 1)).as("num_even"),
+        percentile_approx($"value", lit(0.5), lit(100)).as("percentile_approx_val"))
       .observe(
         name = "other_event",
         avg($"value").cast("int").as("avg_val"))
@@ -444,7 +445,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
         AddData(inputData, 1, 2),
         AdvanceManualClock(100),
         checkMetrics { metrics =>
-          assert(metrics.get("my_event") === Row(1, 2, 3L, 1L))
+          assert(metrics.get("my_event") === Row(1, 2, 3L, 1L, 1))
           assert(metrics.get("other_event") === Row(1))
         },
 
@@ -452,7 +453,7 @@ class StreamingQueryListenerSuite extends StreamTest with BeforeAndAfter {
         AddData(inputData, 10, 30, -10, 5),
         AdvanceManualClock(100),
         checkMetrics { metrics =>
-          assert(metrics.get("my_event") === Row(-10, 30, 35L, 3L))
+          assert(metrics.get("my_event") === Row(-10, 30, 35L, 3L, 5))
           assert(metrics.get("other_event") === Row(8))
         },
 

From 51ca7c080da34e0d33333d4b8c251b814f8fcd91 Mon Sep 17 00:00:00 2001
From: Angerszhuuuu <angers.zhu@gmail.com>
Date: Mon, 8 Nov 2021 12:08:32 -0800
Subject: [PATCH 157/169] [SPARK-37196][SQL] HiveDecimal enforcePrecisionScale
 failed return null

For case
```
withTempDir { dir =>
      withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> "false") {
        withTable("test_precision") {
          val df = sql("SELECT 'dummy' AS name, 1000000000000000000010.7000000000000010 AS value")
          df.write.mode("Overwrite").parquet(dir.getAbsolutePath)
          sql(
            s"""
               |CREATE EXTERNAL TABLE test_precision(name STRING, value DECIMAL(18,6))
               |STORED AS PARQUET LOCATION '${dir.getAbsolutePath}'
               |""".stripMargin)
          checkAnswer(sql("SELECT * FROM test_precision"), Row("dummy", null))
        }
      }
    }
```

We write a data with schema

It's caused by you create a df with
```
root
 |-- name: string (nullable = false)
 |-- value: decimal(38,16) (nullable = false)
```
but create table schema

```
root
 |-- name: string (nullable = false)
 |-- value: decimal(18,6) (nullable = false)
```

This will cause enforcePrecisionScale return `null`
```
  public HiveDecimal getPrimitiveJavaObject(Object o) {
    return o == null ? null : this.enforcePrecisionScale(((HiveDecimalWritable)o).getHiveDecimal());
  }
```
Then throw NPE when call `toCatalystDecimal `

We should judge if the return value is `null` to avoid throw NPE

Fix bug

No

Added UT

Closes #34519 from AngersZhuuuu/SPARK-37196.

Authored-by: Angerszhuuuu <angers.zhu@gmail.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
(cherry picked from commit a4f8ffbbfb0158a03ff52f1ed0dde75241c3a90e)
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../org/apache/spark/sql/hive/HiveShim.scala    | 15 ++++++++++++---
 .../sql/hive/execution/SQLQuerySuite.scala      | 17 +++++++++++++++++
 2 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
index 3a53a2a8dadd8..351cde58427c6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveShim.scala
@@ -101,10 +101,19 @@ private[hive] object HiveShim {
 
   def toCatalystDecimal(hdoi: HiveDecimalObjectInspector, data: Any): Decimal = {
     if (hdoi.preferWritable()) {
-      Decimal(hdoi.getPrimitiveWritableObject(data).getHiveDecimal().bigDecimalValue,
-        hdoi.precision(), hdoi.scale())
+      val value = hdoi.getPrimitiveWritableObject(data)
+      if (value == null) {
+        null
+      } else {
+        Decimal(value.getHiveDecimal().bigDecimalValue, hdoi.precision(), hdoi.scale())
+      }
     } else {
-      Decimal(hdoi.getPrimitiveJavaObject(data).bigDecimalValue(), hdoi.precision(), hdoi.scale())
+      val value = hdoi.getPrimitiveJavaObject(data)
+      if (value == null) {
+        null
+      } else {
+        Decimal(value.bigDecimalValue(), hdoi.precision(), hdoi.scale())
+      }
     }
   }
 
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
index aaa77ae2a92bf..3d4357cabb4f7 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2584,6 +2584,23 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi
     }
   }
 
+  test("SPARK-37196: HiveDecimal Precision Scale match failed should return null") {
+    withTempDir { dir =>
+      withSQLConf(HiveUtils.CONVERT_METASTORE_PARQUET.key -> "false") {
+        withTable("test_precision") {
+          val df = sql(s"SELECT 'dummy' AS name, ${"1" * 20}.${"2" * 18} AS value")
+          df.write.mode("Overwrite").parquet(dir.getAbsolutePath)
+          sql(
+            s"""
+               |CREATE EXTERNAL TABLE test_precision(name STRING, value DECIMAL(18,6))
+               |STORED AS PARQUET LOCATION '${dir.getAbsolutePath}'
+               |""".stripMargin)
+          checkAnswer(sql("SELECT * FROM test_precision"), Row("dummy", null))
+        }
+      }
+    }
+  }
+
   test("SPARK-11374 Support skip.header.line.count option in Hive table") {
     withTable("skip_table_0", "skip_table_1", "skip_table_2", "skip_table_3", "skip_table_4") {
       withTempDir { dir =>

From 86e817708de3f6bbfd8791967f1870852b0b7294 Mon Sep 17 00:00:00 2001
From: Tom van Bussel <tom.vanbussel@databricks.com>
Date: Mon, 22 Nov 2021 16:18:30 +0800
Subject: [PATCH 158/169] [SPARK-37388][SQL] Fix NPE in WidthBucket in
 WholeStageCodegenExec

This PR fixes a `NullPointerException` in `WholeStageCodegenExec` caused by the expression `WidthBucket`. The cause of this NPE is that `WidthBucket` calls `WidthBucket.computeBucketNumber`, which can return `null`, but the generated code cannot deal with `null`s.

This fixes a `NullPointerException` in Spark SQL.

No

Added tests to `MathExpressionsSuite`. This suite already had tests for `WidthBucket` with interval inputs, but lacked tests with double inputs. I checked that the tests failed without the fix, and succeed with the fix.

Closes #34670 from tomvanbussel/SPARK-37388.

Authored-by: Tom van Bussel <tom.vanbussel@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 77f397464823a547c27d98ed306703cb9c73cec3)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../expressions/mathExpressions.scala         | 34 ++++++++++++++-----
 .../expressions/MathExpressionsSuite.scala    | 28 +++++++++++++++
 2 files changed, 54 insertions(+), 8 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
index 931365fb25a1e..5b079889fc30f 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/mathExpressions.scala
@@ -1367,14 +1367,27 @@ case class BRound(child: Expression, scale: Expression)
 }
 
 object WidthBucket {
-
   def computeBucketNumber(value: Double, min: Double, max: Double, numBucket: Long): jl.Long = {
-    if (numBucket <= 0 || numBucket == Long.MaxValue || jl.Double.isNaN(value) || min == max ||
-        jl.Double.isNaN(min) || jl.Double.isInfinite(min) ||
-        jl.Double.isNaN(max) || jl.Double.isInfinite(max)) {
-      return null
+    if (isNull(value, min, max, numBucket)) {
+      null
+    } else {
+      computeBucketNumberNotNull(value, min, max, numBucket)
     }
+  }
 
+  /** This function is called by generated Java code, so it needs to be public. */
+  def isNull(value: Double, min: Double, max: Double, numBucket: Long): Boolean = {
+    numBucket <= 0 ||
+      numBucket == Long.MaxValue ||
+      jl.Double.isNaN(value) ||
+      min == max ||
+      jl.Double.isNaN(min) || jl.Double.isInfinite(min) ||
+      jl.Double.isNaN(max) || jl.Double.isInfinite(max)
+  }
+
+  /** This function is called by generated Java code, so it needs to be public. */
+  def computeBucketNumberNotNull(
+      value: Double, min: Double, max: Double, numBucket: Long): jl.Long = {
     val lower = Math.min(min, max)
     val upper = Math.max(min, max)
 
@@ -1457,8 +1470,13 @@ case class WidthBucket(
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    defineCodeGen(ctx, ev, (input, min, max, numBucket) =>
-      "org.apache.spark.sql.catalyst.expressions.WidthBucket" +
-        s".computeBucketNumber($input, $min, $max, $numBucket)")
+    nullSafeCodeGen(ctx, ev, (input, min, max, numBucket) => {
+      s"""${ev.isNull} = org.apache.spark.sql.catalyst.expressions.WidthBucket
+         |  .isNull($input, $min, $max, $numBucket);
+         |if (!${ev.isNull}) {
+         |  ${ev.value} = org.apache.spark.sql.catalyst.expressions.WidthBucket
+         |    .computeBucketNumberNotNull($input, $min, $max, $numBucket);
+         |}""".stripMargin
+    })
   }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
index 6d09e28362e11..28c19d1274ad2 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MathExpressionsSuite.scala
@@ -676,4 +676,32 @@ class MathExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
     checkEvaluation(BRound(-0.35, 1), -0.4)
     checkEvaluation(BRound(-35, -1), -40)
   }
+
+  test("SPARK-37388: width_bucket") {
+    val nullDouble = Literal.create(null, DoubleType)
+    val nullLong = Literal.create(null, LongType)
+
+    checkEvaluation(WidthBucket(5.35, 0.024, 10.06, 5L), 3L)
+    checkEvaluation(WidthBucket(-2.1, 1.3, 3.4, 3L), 0L)
+    checkEvaluation(WidthBucket(8.1, 0.0, 5.7, 4L), 5L)
+    checkEvaluation(WidthBucket(-0.9, 5.2, 0.5, 2L), 3L)
+    checkEvaluation(WidthBucket(nullDouble, 0.024, 10.06, 5L), null)
+    checkEvaluation(WidthBucket(5.35, nullDouble, 10.06, 5L), null)
+    checkEvaluation(WidthBucket(5.35, 0.024, nullDouble, 5L), null)
+    checkEvaluation(WidthBucket(5.35, nullDouble, nullDouble, 5L), null)
+    checkEvaluation(WidthBucket(5.35, 0.024, 10.06, nullLong), null)
+    checkEvaluation(WidthBucket(nullDouble, nullDouble, nullDouble, nullLong), null)
+    checkEvaluation(WidthBucket(5.35, 0.024, 10.06, -5L), null)
+    checkEvaluation(WidthBucket(5.35, 0.024, 10.06, Long.MaxValue), null)
+    checkEvaluation(WidthBucket(Double.NaN, 0.024, 10.06, 5L), null)
+    checkEvaluation(WidthBucket(Double.NegativeInfinity, 0.024, 10.06, 5L), 0L)
+    checkEvaluation(WidthBucket(Double.PositiveInfinity, 0.024, 10.06, 5L), 6L)
+    checkEvaluation(WidthBucket(5.35, 0.024, 0.024, 5L), null)
+    checkEvaluation(WidthBucket(5.35, Double.NaN, 10.06, 5L), null)
+    checkEvaluation(WidthBucket(5.35, Double.NegativeInfinity, 10.06, 5L), null)
+    checkEvaluation(WidthBucket(5.35, Double.PositiveInfinity, 10.06, 5L), null)
+    checkEvaluation(WidthBucket(5.35, 0.024, Double.NaN, 5L), null)
+    checkEvaluation(WidthBucket(5.35, 0.024, Double.NegativeInfinity, 5L), null)
+    checkEvaluation(WidthBucket(5.35, 0.024, Double.PositiveInfinity, 5L), null)
+  }
 }

From 508d6f42e4f3f5c7edd18a35724964adeac20b06 Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Mon, 29 Nov 2021 20:26:43 -0800
Subject: [PATCH 159/169] [SPARK-37452][SQL][3.1] Char and Varchar break
 backward compatibility between v3.1 and v2

This backports https://github.com/apache/spark/pull/34697 to 3.1

### What changes were proposed in this pull request?

We will store table schema in table properties for the read-side to restore. In Spark 3.1, we add char/varchar support natively. In some commands like `create table`, `alter table` with these types,  the `char(x)` or `varchar(x)` will be stored directly to those properties. If a user uses Spark 2 to read such a table it will fail to parse the schema.

FYI, https://github.com/apache/spark/blob/branch-2.4/sql/catalyst/src/main/scala/org/apache/spark/sql/types/DataType.scala#L136

A table can be a newly created one by Spark 3.1 and later or an existing one modified by Spark 3.1 and on.

### Why are the changes needed?

backward compatibility

### Does this PR introduce _any_ user-facing change?

That's not necessarily user-facing as a bugfix and only related to internal table properties.

### How was this patch tested?

manully

Closes #34736 from yaooqinn/PR_TOOL_PICK_PR_34697_BRANCH-3.1.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../spark/sql/hive/HiveExternalCatalog.scala  | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 019718cc53a95..1e43f2cc62ae6 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -40,7 +40,7 @@ import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils._
 import org.apache.spark.sql.catalyst.expressions._
-import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
+import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils, DateTimeUtils}
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.{PartitioningUtils, SourceOptions}
 import org.apache.spark.sql.hive.client.HiveClient
@@ -429,12 +429,17 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     val properties = new mutable.HashMap[String, String]
 
     properties.put(CREATED_SPARK_VERSION, table.createVersion)
-
+    // This is for backward compatibility to Spark 2 to read tables with char/varchar created by
+    // Spark 3.1. At read side, we will restore a table schema from its properties. So, we need to
+    // clear the `varchar(n)` and `char(n)` and replace them with `string` as Spark 2 does not have
+    // a type mapping for them in `DataType.nameToType`.
+    // See `restoreHiveSerdeTable` for example.
+    val newSchema = CharVarcharUtils.replaceCharVarcharWithStringInSchema(schema)
     // Serialized JSON schema string may be too long to be stored into a single metastore table
     // property. In this case, we split the JSON string and store each part as a separate table
     // property.
     val threshold = conf.get(SCHEMA_STRING_LENGTH_THRESHOLD)
-    val schemaJsonString = schema.json
+    val schemaJsonString = newSchema.json
     // Split the JSON string.
     val parts = schemaJsonString.grouped(threshold).toSeq
     properties.put(DATASOURCE_SCHEMA_NUMPARTS, parts.size.toString)
@@ -745,7 +750,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
         // If this is a view created by Spark 2.2 or higher versions, we should restore its schema
         // from table properties.
         if (table.properties.contains(DATASOURCE_SCHEMA_NUMPARTS)) {
-          table = table.copy(schema = getSchemaFromTableProperties(table))
+          val newSchema = CharVarcharUtils.getRawSchema(getSchemaFromTableProperties(table))
+          table = table.copy(schema = newSchema)
         }
 
       // No provider in table properties, which means this is a Hive serde table.
@@ -796,7 +802,8 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     // If this is a Hive serde table created by Spark 2.1 or higher versions, we should restore its
     // schema from table properties.
     if (table.properties.contains(DATASOURCE_SCHEMA_NUMPARTS)) {
-      val schemaFromTableProps = getSchemaFromTableProperties(table)
+      val schemaFromTableProps =
+        CharVarcharUtils.getRawSchema(getSchemaFromTableProperties(table))
       val partColumnNames = getPartitionColumnsFromTableProperties(table)
       val reorderedSchema = reorderSchema(schema = schemaFromTableProps, partColumnNames)
 
@@ -836,7 +843,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       storageWithLocation.properties.filterKeys(!HIVE_GENERATED_STORAGE_PROPERTIES(_)).toMap)
     val partitionProvider = table.properties.get(TABLE_PARTITION_PROVIDER)
 
-    val schemaFromTableProps = getSchemaFromTableProperties(table)
+    val schemaFromTableProps = CharVarcharUtils.getRawSchema(getSchemaFromTableProperties(table))
     val partColumnNames = getPartitionColumnsFromTableProperties(table)
     val reorderedSchema = reorderSchema(schema = schemaFromTableProps, partColumnNames)
 

From 04f31f47592a5816fab4c6d604906dec5def37e6 Mon Sep 17 00:00:00 2001
From: Daniel Dai <jdai@pinterest.com>
Date: Tue, 7 Dec 2021 08:48:23 -0600
Subject: [PATCH 160/169] [SPARK-37556][SQL] Deser void class fail with Java
 serialization

**What changes were proposed in this pull request?**
Change the deserialization mapping for primitive type void.

**Why are the changes needed?**
The void primitive type in Scala should be classOf[Unit] not classOf[Void]. Spark erroneously [map it](https://github.com/apache/spark/blob/v3.2.0/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala#L80) differently than all other primitive types. Here is the code:
```
private object JavaDeserializationStream {
  val primitiveMappings = Map[String, Class[_]](
    "boolean" -> classOf[Boolean],
    "byte" -> classOf[Byte],
    "char" -> classOf[Char],
    "short" -> classOf[Short],
    "int" -> classOf[Int],
    "long" -> classOf[Long],
    "float" -> classOf[Float],
    "double" -> classOf[Double],
    "void" -> classOf[Void]
  )
}
```
Spark code is Here is the demonstration:
```
scala> classOf[Long]
val res0: Class[Long] = long

scala> classOf[Double]
val res1: Class[Double] = double

scala> classOf[Byte]
val res2: Class[Byte] = byte

scala> classOf[Void]
val res3: Class[Void] = class java.lang.Void  <--- this is wrong

scala> classOf[Unit]
val res4: Class[Unit] = void <---- this is right
```

It will result in Spark deserialization error if the Spark code contains void primitive type:
`java.io.InvalidClassException: java.lang.Void; local class name incompatible with stream class name "void"`

**Does this PR introduce any user-facing change?**
no

**How was this patch tested?**
Changed test, also tested e2e with the code results deserialization error and it pass now.

Closes #34816 from daijyc/voidtype.

Authored-by: Daniel Dai <jdai@pinterest.com>
Signed-off-by: Sean Owen <srowen@gmail.com>
(cherry picked from commit fb40c0e19f84f2de9a3d69d809e9e4031f76ef90)
Signed-off-by: Sean Owen <srowen@gmail.com>
---
 .../scala/org/apache/spark/serializer/JavaSerializer.scala    | 4 ++--
 .../org/apache/spark/serializer/JavaSerializerSuite.scala     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
index 077b035f3d079..3c134011f11a0 100644
--- a/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
+++ b/core/src/main/scala/org/apache/spark/serializer/JavaSerializer.scala
@@ -87,8 +87,8 @@ private object JavaDeserializationStream {
     "long" -> classOf[Long],
     "float" -> classOf[Float],
     "double" -> classOf[Double],
-    "void" -> classOf[Void]
-  )
+    "void" -> classOf[Unit])
+
 }
 
 private[spark] class JavaSerializerInstance(
diff --git a/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala b/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala
index 6a6ea42797fb6..03349f8aef4fc 100644
--- a/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala
+++ b/core/src/test/scala/org/apache/spark/serializer/JavaSerializerSuite.scala
@@ -47,5 +47,5 @@ private class ContainsPrimitiveClass extends Serializable {
   val floatClass = classOf[Float]
   val booleanClass = classOf[Boolean]
   val byteClass = classOf[Byte]
-  val voidClass = classOf[Void]
+  val voidClass = classOf[Unit]
 }

From 9157b18c0102007d3a6e01baa00822543c3dad5a Mon Sep 17 00:00:00 2001
From: Wenchen Fan <wenchen@databricks.com>
Date: Wed, 8 Dec 2021 13:04:40 +0800
Subject: [PATCH 161/169] [SPARK-37392][SQL] Fix the performance bug when
 inferring constraints for Generate

This is a performance regression since Spark 3.1, caused by https://issues.apache.org/jira/browse/SPARK-32295

If you run the query in the JIRA ticket
```
Seq(
  (1, "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x", "x")
).toDF()
  .checkpoint() // or save and reload to truncate lineage
  .createOrReplaceTempView("sub")

session.sql("""
  SELECT
    *
  FROM
  (
    SELECT
      EXPLODE( ARRAY( * ) ) result
    FROM
    (
      SELECT
        _1 a, _2 b, _3 c, _4 d, _5 e, _6 f, _7 g, _8 h, _9 i, _10 j, _11 k, _12 l, _13 m, _14 n, _15 o, _16 p, _17 q, _18 r, _19 s, _20 t, _21 u
      FROM
        sub
    )
  )
  WHERE
    result != ''
  """).show()
```
You will hit OOM. The reason is that:
1. We infer additional predicates with `Generate`. In this case, it's `size(array(cast(_1#21 as string), _2#22, _3#23, ...) > 0`
2. Because of the cast, the `ConstantFolding` rule can't optimize this `size(array(...))`.
3. We end up with a plan containing this part
```
   +- Project [_1#21 AS a#106, _2#22 AS b#107, _3#23 AS c#108, _4#24 AS d#109, _5#25 AS e#110, _6#26 AS f#111, _7#27 AS g#112, _8#28 AS h#113, _9#29 AS i#114, _10#30 AS j#115, _11#31 AS k#116, _12#32 AS l#117, _13#33 AS m#118, _14#34 AS n#119, _15#35 AS o#120, _16#36 AS p#121, _17#37 AS q#122, _18#38 AS r#123, _19#39 AS s#124, _20#40 AS t#125, _21#41 AS u#126]
      +- Filter (size(array(cast(_1#21 as string), _2#22, _3#23, _4#24, _5#25, _6#26, _7#27, _8#28, _9#29, _10#30, _11#31, _12#32, _13#33, _14#34, _15#35, _16#36, _17#37, _18#38, _19#39, _20#40, _21#41), true) > 0)
         +- LogicalRDD [_1#21, _2#22, _3#23, _4#24, _5#25, _6#26, _7#27, _8#28, _9#29, _10#30, _11#31, _12#32, _13#33, _14#34, _15#35, _16#36, _17#37, _18#38, _19#39, _20#40, _21#41]
```
When calculating the constraints of the `Project`, we generate around 2^20 expressions, due to this code
```
var allConstraints = child.constraints
projectList.foreach {
  case a  Alias(l: Literal, _) =>
    allConstraints += EqualNullSafe(a.toAttribute, l)
  case a  Alias(e, _) =>
    // For every alias in `projectList`, replace the reference in constraints by its attribute.
    allConstraints ++= allConstraints.map(_ transform {
      case expr: Expression if expr.semanticEquals(e) =>
        a.toAttribute
    })
    allConstraints += EqualNullSafe(e, a.toAttribute)
  case _ => // Don't change.
}
```

There are 3 issues here:
1. We may infer complicated predicates from `Generate`
2. `ConstanFolding` rule is too conservative. At least `Cast` has no side effect with ANSI-off.
3. When calculating constraints, we should have a upper bound to avoid generating too many expressions.

This fixes the first 2 issues, and leaves the third one for the future.

fix a performance issue

no

new tests, and run the query in JIRA ticket locally.

Closes #34823 from cloud-fan/perf.

Authored-by: Wenchen Fan <wenchen@databricks.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit 1fac7a9d9992b7c120f325cdfa6a935b52c7f3bc)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
---
 .../sql/catalyst/optimizer/Optimizer.scala    | 41 ++++----
 .../sql/catalyst/optimizer/expressions.scala  |  1 +
 .../InferFiltersFromGenerateSuite.scala       | 98 ++++++++-----------
 3 files changed, 67 insertions(+), 73 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index ecaa6b96ca031..ef9f19914cec7 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -894,25 +894,30 @@ object TransposeWindow extends Rule[LogicalPlan] {
  * by this [[Generate]] can be removed earlier - before joins and in data sources.
  */
 object InferFiltersFromGenerate extends Rule[LogicalPlan] {
-  def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
-    // This rule does not infer filters from foldable expressions to avoid constant filters
-    // like 'size([1, 2, 3]) > 0'. These do not show up in child's constraints and
-    // then the idempotence will break.
-    case generate @ Generate(e, _, _, _, _, _)
-      if !e.deterministic || e.children.forall(_.foldable) ||
-        e.children.exists(_.isInstanceOf[UserDefinedExpression]) => generate
-
+  def apply(plan: LogicalPlan): LogicalPlan = plan.transformUp {
     case generate @ Generate(g, _, false, _, _, _) if canInferFilters(g) =>
-      // Exclude child's constraints to guarantee idempotency
-      val inferredFilters = ExpressionSet(
-        Seq(
-          GreaterThan(Size(g.children.head), Literal(0)),
-          IsNotNull(g.children.head)
-        )
-      ) -- generate.child.constraints
-
-      if (inferredFilters.nonEmpty) {
-        generate.copy(child = Filter(inferredFilters.reduce(And), generate.child))
+      assert(g.children.length == 1)
+      val input = g.children.head
+      // Generating extra predicates here has overheads/risks:
+      //   - We may evaluate expensive input expressions multiple times.
+      //   - We may infer too many constraints later.
+      //   - The input expression may fail to be evaluated under ANSI mode. If we reorder the
+      //     predicates and evaluate the input expression first, we may fail the query unexpectedly.
+      // To be safe, here we only generate extra predicates if the input is an attribute.
+      // Note that, foldable input is also excluded here, to avoid constant filters like
+      // 'size([1, 2, 3]) > 0'. These do not show up in child's constraints and then the
+      // idempotence will break.
+      if (input.isInstanceOf[Attribute]) {
+        // Exclude child's constraints to guarantee idempotency
+        val inferredFilters = ExpressionSet(
+          Seq(GreaterThan(Size(input), Literal(0)), IsNotNull(input))
+        ) -- generate.child.constraints
+
+        if (inferredFilters.nonEmpty) {
+          generate.copy(child = Filter(inferredFilters.reduce(And), generate.child))
+        } else {
+          generate
+        }
       } else {
         generate
       }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index d9897530ff74c..78098c41ec736 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -46,6 +46,7 @@ object ConstantFolding extends Rule[LogicalPlan] {
   private def hasNoSideEffect(e: Expression): Boolean = e match {
     case _: Attribute => true
     case _: Literal => true
+    case c: Cast if !conf.ansiEnabled => hasNoSideEffect(c.child)
     case _: NoThrow if e.deterministic => e.children.forall(hasNoSideEffect)
     case _ => false
   }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala
index 800d37eaa0d4a..61ab4f027ed22 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/InferFiltersFromGenerateSuite.scala
@@ -18,10 +18,8 @@
 package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.Row
-import org.apache.spark.sql.catalyst.analysis.EliminateSubqueryAliases
 import org.apache.spark.sql.catalyst.dsl.expressions._
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
@@ -36,7 +34,7 @@ class InferFiltersFromGenerateSuite extends PlanTest {
   val testRelation = LocalRelation('a.array(StructType(Seq(
     StructField("x", IntegerType),
     StructField("y", IntegerType)
-  ))), 'c1.string, 'c2.string)
+  ))), 'c1.string, 'c2.string, 'c3.int)
 
   Seq(Explode(_), PosExplode(_), Inline(_)).foreach { f =>
     val generator = f('a)
@@ -74,63 +72,53 @@ class InferFiltersFromGenerateSuite extends PlanTest {
       val optimized = Optimize.execute(originalQuery)
       comparePlans(optimized, originalQuery)
     }
-  }
 
-  // setup rules to test inferFilters with ConstantFolding to make sure
-  // the Filter rule added in inferFilters is removed again when doing
-  // explode with CreateArray/CreateMap
-  object OptimizeInferAndConstantFold extends RuleExecutor[LogicalPlan] {
-    val batches =
-      Batch("AnalysisNodes", Once,
-        EliminateSubqueryAliases) ::
-      Batch("Infer Filters", Once, InferFiltersFromGenerate) ::
-      Batch("ConstantFolding after", FixedPoint(4),
-        ConstantFolding,
-        NullPropagation,
-        PruneFilters) :: Nil
+    val generatorWithFromJson = f(JsonToStructs(
+      ArrayType(new StructType().add("s", "string")),
+      Map.empty,
+      'c1))
+    test("SPARK-37392: Don't infer filters from " + generatorWithFromJson) {
+      val originalQuery = testRelation.generate(generatorWithFromJson).analyze
+      val optimized = Optimize.execute(originalQuery)
+      comparePlans(optimized, originalQuery)
+    }
+
+    val returnSchema = ArrayType(StructType(Seq(
+      StructField("x", IntegerType),
+      StructField("y", StringType)
+    )))
+    val fakeUDF = ScalaUDF(
+      (i: Int) => Array(Row.fromSeq(Seq(1, "a")), Row.fromSeq(Seq(2, "b"))),
+      returnSchema, 'c3 :: Nil, Nil)
+    val generatorWithUDF = f(fakeUDF)
+    test("SPARK-36715: Don't infer filters from " + generatorWithUDF) {
+      val originalQuery = testRelation.generate(generatorWithUDF).analyze
+      val optimized = Optimize.execute(originalQuery)
+      comparePlans(optimized, originalQuery)
+    }
   }
 
   Seq(Explode(_), PosExplode(_)).foreach { f =>
-     val createArrayExplode = f(CreateArray(Seq('c1)))
-     test("SPARK-33544: Don't infer filters from CreateArray " + createArrayExplode) {
-       val originalQuery = testRelation.generate(createArrayExplode).analyze
-       val optimized = OptimizeInferAndConstantFold.execute(originalQuery)
-       comparePlans(optimized, originalQuery)
-     }
-     val createMapExplode = f(CreateMap(Seq('c1, 'c2)))
-     test("SPARK-33544: Don't infer filters from CreateMap " + createMapExplode) {
-       val originalQuery = testRelation.generate(createMapExplode).analyze
-       val optimized = OptimizeInferAndConstantFold.execute(originalQuery)
-       comparePlans(optimized, originalQuery)
-     }
-   }
-
-   Seq(Inline(_)).foreach { f =>
-     val createArrayStructExplode = f(CreateArray(Seq(CreateStruct(Seq('c1)))))
-     test("SPARK-33544: Don't infer filters from CreateArray " + createArrayStructExplode) {
-       val originalQuery = testRelation.generate(createArrayStructExplode).analyze
-       val optimized = OptimizeInferAndConstantFold.execute(originalQuery)
-       comparePlans(optimized, originalQuery)
-     }
-   }
+    val createArrayExplode = f(CreateArray(Seq('c1)))
+    test("SPARK-33544: Don't infer filters from " + createArrayExplode) {
+      val originalQuery = testRelation.generate(createArrayExplode).analyze
+      val optimized = Optimize.execute(originalQuery)
+      comparePlans(optimized, originalQuery)
+    }
+    val createMapExplode = f(CreateMap(Seq('c1, 'c2)))
+    test("SPARK-33544: Don't infer filters from " + createMapExplode) {
+      val originalQuery = testRelation.generate(createMapExplode).analyze
+      val optimized = Optimize.execute(originalQuery)
+      comparePlans(optimized, originalQuery)
+    }
+  }
 
-  test("SPARK-36715: Don't infer filters from udf") {
-    Seq(Explode(_), PosExplode(_), Inline(_)).foreach { f =>
-      val returnSchema = ArrayType(StructType(Seq(
-        StructField("x", IntegerType),
-        StructField("y", StringType)
-      )))
-      val fakeUDF = ScalaUDF(
-        (i: Int) => Array(Row.fromSeq(Seq(1, "a")), Row.fromSeq(Seq(2, "b"))),
-        returnSchema, Literal(8) :: Nil,
-        Option(ExpressionEncoder[Int]().resolveAndBind()) :: Nil)
-      val generator = f(fakeUDF)
-      val originalQuery = OneRowRelation().generate(generator).analyze
-      val optimized = OptimizeInferAndConstantFold.execute(originalQuery)
-      val correctAnswer = OneRowRelation()
-        .generate(generator)
-        .analyze
-      comparePlans(optimized, correctAnswer)
+  Seq(Inline(_)).foreach { f =>
+    val createArrayStructExplode = f(CreateArray(Seq(CreateStruct(Seq('c1)))))
+    test("SPARK-33544: Don't infer filters from " + createArrayStructExplode) {
+      val originalQuery = testRelation.generate(createArrayStructExplode).analyze
+      val optimized = Optimize.execute(originalQuery)
+      comparePlans(optimized, originalQuery)
     }
   }
 }

From a9e99bdec67ba5dc0371acd4a9d17036fca5c2e2 Mon Sep 17 00:00:00 2001
From: Yuming Wang <yumwang@ebay.com>
Date: Thu, 9 Dec 2021 09:17:25 -0800
Subject: [PATCH 162/169] [SPARK-37451][3.1][SQL] Fix cast string type to
 decimal type if spark.sql.legacy.allowNegativeScaleOfDecimal is enabled

Backport #34811

### What changes were proposed in this pull request?

Fix cast string type to decimal type only if `spark.sql.legacy.allowNegativeScaleOfDecimal` is enabled. For example:
```scala
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row

spark.conf.set("spark.sql.legacy.allowNegativeScaleOfDecimal", true)
val data = Seq(Row("7.836725755512218E38"))
val schema = StructType(Array(StructField("a", StringType, false)))
val df =spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
df.select(col("a").cast(DecimalType(37,-17))).show
```

The result is null since [SPARK-32706](https://issues.apache.org/jira/browse/SPARK-32706).

### Why are the changes needed?

Fix regression bug.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Unit test.

Closes #34851 from wangyum/SPARK-37451-branch-3.1.

Authored-by: Yuming Wang <yumwang@ebay.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../org/apache/spark/sql/types/Decimal.scala      |  6 ++++--
 .../org/apache/spark/sql/types/DecimalSuite.scala | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
index 72cf03d0ab506..53eb26b6fad21 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/types/Decimal.scala
@@ -601,7 +601,8 @@ object Decimal {
       val bigDecimal = stringToJavaBigDecimal(str)
       // We fast fail because constructing a very large JavaBigDecimal to Decimal is very slow.
       // For example: Decimal("6.0790316E+25569151")
-      if (numDigitsInIntegralPart(bigDecimal) > DecimalType.MAX_PRECISION) {
+      if (numDigitsInIntegralPart(bigDecimal) > DecimalType.MAX_PRECISION &&
+          !SQLConf.get.allowNegativeScaleOfDecimalEnabled) {
         null
       } else {
         Decimal(bigDecimal)
@@ -617,7 +618,8 @@ object Decimal {
       val bigDecimal = stringToJavaBigDecimal(str)
       // We fast fail because constructing a very large JavaBigDecimal to Decimal is very slow.
       // For example: Decimal("6.0790316E+25569151")
-      if (numDigitsInIntegralPart(bigDecimal) > DecimalType.MAX_PRECISION) {
+      if (numDigitsInIntegralPart(bigDecimal) > DecimalType.MAX_PRECISION &&
+          !SQLConf.get.allowNegativeScaleOfDecimalEnabled) {
         throw new ArithmeticException(s"out of decimal type range: $str")
       } else {
         Decimal(bigDecimal)
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala
index 57449f5b6dc66..5433c561a0379 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/types/DecimalSuite.scala
@@ -299,4 +299,19 @@ class DecimalSuite extends SparkFunSuite with PrivateMethodTester with SQLHelper
       assert(Decimal.fromStringANSI(UTF8String.fromString(string)) === Decimal(string))
     }
   }
+
+  test("SPARK-37451: Performance improvement regressed String to Decimal cast") {
+    val values = Array("7.836725755512218E38")
+    for (string <- values) {
+      assert(Decimal.fromString(UTF8String.fromString(string)) === null)
+      intercept[ArithmeticException](Decimal.fromStringANSI(UTF8String.fromString(string)))
+    }
+
+    withSQLConf(SQLConf.LEGACY_ALLOW_NEGATIVE_SCALE_OF_DECIMAL_ENABLED.key -> "true") {
+      for (string <- values) {
+        assert(Decimal.fromString(UTF8String.fromString(string)) === Decimal(string))
+        assert(Decimal.fromStringANSI(UTF8String.fromString(string)) === Decimal(string))
+      }
+    }
+  }
 }

From 4c01c47d3d3232d49fdcc5414efb6479513d4b11 Mon Sep 17 00:00:00 2001
From: Mohamadreza Rostami <mohamadrezarostami2@gmail.com>
Date: Thu, 16 Dec 2021 15:21:45 +0800
Subject: [PATCH 163/169] [SPARK-37060][CORE][3.1] Handle driver status
 response from backup masters

### What changes were proposed in this pull request?
After an improvement in SPARK-31486, contributor uses 'asyncSendToMasterAndForwardReply' method instead of 'activeMasterEndpoint.askSync' to get the status of driver. Since the driver's status is only available in active master and the 'asyncSendToMasterAndForwardReply' method iterate over all of the masters, we have to handle the response from the backup masters in the client, which the developer did not consider in the SPARK-31486 change. So drivers running in cluster mode and on a cluster with multi masters affected by this bug.

### Why are the changes needed?

We need to find if the response received from a backup master client must ignore it.

### Does this PR introduce _any_ user-facing change?

No, It's only fixed a bug and brings back the ability to deploy in cluster mode on multi-master clusters.

### How was this patch tested?

Closes #34911 from mohamadrezarostami/fix-a-bug-in-report-driver-status.

Authored-by: Mohamadreza Rostami <mohamadrezarostami2@gmail.com>
Signed-off-by: yi.wu <yi.wu@databricks.com>
---
 .../main/scala/org/apache/spark/deploy/Client.scala    | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/Client.scala b/core/src/main/scala/org/apache/spark/deploy/Client.scala
index 7c5ab43a9e1b3..15cca6668bf6f 100644
--- a/core/src/main/scala/org/apache/spark/deploy/Client.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/Client.scala
@@ -190,13 +190,15 @@ private class ClientEndpoint(
                 logDebug(s"State of driver $submittedDriverID is ${state.get}, " +
                   s"continue monitoring driver status.")
               }
-            }
-        }
-      } else {
+          }
+      }
+    } else if (exception.exists(e => Utils.responseFromBackup(e.getMessage))) {
+       logDebug(s"The status response is reported from a backup spark instance. So, ignored.")
+    } else {
         logError(s"ERROR: Cluster master did not recognize $submittedDriverID")
         System.exit(-1)
-      }
     }
+  }
   override def receive: PartialFunction[Any, Unit] = {
 
     case SubmitDriverResponse(master, success, driverId, message) =>

From 918c222f44e3cdb3fe4e4c6f46794bc39c0950aa Mon Sep 17 00:00:00 2001
From: Huaxin Gao <huaxin_gao@apple.com>
Date: Fri, 17 Dec 2021 13:43:24 +0900
Subject: [PATCH 164/169] [SPARK-37654][SQL] Fix NPE in Row.getSeq when field
 is Null

### What changes were proposed in this pull request?
Fix NPE
```
scala> Row(null).getSeq(0)
java.lang.NullPointerException
  at org.apache.spark.sql.Row.getSeq(Row.scala:319)
  at org.apache.spark.sql.Row.getSeq$(Row.scala:319)
  at org.apache.spark.sql.catalyst.expressions.GenericRow.getSeq(rows.scala:166)
```

### Why are the changes needed?
bug fixing

### Does this PR introduce _any_ user-facing change?
No

### How was this patch tested?
new UT

Closes #34928 from huaxingao/npe.

Authored-by: Huaxin Gao <huaxin_gao@apple.com>
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
(cherry picked from commit fcf636d9eb8d645c24be3db2d599aba2d7e2955a)
Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
---
 sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala  | 5 ++++-
 sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala | 6 ++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
index 88c672f1cdf85..d58de77dc3dd2 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/Row.scala
@@ -316,7 +316,10 @@ trait Row extends Serializable {
    *
    * @throws ClassCastException when data type does not match.
    */
-  def getSeq[T](i: Int): Seq[T] = getAs[scala.collection.Seq[T]](i).toSeq
+  def getSeq[T](i: Int): Seq[T] = {
+    val res = getAs[scala.collection.Seq[T]](i)
+    if (res != null) res.toSeq else null
+  }
 
   /**
    * Returns the value at position i of array type as `java.util.List`.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
index fd9655fdbef42..9e8f3fbeaef44 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/RowSuite.scala
@@ -105,4 +105,10 @@ class RowSuite extends SparkFunSuite with SharedSparkSession {
     val empty = Row()
     assert(empty.toString == "[]")
   }
+
+  test("SPARK-37654: row contains a null at the requested index should return null") {
+    assert(Row(Seq("value")).getSeq(0) === List("value"))
+    assert(Row(Seq()).getSeq(0) === List())
+    assert(Row(null).getSeq(0) === null)
+  }
 }

From 3f541d1d65d65bfae909faaa9d0a516b4ae8982b Mon Sep 17 00:00:00 2001
From: Josh Rosen <joshrosen@databricks.com>
Date: Tue, 4 Jan 2022 10:59:53 -0800
Subject: [PATCH 165/169] [SPARK-37784][SQL] Correctly handle UDTs in
 CodeGenerator.addBufferedState()

### What changes were proposed in this pull request?

This PR fixes a correctness issue in the CodeGenerator.addBufferedState() helper method (which is used by the SortMergeJoinExec operator).

The addBufferedState() method generates code for buffering values that come from a row in an operator's input iterator, performing any necessary copying so that the buffered values remain correct after the input iterator advances to the next row.

The current logic does not correctly handle UDTs: these fall through to the match statement's default branch, causing UDT values to be buffered without copying. This is problematic if the UDT's underlying SQL type is an array, map, struct, or string type (since those types require copying). Failing to copy values can lead to correctness issues or crashes.

This patch's fix is simple: when the dataType is a UDT, use its underlying sqlType for determining whether values need to be copied. I used an existing helper function to perform this type unwrapping.

### Why are the changes needed?

Fix a correctness issue.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

I manually tested this change by re-running a workload which failed with a segfault prior to this patch. See JIRA for more details: https://issues.apache.org/jira/browse/SPARK-37784

So far I have been unable to come up with a CI-runnable regression test which would have failed prior to this change (my only working reproduction runs in a pre-production environment and does not fail in my development environment).

Closes #35066 from JoshRosen/SPARK-37784.

Authored-by: Josh Rosen <joshrosen@databricks.com>
Signed-off-by: Josh Rosen <joshrosen@databricks.com>
(cherry picked from commit eeef48fac412a57382b02ba3f39456d96379b5f5)
Signed-off-by: Josh Rosen <joshrosen@databricks.com>
---
 .../spark/sql/catalyst/expressions/codegen/CodeGenerator.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
index 6e6b9461f3674..4092436933413 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
@@ -334,7 +334,7 @@ class CodegenContext extends Logging {
    */
   def addBufferedState(dataType: DataType, variableName: String, initCode: String): ExprCode = {
     val value = addMutableState(javaType(dataType), variableName)
-    val code = dataType match {
+    val code = UserDefinedType.sqlType(dataType) match {
       case StringType => code"$value = $initCode.clone();"
       case _: StructType | _: ArrayType | _: MapType => code"$value = $initCode.copy();"
       case _ => code"$value = $initCode;"

From d8dfbf7e49bcf5270a5b67f194e077ba1eec27de Mon Sep 17 00:00:00 2001
From: "yi.wu" <yi.wu@databricks.com>
Date: Wed, 5 Jan 2022 10:48:16 +0800
Subject: [PATCH 166/169] [SPARK-35714][FOLLOW-UP][CORE] WorkerWatcher should
 run System.exit in a thread out of RpcEnv

### What changes were proposed in this pull request?

This PR proposes to let `WorkerWatcher` run `System.exit` in a separate thread instead of some thread of `RpcEnv`.

### Why are the changes needed?

`System.exit` will trigger the shutdown hook to run `executor.stop`, which will result in the same deadlock issue with SPARK-14180. But note that since Spark upgrades to Hadoop 3  recently, each hook now will have a [timeout threshold](https://github.com/apache/hadoop/blob/d4794dd3b2ba365a9d95ad6aafcf43a1ea40f777/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/ShutdownHookManager.java#L205-L209) which forcibly interrupt the hook execution once reaches timeout. So, the deadlock issue doesn't really exist in the master branch. However, it's still critical for previous releases and is a wrong behavior that should be fixed.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Tested manually.

Closes #35069 from Ngone51/fix-workerwatcher-exit.

Authored-by: yi.wu <yi.wu@databricks.com>
Signed-off-by: yi.wu <yi.wu@databricks.com>
(cherry picked from commit 639d6f40e597d79c680084376ece87e40f4d2366)
Signed-off-by: yi.wu <yi.wu@databricks.com>
---
 .../org/apache/spark/deploy/worker/WorkerWatcher.scala    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
index efffc9f23c4cc..b7a5728dd00d5 100644
--- a/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/worker/WorkerWatcher.scala
@@ -54,8 +54,12 @@ private[spark] class WorkerWatcher(
     if (isTesting) {
       isShutDown = true
     } else if (isChildProcessStopping.compareAndSet(false, true)) {
-      // SPARK-35714: avoid the duplicate call of `System.exit` to avoid the dead lock
-      System.exit(-1)
+      // SPARK-35714: avoid the duplicate call of `System.exit` to avoid the dead lock.
+      // Same as SPARK-14180, we should run `System.exit` in a separate thread to avoid
+      // dead lock since `System.exit` will trigger the shutdown hook of `executor.stop`.
+      new Thread("WorkerWatcher-exit-executor") {
+        override def run(): Unit = System.exit(-1)
+      }.start()
     }
 
   override def receive: PartialFunction[Any, Unit] = {

From dd437c05dd1f5531f5ffad041f77ee060d1a94ea Mon Sep 17 00:00:00 2001
From: stczwd <qcsd2011@163.com>
Date: Tue, 11 Jan 2022 15:23:12 +0900
Subject: [PATCH 167/169] [SPARK-37860][UI] Fix taskindex in the stage page
 task event timeline

### What changes were proposed in this pull request?
This reverts commit 450b415028c3b00f3a002126cd11318d3932e28f.

### Why are the changes needed?
In #32888, shahidki31 change taskInfo.index to taskInfo.taskId. However, we generally use `index.attempt` or `taskId` to distinguish tasks within a stage, not `taskId.attempt`.
Thus #32888 was a wrong fix issue, we should revert it.

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
origin test suites

Closes #35160 from stczwd/SPARK-37860.

Authored-by: stczwd <qcsd2011@163.com>
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
(cherry picked from commit 3d2fde5242c8989688c176b8ed5eb0bff5e1f17f)
Signed-off-by: Kousuke Saruta <sarutak@oss.nttdata.com>
---
 core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
index 459e09ac9a36e..47ba951953cec 100644
--- a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
+++ b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -355,7 +355,7 @@ private[ui] class StagePage(parent: StagesTab, store: AppStatusStore) extends We
                |'content': '<div class="task-assignment-timeline-content"
                  |data-toggle="tooltip" data-placement="top"
                  |data-html="true" data-container="body"
-                 |data-title="${s"Task " + taskInfo.taskId + " (attempt " + attempt + ")"}<br>
+                 |data-title="${s"Task " + index + " (attempt " + attempt + ")"}<br>
                  |Status: ${taskInfo.status}<br>
                  |Launch Time: ${UIUtils.formatDate(new Date(launchTime))}
                  |${

From 4778af075cdd6c3af109d71a508f9f79e2d8eae4 Mon Sep 17 00:00:00 2001
From: "feng.zhu" <fishcus@outlook.com>
Date: Wed, 12 Jan 2022 18:08:14 +0800
Subject: [PATCH 168/169] [SPARK-34555][SQL][FOLLOWUP] code format

---
 .../org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
index af99b6b954617..795776b572be5 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/connector/DataSourceV2SQLSuite.scala
@@ -35,7 +35,7 @@ import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
 import org.apache.spark.sql.internal.SQLConf.{PARTITION_OVERWRITE_MODE, PartitionOverwriteMode, V2_SESSION_CATALOG_IMPLEMENTATION}
 import org.apache.spark.sql.internal.connector.SimpleTableProvider
 import org.apache.spark.sql.sources.SimpleScanSource
-import org.apache.spark.sql.types.{BooleanType, LongType, MetadataBuilder, StringType, StructField, StructType}
+import org.apache.spark.sql.types.{BooleanType, LongType, StringType, StructField, StructType}
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.unsafe.types.UTF8String

From 5f31fe1371c4c9f3a551fad9704249fc85174234 Mon Sep 17 00:00:00 2001
From: Bruce Robbins <bersprockets@gmail.com>
Date: Mon, 31 Jan 2022 18:50:19 -0800
Subject: [PATCH 169/169] [SPARK-38075][SQL][3.1] Fix `hasNext` in
 `HiveScriptTransformationExec`'s process output iterator

Backport #35368 to 3.1.

### What changes were proposed in this pull request?

Fix hasNext in HiveScriptTransformationExec's process output iterator to always return false if it had previously returned false.

### Why are the changes needed?

When hasNext on the process output iterator returns false, it leaves the iterator in a state (i.e., scriptOutputWritable is not null) such that the next call returns true.

The Guava Ordering used in TakeOrderedAndProjectExec will call hasNext on the process output iterator even after an earlier call had returned false. This results in fake rows when script transform is used with `order by` and `limit`. For example:

```
create or replace temp view t as
select * from values
(1),
(2),
(3)
as t(a);

select transform(a)
USING 'cat' AS (a int)
FROM t order by a limit 10;
```
This returns:
```
NULL
NULL
NULL
1
2
3
```

### Does this PR introduce _any_ user-facing change?

No, other than removing the correctness issue.

### How was this patch tested?

New unit test.

Closes #35375 from bersprockets/SPARK-38075_3.1.

Authored-by: Bruce Robbins <bersprockets@gmail.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
---
 .../HiveScriptTransformationExec.scala         |  7 ++++++-
 .../HiveScriptTransformationSuite.scala        | 18 ++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationExec.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationExec.scala
index 2d1dfbf185830..832463e2512b5 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationExec.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationExec.scala
@@ -64,7 +64,7 @@ private[hive] case class HiveScriptTransformationExec(
       outputSoi: StructObjectInspector,
       hadoopConf: Configuration): Iterator[InternalRow] = {
     new Iterator[InternalRow] with HiveInspectors {
-      var curLine: String = null
+      private var completed = false
       val scriptOutputStream = new DataInputStream(inputStream)
 
       val scriptOutputReader =
@@ -78,6 +78,9 @@ private[hive] case class HiveScriptTransformationExec(
       lazy val unwrappers = outputSoi.getAllStructFieldRefs.asScala.map(unwrapperFor)
 
       override def hasNext: Boolean = {
+        if (completed) {
+          return false
+        }
         try {
           if (scriptOutputWritable == null) {
             scriptOutputWritable = reusedWritableObject
@@ -85,6 +88,7 @@ private[hive] case class HiveScriptTransformationExec(
             if (scriptOutputReader != null) {
               if (scriptOutputReader.next(scriptOutputWritable) <= 0) {
                 checkFailureAndPropagate(writerThread, null, proc, stderrBuffer)
+                completed = true
                 return false
               }
             } else {
@@ -97,6 +101,7 @@ private[hive] case class HiveScriptTransformationExec(
                   // there can be a lag between EOF being written out and the process
                   // being terminated. So explicitly waiting for the process to be done.
                   checkFailureAndPropagate(writerThread, null, proc, stderrBuffer)
+                  completed = true
                   return false
               }
             }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala
index 1018ae5b68895..9982dec7c1c60 100644
--- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala
+++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveScriptTransformationSuite.scala
@@ -23,6 +23,7 @@ import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
 import org.scalatest.exceptions.TestFailedException
 
 import org.apache.spark.{SparkException, TestUtils}
+import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression}
 import org.apache.spark.sql.execution._
 import org.apache.spark.sql.functions._
@@ -438,4 +439,21 @@ class HiveScriptTransformationSuite extends BaseScriptTransformationSuite with T
       assert(e2.contains("array<double> cannot be converted to Hive TypeInfo"))
     }
   }
+
+  test("SPARK-38075: ORDER BY with LIMIT should not add fake rows") {
+    withTempView("v") {
+      val df = Seq((1), (2), (3)).toDF("a")
+      df.createTempView("v")
+      checkAnswer(sql(
+        """
+          |SELECT TRANSFORM(a)
+          |  USING 'cat' AS (a)
+          |FROM v
+          |ORDER BY a
+          |LIMIT 10
+          |""".stripMargin),
+        identity,
+        Row("1") :: Row("2") :: Row("3") :: Nil)
+    }
+  }
 }