[CARMEL-4174] Backport Hash join PR / Merge pull request #1054 from carmel/hash_join

wangyum · GitHub Enterprise · commit e9e5f39780a0 · 2022-09-02T10:34:10.000+08:00
Backport Hash join PR
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/BroadcastJoinOuterJoinStreamSide.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/BroadcastJoinOuterJoinStreamSide.scala
@@ -36,7 +36,8 @@ object BroadcastJoinOuterJoinStreamSide extends Rule[LogicalPlan] with JoinSelec
           LeftOuter | LeftSemi | LeftAnti, _, _) =>
           j
         case j @ ExtractEquiJoinKeys(LeftOuter | LeftSemi | LeftAnti,
-          leftKeys, _, None, left, right, hint) if leftKeys.nonEmpty && muchSmaller(left, right) &&
+          leftKeys, _, None, left, right, hint)
+          if leftKeys.nonEmpty && muchSmaller(left, right, conf) &&
           !(hintToBroadcastRight(hint) || canBroadcastBySize(right, conf)) &&
           (hintToBroadcastLeft(hint) || canBroadcastBySize(left, conf)) =>
           logInfo("BroadcastJoinOuterJoinStreamSide detected.")
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/joins.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.util.Utils
 
 /**
  * Reorder the joins and push all the conditions into join, so that the bottom ones have at least
@@ -270,12 +271,18 @@ trait JoinSelectionHelper {
     val buildLeft = if (hintOnly) {
       hintToShuffleHashJoinLeft(hint)
     } else {
-      canBuildLocalHashMapBySize(left, conf) && muchSmaller(left, right)
+      hintToPreferShuffleHashJoinLeft(hint) ||
+        (!conf.preferSortMergeJoin && canBuildLocalHashMapBySize(left, conf) &&
+          muchSmaller(left, right, conf)) ||
+        forceApplyShuffledHashJoin(conf)
     }
     val buildRight = if (hintOnly) {
       hintToShuffleHashJoinRight(hint)
     } else {
-      canBuildLocalHashMapBySize(right, conf) && muchSmaller(right, left)
+      hintToPreferShuffleHashJoinRight(hint) ||
+        (!conf.preferSortMergeJoin && canBuildLocalHashMapBySize(right, conf) &&
+          muchSmaller(right, left, conf)) ||
+        forceApplyShuffledHashJoin(conf)
     }
     getBuildSide(
       canBuildShuffledHashJoinLeft(joinType) && buildLeft,
@@ -366,6 +373,14 @@ trait JoinSelectionHelper {
     hint.rightHint.exists(_.strategy.contains(SHUFFLE_HASH))
   }
 
+  def hintToPreferShuffleHashJoinLeft(hint: JoinHint): Boolean = {
+    hint.leftHint.exists(_.strategy.contains(PREFER_SHUFFLE_HASH))
+  }
+
+  def hintToPreferShuffleHashJoinRight(hint: JoinHint): Boolean = {
+    hint.rightHint.exists(_.strategy.contains(PREFER_SHUFFLE_HASH))
+  }
+
   def hintToSortMergeJoin(hint: JoinHint): Boolean = {
     hint.leftHint.exists(_.strategy.contains(SHUFFLE_MERGE)) ||
       hint.rightHint.exists(_.strategy.contains(SHUFFLE_MERGE))
@@ -405,14 +420,15 @@ trait JoinSelectionHelper {
   }
 
   /**
-   * Returns whether plan a is much smaller (3X) than plan b.
+   * Returns true if the data size of plan a multiplied by SHUFFLE_HASH_JOIN_FACTOR
+   * is smaller than plan b.
    *
    * The cost to build hash map is higher than sorting, we should only build hash map on a table
    * that is much smaller than other one. Since we does not have the statistic for number of rows,
    * use the size of bytes here as estimation.
    */
-  def muchSmaller(a: LogicalPlan, b: LogicalPlan): Boolean = {
-    a.stats.sizeInBytes * 3 <= b.stats.sizeInBytes
+  def muchSmaller(a: LogicalPlan, b: LogicalPlan, conf: SQLConf): Boolean = {
+    a.stats.sizeInBytes * conf.getConf(SQLConf.SHUFFLE_HASH_JOIN_FACTOR) <= b.stats.sizeInBytes
   }
 
   def canBroadcastTokenTree(left: LogicalPlan,
@@ -436,5 +452,14 @@ trait JoinSelectionHelper {
         right.stats.sizeInBytes <= conf.containsJoinThreshold &&
         !hintToNotBroadcastRight(hint)
   }
+
+  /**
+   * Returns whether a shuffled hash join should be force applied.
+   * The config key is hard-coded because it's testing only and should not be exposed.
+   */
+  private def forceApplyShuffledHashJoin(conf: SQLConf): Boolean = {
+    Utils.isTesting &&
+      conf.getConfString("spark.sql.join.forceApplyShuffledHashJoin", "false") == "true"
+  }
 }
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala
@@ -197,6 +197,14 @@ case object NO_BROADCAST_HASH extends JoinStrategyHint {
   override def hintAliases: Set[String] = Set.empty
 }
 
+/**
+ * An internal hint to encourage shuffle hash join, used by adaptive query execution.
+ */
+case object PREFER_SHUFFLE_HASH extends JoinStrategyHint {
+  override def displayName: String = "prefer_shuffle_hash"
+  override def hintAliases: Set[String] = Set.empty
+}
+
 /**
  * The callback for implementing customized strategies of handling hint errors.
  */
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -498,6 +498,14 @@ object SQLConf {
     .intConf
     .createWithDefault(100000)
 
+  val SHUFFLE_HASH_JOIN_FACTOR = buildConf("spark.sql.shuffledHashJoinFactor")
+    .doc("The shuffle hash join can be selected if the data size of small" +
+      " side multiplied by this factor is still smaller than the large side.")
+    .version("3.3.0")
+    .intConf
+    .checkValue(_ >= 1, "The shuffle hash join factor cannot be negative.")
+    .createWithDefault(3)
+
   val LIMIT_SCALE_UP_FACTOR = buildConf("spark.sql.limit.scaleUpFactor")
     .internal()
     .doc("Minimal increase rate in number of partitions between attempts when executing a take " +
@@ -696,6 +704,17 @@ object SQLConf {
       .bytesConf(ByteUnit.BYTE)
       .createOptional
 
+  val ADAPTIVE_MAX_SHUFFLE_HASH_JOIN_LOCAL_MAP_THRESHOLD =
+    buildConf("spark.sql.adaptive.maxShuffledHashJoinLocalMapThreshold")
+      .doc("Configures the maximum size in bytes per partition that can be allowed to build " +
+        "local hash map. If this value is not smaller than " +
+        s"${ADVISORY_PARTITION_SIZE_IN_BYTES.key} and all the partition size are not larger " +
+        "than this config, join selection prefer to use shuffled hash join instead of " +
+        s"sort merge join regardless of the value of ${PREFER_SORTMERGEJOIN.key}.")
+      .version("3.2.0")
+      .bytesConf(ByteUnit.BYTE)
+      .createWithDefault(0L)
+
   val SUBEXPRESSION_ELIMINATION_ENABLED =
     buildConf("spark.sql.subexpressionElimination.enabled")
       .internal()
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkStrategies.scala
@@ -226,13 +226,7 @@ abstract class SparkStrategies extends QueryPlanner[SparkPlan] {
 
         def createJoinWithoutHint() = {
           createBroadcastHashJoin(false)
-            .orElse {
-              if (!conf.preferSortMergeJoin) {
-                createShuffleHashJoin(false)
-              } else {
-                None
-              }
-            }
+            .orElse(createShuffleHashJoin(false))
             .orElse(createSortMergeJoin())
             .orElse(createCartesianProduct())
             .getOrElse {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEOptimizer.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEOptimizer.scala
@@ -30,7 +30,7 @@ class AQEOptimizer(sparkSession: SparkSession) extends RuleExecutor[LogicalPlan]
   private val conf = sparkSession.sessionState.conf
 
   private val defaultBatches = Seq(
-    Batch("Demote BroadcastHashJoin", Once, DemoteBroadcastHashJoin),
+    Batch("Dynamic Join Selection", Once, DynamicJoinSelection),
     Batch("Adaptive Bloom Filter Join", Once, AdaptiveBloomFilterJoin(sparkSession)),
     Batch("Eliminate Join to Empty Relation", Once, EliminateJoinToEmptyRelation),
     Batch("Optimize bloom filter Join", Once, OptimizeBloomFilterJoin)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DemoteBroadcastHashJoin.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DemoteBroadcastHashJoin.scala
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DynamicJoinSelection.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/DynamicJoinSelection.scala
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.adaptive
+
+import org.apache.spark.MapOutputStatistics
+import org.apache.spark.sql.catalyst.optimizer.JoinSelectionHelper
+import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
+import org.apache.spark.sql.catalyst.plans.{LeftAnti, LeftOuter, RightOuter}
+import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, Join, JoinStrategyHint, LogicalPlan, NO_BROADCAST_HASH, PREFER_SHUFFLE_HASH, SHUFFLE_HASH}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * This optimization rule includes three join selection:
+ *   1. detects a join child that has a high ratio of empty partitions and adds a
+ *      NO_BROADCAST_HASH hint to avoid it being broadcast, as shuffle join is faster in this case:
+ *      many tasks complete immediately since one join side is empty.
+ *   2. detects a join child that every partition size is less than local map threshold and adds a
+ *      PREFER_SHUFFLE_HASH hint to encourage being shuffle hash join instead of sort merge join.
+ *   3. if a join satisfies both NO_BROADCAST_HASH and PREFER_SHUFFLE_HASH,
+ *      then add a SHUFFLE_HASH hint.
+ */
+object DynamicJoinSelection extends Rule[LogicalPlan] with JoinSelectionHelper {
+
+  private def hasManyEmptyPartitions(mapStats: MapOutputStatistics): Boolean = {
+    val partitionCnt = mapStats.bytesByPartitionId.length
+    val nonZeroCnt = mapStats.bytesByPartitionId.count(_ > 0)
+    partitionCnt > 0 && nonZeroCnt > 0 &&
+      (nonZeroCnt * 1.0 / partitionCnt) < conf.nonEmptyPartitionRatioForBroadcastJoin
+  }
+
+  private def preferShuffledHashJoin(mapStats: MapOutputStatistics): Boolean = {
+    val maxShuffledHashJoinLocalMapThreshold =
+      conf.getConf(SQLConf.ADAPTIVE_MAX_SHUFFLE_HASH_JOIN_LOCAL_MAP_THRESHOLD)
+    val advisoryPartitionSize = conf.getConf(SQLConf.ADVISORY_PARTITION_SIZE_IN_BYTES)
+    if (advisoryPartitionSize <= maxShuffledHashJoinLocalMapThreshold) {
+      mapStats.bytesByPartitionId.forall(_ <= maxShuffledHashJoinLocalMapThreshold)
+    } else {
+      false
+    }
+  }
+
+  private def selectJoinStrategy(
+      join: Join,
+      isLeft: Boolean): Option[JoinStrategyHint] = {
+    val plan = if (isLeft) join.left else join.right
+    plan match {
+      case LogicalQueryStage(_, stage: ShuffleQueryStageExec) if stage.isMaterialized
+        && stage.mapStats.isDefined =>
+
+        val manyEmptyInPlan = hasManyEmptyPartitions(stage.mapStats.get)
+        val canBroadcastPlan = (isLeft && canBuildBroadcastLeft(join.joinType)) ||
+          (!isLeft && canBuildBroadcastRight(join.joinType))
+        val manyEmptyInOther = (if (isLeft) join.right else join.left) match {
+          case LogicalQueryStage(_, stage: ShuffleQueryStageExec) if stage.isMaterialized
+            && stage.mapStats.isDefined => hasManyEmptyPartitions(stage.mapStats.get)
+          case _ => false
+        }
+
+        val demoteBroadcastHash = if (manyEmptyInPlan && canBroadcastPlan) {
+          join.joinType match {
+            // don't demote BHJ since you cannot short circuit local join if inner (null-filled)
+            // side is empty
+            case LeftOuter | RightOuter | LeftAnti => false
+            case _ => true
+          }
+        } else if (manyEmptyInOther && canBroadcastPlan) {
+          // for example, LOJ, !isLeft but it's the LHS that has many empty partitions if we
+          // proceed with shuffle.  But if we proceed with BHJ, the OptimizeShuffleWithLocalRead
+          // will assemble partitions as they were before the shuffle and that may no longer have
+          // many empty partitions and thus cannot short-circuit local join
+          join.joinType match {
+            case LeftOuter | RightOuter | LeftAnti => true
+            case _ => false
+          }
+        } else {
+          false
+        }
+        val rowNumberExceeded =
+          stage.computeStats().exists(_.rowCount.exists(_.toLong >= conf.broadcastMaxRowNum))
+        val adjustDemoteBroadcastHash = rowNumberExceeded || demoteBroadcastHash
+
+        val preferShuffleHash = preferShuffledHashJoin(stage.mapStats.get)
+        if (adjustDemoteBroadcastHash && preferShuffleHash) {
+          Some(SHUFFLE_HASH)
+        } else if (adjustDemoteBroadcastHash) {
+          Some(NO_BROADCAST_HASH)
+        } else if (preferShuffleHash) {
+          Some(PREFER_SHUFFLE_HASH)
+        } else {
+          None
+        }
+
+      case _ => None
+    }
+  }
+
+  def apply(plan: LogicalPlan): LogicalPlan = plan.transformDown {
+    case j @ ExtractEquiJoinKeys(_, _, _, _, _, _, hint) =>
+      var newHint = hint
+      if (!hint.leftHint.exists(_.strategy.isDefined)) {
+        selectJoinStrategy(j, true).foreach { strategy =>
+          newHint = newHint.copy(leftHint =
+            Some(hint.leftHint.getOrElse(HintInfo()).copy(strategy = Some(strategy))))
+        }
+      }
+      if (!hint.rightHint.exists(_.strategy.isDefined)) {
+        selectJoinStrategy(j, false).foreach { strategy =>
+          newHint = newHint.copy(rightHint =
+            Some(hint.rightHint.getOrElse(HintInfo()).copy(strategy = Some(strategy))))
+        }
+      }
+      if (newHint.ne(hint)) {
+        j.copy(hint = newHint)
+      } else {
+        j
+      }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/QueryStageExec.scala
@@ -108,6 +108,7 @@ abstract class QueryStageExec extends LeafExecNode {
   protected var _resultOption = new AtomicReference[Option[Any]](None)
 
   private[adaptive] def resultOption: AtomicReference[Option[Any]] = _resultOption
+  def isMaterialized: Boolean = resultOption.get().isDefined
 
   override def output: Seq[Attribute] = plan.output
   override def outputPartitioning: Partitioning = plan.outputPartitioning
diff --git a/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/join.sql b/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/join.sql
@@ -13,7 +13,7 @@
 
 --CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=10485760
 --CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=true
---CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.preferSortMergeJoin=false
+--CONFIG_DIM1 spark.sql.autoBroadcastJoinThreshold=-1,spark.sql.join.forceApplyShuffledHashJoin=true
 
 --CONFIG_DIM2 spark.sql.codegen.wholeStage=true
 --CONFIG_DIM2 spark.sql.codegen.wholeStage=false,spark.sql.codegen.factoryMode=CODEGEN_ONLY
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JoinSuite.scala
@@ -1603,4 +1603,12 @@ class JoinSuite extends QueryTest with SharedSparkSession with AdaptiveSparkPlan
       }
     }
   }
+
+  test("SPARK-35984: Config to force applying shuffled hash join") {
+    val sql = "SELECT * FROM testData JOIN testData2 ON key = a"
+    assertJoin(sql, classOf[SortMergeJoinExec])
+    withSQLConf("spark.sql.join.forceApplyShuffledHashJoin" -> "true") {
+      assertJoin(sql, classOf[ShuffledHashJoinExec])
+    }
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala