turboFei
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/DynamicPruning.scala‎
Lines changed: 5 additions & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/DynamicPruning.scala‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala‎
Lines changed: 2 additions & 0 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala‎
Lines changed: 11 additions & 1 deletion b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/hints.scala‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala‎
Lines changed: 58 additions & 0 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala‎
Lines changed: 12 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala‎
Lines changed: 19 additions & 11 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/SparkOptimizer.scala‎
Lines changed: 19 additions & 11 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEOptimizer.scala‎
Lines changed: 1 addition & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AQEOptimizer.scala‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeBuildBloomFilter.scala‎
Lines changed: 82 additions & 0 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/OptimizeBuildBloomFilter.scala‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveDynamicPruningFilters.scala‎
Lines changed: 40 additions & 9 deletions b/‎sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/PlanAdaptiveDynamicPruningFilters.scala‎
Lines changed: 40 additions & 9 deletions
@@ -20,8 +20,8 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
 import org.apache.spark.sql.catalyst.plans.logical.{HintInfo, LogicalPlan}
+import org.apache.spark.sql.catalyst.trees.{TreeNodeTag, UnaryLike}
 import org.apache.spark.sql.catalyst.trees.TreePattern._
-import org.apache.spark.sql.catalyst.trees.UnaryLike
 
 trait DynamicPruning extends Predicate
 
@@ -89,6 +89,10 @@ case class DynamicPruningSubquery(
     copy(pruningKey = newChild)
 }
 
+object DynamicPruningSubquery {
+  private[spark] val IS_PRUNING_DATA_TAG = TreeNodeTag[Boolean]("is_pruning_data")
+}
+
 /**
  * Marker for a planned [[DynamicPruning]] expression.
  * The expression is created during planning, and it defers to its child for evaluation.
 
@@ -658,6 +658,8 @@ object Murmur3HashFunction extends InterpretedHashFunction {
 case class XxHash64(children: Seq[Expression], seed: Long) extends HashExpression[Long] {
   def this(arguments: Seq[Expression]) = this(arguments, 42L)
 
+  def this(argument: Expression) = this(Seq(argument))
+
   override def dataType: DataType = LongType
 
   override def prettyName: String = "xxhash64"
 
@@ -127,7 +127,8 @@ object JoinStrategyHint {
     BROADCAST,
     SHUFFLE_MERGE,
     SHUFFLE_HASH,
-    SHUFFLE_REPLICATE_NL)
+    SHUFFLE_REPLICATE_NL,
+    BLOOM_FILTER_JOIN)
 }
 
 /**
@@ -197,6 +198,15 @@ case object NO_BROADCAST_AND_REPLICATION extends JoinStrategyHint {
   override def hintAliases: Set[String] = Set.empty
 }
 
+/**
+ * The hint for bloom filter join.
+ */
+case object BLOOM_FILTER_JOIN extends JoinStrategyHint {
+  override def displayName: String = "bloom_filter_join"
+  override def hintAliases: Set[String] = Set(
+    "BLOOM_FILTER_JOIN")
+}
+
 /**
  * The callback for implementing customized strategies of handling hint errors.
  */
 
@@ -460,6 +460,51 @@ object SQLConf {
       .booleanConf
       .createWithDefault(true)
 
+  val RUNTIME_FILTER_PRUNING_ENABLED =
+    buildConf("spark.sql.optimizer.runtimeFilterPruning.enabled")
+      .doc("When true, we will generate predicate when it's used as join key")
+      .version("3.5.0")
+      .booleanConf
+      .createWithDefault(false)
+
+  val RUNTIME_FILTER_PRUNING_FILTERING_ROW_COUNT =
+    buildConf("spark.sql.optimizer.runtimeFilterPruning.filteringSideThreshold")
+      .internal()
+      .doc("We assume it has partition pruning filter if it has no selective predicate and " +
+        "the maximum number of rows less than this threshold.")
+      .version("3.5.0")
+      .intConf
+      .checkValue(threshold => threshold >= 0, "The maximum row count must be non-negative.")
+      .createWithDefault(0)
+
+  val RUNTIME_FILTER_PRUNING_MAX_BLOOM_FILTER_ENTRIES =
+    buildConf("spark.sql.optimizer.runtimeFilterPruning.maxBloomFilterEntries")
+      .doc("The maximum number of bloom filter entries allowed when building dynamic bloom filter" +
+        "join pruning.")
+      .version("3.5.0")
+      .longConf
+      .checkValue(_ > 0, "the value of max bloom filter entries must be greater than 0")
+      .createWithDefault(100000000L)
+
+  val RUNTIME_FILTER_PRUNING_PRUNING_SIDE_EXTRA_FILTER_RATIO =
+    buildConf("spark.sql.optimizer.runtimeFilterPruning.pruningSideExtraFilterRatio")
+      .internal()
+      .doc("When filtering side doesn't support broadcast by join type, and doing DPP means " +
+        "running an extra query that may have significant overhead. This config will be used " +
+        "as the extra filter ratio for computing the data size of the pruning side after DPP, " +
+        "in order to evaluate if it is worth adding an extra subquery as the pruning filter.")
+      .version("3.5.0")
+      .doubleConf
+      .checkValue(ratio => ratio > 0.0 && ratio <= 1.0, "The ratio value must be in (0.0, 1.0].")
+      .createWithDefault(0.04)
+
+  val DYNAMIC_PRUNING_MAX_INSET_NUM =
+    buildConf("spark.sql.optimizer.dynamicPruning.maxInsetNum")
+      .doc("We will fall back to true if InSet's size exceeds this value when pruning the data.")
+      .version("3.0.0")
+      .intConf
+      .createWithDefault(1000000)
+
   val PLANNED_WRITE_ENABLED = buildConf("spark.sql.optimizer.plannedWrite.enabled")
     .internal()
     .doc("When set to true, Spark optimizer will add logical sort operators to V1 write commands " +
@@ -5227,9 +5272,22 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
   def runtimeFilterCreationSideThreshold: Long =
     getConf(RUNTIME_BLOOM_FILTER_CREATION_SIDE_THRESHOLD)
 
+  def runtimeFilterApplicationSideThreshold: Long =
+    getConf(RUNTIME_BLOOM_FILTER_APPLICATION_SIDE_SCAN_SIZE_THRESHOLD)
+
   def runtimeRowLevelOperationGroupFilterEnabled: Boolean =
     getConf(RUNTIME_ROW_LEVEL_OPERATION_GROUP_FILTER_ENABLED)
 
+  def runtimeFilterPruningEnabled: Boolean = getConf(RUNTIME_FILTER_PRUNING_ENABLED)
+
+  def runtimeFilterPruningPruningSideExtraFilterRatio: Double =
+    getConf(RUNTIME_FILTER_PRUNING_PRUNING_SIDE_EXTRA_FILTER_RATIO)
+
+  def runtimeFilterPruningMaxBloomFilterEntries: Long =
+    getConf(RUNTIME_FILTER_PRUNING_MAX_BLOOM_FILTER_ENTRIES)
+
+  def dynamicPruningMaxInsetNum: Int = getConf(DYNAMIC_PRUNING_MAX_INSET_NUM)
+
   def stateStoreProviderClass: String = getConf(STATE_STORE_PROVIDER_CLASS)
 
   def isStateSchemaCheckEnabled: Boolean = getConf(STATE_SCHEMA_CHECK_ENABLED)
 
@@ -583,6 +583,18 @@ object QueryExecution {
     prepareForExecution(preparationRules, sparkPlan.clone())
   }
 
+  /**
+   * Prepare the [[SparkPlan]] for execution using exists adaptive execution context.
+   * This method is only called by [[PlanAdaptiveDynamicPruningFilters]].
+   */
+  def prepareExecutedPlan(
+      session: SparkSession,
+      sparkPlan: SparkPlan,
+      context: AdaptiveExecutionContext): SparkPlan = {
+    val preparationRules = preparations(session, Option(InsertAdaptiveSparkPlan(context)), true)
+    prepareForExecution(preparationRules, sparkPlan.clone())
+  }
+
   /**
    * Converts asserts, null pointer exceptions to internal errors.
    */
 
@@ -25,7 +25,7 @@ import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.connector.catalog.CatalogManager
 import org.apache.spark.sql.execution.datasources.{CleanupLazyFileIndex, PruneFileSourcePartitions, SchemaPruning, V1Writes}
 import org.apache.spark.sql.execution.datasources.v2.{GroupBasedRowLevelOperationScanPlanning, OptimizeMetadataOnlyDeleteFromTable, V2ScanPartitioningAndOrdering, V2ScanRelationPushDown, V2Writes}
-import org.apache.spark.sql.execution.dynamicpruning.{CleanupDynamicPruningFilters, PartitionPruning, RowLevelOperationRuntimeGroupFiltering}
+import org.apache.spark.sql.execution.dynamicpruning.{CleanupDynamicPruningFilters, PartitionPruning, RowLevelOperationRuntimeGroupFiltering, RuntimeFilterPruning}
 import org.apache.spark.sql.execution.python.{ExtractGroupingPythonUDFFromAggregate, ExtractPythonUDFFromAggregate, ExtractPythonUDFs, ExtractPythonUDTFs}
 
 class SparkOptimizer(
@@ -47,16 +47,24 @@ class SparkOptimizer(
   override def preCBORules: Seq[Rule[LogicalPlan]] =
     OptimizeMetadataOnlyDeleteFromTable :: Nil
 
-  override def defaultBatches: Seq[Batch] = (preOptimizationBatches ++ super.defaultBatches :+
-    Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog)) :+
-    Batch("PartitionPruning", Once,
-      PartitionPruning,
-      // We can't run `OptimizeSubqueries` in this batch, as it will optimize the subqueries
-      // twice which may break some optimizer rules that can only be applied once. The rule below
-      // only invokes `OptimizeSubqueries` to optimize newly added subqueries.
-      new RowLevelOperationRuntimeGroupFiltering(OptimizeSubqueries)) :+
-    Batch("InjectRuntimeFilter", FixedPoint(1),
-      InjectRuntimeFilter) :+
+  override def defaultBatches: Seq[Batch] = ((preOptimizationBatches ++ super.defaultBatches :+
+    Batch("Optimize Metadata Only Query", Once, OptimizeMetadataOnlyQuery(catalog))) ++
+    {
+      if (conf.runtimeFilterPruningEnabled) {
+        Seq.empty[Batch] :+ Batch("Runtime Filter Pruning", Once,
+          RuntimeFilterPruning,
+          new RowLevelOperationRuntimeGroupFiltering(OptimizeSubqueries))
+      } else {
+        Seq.empty[Batch] :+ Batch("PartitionPruning", Once,
+          PartitionPruning,
+          // We can't run `OptimizeSubqueries` in this batch, as it will optimize the subqueries
+          // twice which may break some optimizer rules that can only be applied once. The rule
+          // below only invokes `OptimizeSubqueries` to optimize newly added subqueries.
+          new RowLevelOperationRuntimeGroupFiltering(OptimizeSubqueries)) :+
+          Batch("InjectRuntimeFilter", FixedPoint(1),
+            InjectRuntimeFilter)
+      }
+    } :+
     Batch("MergeScalarSubqueries", Once,
       MergeScalarSubqueries,
       RewriteDistinctAggregates) :+
 
@@ -41,6 +41,7 @@ class AQEOptimizer(conf: SQLConf, extendedRuntimeOptimizerRules: Seq[Rule[Logica
       ConvertToLocalRelation,
       UpdateAttributeNullability),
     Batch("Dynamic Join Selection", Once, DynamicJoinSelection),
+    Batch("Optimize Bloom Filter Join", Once, OptimizeBuildBloomFilter),
     Batch("Eliminate Limits", fixedPoint, EliminateLimits),
     Batch("Optimize One Row Plan", fixedPoint, OptimizeOneRowPlan)) :+
     Batch("User Provided Runtime Optimizers", fixedPoint, extendedRuntimeOptimizerRules: _*)
 
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.adaptive
+
+import org.apache.spark.sql.catalyst.expressions.{Alias, Literal, Multiply}
+import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, BloomFilterAggregate}
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Repartition}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.CoalesceExec
+import org.apache.spark.sql.execution.aggregate.ObjectHashAggregateExec
+import org.apache.spark.sql.execution.dynamicpruning.DynamicPruningHelper
+import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.util.sketch.BloomFilter
+
+/**
+ * This optimization rule find the build bloom filter expression and
+ * set expectedNumItems from LogicalQueryStage which is more accurate.
+ */
+object OptimizeBuildBloomFilter extends Rule[LogicalPlan] with DynamicPruningHelper {
+  def apply(plan: LogicalPlan): LogicalPlan = {
+    if (!conf.runtimeFilterBloomFilterEnabled) {
+      return plan
+    }
+
+    lazy val defaultNumItems =
+      Literal(conf.getConf(SQLConf.RUNTIME_BLOOM_FILTER_EXPECTED_NUM_ITEMS))
+    lazy val defaultNumBits =
+      Literal(conf.getConf(SQLConf.RUNTIME_BLOOM_FILTER_NUM_BITS))
+
+    plan match {
+      case p @ LogicalQueryStage(a @ Aggregate(Nil, Seq(Alias(AggregateExpression(
+          bf: BloomFilterAggregate, _, _, _, _), _)), child: Repartition),
+          ha @ ObjectHashAggregateExec(_, _, _, _, _, _, _, _,
+            ShuffleExchangeExec(_, ObjectHashAggregateExec(_, _, _, _, _, _, _, _,
+            CoalesceExec(_, s: ShuffleQueryStageExec)), _, _)))
+        if s.isMaterialized && s.getRuntimeStatistics.rowCount.nonEmpty &&
+          (bf.numBitsExpression.isInstanceOf[Multiply] ||
+            (bf.numBitsExpression.semanticEquals(defaultNumBits) &&
+              bf.estimatedNumItemsExpression.semanticEquals(defaultNumItems))) =>
+        val expectedNumItems = math.max(s.getRuntimeStatistics.rowCount.get.longValue(), 1L)
+        val fpp = expectedNumItems / 3000000000L.toDouble
+        val numBits = BloomFilter.optimalNumOfBits(expectedNumItems, fpp)
+        val newBuildBloomFilter = bf.copy(
+          estimatedNumItemsExpression = Literal(expectedNumItems),
+          numBitsExpression = Literal(numBits))
+
+        val newLogicalPlan = a.transformExpressions {
+          case e: AggregateExpression if e.aggregateFunction.semanticEquals(bf) =>
+            e.copy(aggregateFunction = newBuildBloomFilter)
+        }
+
+        val newPhysicalPlan = ha.transformDown {
+          case a: ObjectHashAggregateExec =>
+            a.transformExpressions {
+              case e: AggregateExpression if e.aggregateFunction.semanticEquals(bf) =>
+                e.copy(aggregateFunction = newBuildBloomFilter)
+            }
+          case c: CoalesceExec =>
+            c.copy(numPartitions = coalesceBuildBloomFilterNum(child))
+        }
+
+        p.copy(logicalPlan = newLogicalPlan, physicalPlan = newPhysicalPlan)
+      case _ => plan
+    }
+  }
+}
@@ -17,30 +17,35 @@
 
 package org.apache.spark.sql.execution.adaptive
 
-import org.apache.spark.sql.catalyst.expressions.{Alias, BindReferences, DynamicPruningExpression, Literal}
-import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight}
+import org.apache.spark.sql.catalyst.expressions.{Alias, BindReferences, BloomFilterMightContain, DynamicPruningExpression, DynamicPruningSubquery, Literal, XxHash64}
+import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, JoinSelectionHelper}
 import org.apache.spark.sql.catalyst.plans.logical.Aggregate
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.trees.TreePattern._
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.exchange.BroadcastExchangeExec
+import org.apache.spark.sql.execution.{ScalarSubquery => ScalarSubqueryExec}
+import org.apache.spark.sql.execution.dynamicpruning.DynamicPruningHelper
+import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, HashedRelationBroadcastMode, HashJoin}
 
 /**
  * A rule to insert dynamic pruning predicates in order to reuse the results of broadcast.
  */
-case class PlanAdaptiveDynamicPruningFilters(
-    rootPlan: AdaptiveSparkPlanExec) extends Rule[SparkPlan] with AdaptiveSparkPlanHelper {
+case class PlanAdaptiveDynamicPruningFilters(rootPlan: AdaptiveSparkPlanExec)
+  extends Rule[SparkPlan]
+    with AdaptiveSparkPlanHelper
+    with JoinSelectionHelper
+    with DynamicPruningHelper {
   def apply(plan: SparkPlan): SparkPlan = {
     if (!conf.dynamicPartitionPruningEnabled) {
       return plan
     }
 
     plan.transformAllExpressionsWithPruning(
       _.containsAllPatterns(DYNAMIC_PRUNING_EXPRESSION, IN_SUBQUERY_EXEC)) {
-      case DynamicPruningExpression(InSubqueryExec(
+      case e @ DynamicPruningExpression(InSubqueryExec(
           value, SubqueryAdaptiveBroadcastExec(name, index, onlyInBroadcast, buildPlan, buildKeys,
-          adaptivePlan: AdaptiveSparkPlanExec), exprId, _, _, _)) =>
+          adaptivePlan: AdaptiveSparkPlanExec), exprId, _, _, _, _)) =>
         val packedKeys = BindReferences.bindReferences(
           HashJoin.rewriteKeyExpr(buildKeys), adaptivePlan.executedPlan.output)
         val mode = HashedRelationBroadcastMode(packedKeys)
@@ -56,16 +61,19 @@ case class PlanAdaptiveDynamicPruningFilters(
             case _ => false
           }.isDefined
 
+        val shouldBroadcast =
+          e.getTagValue(DynamicPruningSubquery.IS_PRUNING_DATA_TAG).getOrElse(false)
+
         if (canReuseExchange) {
           exchange.setLogicalLink(adaptivePlan.executedPlan.logicalLink.get)
           val newAdaptivePlan = adaptivePlan.copy(inputPlan = exchange)
 
           val broadcastValues = SubqueryBroadcastExec(
             name, index, buildKeys, newAdaptivePlan)
-          DynamicPruningExpression(InSubqueryExec(value, broadcastValues, exprId))
+          DynamicPruningExpression(InSubqueryExec(value, broadcastValues, exprId, shouldBroadcast))
         } else if (onlyInBroadcast) {
           DynamicPruningExpression(Literal.TrueLiteral)
-        } else {
+        } else if (canBroadcastBySize(buildPlan, conf)) {
           // we need to apply an aggregate on the buildPlan in order to be column pruned
           val alias = Alias(buildKeys(index), buildKeys(index).toString)()
           val aggregate = Aggregate(Seq(alias), Seq(alias), buildPlan)
@@ -77,6 +85,29 @@ case class PlanAdaptiveDynamicPruningFilters(
           val newAdaptivePlan = sparkPlan.asInstanceOf[AdaptiveSparkPlanExec]
           val values = SubqueryExec(name, newAdaptivePlan)
           DynamicPruningExpression(InSubqueryExec(value, values, exprId))
+        } else if (!conf.exchangeReuseEnabled) {
+          DynamicPruningExpression(Literal.TrueLiteral)
+        } else {
+          val childPlan = adaptivePlan.executedPlan
+          val session = adaptivePlan.context.session
+          val reusedShuffleExchange = collectFirst(rootPlan.currentPhysicalPlan) {
+            case s: ShuffleExchangeExec if s.child.sameResult(childPlan) =>
+              s
+            case s @ ShuffleExchangeExec(_, _: WholeStageCodegenExec, _, _)
+              if s.child.sameResult(QueryExecution.prepareExecutedPlan(session, childPlan)) =>
+              s.copy(child = childPlan)
+          }
+
+          val bfLogicalPlan = planBloomFilterLogicalPlan(buildPlan, buildKeys, index)
+          val bfPhysicalPlan =
+            planBloomFilterPhysicalPlan(bfLogicalPlan, reusedShuffleExchange).map { plan =>
+              val executedPlan = QueryExecution.prepareExecutedPlan(
+                session, plan, adaptivePlan.context)
+              val scalarSubquery = ScalarSubqueryExec(SubqueryExec.createForScalarSubquery(
+                s"scalar-subquery#${exprId.id}", executedPlan), exprId)
+              BloomFilterMightContain(scalarSubquery, new XxHash64(value))
+            }.getOrElse(Literal.TrueLiteral)
+          DynamicPruningExpression(bfPhysicalPlan)
         }
     }
   }