[SPARK-32268][SQL][FOLLOWUP] Add ColumnPruning in injectBloomFilter

Yang Liu · wangyum · commit c98725a2b957 · 2022-04-04T11:03:20.000+08:00
### What changes were proposed in this pull request? Add `ColumnPruning` in `InjectRuntimeFilter.injectBloomFilter` to optimize the BoomFilter creation query. ### Why are the changes needed? It seems BloomFilter subqueries injected by `InjectRuntimeFilter` will read as many columns as filterCreationSidePlan. This does not match "Only scan the required columns" as the design said. We can check this by a simple case in `InjectRuntimeFilterSuite`: ```scala withSQLConf(SQLConf.RUNTIME_BLOOM_FILTER_ENABLED.key -> "true", SQLConf.RUNTIME_BLOOM_FILTER_APPLICATION_SIDE_SCAN_SIZE_THRESHOLD.key -> "3000", SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "2000") { val query = "select * from bf1 join bf2 on bf1.c1 = bf2.c2 where bf2.a2 = 62" sql(query).explain() } ``` The reason is subqueries have not been optimized by `ColumnPruning`, and this pr will fix it. ### Does this PR introduce _any_ user-facing change? No, not released ### How was this patch tested? Improve the test by adding `columnPruningTakesEffect` to check the optimizedPlan of bloom filter join. Closes #36047 from Flyangz/SPARK-32268-FOllOWUP. Authored-by: Yang Liu <yintai@xiaohongshu.com> Signed-off-by: Yuming Wang <yumwang@ebay.com>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InjectRuntimeFilter.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/InjectRuntimeFilter.scala
@@ -85,7 +85,8 @@ object InjectRuntimeFilter extends Rule[LogicalPlan] with PredicateHelper with J
       }
     val aggExp = AggregateExpression(bloomFilterAgg, Complete, isDistinct = false, None)
     val alias = Alias(aggExp, "bloomFilter")()
-    val aggregate = ConstantFolding(Aggregate(Nil, Seq(alias), filterCreationSidePlan))
+    val aggregate =
+      ConstantFolding(ColumnPruning(Aggregate(Nil, Seq(alias), filterCreationSidePlan)))
     val bloomFilterSubquery = ScalarSubquery(aggregate, Nil)
     val filter = BloomFilterMightContain(bloomFilterSubquery,
       new XxHash64(Seq(filterApplicationSideExp)))
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/InjectRuntimeFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/InjectRuntimeFilterSuite.scala
@@ -255,6 +255,7 @@ class InjectRuntimeFilterSuite extends QueryTest with SQLTestUtils with SharedSp
         planEnabled = sql(query).queryExecution.optimizedPlan
         checkAnswer(sql(query), expectedAnswer)
         if (shouldReplace) {
+          assert(!columnPruningTakesEffect(planEnabled))
           assert(getNumBloomFilters(planEnabled) > getNumBloomFilters(planDisabled))
         } else {
           assert(getNumBloomFilters(planEnabled) == getNumBloomFilters(planDisabled))
@@ -288,6 +289,20 @@ class InjectRuntimeFilterSuite extends QueryTest with SQLTestUtils with SharedSp
     numMightContains
   }
 
+  def columnPruningTakesEffect(plan: LogicalPlan): Boolean = {
+    def takesEffect(plan: LogicalPlan): Boolean = {
+      val result = org.apache.spark.sql.catalyst.optimizer.ColumnPruning.apply(plan)
+      !result.fastEquals(plan)
+    }
+
+    plan.collectFirst {
+      case Filter(condition, _) if condition.collectFirst {
+        case subquery: org.apache.spark.sql.catalyst.expressions.ScalarSubquery
+          if takesEffect(subquery.plan) => true
+      }.nonEmpty => true
+    }.nonEmpty
+  }
+
   def assertRewroteSemiJoin(query: String): Unit = {
     checkWithAndWithoutFeatureEnabled(query, testSemiJoin = true, shouldReplace = true)
   }