apache · c21 · Sep 18, 2020 · Sep 18, 2020 · Sep 22, 2020 · Sep 22, 2020
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -951,6 +951,14 @@ object SQLConf {
     .checkValue(_ > 0, "the value of spark.sql.sources.bucketing.maxBuckets must be greater than 0")
     .createWithDefault(100000)
 
+  val AUTO_BUCKETED_SCAN_ENABLED =
+    buildConf("spark.sql.sources.bucketing.autoBucketedScan.enabled")
+      .doc("When true, decide whether to do bucketed scan on input tables based on query plan " +
+        "automatically.")
+      .version("3.1.0")
+      .booleanConf
+      .createWithDefault(true)
+
   val CROSS_JOINS_ENABLED = buildConf("spark.sql.crossJoin.enabled")
     .internal()
     .doc("When false, we will throw an error if a query contains a cartesian product without " +
@@ -3164,6 +3172,8 @@ class SQLConf extends Serializable with Logging {
 
   def bucketingMaxBuckets: Int = getConf(SQLConf.BUCKETING_MAX_BUCKETS)
 
+  def autoBucketedScanEnabled: Boolean = getConf(SQLConf.AUTO_BUCKETED_SCAN_ENABLED)
+
   def dataFrameSelfJoinAutoResolveAmbiguity: Boolean =
     getConf(DATAFRAME_SELF_JOIN_AUTO_RESOLVE_AMBIGUITY)
 

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -156,7 +156,9 @@ case class RowDataSourceScanExec(
  * @param optionalBucketSet Bucket ids for bucket pruning.
  * @param optionalNumCoalescedBuckets Number of coalesced buckets.
  * @param dataFilters Filters on non-partition columns.
- * @param tableIdentifier identifier for the table in the metastore.
+ * @param tableIdentifier Identifier for the table in the metastore.
+ * @param disableBucketedScan Disable bucketed scan based on physical query plan, see rule
+ *                            [[DisableUnnecessaryBucketedScan]] for details.
  */
 case class FileSourceScanExec(
     @transient relation: HadoopFsRelation,
@@ -166,7 +168,8 @@ case class FileSourceScanExec(
     optionalBucketSet: Option[BitSet],
     optionalNumCoalescedBuckets: Option[Int],
     dataFilters: Seq[Expression],
-    tableIdentifier: Option[TableIdentifier])
+    tableIdentifier: Option[TableIdentifier],
+    disableBucketedScan: Boolean = false)
   extends DataSourceScanExec {
 
   // Note that some vals referring the file-based relation are lazy intentionally
@@ -257,7 +260,8 @@ case class FileSourceScanExec(
 
   // exposed for testing
   lazy val bucketedScan: Boolean = {
-    if (relation.sparkSession.sessionState.conf.bucketingEnabled && relation.bucketSpec.isDefined) {
+    if (relation.sparkSession.sessionState.conf.bucketingEnabled && relation.bucketSpec.isDefined
+      && !disableBucketedScan) {
       val spec = relation.bucketSpec.get
       val bucketColumns = spec.bucketColumnNames.flatMap(n => toAttribute(n))
       bucketColumns.size == spec.bucketColumnNames.size
@@ -339,7 +343,7 @@ case class FileSourceScanExec(
       location.getClass.getSimpleName +
         Utils.buildLocationMetadata(location.rootPaths, maxMetadataValueLength)
     val metadata =
-      Map(
+      HashMap(
         "Format" -> relation.fileFormat.toString,
         "ReadSchema" -> requiredSchema.catalogString,
         "Batched" -> supportsColumnar.toString,
@@ -348,20 +352,22 @@ case class FileSourceScanExec(
         "DataFilters" -> seqToString(dataFilters),
         "Location" -> locationDesc)
 
-    val withSelectedBucketsCount = relation.bucketSpec.map { spec =>
-      val numSelectedBuckets = optionalBucketSet.map { b =>
-        b.cardinality()
-      } getOrElse {
-        spec.numBuckets
+    if (bucketedScan) {
+      relation.bucketSpec.map { spec =>
+        val numSelectedBuckets = optionalBucketSet.map { b =>
+          b.cardinality()
+        } getOrElse {
+          spec.numBuckets
+        }
+        metadata += ("SelectedBucketsCount" ->
+          (s"$numSelectedBuckets out of ${spec.numBuckets}" +
+            optionalNumCoalescedBuckets.map { b => s" (Coalesced to $b)"}.getOrElse("")))
       }
-      metadata + ("SelectedBucketsCount" ->
-        (s"$numSelectedBuckets out of ${spec.numBuckets}" +
-          optionalNumCoalescedBuckets.map { b => s" (Coalesced to $b)"}.getOrElse("")))
-    } getOrElse {
-      metadata
+    } else if (disableBucketedScan) {
+      metadata += ("DisableBucketedScan" -> "true")
     }
 
-    withSelectedBucketsCount
+    metadata.toMap
   }
 
   override def verboseStringWithOperatorId(): String = {
@@ -624,6 +630,7 @@ case class FileSourceScanExec(
       optionalBucketSet,
       optionalNumCoalescedBuckets,
       QueryPlan.normalizePredicates(dataFilters, output),
-      None)
+      None,
+      disableBucketedScan)
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/QueryExecution.scala
@@ -35,7 +35,7 @@ import org.apache.spark.sql.catalyst.rules.{PlanChangeLogger, Rule}
 import org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat
 import org.apache.spark.sql.catalyst.util.truncatedString
 import org.apache.spark.sql.execution.adaptive.{AdaptiveExecutionContext, InsertAdaptiveSparkPlan}
-import org.apache.spark.sql.execution.bucketing.CoalesceBucketsInJoin
+import org.apache.spark.sql.execution.bucketing.{CoalesceBucketsInJoin, DisableUnnecessaryBucketedScan}
 import org.apache.spark.sql.execution.dynamicpruning.PlanDynamicPruningFilters
 import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReuseExchange}
 import org.apache.spark.sql.execution.streaming.{IncrementalExecution, OffsetSeqMetadata}
@@ -344,6 +344,7 @@ object QueryExecution {
       PlanSubqueries(sparkSession),
       RemoveRedundantProjects(sparkSession.sessionState.conf),
       EnsureRequirements(sparkSession.sessionState.conf),
+      DisableUnnecessaryBucketedScan(sparkSession.sessionState.conf),
       ApplyColumnarRulesAndInsertTransitions(sparkSession.sessionState.conf,
         sparkSession.sessionState.columnarRules),
       CollapseCodegenStages(sparkSession.sessionState.conf),

diff --git a/.../main/scala/org/apache/spark/sql/execution/bucketing/DisableUnnecessaryBucketedScan.scala b/.../main/scala/org/apache/spark/sql/execution/bucketing/DisableUnnecessaryBucketedScan.scala
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.bucketing
+
+import org.apache.spark.sql.catalyst.expressions.aggregate.{Partial, PartialMerge}
+import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, HashClusteredDistribution}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.{FileSourceScanExec, FilterExec, ProjectExec, SortExec, SparkPlan}
+import org.apache.spark.sql.execution.aggregate.BaseAggregateExec
+import org.apache.spark.sql.execution.exchange.Exchange
+import org.apache.spark.sql.internal.SQLConf
+
+/**
+ * Disable unnecessary bucketed table scan based on actual physical query plan.
+ * NOTE: this rule is designed to be applied right after [[EnsureRequirements]],
+ * where all [[ShuffleExchangeExec]] and [[SortExec]] have been added to plan properly.
+ *
+ * When BUCKETING_ENABLED and AUTO_BUCKETED_SCAN_ENABLED are set to true, go through
+ * query plan to check where bucketed table scan is unnecessary, and disable bucketed table
+ * scan if needed.
+ *
+ * For all operators which [[hasInterestingPartition]] (i.e., require [[ClusteredDistribution]]
+ * or [[HashClusteredDistribution]]), check if the sub-plan for operator has [[Exchange]] and
+ * bucketed table scan. If yes, disable the bucketed table scan in the sub-plan.
+ * Only allow certain operators in sub-plan, which guarantees each sub-plan is single lineage
+ * (i.e., each operator has only one child). See details in
+ * [[disableBucketWithInterestingPartition]]).
+ *
+ * Examples:
+ * (1).join:
+ *         SortMergeJoin(t1.i = t2.j)
+ *            /            \
+ *        Sort(i)        Sort(j)
+ *          /               \
+ *      Shuffle(i)       Scan(t2: i, j)
+ *        /         (bucketed on column j, enable bucketed scan)
+ *   Scan(t1: i, j)
+ * (bucketed on column j, DISABLE bucketed scan)
+ *
+ * (2).aggregate:
+ *         HashAggregate(i, ..., Final)
+ *                      |
+ *                  Shuffle(i)
+ *                      |
+ *         HashAggregate(i, ..., Partial)
+ *                      |
+ *                    Filter
+ *                      |
+ *                  Scan(t1: i, j)
+ *  (bucketed on column j, DISABLE bucketed scan)
+ *
+ * The idea of [[hasInterestingPartition]] is inspired from "interesting order" in
+ * the paper "Access Path Selection in a Relational Database Management System"
+ * (http://www.inf.ed.ac.uk/teaching/courses/adbs/AccessPath.pdf).
+ */
+case class DisableUnnecessaryBucketedScan(conf: SQLConf) extends Rule[SparkPlan] {
+
+  /**
+   * Disable bucketed table scan with pre-order traversal of plan.
+   *
+   * @param withInterestingPartition The traversed plan has operator with interesting partition.
+   * @param withExchange The traversed plan has [[Exchange]] operator.
+   */
+  private def disableBucketWithInterestingPartition(
+      plan: SparkPlan,
+      withInterestingPartition: Boolean,
+      withExchange: Boolean): SparkPlan = {
+    plan match {
+      case p if hasInterestingPartition(p) =>
+        // Operators with interesting partition, propagates `withInterestingPartition` as true
+        // to its children.
+        p.mapChildren(disableBucketWithInterestingPartition(_, true, false))
+      case exchange: Exchange if withInterestingPartition =>
+        // Exchange operator propagates `withExchange` as true to its child
+        // if the plan has interesting partition.
+        exchange.mapChildren(disableBucketWithInterestingPartition(
+          _, withInterestingPartition, true))
+      case scan: FileSourceScanExec
+          if withInterestingPartition && withExchange && isBucketedScanWithoutFilter(scan) =>
+        // Disable bucketed table scan if the plan has interesting partition,
+        // and [[Exchange]] in the plan.
+        scan.copy(disableBucketedScan = true)
+      case o =>
+        if (isAllowedUnaryExecNode(o)) {
+          // Propagates `withInterestingPartition` and `withExchange` from parent
+          // for only allowed single-child nodes.
+          o.mapChildren(disableBucketWithInterestingPartition(
+            _, withInterestingPartition, withExchange))
+        } else {
+          o.mapChildren(disableBucketWithInterestingPartition(_, false, false))
+        }
+    }
+  }
+
+  private def hasInterestingPartition(plan: SparkPlan): Boolean = {
+    plan.requiredChildDistribution.exists {
+      case _: ClusteredDistribution | _: HashClusteredDistribution => true
+      case _ => false
+    }
+  }
+
+  private def isAllowedUnaryExecNode(plan: SparkPlan): Boolean = {
+    plan match {
+      case _: SortExec | _: Exchange | _: ProjectExec | _: FilterExec |
+           _: FileSourceScanExec => true
+      case partialAgg: BaseAggregateExec =>
+        val modes = partialAgg.aggregateExpressions.map(_.mode)
+        modes.nonEmpty && modes.forall(mode => mode == Partial || mode == PartialMerge)
+      case _ => false
+    }
+  }
+
+  private def isBucketedScanWithoutFilter(scan: FileSourceScanExec): Boolean = {
+    // Do not disable bucketed table scan if it has filter pruning,
+    // because bucketed table scan is still useful here to save CPU/IO cost with
+    // only reading selected bucket files.
+    scan.bucketedScan && scan.optionalBucketSet.isEmpty
+  }
+
+  private def disableAllBucketedScan(plan: SparkPlan): SparkPlan = {
+    plan.transformUp {
+      case scan: FileSourceScanExec if isBucketedScanWithoutFilter(scan) =>
+        scan.copy(disableBucketedScan = true)
+    }
+  }
+
+  def apply(plan: SparkPlan): SparkPlan = {
+    if (!conf.bucketingEnabled || !conf.autoBucketedScanEnabled) {
+      plan
+    } else if (plan.find(hasInterestingPartition).isDefined) {
+      disableBucketWithInterestingPartition(plan, false, false)
+    } else {
+      // Disable all bucketed scans if there's no operator with interesting partition
+      // found in query plan.
+      disableAllBucketedScan(plan)
+    }
+  }
+}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameJoinSuite.scala
@@ -348,7 +348,7 @@ class DataFrameJoinSuite extends QueryTest
             }
             assert(broadcastExchanges.size == 1)
             val tables = broadcastExchanges.head.collect {
-              case FileSourceScanExec(_, _, _, _, _, _, _, Some(tableIdent)) => tableIdent
+              case FileSourceScanExec(_, _, _, _, _, _, _, Some(tableIdent), _) => tableIdent
             }
             assert(tables.size == 1)
             assert(tables.head === TableIdentifier(table1Name, Some(dbName)))

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
@@ -344,6 +344,18 @@ class ExplainSuite extends ExplainSuiteHelper with DisableAdaptiveExecutionSuite
     }
   }
 
+  test("SPARK-32859: disable unnecessary bucketed table scan based on query plan") {
+    withTable("t1", "t2") {
+      withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "0",
+        SQLConf.AUTO_BUCKETED_SCAN_ENABLED.key -> "true") {
+        Seq(1, 2).toDF("i").write.bucketBy(8, "i").saveAsTable("t1")
+        Seq(2, 3).toDF("i").write.saveAsTable("t2")
+        val joined = sql("SELECT * FROM t1 JOIN t2 ON t1.i + 1 = t2.i")
+        checkKeywordsExistsInExplain(joined, keywords = "DisableBucketedScan: true")
+      }
+    }
+  }
+
   test("Coalesced bucket info should be a part of explain string") {
     withTable("t1", "t2") {
       withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "0",

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SubquerySuite.scala
@@ -1314,7 +1314,7 @@ class SubquerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark
       // need to execute the query before we can examine fs.inputRDDs()
       assert(stripAQEPlan(df.queryExecution.executedPlan) match {
         case WholeStageCodegenExec(ColumnarToRowExec(InputAdapter(
-            fs @ FileSourceScanExec(_, _, _, partitionFilters, _, _, _, _)))) =>
+            fs @ FileSourceScanExec(_, _, _, partitionFilters, _, _, _, _, _)))) =>
           partitionFilters.exists(ExecSubqueryExpression.hasSubquery) &&
             fs.inputRDDs().forall(
               _.asInstanceOf[FileScanRDD].filePartitions.forall(

diff --git a/...e/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala b/...e/src/test/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategySuite.scala
@@ -262,20 +262,22 @@ class FileSourceStrategySuite extends QueryTest with SharedSparkSession with Pre
           "p1=2/file7_0000" -> 1),
         buckets = 3)
 
-    // No partition pruning
-    checkScan(table) { partitions =>
-      assert(partitions.size == 3)
-      assert(partitions(0).files.size == 5)
-      assert(partitions(1).files.size == 0)
-      assert(partitions(2).files.size == 2)
-    }
-
-    // With partition pruning
-    checkScan(table.where("p1=2")) { partitions =>
-      assert(partitions.size == 3)
-      assert(partitions(0).files.size == 3)
-      assert(partitions(1).files.size == 0)
-      assert(partitions(2).files.size == 1)
+    withSQLConf(SQLConf.AUTO_BUCKETED_SCAN_ENABLED.key -> "false") {
+      // No partition pruning
+      checkScan(table) { partitions =>
+        assert(partitions.size == 3)
+        assert(partitions(0).files.size == 5)
+        assert(partitions(1).files.size == 0)
+        assert(partitions(2).files.size == 2)
+      }
+
+      // With partition pruning
+      checkScan(table.where("p1=2")) { partitions =>
+        assert(partitions.size == 3)
+        assert(partitions(0).files.size == 3)
+        assert(partitions(1).files.size == 0)
+        assert(partitions(2).files.size == 1)
+      }
     }
   }
 

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/joins/BroadcastJoinSuite.scala
@@ -432,22 +432,24 @@ abstract class BroadcastJoinSuiteBase extends QueryTest with SQLTestUtils
         // join1 is a broadcast join where df2 is broadcasted. Note that output partitioning on the
         // streamed side (t1) is HashPartitioning (bucketed files).
         val join1 = t1.join(df2, t1("i1") === df2("i2") && t1("j1") === df2("j2"))
-        val plan1 = join1.queryExecution.executedPlan
-        assert(collect(plan1) { case e: ShuffleExchangeExec => e }.isEmpty)
-        val broadcastJoins = collect(plan1) { case b: BroadcastHashJoinExec => b }
-        assert(broadcastJoins.size == 1)
-        assert(broadcastJoins(0).outputPartitioning.isInstanceOf[PartitioningCollection])
-        val p = broadcastJoins(0).outputPartitioning.asInstanceOf[PartitioningCollection]
-        assert(p.partitionings.size == 4)
-        // Verify all the combinations of output partitioning.
-        Seq(Seq(t1("i1"), t1("j1")),
-          Seq(t1("i1"), df2("j2")),
-          Seq(df2("i2"), t1("j1")),
-          Seq(df2("i2"), df2("j2"))).foreach { expected =>
-          val expectedExpressions = expected.map(_.expr)
-          assert(p.partitionings.exists {
-            case h: HashPartitioning => expressionsEqual(h.expressions, expectedExpressions)
-          })
+        withSQLConf(SQLConf.AUTO_BUCKETED_SCAN_ENABLED.key -> "false") {
+          val plan1 = join1.queryExecution.executedPlan
+          assert(collect(plan1) { case e: ShuffleExchangeExec => e }.isEmpty)
+          val broadcastJoins = collect(plan1) { case b: BroadcastHashJoinExec => b }
+          assert(broadcastJoins.size == 1)
+          assert(broadcastJoins(0).outputPartitioning.isInstanceOf[PartitioningCollection])
+          val p = broadcastJoins(0).outputPartitioning.asInstanceOf[PartitioningCollection]
+          assert(p.partitionings.size == 4)
+          // Verify all the combinations of output partitioning.
+          Seq(Seq(t1("i1"), t1("j1")),
+            Seq(t1("i1"), df2("j2")),
+            Seq(df2("i2"), t1("j1")),
+            Seq(df2("i2"), df2("j2"))).foreach { expected =>
+            val expectedExpressions = expected.map(_.expr)
+            assert(p.partitionings.exists {
+              case h: HashPartitioning => expressionsEqual(h.expressions, expectedExpressions)
+            })
+          }
         }
 
         // Join on the column from the broadcasted side (i2, j2) and make sure output partitioning