apache · c21 · Feb 1, 2021 · Feb 1, 2021 · Feb 1, 2021 · Feb 1, 2021
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -545,7 +545,6 @@ case class FileSourceScanExec(
           .getOrElse(sys.error(s"Invalid bucket file ${f.filePath}"))
       }
 
-    // TODO(SPARK-32985): Decouple bucket filter pruning and bucketed table scan
     val prunedFilesGroupedToBuckets = if (optionalBucketSet.isDefined) {
       val bucketSet = optionalBucketSet.get
       filesGroupedToBuckets.filter {
@@ -591,20 +590,34 @@ case class FileSourceScanExec(
     logInfo(s"Planning scan with bin packing, max size: $maxSplitBytes bytes, " +
       s"open cost is considered as scanning $openCostInBytes bytes.")
 
+    // Filter files with bucket pruning if possible
+    val filePruning: Path => Boolean = optionalBucketSet match {
+      case Some(bucketSet) =>
+        filePath => bucketSet.get(BucketingUtils.getBucketId(filePath.getName)
+          .getOrElse(sys.error(s"Invalid bucket file $filePath")))
 val IGNORE_CORRUPT_FILES = buildConf("spark.sql.files.ignoreCorruptFiles") 
   .doc("Whether to ignore corrupt files. If true, the Spark jobs will continue to run when " + 
     "encountering corrupted files and the contents that have been read will still be returned. " + 
     "This configuration is effective only when using file-based sources such as Parquet, JSON " + 
     "and ORC.") 
   .version("2.1.1") 
   .booleanConf 
   .createWithDefault(false) 
 val IGNORE_MISSING_FILES = buildConf("spark.sql.files.ignoreMissingFiles") 
   .doc("Whether to ignore missing files. If true, the Spark jobs will continue to run when " + 
     "encountering missing files and the contents that have been read will still be returned. " + 
     "This configuration is effective only when using file-based sources such as Parquet, JSON " + 
     "and ORC.") 
   .version("2.3.0") 
   .booleanConf 
   .createWithDefault(false) 
 val IGNORE_CORRUPT_FILES = buildConf("spark.sql.files.ignoreCorruptFiles") 
   .doc("Whether to ignore corrupt files. If true, the Spark jobs will continue to run when " + 
     "encountering corrupted files and the contents that have been read will still be returned. " + 
     "This configuration is effective only when using file-based sources such as Parquet, JSON " + 
     "and ORC.") 
   .version("2.1.1") 
   .booleanConf 
   .createWithDefault(false) 
  
 val IGNORE_MISSING_FILES = buildConf("spark.sql.files.ignoreMissingFiles") 
   .doc("Whether to ignore missing files. If true, the Spark jobs will continue to run when " + 
     "encountering missing files and the contents that have been read will still be returned. " + 
     "This configuration is effective only when using file-based sources such as Parquet, JSON " + 
     "and ORC.") 
   .version("2.3.0") 
   .booleanConf 
   .createWithDefault(false) 
+      case None =>
+        _ => true
+    }
+
     val splitFiles = selectedPartitions.flatMap { partition =>
       partition.files.flatMap { file =>
         // getPath() is very expensive so we only want to call it once in this block:
         val filePath = file.getPath
-        val isSplitable = relation.fileFormat.isSplitable(
-          relation.sparkSession, relation.options, filePath)
-        PartitionedFileUtil.splitFiles(
-          sparkSession = relation.sparkSession,
-          file = file,
-          filePath = filePath,
-          isSplitable = isSplitable,
-          maxSplitBytes = maxSplitBytes,
-          partitionValues = partition.values
-        )
+
+        if (filePruning(filePath)) {
+          val isSplitable = relation.fileFormat.isSplitable(
+            relation.sparkSession, relation.options, filePath)
+          PartitionedFileUtil.splitFiles(
+            sparkSession = relation.sparkSession,
+            file = file,
+            filePath = filePath,
+            isSplitable = isSplitable,
+            maxSplitBytes = maxSplitBytes,
+            partitionValues = partition.values
+          )
+        } else {
+          Seq.empty
+        }
       }
     }.sortBy(_.length)(implicitly[Ordering[Long]].reverse)
 

diff --git a/.../main/scala/org/apache/spark/sql/execution/bucketing/DisableUnnecessaryBucketedScan.scala b/.../main/scala/org/apache/spark/sql/execution/bucketing/DisableUnnecessaryBucketedScan.scala
@@ -98,7 +98,7 @@ object DisableUnnecessaryBucketedScan extends Rule[SparkPlan] {
         exchange.mapChildren(disableBucketWithInterestingPartition(
           _, withInterestingPartition, true, withAllowedNode))
       case scan: FileSourceScanExec =>
-        if (isBucketedScanWithoutFilter(scan)) {
+        if (scan.bucketedScan) {
           if (!withInterestingPartition || (withExchange && withAllowedNode)) {
             val nonBucketedScan = scan.copy(disableBucketedScan = true)
             scan.logicalLink.foreach(nonBucketedScan.setLogicalLink)
@@ -140,20 +140,13 @@ object DisableUnnecessaryBucketedScan extends Rule[SparkPlan] {
     }
   }
 
-  private def isBucketedScanWithoutFilter(scan: FileSourceScanExec): Boolean = {
-    // Do not disable bucketed table scan if it has filter pruning,
-    // because bucketed table scan is still useful here to save CPU/IO cost with
-    // only reading selected bucket files.
-    scan.bucketedScan && scan.optionalBucketSet.isEmpty
-  }
-
   def apply(plan: SparkPlan): SparkPlan = {
-    lazy val hasBucketedScanWithoutFilter = plan.find {
-      case scan: FileSourceScanExec => isBucketedScanWithoutFilter(scan)
+    lazy val hasBucketedScan = plan.find {
+      case scan: FileSourceScanExec => scan.bucketedScan
       case _ => false
     }.isDefined
 
-    if (!conf.bucketingEnabled || !conf.autoBucketedScanEnabled || !hasBucketedScanWithoutFilter) {
+    if (!conf.bucketingEnabled || !conf.autoBucketedScanEnabled || !hasBucketedScan) {
       plan
     } else {
       disableBucketWithInterestingPartition(plan, false, false, true)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/sources/BucketedReadSuite.scala
@@ -117,8 +117,11 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils with Adapti
       bucketValues: Seq[Any],
       filterCondition: Column,
       originalDataFrame: DataFrame): Unit = {
-    // This test verifies parts of the plan. Disable whole stage codegen.
-    withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") {
+    // This test verifies parts of the plan. Disable whole stage codegen,
+    // automatically bucketed scan, and filter push down for json data source.
+    withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false",
+      SQLConf.AUTO_BUCKETED_SCAN_ENABLED.key -> "false",
+      SQLConf.JSON_FILTER_PUSHDOWN_ENABLED.key -> "false") {
       val bucketedDataFrame = spark.table("bucketed_table").select("i", "j", "k")
       val BucketSpec(numBuckets, bucketColumnNames, _) = bucketSpec
       // Limit: bucket pruning only works when the bucket column has one and only one column
@@ -148,19 +151,53 @@ abstract class BucketedReadSuite extends QueryTest with SQLTestUtils with Adapti
         if (invalidBuckets.nonEmpty) {
           fail(s"Buckets ${invalidBuckets.mkString(",")} should have been pruned from:\n$plan")
         }
+
+        withSQLConf(SQLConf.BUCKETING_ENABLED.key -> "false") {
+          // Bucket pruning should still work when bucketing is disabled
+          val planWithBucketDisabled = spark.table("bucketed_table").select("i", "j", "k").filter(filterCondition)
+            .queryExecution.executedPlan
+          val fileScanWithBucketDisabled = getFileScan(planWithBucketDisabled)
+          assert(!fileScanWithBucketDisabled.bucketedScan,
+            "except no bucketed scan when disabling bucketing but found\n" +
+              s"$fileScanWithBucketDisabled")
+
+          val tableSchema = fileScanWithBucketDisabled.schema
+          val bucketColumnIndex = tableSchema.fieldIndex(bucketColumnNames.head)
+          val bucketColumn = tableSchema.toAttributes(bucketColumnIndex)
+          val bucketColumnType = tableSchema.apply(bucketColumnIndex).dataType
+          val rowsWithInvalidBuckets = fileScanWithBucketDisabled.execute().filter(row => {
+            // Return rows should have been pruned
+            val bucketColumnValue = row.get(bucketColumnIndex, bucketColumnType)
+            val bucketId = BucketingUtils.getBucketIdFromValue(
+              bucketColumn, numBuckets, bucketColumnValue)
+            !matchedBuckets.get(bucketId)
+          }).collect()
+
+          if (rowsWithInvalidBuckets.nonEmpty) {
+            fail(s"Rows ${rowsWithInvalidBuckets.mkString(",")} should have been pruned from:\n" +
+              s"$planWithBucketDisabled")
+          }
+        }
       }
 
+      val expectedDataFrame = originalDataFrame.filter(filterCondition).orderBy("i", "j", "k")
       checkAnswer(
         bucketedDataFrame.filter(filterCondition).orderBy("i", "j", "k"),
-        originalDataFrame.filter(filterCondition).orderBy("i", "j", "k"))
+        expectedDataFrame)
+
+      withSQLConf(SQLConf.BUCKETING_ENABLED.key -> "false") {
+        checkAnswer(
+          spark.table("bucketed_table").select("i", "j", "k").filter(filterCondition)
+            .orderBy("i", "j", "k"),
+          expectedDataFrame)
+      }
     }
   }
 
   test("read partitioning bucketed tables with bucket pruning filters") {
     withTable("bucketed_table") {
       val numBuckets = NumBucketsForPruningDF
       val bucketSpec = BucketSpec(numBuckets, Seq("j"), Nil)
-      // json does not support predicate push-down, and thus json is used here
       df.write
         .format("json")
         .partitionBy("i")

diff --git a/...ore/src/test/scala/org/apache/spark/sql/sources/DisableUnnecessaryBucketedScanSuite.scala b/...ore/src/test/scala/org/apache/spark/sql/sources/DisableUnnecessaryBucketedScanSuite.scala
@@ -95,7 +95,7 @@ abstract class DisableUnnecessaryBucketedScanSuite
         ("SELECT i FROM t1", 0, 1),
         ("SELECT j FROM t1", 0, 0),
         // Filter on bucketed column
-        ("SELECT * FROM t1 WHERE i = 1", 1, 1),
+        ("SELECT * FROM t1 WHERE i = 1", 0, 1),
         // Filter on non-bucketed column
         ("SELECT * FROM t1 WHERE j = 1", 0, 1),
         // Join with same buckets