apache · c21 · Nov 6, 2021 · Nov 6, 2021 · Nov 6, 2021 · Nov 6, 2021
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/orc/OrcScan.scala
@@ -44,7 +44,7 @@ case class OrcScan(
     dataFilters: Seq[Expression] = Seq.empty) extends FileScan {
   override def isSplitable(path: Path): Boolean = {
     // If aggregate is pushed down, only the file footer will be read once,
-    // so file should be not split across multiple tasks.
+    // so file should not be split across multiple tasks.
     pushedAggregate.isEmpty
   }
 

diff --git a/...org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala b/...org/apache/spark/sql/execution/datasources/v2/parquet/ParquetPartitionReaderFactory.scala
@@ -92,11 +92,6 @@ case class ParquetPartitionReaderFactory(
       ParquetFooterReader.readFooter(conf, filePath, SKIP_ROW_GROUPS)
     } else {
       // For aggregate push down, we will get max/min/count from footer statistics.
-      // We want to read the footer for the whole file instead of reading multiple
-      // footers for every split of the file. Basically if the start (the beginning of)
-      // the offset in PartitionedFile is 0, we will read the footer. Otherwise, it means
-      // that we have already read footer for that file, so we will skip reading again.
-      if (file.start != 0) return null
       ParquetFooterReader.readFooter(conf, filePath, NO_FILTER)
     }
   }

diff --git a/...re/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScan.scala b/...re/src/main/scala/org/apache/spark/sql/execution/datasources/v2/parquet/ParquetScan.scala
@@ -47,7 +47,11 @@ case class ParquetScan(
     pushedAggregate: Option[Aggregation] = None,
     partitionFilters: Seq[Expression] = Seq.empty,
     dataFilters: Seq[Expression] = Seq.empty) extends FileScan {
-  override def isSplitable(path: Path): Boolean = true
+  override def isSplitable(path: Path): Boolean = {
+    // If aggregate is pushed down, only the file footer will be read once,
+    // so file should not be split across multiple tasks.
+    pushedAggregate.isEmpty
+  }
 
   override def readSchema(): StructType = {
     // If aggregate is pushed down, schema has already been pruned in `ParquetScanBuilder`