[SPARK-36568][SQL] Better FileScan statistics estimation

peter-toth · cloud-fan · commit 3bb8cd918dca · 2021-08-26T11:26:39.000+08:00
### What changes were proposed in this pull request? This PR modifies `FileScan.estimateStatistics()` to take the read schema into account. ### Why are the changes needed? `V2ScanRelationPushDown` can column prune `DataSourceV2ScanRelation`s and change read schema of `Scan` operations. The better statistics returned by `FileScan.estimateStatistics()` can mean better query plans. For example, with this change the broadcast issue in SPARK-36568 can be avoided. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added new UT. Closes #33825 from peter-toth/SPARK-36568-scan-statistics-estimation. Authored-by: Peter Toth <peter.toth@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileScan.scala
@@ -49,6 +49,8 @@ trait FileScan extends Scan
 
   def fileIndex: PartitioningAwareFileIndex
 
+  def dataSchema: StructType
+
   /**
    * Returns the required data schema
    */
@@ -187,7 +189,10 @@ trait FileScan extends Scan
     new Statistics {
       override def sizeInBytes(): OptionalLong = {
         val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor
-        val size = (compressionFactor * fileIndex.sizeInBytes).toLong
+        val size = (compressionFactor * fileIndex.sizeInBytes /
+          (dataSchema.defaultSize + fileIndex.partitionSchema.defaultSize) *
+          (readDataSchema.defaultSize + readPartitionSchema.defaultSize)).toLong
+
         OptionalLong.of(size)
       }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScan.scala
@@ -33,6 +33,7 @@ import org.apache.spark.util.SerializableConfiguration
 case class TextScan(
     sparkSession: SparkSession,
     fileIndex: PartitioningAwareFileIndex,
+    dataSchema: StructType,
     readDataSchema: StructType,
     readPartitionSchema: StructType,
     options: CaseInsensitiveStringMap,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScanBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/text/TextScanBuilder.scala
@@ -33,6 +33,6 @@ case class TextScanBuilder(
   extends FileScanBuilder(sparkSession, fileIndex, dataSchema) {
 
   override def build(): Scan = {
-    TextScan(sparkSession, fileIndex, readDataSchema(), readPartitionSchema(), options)
+    TextScan(sparkSession, fileIndex, dataSchema, readDataSchema(), readPartitionSchema(), options)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileBasedDataSourceSuite.scala
@@ -731,6 +731,28 @@ class FileBasedDataSourceSuite extends QueryTest
     }
   }
 
+  test("SPARK-36568: FileScan statistics estimation takes read schema into account") {
+    withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> "") {
+      withTempDir { dir =>
+        spark.range(1000).map(x => (x / 100, x, x)).toDF("k", "v1", "v2").
+          write.partitionBy("k").mode(SaveMode.Overwrite).orc(dir.toString)
+        val dfAll = spark.read.orc(dir.toString)
+        val dfK = dfAll.select("k")
+        val dfV1 = dfAll.select("v1")
+        val dfV2 = dfAll.select("v2")
+        val dfV1V2 = dfAll.select("v1", "v2")
+
+        def sizeInBytes(df: DataFrame): BigInt = df.queryExecution.optimizedPlan.stats.sizeInBytes
+
+        assert(sizeInBytes(dfAll) === BigInt(getLocalDirSize(dir)))
+        assert(sizeInBytes(dfK) < sizeInBytes(dfAll))
+        assert(sizeInBytes(dfV1) < sizeInBytes(dfAll))
+        assert(sizeInBytes(dfV2) === sizeInBytes(dfV1))
+        assert(sizeInBytes(dfV1V2) < sizeInBytes(dfAll))
+      }
+    }
+  }
+
   test("File source v2: support partition pruning") {
     withSQLConf(SQLConf.USE_V1_SOURCE_LIST.key -> "") {
       allFileBasedDataSources.foreach { format =>
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/FileScanSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/FileScanSuite.scala
@@ -367,7 +367,7 @@ class FileScanSuite extends FileScanSuiteBase {
       (s, fi, ds, rds, rps, f, o, pf, df) => JsonScan(s, fi, ds, rds, rps, o, f, pf, df),
       Seq.empty),
     ("TextScan",
-      (s, fi, _, rds, rps, _, o, pf, df) => TextScan(s, fi, rds, rps, o, pf, df),
+      (s, fi, ds, rds, rps, _, o, pf, df) => TextScan(s, fi, ds, rds, rps, o, pf, df),
       Seq("dataSchema", "pushedFilters")))
 
   run(scanBuilders)
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala
@@ -22,6 +22,7 @@ import java.net.URI
 import java.nio.file.Files
 import java.util.{Locale, UUID}
 
+import scala.collection.JavaConverters._
 import scala.concurrent.duration._
 import scala.language.implicitConversions
 import scala.util.control.NonFatal
@@ -459,7 +460,9 @@ private[sql] trait SQLTestUtilsBase
    */
   def getLocalDirSize(file: File): Long = {
     assert(file.isDirectory)
-    file.listFiles.filter(f => DataSourceUtils.isDataFile(f.getName)).map(_.length).sum
+    Files.walk(file.toPath).iterator().asScala
+      .filter(p => Files.isRegularFile(p) && DataSourceUtils.isDataFile(p.getFileName.toString))
+      .map(_.toFile.length).sum
   }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -33,6 +33,6 @@ case class TextScanBuilder(`
`33`	`33`	`extends FileScanBuilder(sparkSession, fileIndex, dataSchema) {`
`34`	`34`
`35`	`35`	`override def build(): Scan = {`
`36`		`- TextScan(sparkSession, fileIndex, readDataSchema(), readPartitionSchema(), options)`
	`36`	`+ TextScan(sparkSession, fileIndex, dataSchema, readDataSchema(), readPartitionSchema(), options)`
`37`	`37`	`}`
`38`	`38`	`}`