init commit

lianhuiwang · lianhuiwang · commit 20a6043643c0 · 2017-06-05T23:09:38.000+08:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/PruneFileSourcePartitions.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.sql.execution.datasources
 
+import org.apache.spark.sql.catalyst.catalog.CatalogStatistics
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.planning.PhysicalOperation
 import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project}
@@ -59,8 +60,10 @@ private[sql] object PruneFileSourcePartitions extends Rule[LogicalPlan] {
         val prunedFileIndex = catalogFileIndex.filterPartitions(partitionKeyFilters.toSeq)
         val prunedFsRelation =
           fsRelation.copy(location = prunedFileIndex)(sparkSession)
-        val prunedLogicalRelation = logicalRelation.copy(relation = prunedFsRelation)
-
+        val withStats = logicalRelation.catalogTable.map(_.copy(
+          stats = Some(CatalogStatistics(sizeInBytes = BigInt(prunedFileIndex.sizeInBytes)))))
+        val prunedLogicalRelation = logicalRelation.copy(
+          relation = prunedFsRelation, catalogTable = withStats)
         // Keep partition-pruning predicates so that they are visible in physical planning
         val filterExpression = filters.reduceLeft(And)
         val filter = Filter(filterExpression, prunedLogicalRelation)
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/PruneFileSourcePartitionsSuite.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.rules.RuleExecutor
 import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.hive.test.TestHiveSingleton
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types.StructType
 
@@ -66,4 +67,33 @@ class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with Te
       }
     }
   }
+
+  test("SPARK-20986 Reset table's statistics after PruneFileSourcePartitions rule") {
+    withTempView("tempTbl", "partTbl") {
+      spark.range(1000).selectExpr("id").createOrReplaceTempView("tempTbl")
+      sql("CREATE TABLE partTbl (id INT) PARTITIONED BY (part INT) STORED AS parquet")
+      for (part <- Seq(1, 2, 3)) {
+        sql(
+          s"""
+             |INSERT OVERWRITE TABLE partTbl PARTITION (part='$part')
+             |select id from tempTbl
+            """.stripMargin)
+      }
+
+      withSQLConf(SQLConf.ENABLE_FALL_BACK_TO_HDFS_FOR_STATS.key -> "true") {
+        val df = sql("SELECT * FROM partTbl where part = 1")
+        val query = df.queryExecution.analyzed.analyze
+        val sizes1 = query.collect {
+          case relation: LogicalRelation => relation.computeStats(conf).sizeInBytes
+        }
+        assert(sizes1.size === 1, s"Size wrong for:\n ${df.queryExecution}")
+        assert(sizes1(0) > 5000, s"expected > 5000 for test table 'src', got: ${sizes1(0)}")
+        val sizes2 = Optimize.execute(query).collect {
+          case relation: LogicalRelation => relation.computeStats(conf).sizeInBytes
+        }
+        assert(sizes2.size === 1, s"Size wrong for:\n ${df.queryExecution}")
+        assert(sizes2(0) < 5000, s"expected < 5000 for test table 'src', got: ${sizes2(0)}")
+      }
+    }
+  }
 }