diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala index b87bbb487467..b10ce054cbf0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/interface.scala @@ -366,10 +366,16 @@ case class CatalogStatistics( * Convert [[CatalogStatistics]] to [[Statistics]], and match column stats to attributes based * on column names. */ - def toPlanStats(planOutput: Seq[Attribute]): Statistics = { - val matched = planOutput.flatMap(a => colStats.get(a.name).map(a -> _)) - Statistics(sizeInBytes = sizeInBytes, rowCount = rowCount, - attributeStats = AttributeMap(matched)) + def toPlanStats(planOutput: Seq[Attribute], cboEnabled: Boolean): Statistics = { + if (cboEnabled) { + val attrStats = planOutput.flatMap(a => colStats.get(a.name).map(a -> _)) + Statistics(sizeInBytes = sizeInBytes, rowCount = rowCount, + attributeStats = AttributeMap(attrStats)) + } else { + // When CBO is disabled, we apply the size-only estimation strategy, so there's no need to + // propagate other statistics from catalog to the plan. + Statistics(sizeInBytes = sizeInBytes) + } } /** Readable string representation for the CatalogStatistics. */ @@ -452,7 +458,7 @@ case class HiveTableRelation( ) override def computeStats(): Statistics = { - tableMeta.stats.map(_.toPlanStats(output)).getOrElse { + tableMeta.stats.map(_.toPlanStats(output, conf.cboEnabled)).getOrElse { throw new IllegalStateException("table stats must be specified.") } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala index 236995708a12..8d715f634298 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/LogicalRelation.scala @@ -41,7 +41,7 @@ case class LogicalRelation( override def computeStats(): Statistics = { catalogTable - .flatMap(_.stats.map(_.toPlanStats(output))) + .flatMap(_.stats.map(_.toPlanStats(output, conf.cboEnabled))) .getOrElse(Statistics(sizeInBytes = relation.sizeInBytes)) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala index f6df077ec572..0a0407d5dbac 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StatisticsCollectionTestBase.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.catalog.{CatalogStatistics, CatalogTable, H import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, Histogram, HistogramBin, LogicalPlan} import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.datasources.LogicalRelation -import org.apache.spark.sql.internal.StaticSQLConf +import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.Decimal @@ -223,11 +223,19 @@ abstract class StatisticsCollectionTestBase extends QueryTest with SQLTestUtils assert(catalogTable.stats.get.colStats == Map("c1" -> emptyColStat)) // Check relation statistics - assert(relation.stats.sizeInBytes == 0) - assert(relation.stats.rowCount == Some(0)) - assert(relation.stats.attributeStats.size == 1) - val (attribute, colStat) = relation.stats.attributeStats.head - assert(attribute.name == "c1") - assert(colStat == emptyColStat) + withSQLConf(SQLConf.CBO_ENABLED.key -> "true") { + assert(relation.stats.sizeInBytes == 0) + assert(relation.stats.rowCount == Some(0)) + assert(relation.stats.attributeStats.size == 1) + val (attribute, colStat) = relation.stats.attributeStats.head + assert(attribute.name == "c1") + assert(colStat == emptyColStat) + } + relation.invalidateStatsCache() + withSQLConf(SQLConf.CBO_ENABLED.key -> "false") { + assert(relation.stats.sizeInBytes == 0) + assert(relation.stats.rowCount.isEmpty) + assert(relation.stats.attributeStats.isEmpty) + } } } diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala index 3066a4f305f0..dfabf1ec2a22 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveExplainSuite.scala @@ -18,8 +18,10 @@ package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.hive.test.TestHiveSingleton +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils /** @@ -29,21 +31,32 @@ class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleto import testImplicits._ test("show cost in explain command") { + val explainCostCommand = "EXPLAIN COST SELECT * FROM src" // For readability, we only show optimized plan and physical plan in explain cost command - checkKeywordsExist(sql("EXPLAIN COST SELECT * FROM src "), + checkKeywordsExist(sql(explainCostCommand), "Optimized Logical Plan", "Physical Plan") - checkKeywordsNotExist(sql("EXPLAIN COST SELECT * FROM src "), + checkKeywordsNotExist(sql(explainCostCommand), "Parsed Logical Plan", "Analyzed Logical Plan") - // Only has sizeInBytes before ANALYZE command - checkKeywordsExist(sql("EXPLAIN COST SELECT * FROM src "), "sizeInBytes") - checkKeywordsNotExist(sql("EXPLAIN COST SELECT * FROM src "), "rowCount") + withSQLConf(SQLConf.CBO_ENABLED.key -> "true") { + // Only has sizeInBytes before ANALYZE command + checkKeywordsExist(sql(explainCostCommand), "sizeInBytes") + checkKeywordsNotExist(sql(explainCostCommand), "rowCount") - // Has both sizeInBytes and rowCount after ANALYZE command - sql("ANALYZE TABLE src COMPUTE STATISTICS") - checkKeywordsExist(sql("EXPLAIN COST SELECT * FROM src "), "sizeInBytes", "rowCount") + // Has both sizeInBytes and rowCount after ANALYZE command + sql("ANALYZE TABLE src COMPUTE STATISTICS") + checkKeywordsExist(sql(explainCostCommand), "sizeInBytes", "rowCount") + } + + spark.sessionState.catalog.refreshTable(TableIdentifier("src")) + + withSQLConf(SQLConf.CBO_ENABLED.key -> "false") { + // Don't show rowCount if cbo is disabled + checkKeywordsExist(sql(explainCostCommand), "sizeInBytes") + checkKeywordsNotExist(sql(explainCostCommand), "rowCount") + } - // No cost information + // No statistics information if "cost" is not specified checkKeywordsNotExist(sql("EXPLAIN SELECT * FROM src "), "sizeInBytes", "rowCount") }