Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -642,8 +642,13 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
if (stats.get.rowCount.isDefined) {
statsProperties += STATISTICS_NUM_ROWS -> stats.get.rowCount.get.toString()
}

// For datasource tables and hive serde tables created by spark 2.1 or higher,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also add a test for hive serde tables?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@viirya Hive serde tables should already be tested , right ?
https://github.com/dilipbiswal/spark/blob/420be2f28db5f413566c161aa7969db664cd8f3b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala#L251
Isn't this testing hive serde tables ? Is there anything specific in hive serde tables that we want to test ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, here I think we should test when the schema read by getRawTable (i.e., rawTable) is different than the schema stored in table properties.

Previously because two copies of schema are different, you will get the NoSuchElement too. That's why we should call restoreTableMetadata here for hive serde tables.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@viirya Thanks .. i see.. so far i always get the same schema for both. I will study this some more to see when they can be different.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The case they are different I think is case sensitivity issue in Hive schema.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@viirya Thank you. I have added the new test.

// the data schema is stored in the table properties.
val schema = restoreTableMetadata(rawTable).schema

val colNameTypeMap: Map[String, DataType] =
rawTable.schema.fields.map(f => (f.name, f.dataType)).toMap
schema.fields.map(f => (f.name, f.dataType)).toMap
stats.get.colStats.foreach { case (colName, colStat) =>
colStat.toMap(colName, colNameTypeMap(colName)).foreach { case (k, v) =>
statsProperties += (columnStatKeyPropName(colName, k) -> v)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,11 @@ package org.apache.spark.sql.hive

import java.io.{File, PrintWriter}

import org.apache.hadoop.hive.common.StatsSetupConst
import scala.reflect.ClassTag
import scala.util.matching.Regex

import org.apache.hadoop.hive.common.StatsSetupConst

import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.catalog.{CatalogRelation, CatalogStatistics}
Expand All @@ -34,9 +35,16 @@ import org.apache.spark.sql.execution.joins._
import org.apache.spark.sql.hive.HiveExternalCatalog._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._


class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleton {
private def dropMetadata(schema: StructType): StructType = {
val newFields = schema.fields.map { f =>
StructField(f.name, f.dataType, f.nullable, Metadata.empty)
}
StructType(newFields)
}

test("Hive serde tables should fallback to HDFS for size estimation") {
withSQLConf(SQLConf.ENABLE_FALL_BACK_TO_HDFS_FOR_STATS.key -> "true") {
Expand Down Expand Up @@ -117,6 +125,72 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto
}
}

test("analyze non hive compatible datasource tables") {
val table = "parquet_tab"
withTable(table) {
sql(
s"""
|CREATE TABLE $table (a int, b int)
|USING parquet
|OPTIONS (skipHiveMetadata true)
""".stripMargin)

// Verify that the schema stored in catalog is a dummy one used for
// data source tables. The actual schema is stored in table properties.
val rawSchema = dropMetadata(hiveClient.getTable("default", table).schema)
val expectedRawSchema = new StructType()
.add("col", "array<string>")
assert(rawSchema == expectedRawSchema)

val actualSchema = spark.sharedState.externalCatalog.getTable("default", table).schema
val expectedActualSchema = new StructType()
.add("a", "int")
.add("b", "int")
assert(actualSchema == expectedActualSchema)

sql(s"INSERT INTO $table VALUES (1, 1)")
sql(s"INSERT INTO $table VALUES (2, 1)")
sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS a, b")
val fetchedStats0 =
checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(2))
assert(fetchedStats0.get.colStats == Map(
"a" -> ColumnStat(2, Some(1), Some(2), 0, 4, 4),
"b" -> ColumnStat(1, Some(1), Some(1), 0, 4, 4)))
}
}

test("Analyze hive serde tables when schema is not same as schema in table properties") {

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: This line is useless.

val table = "hive_serde"
withTable(table) {
sql(s"CREATE TABLE $table (C1 INT, C2 STRING, C3 DOUBLE)")

// Verify that the table schema stored in hive catalog is
// different than the schema stored in table properties.
val rawSchema = dropMetadata(hiveClient.getTable("default", table).schema)
val expectedRawSchema = new StructType()
.add("c1", "int")
.add("c2", "string")
.add("c3", "double")
assert(rawSchema == expectedRawSchema)

val actualSchema = spark.sharedState.externalCatalog.getTable("default", table).schema
val expectedActualSchema = new StructType()
.add("C1", "int")
.add("C2", "string")
.add("C3", "double")
assert(actualSchema == expectedActualSchema)

sql(s"INSERT INTO TABLE $table SELECT 1, 'a', 10.0")
sql(s"ANALYZE TABLE $table COMPUTE STATISTICS FOR COLUMNS C1")
val fetchedStats1 =
checkTableStats(table, hasSizeInBytes = true, expectedRowCounts = Some(1)).get
assert(fetchedStats1.colStats == Map(
"C1" -> ColumnStat(distinctCount = 1, min = Some(1), max = Some(1), nullCount = 0,
avgLen = 4, maxLen = 4)))
}
}

test("SPARK-21079 - analyze table with location different than that of individual partitions") {
val tableName = "analyzeTable_part"
withTable(tableName) {
Expand Down