diff --git a/docs/sql-ref-syntax-ddl-create-table-like.md b/docs/sql-ref-syntax-ddl-create-table-like.md index cfb959ca6b23..9a32bb6b2e46 100644 --- a/docs/sql-ref-syntax-ddl-create-table-like.md +++ b/docs/sql-ref-syntax-ddl-create-table-like.md @@ -57,6 +57,8 @@ CREATE TABLE [IF NOT EXISTS] table_identifier LIKE source_table_identifier * **TBLPROPERTIES** Table properties that have to be set are specified, such as `created.by.user`, `owner`, etc. + Note that a basic set of table properties defined in a source table is copied into a new table if the table formats including data source providers and storage input formats are the same. + If a specified property key has already existed in a source table, the old property is overwritten with the new one. * **LOCATION** diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index bd238948aab0..9b8650cd560f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -56,7 +56,9 @@ import org.apache.spark.sql.util.SchemaUtils * are identical to the ones defined in the source table. * * The CatalogTable attributes copied from the source table are storage(inputFormat, outputFormat, - * serde, compressed, properties), schema, provider, partitionColumnNames, bucketSpec by default. + * serde, compressed, properties), schema, provider, partitionColumnNames, bucketSpec, tblproperties + * by default. Note that, tblproperties is copied into a new table if the table formats including + * data source providers and storage input formats are the same. * * Use "CREATE TABLE t1 LIKE t2 USING file_format" to specify new provider for t1. * For Hive compatibility, use "CREATE TABLE t1 LIKE t2 STORED AS hiveFormat" @@ -116,6 +118,24 @@ case class CreateTableLikeCommand( CatalogTableType.EXTERNAL } + // We only copy source tbl properties if the format is the same with each other + val needCopyProperties = + (provider.isEmpty || provider == sourceTableDesc.provider) && + (fileFormat.inputFormat.isEmpty || + fileFormat.inputFormat == sourceTableDesc.storage.inputFormat) + + val newProperties = sourceTableDesc.tableType match { + case MANAGED | EXTERNAL if needCopyProperties => + sourceTableDesc.properties ++ properties + case MANAGED | EXTERNAL => + properties + case VIEW => + // For view, we just use the new properties + properties + case other => + throw new IllegalArgumentException( + s"Unknown table type is found at createTableLikeCommand: $other") + } val newTableDesc = CatalogTable( identifier = targetTable, @@ -125,7 +145,7 @@ case class CreateTableLikeCommand( provider = newProvider, partitionColumnNames = sourceTableDesc.partitionColumnNames, bucketSpec = sourceTableDesc.bucketSpec, - properties = properties, + properties = newProperties, tracksPartitionsInCatalog = sourceTableDesc.tracksPartitionsInCatalog) catalog.createTable(newTableDesc, ifNotExists) diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 9bc99b08c2cc..ef8db657940d 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -1063,7 +1063,10 @@ private[hive] object HiveClientImpl extends Logging { hiveTable.setSerializationLib( table.storage.serde.getOrElse("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) table.storage.properties.foreach { case (k, v) => hiveTable.setSerdeParam(k, v) } - table.properties.foreach { case (k, v) => hiveTable.setProperty(k, v) } + // Hive only retain the useful properties through serde class annotation. + // For better compatible with Hive, we remove the metastore properties. + val hiveProperties = table.properties -- HIVE_METASTORE_GENERATED_PROPERTIES + hiveProperties.foreach { case (k, v) => hiveTable.setProperty(k, v) } table.comment.foreach { c => hiveTable.setProperty("comment", c) } // Hive will expand the view text, so it needs 2 fields: viewOriginalText and viewExpandedText. // Since we don't expand the view text, but only add table properties, we map the `viewText` to @@ -1222,6 +1225,14 @@ private[hive] object HiveClientImpl extends Logging { StatsSetupConst.TOTAL_SIZE ) + // Visible for testing. + private[hive] val HIVE_METASTORE_GENERATED_PROPERTIES: Set[String] = Set( + hive_metastoreConstants.DDL_TIME, + // at org.apache.hadoop.hive.ql.exec.DDLTask.updateModifiedParameters() + "last_modified_by", + "last_modified_time" + ) + def newHiveConf( sparkConf: SparkConf, hadoopConf: JIterable[JMap.Entry[String, String]], diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala index 56b871644453..635975aab718 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.hive.execution import java.io.File import java.net.URI +import java.util.UUID import org.apache.hadoop.fs.Path import org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER @@ -38,6 +39,7 @@ import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils} import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.HiveExternalCatalog import org.apache.spark.sql.hive.HiveUtils.{CONVERT_METASTORE_ORC, CONVERT_METASTORE_PARQUET} +import org.apache.spark.sql.hive.client.HiveClientImpl import org.apache.spark.sql.hive.orc.OrcFileOperator import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} @@ -1542,21 +1544,6 @@ class HiveDDLSuite assert(targetTable.unsupportedFeatures.isEmpty, "the unsupportedFeatures in the create table must be empty") - val metastoreGeneratedProperties = Seq( - "CreateTime", - "transient_lastDdlTime", - "grantTime", - "lastUpdateTime", - "last_modified_by", - "last_modified_time", - "Owner:", - "totalNumberFiles", - "maxFileSize", - "minFileSize" - ) - assert(targetTable.properties.filterKeys(!metastoreGeneratedProperties.contains(_)).isEmpty, - "the table properties of source tables should not be copied in the created table") - provider match { case Some(_) => assert(targetTable.provider == provider) @@ -2763,7 +2750,6 @@ class HiveDDLSuite assert(source.properties("a") == "apple") sql("CREATE TABLE t LIKE s STORED AS parquet TBLPROPERTIES('f'='foo', 'b'='bar')") val table = catalog.getTableMetadata(TableIdentifier("t")) - assert(table.properties.get("a") === None) assert(table.properties("f") == "foo") assert(table.properties("b") == "bar") } @@ -2852,6 +2838,68 @@ class HiveDDLSuite } } + test("SPARK-31828: Retain table properties at CreateTableLikeCommand") { + val catalog = spark.sessionState.catalog + withTable("t1", "t2", "t3", "t4", "t5", "t6") { + sql("CREATE TABLE t1(c1 int) USING hive TBLPROPERTIES('k1'='v1', 'k2'='v2')") + val t1 = catalog.getTableMetadata(TableIdentifier("t1")) + assert(t1.properties("k1") == "v1") + assert(t1.properties("k2") == "v2") + sql("CREATE TABLE t2 LIKE t1 TBLPROPERTIES('k2'='v3', 'k4'='v4')") + val t2 = catalog.getTableMetadata(TableIdentifier("t2")) + assert(t2.properties("k1") == "v1") + assert(t2.properties("k2") == "v3") + assert(t2.properties("k4") == "v4") + sql("CREATE TABLE t3 LIKE t1") + val t3 = catalog.getTableMetadata(TableIdentifier("t3")) + assert(t3.properties("k1") == "v1") + assert(t3.properties("k2") == "v2") + sql( + """ + |CREATE TABLE t4 LIKE t1 STORED AS TEXTFILE + |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' + """.stripMargin) + val t4 = catalog.getTableMetadata(TableIdentifier("t4")) + assert(t4.properties("k1") == "v1") + assert(t4.properties("k2") == "v2") + sql("CREATE TABLE t5 LIKE t1 USING parquet") + val t5 = catalog.getTableMetadata(TableIdentifier("t5")) + assert(t5.properties.get("k1").isEmpty) + assert(t5.properties.get("k2").isEmpty) + sql("CREATE TABLE t6 LIKE t1 USING hive") + val t6 = catalog.getTableMetadata(TableIdentifier("t6")) + assert(t6.properties("k1") == "v1") + assert(t6.properties("k2") == "v2") + } + + withView("v1") { + withTable("t1") { + sql("create view v1 as select 1") + sql("alter view v1 set tblproperties('k1'='v1')") + sql("create table t1 like v1") + val t1 = catalog.getTableMetadata(TableIdentifier("t1")) + assert(t1.properties.get("k1").isEmpty) + } + } + } + + test("SPARK-31828: Filters out Hive metastore properties in CreateTableLikeCommand") { + val catalog = spark.sessionState.catalog + HiveClientImpl.HIVE_METASTORE_GENERATED_PROPERTIES.foreach { meta => + withTable("t1", "t2") { + val uuid = UUID.randomUUID().toString + sql(s"CREATE TABLE t1(c1 int) USING hive TBLPROPERTIES('$meta'='$uuid')") + val t1 = catalog.getTableMetadata(TableIdentifier("t1")) + // Removed in HiveClientImpl.toHiveTable, but they may be added by hive + assert(t1.properties.get(s"$meta").isEmpty || t1.properties(s"$meta") != s"$uuid") + sql("CREATE TABLE t2 LIKE t1") + val t2 = catalog.getTableMetadata(TableIdentifier("t2")) + // We don't copy source tbl metastore properties, but they may be added by hive + assert(t2.properties.get(s"$meta").isEmpty || t2.properties(s"$meta") != s"$uuid") + } + } + } + test("SPARK-31904: Fix case sensitive problem of char and varchar partition columns") { withTable("t1", "t2") { sql("CREATE TABLE t1(a STRING, B VARCHAR(10), C CHAR(10)) STORED AS parquet")