-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-31828][SQL] Retain table properties at CreateTableLikeCommand #28647
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
61dfe05
37ba3b3
594a1bb
b30d62c
29aba43
162627a
9eacf1e
175d0e2
ed0877e
dd04dcc
4507e7f
bc52948
6b523e1
0e79ed3
46d7a7b
1edc619
78ff34f
2251181
5c63477
183c209
fcc8b3b
006ec47
319fbfb
7f9f685
93a0c75
3e8af07
0a3b1cf
1e1349f
19f398b
03966ae
dc00260
2d470fc
4b55575
c45489a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -56,7 +56,9 @@ import org.apache.spark.sql.util.SchemaUtils | |
| * are identical to the ones defined in the source table. | ||
| * | ||
| * The CatalogTable attributes copied from the source table are storage(inputFormat, outputFormat, | ||
| * serde, compressed, properties), schema, provider, partitionColumnNames, bucketSpec by default. | ||
| * serde, compressed, properties), schema, provider, partitionColumnNames, bucketSpec, tblproperties | ||
| * by default. Note that, tblproperties is copied into a new table if the table formats including | ||
| * data source providers and storage input formats are the same. | ||
| * | ||
| * Use "CREATE TABLE t1 LIKE t2 USING file_format" to specify new provider for t1. | ||
| * For Hive compatibility, use "CREATE TABLE t1 LIKE t2 STORED AS hiveFormat" | ||
|
|
@@ -116,6 +118,24 @@ case class CreateTableLikeCommand( | |
| CatalogTableType.EXTERNAL | ||
| } | ||
|
|
||
| // We only copy source tbl properties if the format is the same with each other | ||
| val needCopyProperties = | ||
| (provider.isEmpty || provider == sourceTableDesc.provider) && | ||
viirya marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| (fileFormat.inputFormat.isEmpty || | ||
| fileFormat.inputFormat == sourceTableDesc.storage.inputFormat) | ||
|
|
||
| val newProperties = sourceTableDesc.tableType match { | ||
| case MANAGED | EXTERNAL if needCopyProperties => | ||
| sourceTableDesc.properties ++ properties | ||
| case MANAGED | EXTERNAL => | ||
| properties | ||
| case VIEW => | ||
| // For view, we just use the new properties | ||
| properties | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keep view behavior as before. Hive also does not copy view properties.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit format; |
||
| case other => | ||
| throw new IllegalArgumentException( | ||
| s"Unknown table type is found at createTableLikeCommand: $other") | ||
| } | ||
| val newTableDesc = | ||
| CatalogTable( | ||
| identifier = targetTable, | ||
|
|
@@ -125,7 +145,7 @@ case class CreateTableLikeCommand( | |
| provider = newProvider, | ||
| partitionColumnNames = sourceTableDesc.partitionColumnNames, | ||
| bucketSpec = sourceTableDesc.bucketSpec, | ||
| properties = properties, | ||
| properties = newProperties, | ||
| tracksPartitionsInCatalog = sourceTableDesc.tracksPartitionsInCatalog) | ||
|
|
||
| catalog.createTable(newTableDesc, ifNotExists) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1063,7 +1063,10 @@ private[hive] object HiveClientImpl extends Logging { | |
| hiveTable.setSerializationLib( | ||
| table.storage.serde.getOrElse("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) | ||
| table.storage.properties.foreach { case (k, v) => hiveTable.setSerdeParam(k, v) } | ||
| table.properties.foreach { case (k, v) => hiveTable.setProperty(k, v) } | ||
| // Hive only retain the useful properties through serde class annotation. | ||
| // For better compatible with Hive, we remove the metastore properties. | ||
| val hiveProperties = table.properties -- HIVE_METASTORE_GENERATED_PROPERTIES | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It also affect |
||
| hiveProperties.foreach { case (k, v) => hiveTable.setProperty(k, v) } | ||
| table.comment.foreach { c => hiveTable.setProperty("comment", c) } | ||
| // Hive will expand the view text, so it needs 2 fields: viewOriginalText and viewExpandedText. | ||
| // Since we don't expand the view text, but only add table properties, we map the `viewText` to | ||
|
|
@@ -1222,6 +1225,14 @@ private[hive] object HiveClientImpl extends Logging { | |
| StatsSetupConst.TOTAL_SIZE | ||
| ) | ||
|
|
||
| // Visible for testing. | ||
| private[hive] val HIVE_METASTORE_GENERATED_PROPERTIES: Set[String] = Set( | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After check Hive1.2 and Hive2.3 again, I think we can just remove 3 properties so that we can reduce the scope of influence.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this include all the properties in https://github.com/apache/spark/pull/28647/files#diff-b7094baa12601424a5d19cb930e3402fL1534 ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I checked all the properties and make sure hive just modify the three properties. |
||
| hive_metastoreConstants.DDL_TIME, | ||
| // at org.apache.hadoop.hive.ql.exec.DDLTask.updateModifiedParameters() | ||
| "last_modified_by", | ||
| "last_modified_time" | ||
| ) | ||
|
|
||
| def newHiveConf( | ||
| sparkConf: SparkConf, | ||
| hadoopConf: JIterable[JMap.Entry[String, String]], | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ package org.apache.spark.sql.hive.execution | |
|
|
||
| import java.io.File | ||
| import java.net.URI | ||
| import java.util.UUID | ||
|
|
||
| import org.apache.hadoop.fs.Path | ||
| import org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER | ||
|
|
@@ -38,6 +39,7 @@ import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils} | |
| import org.apache.spark.sql.functions._ | ||
| import org.apache.spark.sql.hive.HiveExternalCatalog | ||
| import org.apache.spark.sql.hive.HiveUtils.{CONVERT_METASTORE_ORC, CONVERT_METASTORE_PARQUET} | ||
| import org.apache.spark.sql.hive.client.HiveClientImpl | ||
| import org.apache.spark.sql.hive.orc.OrcFileOperator | ||
| import org.apache.spark.sql.hive.test.TestHiveSingleton | ||
| import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} | ||
|
|
@@ -1542,21 +1544,6 @@ class HiveDDLSuite | |
| assert(targetTable.unsupportedFeatures.isEmpty, | ||
| "the unsupportedFeatures in the create table must be empty") | ||
|
|
||
| val metastoreGeneratedProperties = Seq( | ||
| "CreateTime", | ||
| "transient_lastDdlTime", | ||
| "grantTime", | ||
| "lastUpdateTime", | ||
| "last_modified_by", | ||
| "last_modified_time", | ||
| "Owner:", | ||
| "totalNumberFiles", | ||
| "maxFileSize", | ||
| "minFileSize" | ||
| ) | ||
| assert(targetTable.properties.filterKeys(!metastoreGeneratedProperties.contains(_)).isEmpty, | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove this check. And we can't check all meta properties here. For example Add some meta properties test at new UT. |
||
| "the table properties of source tables should not be copied in the created table") | ||
|
|
||
| provider match { | ||
| case Some(_) => | ||
| assert(targetTable.provider == provider) | ||
|
|
@@ -2763,7 +2750,6 @@ class HiveDDLSuite | |
| assert(source.properties("a") == "apple") | ||
| sql("CREATE TABLE t LIKE s STORED AS parquet TBLPROPERTIES('f'='foo', 'b'='bar')") | ||
| val table = catalog.getTableMetadata(TableIdentifier("t")) | ||
| assert(table.properties.get("a") === None) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems like we intentionally don't keep the table properties from the original table. @maropu @dongjoon-hyun @viirya are you OK with the behavior change proposed by this PR?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on the description of
So we don't say table properties are copied from source table too. Not sure about why we didn't copy table properties. I feel it is OK as seems copying original table properties should not be harmful. And we already copy storage properties. But we should update the doc together.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hm, I have the same opinion with @viirya; I couldn't find any strong reason not to copy the properites. Rather, is this a kind of bugs? Anyway, yea, we should clearly descibe the behaivour in our doc... |
||
| assert(table.properties("f") == "foo") | ||
| assert(table.properties("b") == "bar") | ||
| } | ||
|
|
@@ -2852,6 +2838,68 @@ class HiveDDLSuite | |
| } | ||
| } | ||
|
|
||
| test("SPARK-31828: Retain table properties at CreateTableLikeCommand") { | ||
| val catalog = spark.sessionState.catalog | ||
| withTable("t1", "t2", "t3", "t4", "t5", "t6") { | ||
| sql("CREATE TABLE t1(c1 int) USING hive TBLPROPERTIES('k1'='v1', 'k2'='v2')") | ||
| val t1 = catalog.getTableMetadata(TableIdentifier("t1")) | ||
| assert(t1.properties("k1") == "v1") | ||
| assert(t1.properties("k2") == "v2") | ||
| sql("CREATE TABLE t2 LIKE t1 TBLPROPERTIES('k2'='v3', 'k4'='v4')") | ||
| val t2 = catalog.getTableMetadata(TableIdentifier("t2")) | ||
| assert(t2.properties("k1") == "v1") | ||
| assert(t2.properties("k2") == "v3") | ||
| assert(t2.properties("k4") == "v4") | ||
| sql("CREATE TABLE t3 LIKE t1") | ||
| val t3 = catalog.getTableMetadata(TableIdentifier("t3")) | ||
| assert(t3.properties("k1") == "v1") | ||
| assert(t3.properties("k2") == "v2") | ||
cloud-fan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| sql( | ||
| """ | ||
| |CREATE TABLE t4 LIKE t1 STORED AS TEXTFILE | ||
| |ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' | ||
| """.stripMargin) | ||
| val t4 = catalog.getTableMetadata(TableIdentifier("t4")) | ||
| assert(t4.properties("k1") == "v1") | ||
| assert(t4.properties("k2") == "v2") | ||
| sql("CREATE TABLE t5 LIKE t1 USING parquet") | ||
| val t5 = catalog.getTableMetadata(TableIdentifier("t5")) | ||
| assert(t5.properties.get("k1").isEmpty) | ||
| assert(t5.properties.get("k2").isEmpty) | ||
| sql("CREATE TABLE t6 LIKE t1 USING hive") | ||
| val t6 = catalog.getTableMetadata(TableIdentifier("t6")) | ||
| assert(t6.properties("k1") == "v1") | ||
| assert(t6.properties("k2") == "v2") | ||
| } | ||
|
|
||
| withView("v1") { | ||
| withTable("t1") { | ||
| sql("create view v1 as select 1") | ||
| sql("alter view v1 set tblproperties('k1'='v1')") | ||
| sql("create table t1 like v1") | ||
| val t1 = catalog.getTableMetadata(TableIdentifier("t1")) | ||
| assert(t1.properties.get("k1").isEmpty) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-31828: Filters out Hive metastore properties in CreateTableLikeCommand") { | ||
| val catalog = spark.sessionState.catalog | ||
| HiveClientImpl.HIVE_METASTORE_GENERATED_PROPERTIES.foreach { meta => | ||
| withTable("t1", "t2") { | ||
| val uuid = UUID.randomUUID().toString | ||
| sql(s"CREATE TABLE t1(c1 int) USING hive TBLPROPERTIES('$meta'='$uuid')") | ||
| val t1 = catalog.getTableMetadata(TableIdentifier("t1")) | ||
| // Removed in HiveClientImpl.toHiveTable, but they may be added by hive | ||
| assert(t1.properties.get(s"$meta").isEmpty || t1.properties(s"$meta") != s"$uuid") | ||
| sql("CREATE TABLE t2 LIKE t1") | ||
| val t2 = catalog.getTableMetadata(TableIdentifier("t2")) | ||
| // We don't copy source tbl metastore properties, but they may be added by hive | ||
| assert(t2.properties.get(s"$meta").isEmpty || t2.properties(s"$meta") != s"$uuid") | ||
| } | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-31904: Fix case sensitive problem of char and varchar partition columns") { | ||
| withTable("t1", "t2") { | ||
| sql("CREATE TABLE t1(a STRING, B VARCHAR(10), C CHAR(10)) STORED AS parquet") | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Plz update the comment, too?
if the formats is the same..