Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
61dfe05
init
ulysses-you May 27, 2020
37ba3b3
fix
ulysses-you May 27, 2020
594a1bb
update doc
ulysses-you May 28, 2020
b30d62c
remove metastore proerties
ulysses-you May 29, 2020
29aba43
update doc
ulysses-you May 29, 2020
162627a
add comment
ulysses-you May 29, 2020
9eacf1e
add meta test
ulysses-you May 29, 2020
175d0e2
fix view
ulysses-you May 29, 2020
ed0877e
fix
ulysses-you May 29, 2020
dd04dcc
fix
ulysses-you May 30, 2020
4507e7f
fix
ulysses-you May 30, 2020
bc52948
fix
ulysses-you May 30, 2020
6b523e1
fix
ulysses-you May 31, 2020
0e79ed3
update doc
ulysses-you May 31, 2020
46d7a7b
fix comment
ulysses-you May 31, 2020
1edc619
Merge branch 'master' of https://github.com/apache/spark into SPARK-3…
ulysses-you Jun 24, 2020
78ff34f
update reserved properties
ulysses-you Jun 24, 2020
2251181
revert last commit
ulysses-you Jun 24, 2020
5c63477
check format
ulysses-you Jun 28, 2020
183c209
fix
ulysses-you Jun 28, 2020
fcc8b3b
update doc
ulysses-you Jun 29, 2020
006ec47
update doc
ulysses-you Jul 1, 2020
319fbfb
update doc and comment
ulysses-you Jul 1, 2020
7f9f685
remove useless properties in HiveClientImpl
ulysses-you Jul 1, 2020
93a0c75
add view test
ulysses-you Jul 1, 2020
3e8af07
add test
ulysses-you Jul 1, 2020
0a3b1cf
fix
ulysses-you Jul 1, 2020
1e1349f
fix
ulysses-you Jul 2, 2020
19f398b
update constants
ulysses-you Jul 8, 2020
03966ae
add using hive
ulysses-you Jul 10, 2020
dc00260
Merge branch 'master' of https://github.com/apache/spark into SPARK-3…
ulysses-you Nov 2, 2020
2d470fc
minor
ulysses-you Nov 9, 2020
4b55575
Merge branch 'master' of https://github.com/apache/spark into SPARK-3…
ulysses-you Nov 17, 2020
c45489a
Merge branch 'master' of https://github.com/apache/spark into SPARK-3…
ulysses-you Nov 24, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/sql-ref-syntax-ddl-create-table-like.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ CREATE TABLE [IF NOT EXISTS] table_identifier LIKE source_table_identifier
* **TBLPROPERTIES**

Table properties that have to be set are specified, such as `created.by.user`, `owner`, etc.
Note that a basic set of table properties defined in a source table is copied into a new table if the table formats including data source providers and storage input formats are the same.
If a specified property key has already existed in a source table, the old property is overwritten with the new one.

* **LOCATION**

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,9 @@ import org.apache.spark.sql.util.SchemaUtils
* are identical to the ones defined in the source table.
*
* The CatalogTable attributes copied from the source table are storage(inputFormat, outputFormat,
* serde, compressed, properties), schema, provider, partitionColumnNames, bucketSpec by default.
* serde, compressed, properties), schema, provider, partitionColumnNames, bucketSpec, tblproperties
* by default. Note that, tblproperties is copied into a new table if the table formats including
* data source providers and storage input formats are the same.
*
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Plz update the comment, too? if the formats is the same..

* Use "CREATE TABLE t1 LIKE t2 USING file_format" to specify new provider for t1.
* For Hive compatibility, use "CREATE TABLE t1 LIKE t2 STORED AS hiveFormat"
Expand Down Expand Up @@ -116,6 +118,24 @@ case class CreateTableLikeCommand(
CatalogTableType.EXTERNAL
}

// We only copy source tbl properties if the format is the same with each other
val needCopyProperties =
(provider.isEmpty || provider == sourceTableDesc.provider) &&
(fileFormat.inputFormat.isEmpty ||
fileFormat.inputFormat == sourceTableDesc.storage.inputFormat)

val newProperties = sourceTableDesc.tableType match {
case MANAGED | EXTERNAL if needCopyProperties =>
sourceTableDesc.properties ++ properties
case MANAGED | EXTERNAL =>
properties
case VIEW =>
// For view, we just use the new properties
properties
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keep view behavior as before. Hive also does not copy view properties.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit format;


    val newProperties = sourceTableDesc.tableType match {
      case MANAGED | EXTERNAL =>
        // Hive only retain the useful properties through serde class annotation.
        // For better compatible with Hive, we remove the metastore properties.
        sourceTableDesc.properties -- DDLUtils.METASTORE_GENERATED_PROPERTIES ++ properties

      case VIEW =>
        // For view, we just use new properties
        properties
    }

case other =>
throw new IllegalArgumentException(
s"Unknown table type is found at createTableLikeCommand: $other")
}
val newTableDesc =
CatalogTable(
identifier = targetTable,
Expand All @@ -125,7 +145,7 @@ case class CreateTableLikeCommand(
provider = newProvider,
partitionColumnNames = sourceTableDesc.partitionColumnNames,
bucketSpec = sourceTableDesc.bucketSpec,
properties = properties,
properties = newProperties,
tracksPartitionsInCatalog = sourceTableDesc.tracksPartitionsInCatalog)

catalog.createTable(newTableDesc, ifNotExists)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1063,7 +1063,10 @@ private[hive] object HiveClientImpl extends Logging {
hiveTable.setSerializationLib(
table.storage.serde.getOrElse("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"))
table.storage.properties.foreach { case (k, v) => hiveTable.setSerdeParam(k, v) }
table.properties.foreach { case (k, v) => hiveTable.setProperty(k, v) }
// Hive only retain the useful properties through serde class annotation.
// For better compatible with Hive, we remove the metastore properties.
val hiveProperties = table.properties -- HIVE_METASTORE_GENERATED_PROPERTIES
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It also affect createTable, but seems fine.

hiveProperties.foreach { case (k, v) => hiveTable.setProperty(k, v) }
table.comment.foreach { c => hiveTable.setProperty("comment", c) }
// Hive will expand the view text, so it needs 2 fields: viewOriginalText and viewExpandedText.
// Since we don't expand the view text, but only add table properties, we map the `viewText` to
Expand Down Expand Up @@ -1222,6 +1225,14 @@ private[hive] object HiveClientImpl extends Logging {
StatsSetupConst.TOTAL_SIZE
)

// Visible for testing.
private[hive] val HIVE_METASTORE_GENERATED_PROPERTIES: Set[String] = Set(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After check Hive1.2 and Hive2.3 again, I think we can just remove 3 properties so that we can reduce the scope of influence.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I checked all the properties and make sure hive just modify the three properties.

hive_metastoreConstants.DDL_TIME,
// at org.apache.hadoop.hive.ql.exec.DDLTask.updateModifiedParameters()
"last_modified_by",
"last_modified_time"
)

def newHiveConf(
sparkConf: SparkConf,
hadoopConf: JIterable[JMap.Entry[String, String]],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package org.apache.spark.sql.hive.execution

import java.io.File
import java.net.URI
import java.util.UUID

import org.apache.hadoop.fs.Path
import org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER
Expand All @@ -38,6 +39,7 @@ import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.HiveExternalCatalog
import org.apache.spark.sql.hive.HiveUtils.{CONVERT_METASTORE_ORC, CONVERT_METASTORE_PARQUET}
import org.apache.spark.sql.hive.client.HiveClientImpl
import org.apache.spark.sql.hive.orc.OrcFileOperator
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
Expand Down Expand Up @@ -1542,21 +1544,6 @@ class HiveDDLSuite
assert(targetTable.unsupportedFeatures.isEmpty,
"the unsupportedFeatures in the create table must be empty")

val metastoreGeneratedProperties = Seq(
"CreateTime",
"transient_lastDdlTime",
"grantTime",
"lastUpdateTime",
"last_modified_by",
"last_modified_time",
"Owner:",
"totalNumberFiles",
"maxFileSize",
"minFileSize"
)
assert(targetTable.properties.filterKeys(!metastoreGeneratedProperties.contains(_)).isEmpty,
Copy link
Contributor Author

@ulysses-you ulysses-you May 29, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this check. And we can't check all meta properties here. For example transient_lastDdlTime, hive will add this properties when table created. So the properties is always exists.

Add some meta properties test at new UT.

"the table properties of source tables should not be copied in the created table")

provider match {
case Some(_) =>
assert(targetTable.provider == provider)
Expand Down Expand Up @@ -2763,7 +2750,6 @@ class HiveDDLSuite
assert(source.properties("a") == "apple")
sql("CREATE TABLE t LIKE s STORED AS parquet TBLPROPERTIES('f'='foo', 'b'='bar')")
val table = catalog.getTableMetadata(TableIdentifier("t"))
assert(table.properties.get("a") === None)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like we intentionally don't keep the table properties from the original table. @maropu @dongjoon-hyun @viirya are you OK with the behavior change proposed by this PR?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the description of CreateTableLikeCommand:

The CatalogTable attributes copied from the source table are storage(inputFormat, outputFormat, serde, compressed, properties), schema, provider, partitionColumnNames, bucketSpec by default.

So we don't say table properties are copied from source table too. Not sure about why we didn't copy table properties.

I feel it is OK as seems copying original table properties should not be harmful. And we already copy storage properties.

But we should update the doc together.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hm, I have the same opinion with @viirya; I couldn't find any strong reason not to copy the properites. Rather, is this a kind of bugs? Anyway, yea, we should clearly descibe the behaivour in our doc...

assert(table.properties("f") == "foo")
assert(table.properties("b") == "bar")
}
Expand Down Expand Up @@ -2852,6 +2838,68 @@ class HiveDDLSuite
}
}

test("SPARK-31828: Retain table properties at CreateTableLikeCommand") {
val catalog = spark.sessionState.catalog
withTable("t1", "t2", "t3", "t4", "t5", "t6") {
sql("CREATE TABLE t1(c1 int) USING hive TBLPROPERTIES('k1'='v1', 'k2'='v2')")
val t1 = catalog.getTableMetadata(TableIdentifier("t1"))
assert(t1.properties("k1") == "v1")
assert(t1.properties("k2") == "v2")
sql("CREATE TABLE t2 LIKE t1 TBLPROPERTIES('k2'='v3', 'k4'='v4')")
val t2 = catalog.getTableMetadata(TableIdentifier("t2"))
assert(t2.properties("k1") == "v1")
assert(t2.properties("k2") == "v3")
assert(t2.properties("k4") == "v4")
sql("CREATE TABLE t3 LIKE t1")
val t3 = catalog.getTableMetadata(TableIdentifier("t3"))
assert(t3.properties("k1") == "v1")
assert(t3.properties("k2") == "v2")
sql(
"""
|CREATE TABLE t4 LIKE t1 STORED AS TEXTFILE
|ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
""".stripMargin)
val t4 = catalog.getTableMetadata(TableIdentifier("t4"))
assert(t4.properties("k1") == "v1")
assert(t4.properties("k2") == "v2")
sql("CREATE TABLE t5 LIKE t1 USING parquet")
val t5 = catalog.getTableMetadata(TableIdentifier("t5"))
assert(t5.properties.get("k1").isEmpty)
assert(t5.properties.get("k2").isEmpty)
sql("CREATE TABLE t6 LIKE t1 USING hive")
val t6 = catalog.getTableMetadata(TableIdentifier("t6"))
assert(t6.properties("k1") == "v1")
assert(t6.properties("k2") == "v2")
}

withView("v1") {
withTable("t1") {
sql("create view v1 as select 1")
sql("alter view v1 set tblproperties('k1'='v1')")
sql("create table t1 like v1")
val t1 = catalog.getTableMetadata(TableIdentifier("t1"))
assert(t1.properties.get("k1").isEmpty)
}
}
}

test("SPARK-31828: Filters out Hive metastore properties in CreateTableLikeCommand") {
val catalog = spark.sessionState.catalog
HiveClientImpl.HIVE_METASTORE_GENERATED_PROPERTIES.foreach { meta =>
withTable("t1", "t2") {
val uuid = UUID.randomUUID().toString
sql(s"CREATE TABLE t1(c1 int) USING hive TBLPROPERTIES('$meta'='$uuid')")
val t1 = catalog.getTableMetadata(TableIdentifier("t1"))
// Removed in HiveClientImpl.toHiveTable, but they may be added by hive
assert(t1.properties.get(s"$meta").isEmpty || t1.properties(s"$meta") != s"$uuid")
sql("CREATE TABLE t2 LIKE t1")
val t2 = catalog.getTableMetadata(TableIdentifier("t2"))
// We don't copy source tbl metastore properties, but they may be added by hive
assert(t2.properties.get(s"$meta").isEmpty || t2.properties(s"$meta") != s"$uuid")
}
}
}

test("SPARK-31904: Fix case sensitive problem of char and varchar partition columns") {
withTable("t1", "t2") {
sql("CREATE TABLE t1(a STRING, B VARCHAR(10), C CHAR(10)) STORED AS parquet")
Expand Down