-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-25271][SQL] Hive ctas commands should use data source if it is convertible #22514
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 13 commits
5debc60
1223178
ad620be
1c4ad1a
5780a5e
0b0a900
c5992ae
e6b61c7
e42a846
9629175
e04812d
3c07d74
57fc943
ef52536
15b9c02
d949436
839a6ce
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,8 +31,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoDir, InsertIntoTab | |
| import org.apache.spark.sql.catalyst.rules.Rule | ||
| import org.apache.spark.sql.execution._ | ||
| import org.apache.spark.sql.execution.command.{CreateTableCommand, DDLUtils} | ||
| import org.apache.spark.sql.execution.datasources.{CreateTable, LogicalRelation} | ||
| import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat, ParquetOptions} | ||
| import org.apache.spark.sql.execution.datasources.CreateTable | ||
| import org.apache.spark.sql.hive.execution._ | ||
| import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} | ||
|
|
||
|
|
@@ -181,62 +180,39 @@ case class RelationConversions( | |
| conf: SQLConf, | ||
| sessionCatalog: HiveSessionCatalog) extends Rule[LogicalPlan] { | ||
| private def isConvertible(relation: HiveTableRelation): Boolean = { | ||
| val serde = relation.tableMeta.storage.serde.getOrElse("").toLowerCase(Locale.ROOT) | ||
| serde.contains("parquet") && conf.getConf(HiveUtils.CONVERT_METASTORE_PARQUET) || | ||
| serde.contains("orc") && conf.getConf(HiveUtils.CONVERT_METASTORE_ORC) | ||
| isConvertible(relation.tableMeta) | ||
| } | ||
|
|
||
| // Return true for Apache ORC and Hive ORC-related configuration names. | ||
| // Note that Spark doesn't support configurations like `hive.merge.orcfile.stripe.level`. | ||
| private def isOrcProperty(key: String) = | ||
| key.startsWith("orc.") || key.contains(".orc.") | ||
|
|
||
| private def isParquetProperty(key: String) = | ||
| key.startsWith("parquet.") || key.contains(".parquet.") | ||
|
|
||
| private def convert(relation: HiveTableRelation): LogicalRelation = { | ||
| val serde = relation.tableMeta.storage.serde.getOrElse("").toLowerCase(Locale.ROOT) | ||
|
|
||
| // Consider table and storage properties. For properties existing in both sides, storage | ||
| // properties will supersede table properties. | ||
| if (serde.contains("parquet")) { | ||
| val options = relation.tableMeta.properties.filterKeys(isParquetProperty) ++ | ||
| relation.tableMeta.storage.properties + (ParquetOptions.MERGE_SCHEMA -> | ||
| conf.getConf(HiveUtils.CONVERT_METASTORE_PARQUET_WITH_SCHEMA_MERGING).toString) | ||
| sessionCatalog.metastoreCatalog | ||
| .convertToLogicalRelation(relation, options, classOf[ParquetFileFormat], "parquet") | ||
| } else { | ||
| val options = relation.tableMeta.properties.filterKeys(isOrcProperty) ++ | ||
| relation.tableMeta.storage.properties | ||
| if (conf.getConf(SQLConf.ORC_IMPLEMENTATION) == "native") { | ||
| sessionCatalog.metastoreCatalog.convertToLogicalRelation( | ||
| relation, | ||
| options, | ||
| classOf[org.apache.spark.sql.execution.datasources.orc.OrcFileFormat], | ||
| "orc") | ||
| } else { | ||
| sessionCatalog.metastoreCatalog.convertToLogicalRelation( | ||
| relation, | ||
| options, | ||
| classOf[org.apache.spark.sql.hive.orc.OrcFileFormat], | ||
| "orc") | ||
| } | ||
| } | ||
| private def isConvertible(tableMeta: CatalogTable): Boolean = { | ||
| val serde = tableMeta.storage.serde.getOrElse("").toLowerCase(Locale.ROOT) | ||
| serde.contains("parquet") && SQLConf.get.getConf(HiveUtils.CONVERT_METASTORE_PARQUET) || | ||
| serde.contains("orc") && SQLConf.get.getConf(HiveUtils.CONVERT_METASTORE_ORC) | ||
| } | ||
|
|
||
| private val metastoreCatalog = sessionCatalog.metastoreCatalog | ||
|
|
||
| override def apply(plan: LogicalPlan): LogicalPlan = { | ||
| plan resolveOperators { | ||
| // Write path | ||
| case InsertIntoTable(r: HiveTableRelation, partition, query, overwrite, ifPartitionNotExists) | ||
| // Inserting into partitioned table is not supported in Parquet/Orc data source (yet). | ||
| if query.resolved && DDLUtils.isHiveTable(r.tableMeta) && | ||
| !r.isPartitioned && isConvertible(r) => | ||
| InsertIntoTable(convert(r), partition, query, overwrite, ifPartitionNotExists) | ||
| InsertIntoTable(metastoreCatalog.convert(r), partition, | ||
| query, overwrite, ifPartitionNotExists) | ||
|
|
||
| // Read path | ||
| case relation: HiveTableRelation | ||
| if DDLUtils.isHiveTable(relation.tableMeta) && isConvertible(relation) => | ||
| convert(relation) | ||
| metastoreCatalog.convert(relation) | ||
|
|
||
| // CTAS | ||
| case CreateTable(tableDesc, mode, Some(query)) | ||
| if DDLUtils.isHiveTable(tableDesc) && tableDesc.partitionColumnNames.isEmpty && | ||
| isConvertible(tableDesc) => | ||
| DDLUtils.checkDataColNames(tableDesc) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need this?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In HiveAnalysis, when transforming CreateTable to CreateHiveTableAsSelectCommand, it has this too. checkDataColNames checks if any invalid character is using in column name. |
||
| OptimizedCreateHiveTableAsSelectCommand( | ||
| tableDesc, query, query.output.map(_.name), mode) | ||
| } | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add an internal SQL conf here? The perf impact is huge. It could be better or worse.
Also add it to the migration guide and explain the behavior changes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's not a new optimization... It's an optimization we dropped in 2.3 by mistake.
I'm fine to add a config with default value true.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hmm, the optimization is already controlled by configs like
HiveUtils.CONVERT_METASTORE_ORCandHiveUtils.CONVERT_METASTORE_PARQUET. Do we need another config for it?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't mind to add
HiveUtils.CONVERT_METASTORE_ORC_CTAS, maybe we can do it in a followup?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since the regression was already introduced, we need to add a conf and migration guide.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We usually don't write a migration guide for perf optimizations. Otherwise it's annoying to write one for each optimization and ask users to turn it off if something goes wrong. I think we only do that when there are known issues.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not for perf optimization only. This is using different write paths. Thus, we could have different limits/bugs that might be exposed after this code change. We just let the community aware of this change.