-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-25271][SQL] Hive ctas commands should use data source if it is convertible #22514
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5debc60
1223178
ad620be
1c4ad1a
5780a5e
0b0a900
c5992ae
e6b61c7
e42a846
9629175
e04812d
3c07d74
57fc943
ef52536
15b9c02
d949436
839a6ce
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,32 +20,26 @@ package org.apache.spark.sql.hive.execution | |
| import scala.util.control.NonFatal | ||
|
|
||
| import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} | ||
| import org.apache.spark.sql.catalyst.catalog.CatalogTable | ||
| import org.apache.spark.sql.catalyst.expressions.Attribute | ||
| import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog} | ||
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
| import org.apache.spark.sql.execution.SparkPlan | ||
| import org.apache.spark.sql.execution.command.DataWritingCommand | ||
| import org.apache.spark.sql.execution.command.{DataWritingCommand, DDLUtils} | ||
| import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, InsertIntoHadoopFsRelationCommand, LogicalRelation} | ||
| import org.apache.spark.sql.hive.HiveSessionCatalog | ||
|
|
||
| trait CreateHiveTableAsSelectBase extends DataWritingCommand { | ||
| val tableDesc: CatalogTable | ||
| val query: LogicalPlan | ||
| val outputColumnNames: Seq[String] | ||
| val mode: SaveMode | ||
|
|
||
| /** | ||
| * Create table and insert the query result into it. | ||
| * | ||
| * @param tableDesc the Table Describe, which may contain serde, storage handler etc. | ||
| * @param query the query whose result will be insert into the new relation | ||
| * @param mode SaveMode | ||
| */ | ||
| case class CreateHiveTableAsSelectCommand( | ||
| tableDesc: CatalogTable, | ||
| query: LogicalPlan, | ||
| outputColumnNames: Seq[String], | ||
| mode: SaveMode) | ||
| extends DataWritingCommand { | ||
|
|
||
| private val tableIdentifier = tableDesc.identifier | ||
| protected val tableIdentifier = tableDesc.identifier | ||
|
|
||
| override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some more thoughts:
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the table metadata created by data source CTAS and Hive CTAS are different?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. then how about we create a special Hive CTAS command that follows data source CTAS command but creates Hive table?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also thought about it. But then we will have two Hive CTAS commands. Is it good for you?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm OK with that, since we do have 2 different ways to do Hive CTAS.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I created a Hive CTAS with data source command. |
||
| val catalog = sparkSession.sessionState.catalog | ||
| if (catalog.tableExists(tableIdentifier)) { | ||
| val tableExists = catalog.tableExists(tableIdentifier) | ||
|
|
||
| if (tableExists) { | ||
| assert(mode != SaveMode.Overwrite, | ||
| s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite") | ||
|
|
||
|
|
@@ -57,15 +51,8 @@ case class CreateHiveTableAsSelectCommand( | |
| return Seq.empty | ||
| } | ||
|
|
||
| // For CTAS, there is no static partition values to insert. | ||
| val partition = tableDesc.partitionColumnNames.map(_ -> None).toMap | ||
| InsertIntoHiveTable( | ||
| tableDesc, | ||
| partition, | ||
| query, | ||
| overwrite = false, | ||
| ifPartitionNotExists = false, | ||
| outputColumnNames = outputColumnNames).run(sparkSession, child) | ||
| val command = getWritingCommand(catalog, tableDesc, tableExists = true) | ||
| command.run(sparkSession, child) | ||
| } else { | ||
| // TODO ideally, we should get the output data ready first and then | ||
| // add the relation into catalog, just in case of failure occurs while data | ||
|
|
@@ -77,15 +64,8 @@ case class CreateHiveTableAsSelectCommand( | |
| try { | ||
| // Read back the metadata of the table which was created just now. | ||
| val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier) | ||
| // For CTAS, there is no static partition values to insert. | ||
| val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap | ||
| InsertIntoHiveTable( | ||
| createdTableMeta, | ||
| partition, | ||
| query, | ||
| overwrite = true, | ||
| ifPartitionNotExists = false, | ||
| outputColumnNames = outputColumnNames).run(sparkSession, child) | ||
| val command = getWritingCommand(catalog, createdTableMeta, tableExists = false) | ||
| command.run(sparkSession, child) | ||
| } catch { | ||
| case NonFatal(e) => | ||
| // drop the created table. | ||
|
|
@@ -97,9 +77,89 @@ case class CreateHiveTableAsSelectCommand( | |
| Seq.empty[Row] | ||
| } | ||
|
|
||
| // Returns `DataWritingCommand` which actually writes data into the table. | ||
| def getWritingCommand( | ||
| catalog: SessionCatalog, | ||
| tableDesc: CatalogTable, | ||
| tableExists: Boolean): DataWritingCommand | ||
|
|
||
| override def argString: String = { | ||
| s"[Database:${tableDesc.database}, " + | ||
| s"TableName: ${tableDesc.identifier.table}, " + | ||
| s"InsertIntoHiveTable]" | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Create table and insert the query result into it. | ||
| * | ||
| * @param tableDesc the table description, which may contain serde, storage handler etc. | ||
| * @param query the query whose result will be insert into the new relation | ||
| * @param mode SaveMode | ||
| */ | ||
| case class CreateHiveTableAsSelectCommand( | ||
| tableDesc: CatalogTable, | ||
| query: LogicalPlan, | ||
| outputColumnNames: Seq[String], | ||
| mode: SaveMode) | ||
| extends CreateHiveTableAsSelectBase { | ||
|
|
||
| override def getWritingCommand( | ||
| catalog: SessionCatalog, | ||
| tableDesc: CatalogTable, | ||
| tableExists: Boolean): DataWritingCommand = { | ||
| // For CTAS, there is no static partition values to insert. | ||
| val partition = tableDesc.partitionColumnNames.map(_ -> None).toMap | ||
| InsertIntoHiveTable( | ||
| tableDesc, | ||
| partition, | ||
| query, | ||
| overwrite = if (tableExists) false else true, | ||
| ifPartitionNotExists = false, | ||
| outputColumnNames = outputColumnNames) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Create table and insert the query result into it. This creates Hive table but inserts | ||
| * the query result into it by using data source. | ||
| * | ||
| * @param tableDesc the table description, which may contain serde, storage handler etc. | ||
| * @param query the query whose result will be insert into the new relation | ||
| * @param mode SaveMode | ||
| */ | ||
| case class OptimizedCreateHiveTableAsSelectCommand( | ||
| tableDesc: CatalogTable, | ||
| query: LogicalPlan, | ||
| outputColumnNames: Seq[String], | ||
| mode: SaveMode) | ||
| extends CreateHiveTableAsSelectBase { | ||
|
|
||
| override def getWritingCommand( | ||
| catalog: SessionCatalog, | ||
| tableDesc: CatalogTable, | ||
| tableExists: Boolean): DataWritingCommand = { | ||
| val metastoreCatalog = catalog.asInstanceOf[HiveSessionCatalog].metastoreCatalog | ||
| val hiveTable = DDLUtils.readHiveTable(tableDesc) | ||
|
|
||
| val hadoopRelation = metastoreCatalog.convert(hiveTable) match { | ||
| case LogicalRelation(t: HadoopFsRelation, _, _, _) => t | ||
| case _ => throw new AnalysisException(s"$tableIdentifier should be converted to " + | ||
| "HadoopFsRelation.") | ||
| } | ||
|
|
||
| InsertIntoHadoopFsRelationCommand( | ||
| hadoopRelation.location.rootPaths.head, | ||
| Map.empty, // We don't support to convert partitioned table. | ||
| false, | ||
| Seq.empty, // We don't support to convert partitioned table. | ||
| hadoopRelation.bucketSpec, | ||
| hadoopRelation.fileFormat, | ||
| hadoopRelation.options, | ||
| query, | ||
| if (tableExists) mode else SaveMode.Overwrite, | ||
| Some(tableDesc), | ||
| Some(hadoopRelation.location), | ||
| query.output.map(_.name)) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why do we need this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In HiveAnalysis, when transforming CreateTable to CreateHiveTableAsSelectCommand, it has this too. checkDataColNames checks if any invalid character is using in column name.