-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-6466] Fix spark insert overwrite partitioned table with dynamic partition #9113
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 1 commit
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,7 +17,7 @@ | |
|
|
||
| package org.apache.spark.sql.hudi | ||
|
|
||
| import org.apache.hudi.{DataSourceWriteOptions, config} | ||
| import org.apache.hudi.DataSourceWriteOptions | ||
| import org.apache.hudi.DataSourceWriteOptions._ | ||
| import org.apache.hudi.HoodieConversionUtils.toProperties | ||
| import org.apache.hudi.common.config.{DFSPropertiesConfiguration, TypedProperties} | ||
|
|
@@ -39,6 +39,7 @@ import org.apache.spark.sql.hudi.HoodieSqlCommonUtils.{isHoodieConfigKey, isUsin | |
| import org.apache.spark.sql.hudi.ProvidesHoodieConfig.combineOptions | ||
| import org.apache.spark.sql.hudi.command.{SqlKeyGenerator, ValidateDuplicateKeyPayload} | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.internal.SQLConf.{PARTITION_OVERWRITE_MODE, PartitionOverwriteMode} | ||
| import org.apache.spark.sql.types.StructType | ||
|
|
||
| import java.util.Locale | ||
|
|
@@ -108,9 +109,9 @@ trait ProvidesHoodieConfig extends Logging { | |
| private def deduceSqlWriteOperation(isOverwritePartition: Boolean, isOverwriteTable: Boolean, | ||
| sqlWriteOperation: String): String = { | ||
| if (isOverwriteTable) { | ||
| WriteOperationType.INSERT_OVERWRITE_TABLE.name() | ||
| INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL | ||
| } else if (isOverwritePartition) { | ||
| WriteOperationType.INSERT_OVERWRITE.name() | ||
| INSERT_OVERWRITE_OPERATION_OPT_VAL | ||
| } else { | ||
| sqlWriteOperation | ||
| } | ||
|
|
@@ -208,7 +209,7 @@ trait ProvidesHoodieConfig extends Logging { | |
| // or when both configs are set, or when only sql write operation is set), we honor sql write operation and ignore | ||
| // the insert mode. | ||
| val useLegacyInsertModeFlow = insertModeSet && !sqlWriteOperationSet | ||
| val operation = combinedOpts.getOrElse(OPERATION.key, | ||
| var operation = combinedOpts.getOrElse(OPERATION.key, | ||
| if (useLegacyInsertModeFlow) { | ||
| // NOTE: Target operation could be overridden by the user, therefore if it has been provided as an input | ||
| // we'd prefer that value over auto-deduced operation. Otherwise, we deduce target operation type | ||
|
|
@@ -219,6 +220,32 @@ trait ProvidesHoodieConfig extends Logging { | |
| } | ||
| ) | ||
|
|
||
| val overwriteTableOpts = if (operation.equals(BULK_INSERT_OPERATION_OPT_VAL)) { | ||
| if (isOverwriteTable) { | ||
| Map(HoodieInternalConfig.BULKINSERT_OVERWRITE_OPERATION_TYPE.key -> WriteOperationType.INSERT_OVERWRITE_TABLE.value()) | ||
| } else if (isOverwritePartition) { | ||
| Map(HoodieInternalConfig.BULKINSERT_OVERWRITE_OPERATION_TYPE.key -> WriteOperationType.INSERT_OVERWRITE.value()) | ||
| } else { | ||
| Map() | ||
| } | ||
| } else if (operation.equals(INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL)) { | ||
| if (sqlWriteOperation.equals(BULK_INSERT_OPERATION_OPT_VAL) || enableBulkInsert) { | ||
| operation = BULK_INSERT_OPERATION_OPT_VAL | ||
| Map(HoodieInternalConfig.BULKINSERT_OVERWRITE_OPERATION_TYPE.key -> WriteOperationType.INSERT_OVERWRITE_TABLE.value()) | ||
| } else { | ||
| Map() | ||
| } | ||
| } else if (operation.equals(INSERT_OVERWRITE_OPERATION_OPT_VAL)) { | ||
| if (sqlWriteOperation.equals(BULK_INSERT_OPERATION_OPT_VAL) || enableBulkInsert) { | ||
| operation = BULK_INSERT_OPERATION_OPT_VAL | ||
| Map(HoodieInternalConfig.BULKINSERT_OVERWRITE_OPERATION_TYPE.key -> WriteOperationType.INSERT_OVERWRITE.value()) | ||
| } else { | ||
| Map() | ||
| } | ||
| } else { | ||
| Map() | ||
| } | ||
|
|
||
| // try to use new insert dup policy instead of legacy insert mode to deduce payload class. If only insert mode is explicitly specified, | ||
| // w/o specifying any value for insert dup policy, legacy configs will be honored. But on all other cases (i.e when neither of the configs is set, | ||
| // or when both configs are set, or when only insert dup policy is set), we honor insert dup policy and ignore the insert mode. | ||
|
|
@@ -257,17 +284,6 @@ trait ProvidesHoodieConfig extends Logging { | |
| null | ||
| } | ||
|
|
||
| val overwriteTableOpts = if (operation.equals(BULK_INSERT_OPERATION_OPT_VAL)) { | ||
| if (isOverwriteTable) { | ||
| Map(HoodieInternalConfig.BULKINSERT_OVERWRITE_OPERATION_TYPE.key -> WriteOperationType.INSERT_OVERWRITE_TABLE.value()) | ||
| } else if (isOverwritePartition) { | ||
| Map(HoodieInternalConfig.BULKINSERT_OVERWRITE_OPERATION_TYPE.key -> WriteOperationType.INSERT_OVERWRITE.value()) | ||
| } else { | ||
| Map() | ||
| } | ||
| } else { | ||
| Map() | ||
| } | ||
| val overridingOpts = extraOptions ++ Map( | ||
| "path" -> path, | ||
| TABLE_TYPE.key -> tableType, | ||
|
|
@@ -300,6 +316,45 @@ trait ProvidesHoodieConfig extends Logging { | |
| } | ||
| } | ||
|
|
||
| def deduceIsOverwriteTable(sparkSession: SparkSession, | ||
| catalogTable: HoodieCatalogTable, | ||
| partitionSpec: Map[String, Option[String]]): Boolean = { | ||
| val operation = sparkSession.sqlContext.getConf(OPERATION.key, "") | ||
| if (operation.nonEmpty && operation.equals(INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL)) { | ||
| true | ||
| } else if (operation.nonEmpty && operation.equals(INSERT_OVERWRITE_OPERATION_OPT_VAL)) { | ||
| false | ||
| } else { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use Scala match expression can be more clear? operation match {
case INSERT_OVERWRITE_TABLE_OPERATION_OPT_VAL =>
true
case INSERT_OPERATION_OPT_VAL =>
false
case _ =>
// NonPartitioned table always insert overwrite whole table
if (catalogTable.partitionFields.isEmpty) {
true
} else {
// Insert overwrite partitioned table with PARTITION clause will always insert overwrite the specific partition
if (partitionSpec.nonEmpty) {
false
} else {
// If hoodie.datasource.overwrite.mode configured, respect it
val hoodieOverwriteMode = sparkSession.sqlContext.getConf(OVERWRITE_MODE.key,
sparkSession.sqlContext.getConf(PARTITION_OVERWRITE_MODE.key)).toUpperCase()
hoodieOverwriteMode match {
case "STATIC" =>
true
case "DYNAMIC" =>
false
case _ =>
throw new IllegalArgumentException("xxx")
}
}
}
}
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the review, I'll refine the code later |
||
| // NonPartitioned table always insert overwrite whole table | ||
| if (catalogTable.partitionFields.isEmpty) { | ||
| true | ||
| } else { | ||
| // Insert overwrite partitioned table with PARTITION clause will always insert overwrite the specific partition | ||
| if (partitionSpec.nonEmpty) { | ||
| false | ||
| } else { | ||
| // If hoodie.datasource.overwrite.mode configured, respect it | ||
| val hoodieOverwriteMode = sparkSession.sqlContext.getConf(OVERWRITE_MODE.key, "").toUpperCase() | ||
| if (hoodieOverwriteMode.isEmpty) { | ||
| val sparkOverwriteMode = sparkSession.sqlContext.getConf(PARTITION_OVERWRITE_MODE.key).toUpperCase() | ||
| if (sparkOverwriteMode.equals(PartitionOverwriteMode.STATIC.toString)) { | ||
| true | ||
| } else { | ||
| false | ||
| } | ||
| } else { | ||
| OVERWRITE_MODE.checkValues(hoodieOverwriteMode) | ||
| if (hoodieOverwriteMode.equals("STATIC")) { | ||
| true | ||
| } else { | ||
| false | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| def buildHoodieDropPartitionsConfig(sparkSession: SparkSession, | ||
| hoodieCatalogTable: HoodieCatalogTable, | ||
| partitionsToDrop: String): Map[String, String] = { | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better use
combineOptions()here to keep the sources of HUDI configure consistent with others.