-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-18566][SQL] remove OverwriteOptions #15995
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,16 +24,15 @@ import org.apache.hadoop.fs.Path | |
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.sql._ | ||
| import org.apache.spark.sql.catalyst.{CatalystConf, CatalystTypeConverters, InternalRow, TableIdentifier} | ||
| import org.apache.spark.sql.catalyst.{CatalystConf, CatalystTypeConverters, InternalRow} | ||
| import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala | ||
| import org.apache.spark.sql.catalyst.analysis._ | ||
| import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTablePartition, SimpleCatalogRelation} | ||
| import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec | ||
| import org.apache.spark.sql.catalyst.expressions | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.planning.PhysicalOperation | ||
| import org.apache.spark.sql.catalyst.plans.logical | ||
| import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Union} | ||
| import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, Project} | ||
| import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, UnknownPartitioning} | ||
| import org.apache.spark.sql.catalyst.rules.Rule | ||
| import org.apache.spark.sql.execution.{RowDataSourceScanExec, SparkPlan} | ||
|
|
@@ -100,7 +99,7 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] { | |
| None | ||
| } else if (potentialSpecs.size == 1) { | ||
| val partValue = potentialSpecs.head._2 | ||
| Some(Alias(Cast(Literal(partValue), field.dataType), "_staticPart")()) | ||
| Some(Alias(Cast(Literal(partValue), field.dataType), field.name)()) | ||
| } else { | ||
| throw new AnalysisException( | ||
| s"Partition column ${field.name} have multiple values specified, " + | ||
|
|
@@ -128,61 +127,75 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] { | |
| projectList | ||
| } | ||
|
|
||
| /** | ||
| * Returns true if the [[InsertIntoTable]] plan has already been preprocessed by analyzer rule | ||
| * [[PreprocessTableInsertion]]. It is important that this rule([[DataSourceAnalysis]]) has to | ||
| * be run after [[PreprocessTableInsertion]], to normalize the column names in partition spec and | ||
| * fix the schema mismatch by adding Cast. | ||
| */ | ||
| private def hasBeenPreprocessed( | ||
| tableOutput: Seq[Attribute], | ||
| partSchema: StructType, | ||
| partSpec: Map[String, Option[String]], | ||
| query: LogicalPlan): Boolean = { | ||
| val partColNames = partSchema.map(_.name).toSet | ||
| query.resolved && partSpec.keys.forall(partColNames.contains) && { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it necessary to check that the keys are all valid columns?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, is the issue to avoid this running before PreprocessTableInsertion?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yup |
||
| val staticPartCols = partSpec.filter(_._2.isDefined).keySet | ||
| val expectedColumns = tableOutput.filterNot(a => staticPartCols.contains(a.name)) | ||
| expectedColumns.toStructType.sameType(query.schema) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. similar question, when is this false?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is to follow the previous condition: https://github.com/apache/spark/pull/15995/files#diff-d99813bd5bbc18277e4090475e4944cfL166 This can be caused if users issue an invalid command, e.g. |
||
| } | ||
| } | ||
|
|
||
| override def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
| // If the InsertIntoTable command is for a partitioned HadoopFsRelation and | ||
| // the user has specified static partitions, we add a Project operator on top of the query | ||
| // to include those constant column values in the query result. | ||
| // | ||
| // Example: | ||
| // Let's say that we have a table "t", which is created by | ||
| // CREATE TABLE t (a INT, b INT, c INT) USING parquet PARTITIONED BY (b, c) | ||
| // The statement of "INSERT INTO TABLE t PARTITION (b=2, c) SELECT 1, 3" | ||
| // will be converted to "INSERT INTO TABLE t PARTITION (b, c) SELECT 1, 2, 3". | ||
| // | ||
| // Basically, we will put those partition columns having a assigned value back | ||
| // to the SELECT clause. The output of the SELECT clause is organized as | ||
| // normal_columns static_partitioning_columns dynamic_partitioning_columns. | ||
| // static_partitioning_columns are partitioning columns having assigned | ||
| // values in the PARTITION clause (e.g. b in the above example). | ||
| // dynamic_partitioning_columns are partitioning columns that do not assigned | ||
| // values in the PARTITION clause (e.g. c in the above example). | ||
| case insert @ logical.InsertIntoTable( | ||
| relation @ LogicalRelation(t: HadoopFsRelation, _, _), parts, query, overwrite, false) | ||
| if query.resolved && parts.exists(_._2.isDefined) => | ||
|
|
||
| val projectList = convertStaticPartitions( | ||
| sourceAttributes = query.output, | ||
| providedPartitions = parts, | ||
| targetAttributes = relation.output, | ||
| targetPartitionSchema = t.partitionSchema) | ||
|
|
||
| // We will remove all assigned values to static partitions because they have been | ||
| // moved to the projectList. | ||
| insert.copy(partition = parts.map(p => (p._1, None)), child = Project(projectList, query)) | ||
|
|
||
|
|
||
| case logical.InsertIntoTable( | ||
| l @ LogicalRelation(t: HadoopFsRelation, _, table), _, query, overwrite, false) | ||
| if query.resolved && t.schema.sameType(query.schema) => | ||
|
|
||
| // Sanity checks | ||
| case InsertIntoTable( | ||
| l @ LogicalRelation(t: HadoopFsRelation, _, table), parts, query, overwrite, false) | ||
| if hasBeenPreprocessed(l.output, t.partitionSchema, parts, query) => | ||
|
|
||
| // If the InsertIntoTable command is for a partitioned HadoopFsRelation and | ||
| // the user has specified static partitions, we add a Project operator on top of the query | ||
| // to include those constant column values in the query result. | ||
| // | ||
| // Example: | ||
| // Let's say that we have a table "t", which is created by | ||
| // CREATE TABLE t (a INT, b INT, c INT) USING parquet PARTITIONED BY (b, c) | ||
| // The statement of "INSERT INTO TABLE t PARTITION (b=2, c) SELECT 1, 3" | ||
| // will be converted to "INSERT INTO TABLE t PARTITION (b, c) SELECT 1, 2, 3". | ||
| // | ||
| // Basically, we will put those partition columns having a assigned value back | ||
| // to the SELECT clause. The output of the SELECT clause is organized as | ||
| // normal_columns static_partitioning_columns dynamic_partitioning_columns. | ||
| // static_partitioning_columns are partitioning columns having assigned | ||
| // values in the PARTITION clause (e.g. b in the above example). | ||
| // dynamic_partitioning_columns are partitioning columns that do not assigned | ||
| // values in the PARTITION clause (e.g. c in the above example). | ||
| val actualQuery = if (parts.exists(_._2.isDefined)) { | ||
| val projectList = convertStaticPartitions( | ||
| sourceAttributes = query.output, | ||
| providedPartitions = parts, | ||
| targetAttributes = l.output, | ||
| targetPartitionSchema = t.partitionSchema) | ||
| Project(projectList, query) | ||
| } else { | ||
| query | ||
| } | ||
|
|
||
| // Sanity check | ||
| if (t.location.rootPaths.size != 1) { | ||
| throw new AnalysisException( | ||
| "Can only write data to relations with a single path.") | ||
| throw new AnalysisException("Can only write data to relations with a single path.") | ||
| } | ||
|
|
||
| val outputPath = t.location.rootPaths.head | ||
| val inputPaths = query.collect { | ||
| val inputPaths = actualQuery.collect { | ||
| case LogicalRelation(r: HadoopFsRelation, _, _) => r.location.rootPaths | ||
| }.flatten | ||
|
|
||
| val mode = if (overwrite.enabled) SaveMode.Overwrite else SaveMode.Append | ||
| if (overwrite.enabled && inputPaths.contains(outputPath)) { | ||
| val mode = if (overwrite) SaveMode.Overwrite else SaveMode.Append | ||
| if (overwrite && inputPaths.contains(outputPath)) { | ||
| throw new AnalysisException( | ||
| "Cannot overwrite a path that is also being read from.") | ||
| } | ||
|
|
||
| val partitionSchema = query.resolve( | ||
| val partitionSchema = actualQuery.resolve( | ||
| t.partitionSchema, t.sparkSession.sessionState.analyzer.resolver) | ||
| val partitionsTrackedByCatalog = | ||
| t.sparkSession.sessionState.conf.manageFilesourcePartitions && | ||
|
|
@@ -192,19 +205,13 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] { | |
| var initialMatchingPartitions: Seq[TablePartitionSpec] = Nil | ||
| var customPartitionLocations: Map[TablePartitionSpec, String] = Map.empty | ||
|
|
||
| val staticPartitionKeys: TablePartitionSpec = if (overwrite.enabled) { | ||
| overwrite.staticPartitionKeys.map { case (k, v) => | ||
| (partitionSchema.map(_.name).find(_.equalsIgnoreCase(k)).get, v) | ||
| } | ||
| } else { | ||
| Map.empty | ||
| } | ||
| val staticPartitions = parts.filter(_._2.nonEmpty).map { case (k, v) => k -> v.get } | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The column names in partition spec are already normalized in
|
||
|
|
||
| // When partitions are tracked by the catalog, compute all custom partition locations that | ||
| // may be relevant to the insertion job. | ||
| if (partitionsTrackedByCatalog) { | ||
| val matchingPartitions = t.sparkSession.sessionState.catalog.listPartitions( | ||
| l.catalogTable.get.identifier, Some(staticPartitionKeys)) | ||
| l.catalogTable.get.identifier, Some(staticPartitions)) | ||
| initialMatchingPartitions = matchingPartitions.map(_.spec) | ||
| customPartitionLocations = getCustomPartitionLocations( | ||
| t.sparkSession, l.catalogTable.get, outputPath, matchingPartitions) | ||
|
|
@@ -220,7 +227,7 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] { | |
| l.catalogTable.get.identifier, newPartitions.toSeq.map(p => (p, None)), | ||
| ifNotExists = true).run(t.sparkSession) | ||
| } | ||
| if (overwrite.enabled) { | ||
| if (overwrite) { | ||
| val deletedPartitions = initialMatchingPartitions.toSet -- updatedPartitions | ||
| if (deletedPartitions.nonEmpty) { | ||
| AlterTableDropPartitionCommand( | ||
|
|
@@ -235,14 +242,14 @@ case class DataSourceAnalysis(conf: CatalystConf) extends Rule[LogicalPlan] { | |
|
|
||
| val insertCmd = InsertIntoHadoopFsRelationCommand( | ||
| outputPath, | ||
| staticPartitionKeys, | ||
| staticPartitions, | ||
| customPartitionLocations, | ||
| partitionSchema, | ||
| t.bucketSpec, | ||
| t.fileFormat, | ||
| refreshPartitionsCallback, | ||
| t.options, | ||
| query, | ||
| actualQuery, | ||
| mode, | ||
| table) | ||
|
|
||
|
|
@@ -305,7 +312,7 @@ class FindDataSourceTable(sparkSession: SparkSession) extends Rule[LogicalPlan] | |
| } | ||
|
|
||
| override def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
| case i @ logical.InsertIntoTable(s: SimpleCatalogRelation, _, _, _, _) | ||
| case i @ InsertIntoTable(s: SimpleCatalogRelation, _, _, _, _) | ||
| if DDLUtils.isDatasourceTable(s.metadata) => | ||
| i.copy(table = readDataSourceTable(sparkSession, s)) | ||
|
|
||
|
|
@@ -351,7 +358,7 @@ object DataSourceStrategy extends Strategy with Logging { | |
| Map.empty, | ||
| None) :: Nil | ||
|
|
||
| case i @ logical.InsertIntoTable(l @ LogicalRelation(t: InsertableRelation, _, _), | ||
| case InsertIntoTable(l @ LogicalRelation(t: InsertableRelation, _, _), | ||
| part, query, overwrite, false) if part.isEmpty => | ||
| ExecutedCommandExec(InsertIntoDataSourceCommand(l, query, overwrite)) :: Nil | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add a comment about who preprocesses this?