-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-14922][SPARK-17732][SQL]ALTER TABLE DROP PARTITION should support comparators #19691
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
20f658a
85fdb46
f18caeb
f79c6f4
8728d3b
9832ec5
dd5d482
b4a637c
4b5c74e
e92185c
182449b
d725fc9
defc9f1
6b18939
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -282,6 +282,26 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging | |
| parts.toMap | ||
| } | ||
|
|
||
| /** | ||
| * Create a partition filter specification. | ||
| */ | ||
| def visitPartitionFilterSpec(ctx: PartitionSpecContext): Seq[Expression] = withOrigin(ctx) { | ||
| val parts = ctx.expression.asScala.map { pVal => | ||
| expression(pVal) match { | ||
| case EqualNullSafe(_, _) => | ||
| throw new ParseException("'<=>' operator is not allowed in partition specification.", ctx) | ||
| case cmp @ BinaryComparison(UnresolvedAttribute(name :: Nil), constant: Literal) => | ||
| cmp | ||
| case bc @ BinaryComparison(constant: Literal, _) => | ||
| throw new ParseException(s"Literal $constant is supported only on the rigth-side.", ctx) | ||
| case _ => | ||
| throw new ParseException( | ||
| s"Invalid partition filter specification (${pVal.getText}).", ctx) | ||
| } | ||
| } | ||
| parts | ||
| } | ||
|
|
||
| /** | ||
| * Create a partition specification map without optional values. | ||
| */ | ||
|
|
@@ -293,6 +313,15 @@ class AstBuilder(conf: SQLConf) extends SqlBaseBaseVisitor[AnyRef] with Logging | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Create a partition specification map without optional values | ||
| * and a partition filter specification. | ||
| */ | ||
| protected def visitPartition( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we avoid this method? I find it quite confusing (I mean it is a bit weird to return a tuple with a Map and a Seq of different things....) We can add a new parameter to
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried to add a new parameter to
all of the partitions need to be dropped are: using one tuple is to telling us that the if we don't use tuple, it's would be difficult to tell the different occasions and it would be difficult to decide between Any ideas to replace this
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see what you mean now. Yes, I have no better idea indeed. Thanks. |
||
| ctx: PartitionSpecContext): (Map[String, String], Seq[Expression]) = { | ||
| (visitNonOptionalPartitionSpec(ctx), visitPartitionFilterSpec(ctx)) | ||
| } | ||
|
|
||
| /** | ||
| * Convert a constant of any type into a string. This is typically used in DDL commands, and its | ||
| * main purpose is to prevent slight differences due to back to back conversions i.e.: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,10 +29,10 @@ import org.apache.hadoop.mapred.{FileInputFormat, JobConf} | |
|
|
||
| import org.apache.spark.sql.{AnalysisException, Row, SparkSession} | ||
| import org.apache.spark.sql.catalyst.TableIdentifier | ||
| import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, Resolver} | ||
| import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, Resolver, UnresolvedAttribute} | ||
| import org.apache.spark.sql.catalyst.catalog._ | ||
| import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec | ||
| import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} | ||
| import org.apache.spark.sql.catalyst.expressions.{And, Attribute, AttributeReference, BinaryComparison, Cast, Expression, Literal, PredicateHelper} | ||
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
| import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PartitioningUtils} | ||
| import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat | ||
|
|
@@ -500,7 +500,8 @@ case class AlterTableRenamePartitionCommand( | |
| } | ||
|
|
||
| /** | ||
| * Drop Partition in ALTER TABLE: to drop a particular partition for a table. | ||
| * Drop Partition in ALTER TABLE: to drop a particular partition | ||
| * or a set of partitions according to given expressions for a table. | ||
| * | ||
| * This removes the data and metadata for this partition. | ||
| * The data is actually moved to the .Trash/Current directory if Trash is configured, | ||
|
|
@@ -510,40 +511,86 @@ case class AlterTableRenamePartitionCommand( | |
| * | ||
| * The syntax of this command is: | ||
| * {{{ | ||
| * ALTER TABLE table DROP [IF EXISTS] PARTITION spec1[, PARTITION spec2, ...] [PURGE]; | ||
| * ALTER TABLE table DROP [IF EXISTS] PARTITION (spec1, expr1) | ||
| * [, PARTITION (spec2, expr2), ...] [PURGE]; | ||
| * }}} | ||
| */ | ||
| case class AlterTableDropPartitionCommand( | ||
| tableName: TableIdentifier, | ||
| specs: Seq[TablePartitionSpec], | ||
| partitions: Seq[(TablePartitionSpec, Seq[Expression])], | ||
| ifExists: Boolean, | ||
| purge: Boolean, | ||
| retainData: Boolean) | ||
| extends RunnableCommand { | ||
| extends RunnableCommand with PredicateHelper { | ||
|
|
||
| override def run(sparkSession: SparkSession): Seq[Row] = { | ||
| val catalog = sparkSession.sessionState.catalog | ||
| val table = catalog.getTableMetadata(tableName) | ||
| val resolver = sparkSession.sessionState.conf.resolver | ||
| DDLUtils.verifyAlterTableType(catalog, table, isView = false) | ||
| DDLUtils.verifyPartitionProviderIsHive(sparkSession, table, "ALTER TABLE DROP PARTITION") | ||
|
|
||
| val normalizedSpecs = specs.map { spec => | ||
| PartitioningUtils.normalizePartitionSpec( | ||
| spec, | ||
| table.partitionColumnNames, | ||
| table.identifier.quotedString, | ||
| sparkSession.sessionState.conf.resolver) | ||
| val toDrop = partitions.flatMap { partition => | ||
| if (partition._1.isEmpty && !partition._2.isEmpty) { | ||
| // There are only expressions in this drop condition. | ||
| extractFromPartitionFilter(partition._2, catalog, table, resolver) | ||
| } else if (!partition._1.isEmpty && partition._2.isEmpty) { | ||
| // There are only partitionSpecs in this drop condition. | ||
| extractFromPartitionSpec(partition._1, table, resolver) | ||
| } else if (!partition._1.isEmpty && !partition._2.isEmpty) { | ||
| // This drop condition has both partitionSpecs and expressions. | ||
| extractFromPartitionFilter(partition._2, catalog, table, resolver).intersect( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this may be quite inefficient if we have a lot if partitions. What about converting the
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I agree. And the hard part may be how to convert a
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why? Isn't it enough something like: ?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean how to define
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can (must) just have a single: where we have both a partition spec and an expression specification.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hi, @mgaido91 there is one problem after I changed the syntax, By the way, is a syntax like Besides, I was wrong that day. I think the if conditions won't be inefficient if there is a lot of partitions. it maybe inefficient if there are a lot of dropPartitionSpec which I don't think can happen easily.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @DazhuangSu sorry I missed your last comment somehow. Why do you say it would not be inefficient if you have a lot of partitions?I think it would be! Imagine that you partition per year and day. And you want to get the first 6 months of this year. The spec would be something like which would result in an empty Seq, so we would drop nothing. Moreover, I saw no test for this case in the tests. Can we add tests for this use case and can we add support for it if my understanding that it is not working is right? Thanks
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mgaido91 I understand your point, yes it would be inefficient. I will work on this soon
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thank you @DazhuangSu |
||
| extractFromPartitionSpec(partition._1, table, resolver)) | ||
| } else { | ||
| Seq.empty[TablePartitionSpec] | ||
| } | ||
| } | ||
|
|
||
| catalog.dropPartitions( | ||
| table.identifier, normalizedSpecs, ignoreIfNotExists = ifExists, purge = purge, | ||
| table.identifier, toDrop, ignoreIfNotExists = ifExists, purge = purge, | ||
| retainData = retainData) | ||
|
|
||
| CommandUtils.updateTableStats(sparkSession, table) | ||
|
|
||
| Seq.empty[Row] | ||
| } | ||
|
|
||
| private def extractFromPartitionSpec( | ||
| specs: TablePartitionSpec, | ||
| table: CatalogTable, | ||
| resolver: Resolver): Seq[Map[String, String]] = { | ||
| Seq(PartitioningUtils.normalizePartitionSpec( | ||
| specs, | ||
| table.partitionColumnNames, | ||
| table.identifier.quotedString, | ||
| resolver)) | ||
| } | ||
|
|
||
| private def extractFromPartitionFilter( | ||
| filters: Seq[Expression], | ||
| catalog: SessionCatalog, | ||
| table: CatalogTable, | ||
| resolver: Resolver): Seq[TablePartitionSpec] = { | ||
| val expressions = filters.map { expr => | ||
| val (attrName, constant) = expr match { | ||
| case BinaryComparison(UnresolvedAttribute(name :: Nil), constant: Literal) => | ||
| (name, constant) | ||
| } | ||
| if (!table.partitionColumnNames.exists(resolver(_, attrName))) { | ||
| throw new AnalysisException(s"${attrName} is not a valid partition column " + | ||
| s"in table ${table.identifier.quotedString}.") | ||
| } | ||
| val dataType = table.partitionSchema.apply(attrName).dataType | ||
| expr.withNewChildren(Seq(AttributeReference(attrName, dataType)(), | ||
| Cast(constant, dataType))) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: can we add the cast only when needed, ie. |
||
| }.reduce(And) | ||
| val parts = catalog.listPartitionsByFilter( | ||
| table.identifier, Seq(expressions)).map(_.spec) | ||
| if (parts.isEmpty && !ifExists) { | ||
| throw new AnalysisException(s"There is no partition for ${expressions.sql}") | ||
| } | ||
| parts | ||
| } | ||
| } | ||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still the same question here. Constant has to be in the right side?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hive supports them only on the right side. So it makes sense to have the same here I think.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we support the right-side only, it seems be useful to print explicit error messages like
left-side literal not supported ....?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we can also enforce this is the syntax, like here: https://github.com/apache/spark/pull/20999/files#diff-8c1cb2af4aa1109e08481dae79052cc3R269