-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-31891][SQL] Support MSCK REPAIR TABLE .. [{ADD|DROP|SYNC} PARTITIONS]
#31499
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
c313bdb
1168390
76fe1e7
b5a4bab
dfc03ee
79ec7a1
7b9ee52
20cd456
0cf0574
fefea57
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1913,12 +1913,6 @@ class DDLParserSuite extends AnalysisTest { | |
| "missing 'COLUMNS' at '<EOF>'") | ||
| } | ||
|
|
||
| test("MSCK REPAIR TABLE") { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| comparePlans( | ||
| parsePlan("MSCK REPAIR TABLE a.b.c"), | ||
| RepairTable(UnresolvedTable(Seq("a", "b", "c"), "MSCK REPAIR TABLE", None))) | ||
| } | ||
|
|
||
| test("LOAD DATA INTO table") { | ||
| comparePlans( | ||
| parsePlan("LOAD DATA INPATH 'filepath' INTO TABLE a.b.c"), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -376,8 +376,12 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) | |
| case AnalyzeColumn(ResolvedV1TableOrViewIdentifier(ident), columnNames, allColumns) => | ||
| AnalyzeColumnCommand(ident.asTableIdentifier, columnNames, allColumns) | ||
|
|
||
| case RepairTable(ResolvedV1TableIdentifier(ident)) => | ||
| AlterTableRecoverPartitionsCommand(ident.asTableIdentifier, "MSCK REPAIR TABLE") | ||
| case RepairTable(ResolvedV1TableIdentifier(ident), addPartitions, dropPartitions) => | ||
| AlterTableRecoverPartitionsCommand( | ||
| ident.asTableIdentifier, | ||
| addPartitions, | ||
| dropPartitions, | ||
| "MSCK REPAIR TABLE") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a question: can we propagate the original
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately, we lost the info in |
||
|
|
||
| case LoadData(ResolvedV1TableIdentifier(ident), path, isLocal, isOverwrite, partition) => | ||
| LoadDataCommand( | ||
|
|
@@ -420,6 +424,8 @@ class ResolveSessionCatalog(val catalogManager: CatalogManager) | |
| case AlterTableRecoverPartitions(ResolvedV1TableIdentifier(ident)) => | ||
| AlterTableRecoverPartitionsCommand( | ||
| ident.asTableIdentifier, | ||
| enableAddPartitions = true, | ||
| enableDropPartitions = false, | ||
| "ALTER TABLE RECOVER PARTITIONS") | ||
|
|
||
| case AlterTableAddPartition(ResolvedV1TableIdentifier(ident), partSpecsAndLocs, ifNotExists) => | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -597,11 +597,13 @@ case class PartitionStatistics(numFiles: Int, totalSize: Long) | |||||||||||||||||||||
| * The syntax of this command is: | ||||||||||||||||||||||
| * {{{ | ||||||||||||||||||||||
| * ALTER TABLE table RECOVER PARTITIONS; | ||||||||||||||||||||||
| * MSCK REPAIR TABLE table; | ||||||||||||||||||||||
| * MSCK REPAIR TABLE table [{ADD|DROP|SYNC} PARTITIONS]; | ||||||||||||||||||||||
| * }}} | ||||||||||||||||||||||
| */ | ||||||||||||||||||||||
| case class AlterTableRecoverPartitionsCommand( | ||||||||||||||||||||||
dongjoon-hyun marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||||
| tableName: TableIdentifier, | ||||||||||||||||||||||
| enableAddPartitions: Boolean, | ||||||||||||||||||||||
| enableDropPartitions: Boolean, | ||||||||||||||||||||||
| cmd: String = "ALTER TABLE RECOVER PARTITIONS") extends RunnableCommand { | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| // These are list of statistics that can be collected quickly without requiring a scan of the data | ||||||||||||||||||||||
|
|
@@ -645,34 +647,40 @@ case class AlterTableRecoverPartitionsCommand( | |||||||||||||||||||||
| val hadoopConf = spark.sessionState.newHadoopConf() | ||||||||||||||||||||||
| val fs = root.getFileSystem(hadoopConf) | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| val threshold = spark.sparkContext.conf.get(RDD_PARALLEL_LISTING_THRESHOLD) | ||||||||||||||||||||||
| val pathFilter = getPathFilter(hadoopConf) | ||||||||||||||||||||||
| val droppedAmount = if (enableDropPartitions) { | ||||||||||||||||||||||
| dropPartitions(catalog, fs) | ||||||||||||||||||||||
| } else 0 | ||||||||||||||||||||||
| val addedAmount = if (enableAddPartitions) { | ||||||||||||||||||||||
| val threshold = spark.sparkContext.conf.get(RDD_PARALLEL_LISTING_THRESHOLD) | ||||||||||||||||||||||
| val pathFilter = getPathFilter(hadoopConf) | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| val evalPool = ThreadUtils.newForkJoinPool("AlterTableRecoverPartitionsCommand", 8) | ||||||||||||||||||||||
| val partitionSpecsAndLocs: GenSeq[(TablePartitionSpec, Path)] = | ||||||||||||||||||||||
| try { | ||||||||||||||||||||||
| scanPartitions(spark, fs, pathFilter, root, Map(), table.partitionColumnNames, threshold, | ||||||||||||||||||||||
| spark.sessionState.conf.resolver, new ForkJoinTaskSupport(evalPool)).seq | ||||||||||||||||||||||
| } finally { | ||||||||||||||||||||||
| evalPool.shutdown() | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| val total = partitionSpecsAndLocs.length | ||||||||||||||||||||||
| logInfo(s"Found $total partitions in $root") | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| val evalPool = ThreadUtils.newForkJoinPool("AlterTableRecoverPartitionsCommand", 8) | ||||||||||||||||||||||
| val partitionSpecsAndLocs: GenSeq[(TablePartitionSpec, Path)] = | ||||||||||||||||||||||
| try { | ||||||||||||||||||||||
| scanPartitions(spark, fs, pathFilter, root, Map(), table.partitionColumnNames, threshold, | ||||||||||||||||||||||
| spark.sessionState.conf.resolver, new ForkJoinTaskSupport(evalPool)).seq | ||||||||||||||||||||||
| } finally { | ||||||||||||||||||||||
| evalPool.shutdown() | ||||||||||||||||||||||
| val partitionStats = if (spark.sqlContext.conf.gatherFastStats) { | ||||||||||||||||||||||
| gatherPartitionStats(spark, partitionSpecsAndLocs, fs, pathFilter, threshold) | ||||||||||||||||||||||
| } else { | ||||||||||||||||||||||
| GenMap.empty[String, PartitionStatistics] | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| val total = partitionSpecsAndLocs.length | ||||||||||||||||||||||
| logInfo(s"Found $total partitions in $root") | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| val partitionStats = if (spark.sqlContext.conf.gatherFastStats) { | ||||||||||||||||||||||
| gatherPartitionStats(spark, partitionSpecsAndLocs, fs, pathFilter, threshold) | ||||||||||||||||||||||
| } else { | ||||||||||||||||||||||
| GenMap.empty[String, PartitionStatistics] | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| logInfo(s"Finished to gather the fast stats for all $total partitions.") | ||||||||||||||||||||||
| logInfo(s"Finished to gather the fast stats for all $total partitions.") | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| addPartitions(spark, table, partitionSpecsAndLocs, partitionStats) | ||||||||||||||||||||||
| addPartitions(spark, table, partitionSpecsAndLocs, partitionStats) | ||||||||||||||||||||||
| total | ||||||||||||||||||||||
| } else 0 | ||||||||||||||||||||||
| // Updates the table to indicate that its partition metadata is stored in the Hive metastore. | ||||||||||||||||||||||
| // This is always the case for Hive format tables, but is not true for Datasource tables created | ||||||||||||||||||||||
| // before Spark 2.1 unless they are converted via `msck repair table`. | ||||||||||||||||||||||
| spark.sessionState.catalog.alterTable(table.copy(tracksPartitionsInCatalog = true)) | ||||||||||||||||||||||
| spark.catalog.refreshTable(tableIdentWithDB) | ||||||||||||||||||||||
| logInfo(s"Recovered all partitions ($total).") | ||||||||||||||||||||||
| logInfo(s"Recovered all partitions: added ($addedAmount), dropped ($droppedAmount).") | ||||||||||||||||||||||
| Seq.empty[Row] | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
|
|
@@ -791,8 +799,28 @@ case class AlterTableRecoverPartitionsCommand( | |||||||||||||||||||||
| logDebug(s"Recovered ${parts.length} partitions ($done/$total so far)") | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| // Drops the partitions that do not exist in the file system | ||||||||||||||||||||||
| private def dropPartitions(catalog: SessionCatalog, fs: FileSystem): Int = { | ||||||||||||||||||||||
| val dropPartSpecs = ThreadUtils.parmap( | ||||||||||||||||||||||
| catalog.listPartitions(tableName), | ||||||||||||||||||||||
| "AlterTableRecoverPartitionsCommand: non-existing partitions", | ||||||||||||||||||||||
| maxThreads = 8) { partition => | ||||||||||||||||||||||
| partition.storage.locationUri.flatMap { uri => | ||||||||||||||||||||||
| if (fs.exists(new Path(uri))) None else Some(partition.spec) | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| }.flatten | ||||||||||||||||||||||
| catalog.dropPartitions( | ||||||||||||||||||||||
| tableName, | ||||||||||||||||||||||
| dropPartSpecs, | ||||||||||||||||||||||
| ignoreIfNotExists = true, | ||||||||||||||||||||||
| purge = false, | ||||||||||||||||||||||
| // Since we have already checked that partition directories do not exist, we can avoid | ||||||||||||||||||||||
| // additional calls to the file system at the catalog side by setting this flag. | ||||||||||||||||||||||
| retainData = true) | ||||||||||||||||||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add a comment about the reason why we use
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yep, if we set
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The same for the spark/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/InMemoryCatalog.scala Lines 464 to 473 in bfc0235
|
||||||||||||||||||||||
| dropPartSpecs.length | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| /** | ||||||||||||||||||||||
| * A command that sets the location of a table or a partition. | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,69 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.execution.command | ||
|
|
||
| import org.apache.spark.sql.catalyst.analysis.{AnalysisTest, UnresolvedTable} | ||
| import org.apache.spark.sql.catalyst.parser.CatalystSqlParser.parsePlan | ||
| import org.apache.spark.sql.catalyst.plans.logical.RepairTable | ||
|
|
||
| class MsckRepairTableParserSuite extends AnalysisTest { | ||
| test("repair a table") { | ||
| comparePlans( | ||
| parsePlan("MSCK REPAIR TABLE a.b.c"), | ||
| RepairTable( | ||
| UnresolvedTable(Seq("a", "b", "c"), "MSCK REPAIR TABLE", None), | ||
| enableAddPartitions = true, | ||
| enableDropPartitions = false)) | ||
| } | ||
|
|
||
| test("add partitions") { | ||
| comparePlans( | ||
| parsePlan("msck repair table ns.tbl add partitions"), | ||
| RepairTable( | ||
| UnresolvedTable( | ||
| Seq("ns", "tbl"), | ||
| "MSCK REPAIR TABLE ... ADD PARTITIONS", | ||
| None), | ||
| enableAddPartitions = true, | ||
| enableDropPartitions = false)) | ||
| } | ||
|
|
||
| test("drop partitions") { | ||
| comparePlans( | ||
| parsePlan("MSCK repair table TBL Drop Partitions"), | ||
| RepairTable( | ||
| UnresolvedTable( | ||
| Seq("TBL"), | ||
| "MSCK REPAIR TABLE ... DROP PARTITIONS", | ||
| None), | ||
| enableAddPartitions = false, | ||
| enableDropPartitions = true)) | ||
| } | ||
|
|
||
| test("sync partitions") { | ||
| comparePlans( | ||
| parsePlan("MSCK REPAIR TABLE spark_catalog.ns.tbl SYNC PARTITIONS"), | ||
| RepairTable( | ||
| UnresolvedTable( | ||
| Seq("spark_catalog", "ns", "tbl"), | ||
| "MSCK REPAIR TABLE ... SYNC PARTITIONS", | ||
| None), | ||
| enableAddPartitions = true, | ||
| enableDropPartitions = true)) | ||
| } | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typo: should be
If not specified.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's better to put it in the end, and say
If not specified, ADD is the default.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here is the PR #31633