-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-25240][SQL] Fix for a deadlock in RECOVER PARTITIONS #22233
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
189e14d
bc660a0
59a376d
80f9e7d
071de47
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -52,23 +52,24 @@ class InMemoryCatalogedDDLSuite extends DDLSuite with SharedSQLContext with Befo | |||||
| protected override def generateTable( | ||||||
| catalog: SessionCatalog, | ||||||
| name: TableIdentifier, | ||||||
| isDataSource: Boolean = true): CatalogTable = { | ||||||
| isDataSource: Boolean = true, | ||||||
| partitionCols: Seq[String] = Seq("a", "b")): CatalogTable = { | ||||||
| val storage = | ||||||
| CatalogStorageFormat.empty.copy(locationUri = Some(catalog.defaultTablePath(name))) | ||||||
| val metadata = new MetadataBuilder() | ||||||
| .putString("key", "value") | ||||||
| .build() | ||||||
| val schema = new StructType() | ||||||
| .add("col1", "int", nullable = true, metadata = metadata) | ||||||
| .add("col2", "string") | ||||||
| CatalogTable( | ||||||
| identifier = name, | ||||||
| tableType = CatalogTableType.EXTERNAL, | ||||||
| storage = storage, | ||||||
| schema = new StructType() | ||||||
| .add("col1", "int", nullable = true, metadata = metadata) | ||||||
| .add("col2", "string") | ||||||
| .add("a", "int") | ||||||
| .add("b", "int"), | ||||||
| schema = schema.copy( | ||||||
| fields = schema.fields ++ partitionCols.map(StructField(_, IntegerType))), | ||||||
| provider = Some("parquet"), | ||||||
| partitionColumnNames = Seq("a", "b"), | ||||||
| partitionColumnNames = partitionCols, | ||||||
| createTime = 0L, | ||||||
| createVersion = org.apache.spark.SPARK_VERSION, | ||||||
| tracksPartitionsInCatalog = true) | ||||||
|
|
@@ -176,7 +177,8 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { | |||||
| protected def generateTable( | ||||||
| catalog: SessionCatalog, | ||||||
| name: TableIdentifier, | ||||||
| isDataSource: Boolean = true): CatalogTable | ||||||
| isDataSource: Boolean = true, | ||||||
| partitionCols: Seq[String] = Seq("a", "b")): CatalogTable | ||||||
|
|
||||||
| private val escapedIdentifier = "`(.+)`".r | ||||||
|
|
||||||
|
|
@@ -228,8 +230,10 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { | |||||
| private def createTable( | ||||||
| catalog: SessionCatalog, | ||||||
| name: TableIdentifier, | ||||||
| isDataSource: Boolean = true): Unit = { | ||||||
| catalog.createTable(generateTable(catalog, name, isDataSource), ignoreIfExists = false) | ||||||
| isDataSource: Boolean = true, | ||||||
| partitionCols: Seq[String] = Seq("a", "b")): Unit = { | ||||||
| catalog.createTable( | ||||||
| generateTable(catalog, name, isDataSource, partitionCols), ignoreIfExists = false) | ||||||
| } | ||||||
|
|
||||||
| private def createTablePartition( | ||||||
|
|
@@ -1131,7 +1135,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { | |||||
| } | ||||||
|
|
||||||
| test("alter table: recover partition (parallel)") { | ||||||
| withSQLConf("spark.rdd.parallelListingThreshold" -> "1") { | ||||||
| withSQLConf("spark.rdd.parallelListingThreshold" -> "0") { | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @MaxGekk, out of curiosity, why does this have to be 0?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the recursive calls this condition spark/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala Lines 685 to 686 in 131ca14
statuses.length is 1 and threshold is 1. So, it leads to sequential listening of files. I just enforce parallel scanning even for 1 file/folder.
|
||||||
| testRecoverPartitions() | ||||||
| } | ||||||
| } | ||||||
|
|
@@ -1144,23 +1148,32 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { | |||||
| } | ||||||
|
|
||||||
| val tableIdent = TableIdentifier("tab1") | ||||||
| createTable(catalog, tableIdent) | ||||||
| val part1 = Map("a" -> "1", "b" -> "5") | ||||||
| createTable(catalog, tableIdent, partitionCols = Seq("a", "b", "c")) | ||||||
| val part1 = Map("a" -> "1", "b" -> "5", "c" -> "19") | ||||||
| createTablePartition(catalog, part1, tableIdent) | ||||||
| assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == Set(part1)) | ||||||
|
|
||||||
| val part2 = Map("a" -> "2", "b" -> "6") | ||||||
| val part2 = Map("a" -> "2", "b" -> "6", "c" -> "31") | ||||||
| val root = new Path(catalog.getTableMetadata(tableIdent).location) | ||||||
| val fs = root.getFileSystem(spark.sessionState.newHadoopConf()) | ||||||
| // valid | ||||||
| fs.mkdirs(new Path(new Path(root, "a=1"), "b=5")) | ||||||
| fs.createNewFile(new Path(new Path(root, "a=1/b=5"), "a.csv")) // file | ||||||
| fs.createNewFile(new Path(new Path(root, "a=1/b=5"), "_SUCCESS")) // file | ||||||
| fs.mkdirs(new Path(new Path(root, "A=2"), "B=6")) | ||||||
| fs.createNewFile(new Path(new Path(root, "A=2/B=6"), "b.csv")) // file | ||||||
| fs.createNewFile(new Path(new Path(root, "A=2/B=6"), "c.csv")) // file | ||||||
| fs.createNewFile(new Path(new Path(root, "A=2/B=6"), ".hiddenFile")) // file | ||||||
| fs.mkdirs(new Path(new Path(root, "A=2/B=6"), "_temporary")) | ||||||
| fs.mkdirs(new Path(new Path(new Path(root, "a=1"), "b=5"), "c=19")) | ||||||
| fs.createNewFile(new Path(new Path(root, "a=1/b=5/c=19"), "a.csv")) // file | ||||||
| fs.createNewFile(new Path(new Path(root, "a=1/b=5/c=19"), "_SUCCESS")) // file | ||||||
|
|
||||||
| fs.mkdirs(new Path(new Path(new Path(root, "A=2"), "B=6"), "C=31")) | ||||||
| fs.createNewFile(new Path(new Path(root, "A=2/B=6/C=31"), "b.csv")) // file | ||||||
| fs.createNewFile(new Path(new Path(root, "A=2/B=6/C=31"), "c.csv")) // file | ||||||
| fs.createNewFile(new Path(new Path(root, "A=2/B=6/C=31"), ".hiddenFile")) // file | ||||||
| fs.mkdirs(new Path(new Path(root, "A=2/B=6/C=31"), "_temporary")) | ||||||
|
|
||||||
| val parts = (10 to 100).map { a => | ||||||
| val part = Map("a" -> a.toString, "b" -> "5", "c" -> "42") | ||||||
| fs.mkdirs(new Path(new Path(new Path(root, s"a=$a"), "b=5"), "c=42")) | ||||||
| fs.createNewFile(new Path(new Path(root, s"a=$a/b=5/c=42"), "a.csv")) // file | ||||||
| createTablePartition(catalog, part, tableIdent) | ||||||
| part | ||||||
| } | ||||||
|
|
||||||
| // invalid | ||||||
| fs.mkdirs(new Path(new Path(root, "a"), "b")) // bad name | ||||||
|
|
@@ -1174,7 +1187,7 @@ abstract class DDLSuite extends QueryTest with SQLTestUtils { | |||||
| try { | ||||||
| sql("ALTER TABLE tab1 RECOVER PARTITIONS") | ||||||
| assert(catalog.listPartitions(tableIdent).map(_.spec).toSet == | ||||||
| Set(part1, part2)) | ||||||
| Set(part1, part2) ++ parts) | ||||||
| if (!isUsingHiveMetastore) { | ||||||
| assert(catalog.getPartition(tableIdent, part1).parameters("numFiles") == "1") | ||||||
| assert(catalog.getPartition(tableIdent, part2).parameters("numFiles") == "2") | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -60,7 +60,8 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA | |
| protected override def generateTable( | ||
| catalog: SessionCatalog, | ||
| name: TableIdentifier, | ||
| isDataSource: Boolean): CatalogTable = { | ||
| isDataSource: Boolean, | ||
| partitionCols: Seq[String] = Seq("a", "b")): CatalogTable = { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The interface of this function looks strange. The original one is also hacky. We should refine them later.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, please don't overwrite a method with a default parameter. It's very easy to use different default values then the value to pick up will depend on the type you are using... |
||
| val storage = | ||
| if (isDataSource) { | ||
| val serde = HiveSerDe.sourceToSerDe("parquet") | ||
|
|
@@ -84,17 +85,17 @@ class HiveCatalogedDDLSuite extends DDLSuite with TestHiveSingleton with BeforeA | |
| val metadata = new MetadataBuilder() | ||
| .putString("key", "value") | ||
| .build() | ||
| val schema = new StructType() | ||
| .add("col1", "int", nullable = true, metadata = metadata) | ||
| .add("col2", "string") | ||
| CatalogTable( | ||
| identifier = name, | ||
| tableType = CatalogTableType.EXTERNAL, | ||
| storage = storage, | ||
| schema = new StructType() | ||
| .add("col1", "int", nullable = true, metadata = metadata) | ||
| .add("col2", "string") | ||
| .add("a", "int") | ||
| .add("b", "int"), | ||
| schema = schema.copy( | ||
| fields = schema.fields ++ partitionCols.map(StructField(_, IntegerType))), | ||
| provider = if (isDataSource) Some("parquet") else Some("hive"), | ||
| partitionColumnNames = Seq("a", "b"), | ||
| partitionColumnNames = partitionCols, | ||
| createTime = 0L, | ||
| createVersion = org.apache.spark.SPARK_VERSION, | ||
| tracksPartitionsInCatalog = true) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A question about the changes in this file. Are they related to the work of this PR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@gatorsmile Yes, the changes are related to an existing test which was modified to reproduce the issue. In particular, this line is related to support of any number of partition columns.