Skip to content

Commit 58a8a37

Browse files
committed
[SPARK-20920][SQL] ForkJoinPool pools are leaked when writing hive tables with many partitions
## What changes were proposed in this pull request? Don't leave thread pool running from AlterTableRecoverPartitionsCommand DDL command ## How was this patch tested? Existing tests. Author: Sean Owen <[email protected]> Closes #18216 from srowen/SPARK-20920. (cherry picked from commit 7b7c85e) Signed-off-by: Sean Owen <[email protected]>
1 parent 03cc18b commit 58a8a37

File tree

1 file changed

+13
-8
lines changed
  • sql/core/src/main/scala/org/apache/spark/sql/execution/command

1 file changed

+13
-8
lines changed

sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ package org.apache.spark.sql.execution.command
1919

2020
import scala.collection.{GenMap, GenSeq}
2121
import scala.collection.parallel.ForkJoinTaskSupport
22-
import scala.concurrent.forkjoin.ForkJoinPool
2322
import scala.util.control.NonFatal
2423

2524
import org.apache.hadoop.conf.Configuration
@@ -34,7 +33,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
3433
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
3534
import org.apache.spark.sql.execution.datasources.PartitioningUtils
3635
import org.apache.spark.sql.types._
37-
import org.apache.spark.util.SerializableConfiguration
36+
import org.apache.spark.util.{SerializableConfiguration, ThreadUtils}
3837

3938
// Note: The definition of these commands are based on the ones described in
4039
// https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL
@@ -508,8 +507,15 @@ case class AlterTableRecoverPartitionsCommand(
508507
val threshold = spark.conf.get("spark.rdd.parallelListingThreshold", "10").toInt
509508
val hadoopConf = spark.sparkContext.hadoopConfiguration
510509
val pathFilter = getPathFilter(hadoopConf)
511-
val partitionSpecsAndLocs = scanPartitions(spark, fs, pathFilter, root, Map(),
512-
table.partitionColumnNames, threshold, spark.sessionState.conf.resolver)
510+
511+
val evalPool = ThreadUtils.newForkJoinPool("AlterTableRecoverPartitionsCommand", 8)
512+
val partitionSpecsAndLocs: Seq[(TablePartitionSpec, Path)] =
513+
try {
514+
scanPartitions(spark, fs, pathFilter, root, Map(), table.partitionColumnNames, threshold,
515+
spark.sessionState.conf.resolver, new ForkJoinTaskSupport(evalPool)).seq
516+
} finally {
517+
evalPool.shutdown()
518+
}
513519
val total = partitionSpecsAndLocs.length
514520
logInfo(s"Found $total partitions in $root")
515521

@@ -530,8 +536,6 @@ case class AlterTableRecoverPartitionsCommand(
530536
Seq.empty[Row]
531537
}
532538

533-
@transient private lazy val evalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8))
534-
535539
private def scanPartitions(
536540
spark: SparkSession,
537541
fs: FileSystem,
@@ -540,7 +544,8 @@ case class AlterTableRecoverPartitionsCommand(
540544
spec: TablePartitionSpec,
541545
partitionNames: Seq[String],
542546
threshold: Int,
543-
resolver: Resolver): GenSeq[(TablePartitionSpec, Path)] = {
547+
resolver: Resolver,
548+
evalTaskSupport: ForkJoinTaskSupport): GenSeq[(TablePartitionSpec, Path)] = {
544549
if (partitionNames.isEmpty) {
545550
return Seq(spec -> path)
546551
}
@@ -564,7 +569,7 @@ case class AlterTableRecoverPartitionsCommand(
564569
val value = ExternalCatalogUtils.unescapePathName(ps(1))
565570
if (resolver(columnName, partitionNames.head)) {
566571
scanPartitions(spark, fs, filter, st.getPath, spec ++ Map(partitionNames.head -> value),
567-
partitionNames.drop(1), threshold, resolver)
572+
partitionNames.drop(1), threshold, resolver, evalTaskSupport)
568573
} else {
569574
logWarning(
570575
s"expected partition column ${partitionNames.head}, but got ${ps(0)}, ignoring it")

0 commit comments

Comments
 (0)