[SPARK-24005][CORE] Remove usage of Scala’s parallel collection

MaxGekk · cloud-fan · commit 131ca146ed39 · 2018-08-07T17:14:30.000+08:00
## What changes were proposed in this pull request? In the PR, I propose to replace Scala parallel collections by new methods `parmap()`. The methods use futures to transform a sequential collection by applying a lambda function to each element in parallel. The result of `parmap` is another regular (sequential) collection. The proposed `parmap` method aims to solve the problem of impossibility to interrupt parallel Scala collection. This possibility is needed for reliable task preemption. ## How was this patch tested? A test was added to `ThreadUtilsSuite` Closes #21913 from MaxGekk/par-map. Authored-by: Maxim Gekk <maxim.gekk@databricks.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala b/core/src/main/scala/org/apache/spark/rdd/UnionRDD.scala
@@ -20,12 +20,13 @@ package org.apache.spark.rdd
 import java.io.{IOException, ObjectOutputStream}
 
 import scala.collection.mutable.ArrayBuffer
-import scala.collection.parallel.ForkJoinTaskSupport
+import scala.concurrent.ExecutionContext
 import scala.concurrent.forkjoin.ForkJoinPool
 import scala.reflect.ClassTag
 
 import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
 import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.util.ThreadUtils.parmap
 import org.apache.spark.util.Utils
 
 /**
@@ -59,8 +60,7 @@ private[spark] class UnionPartition[T: ClassTag](
 }
 
 object UnionRDD {
-  private[spark] lazy val partitionEvalTaskSupport =
-    new ForkJoinTaskSupport(new ForkJoinPool(8))
+  private[spark] lazy val threadPool = new ForkJoinPool(8)
 }
 
 @DeveloperApi
@@ -74,14 +74,13 @@ class UnionRDD[T: ClassTag](
     rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)
 
   override def getPartitions: Array[Partition] = {
-    val parRDDs = if (isPartitionListingParallel) {
-      val parArray = rdds.par
-      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
-      parArray
+    val partitionLengths = if (isPartitionListingParallel) {
+      implicit val ec = ExecutionContext.fromExecutor(UnionRDD.threadPool)
+      parmap(rdds)(_.partitions.length)
     } else {
-      rdds
+      rdds.map(_.partitions.length)
     }
-    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
+    val array = new Array[Partition](partitionLengths.sum)
     var pos = 0
     for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
       array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
diff --git a/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala b/core/src/main/scala/org/apache/spark/util/ThreadUtils.scala
@@ -19,8 +19,12 @@ package org.apache.spark.util
 
 import java.util.concurrent._
 
+import scala.collection.TraversableLike
+import scala.collection.generic.CanBuildFrom
+import scala.language.higherKinds
+
 import com.google.common.util.concurrent.{MoreExecutors, ThreadFactoryBuilder}
-import scala.concurrent.{Awaitable, ExecutionContext, ExecutionContextExecutor}
+import scala.concurrent.{Awaitable, ExecutionContext, ExecutionContextExecutor, Future}
 import scala.concurrent.duration.{Duration, FiniteDuration}
 import scala.concurrent.forkjoin.{ForkJoinPool => SForkJoinPool, ForkJoinWorkerThread => SForkJoinWorkerThread}
 import scala.util.control.NonFatal
@@ -254,4 +258,62 @@ private[spark] object ThreadUtils {
       executor.shutdownNow()
     }
   }
+
+  /**
+   * Transforms input collection by applying the given function to each element in parallel fashion.
+   * Comparing to the map() method of Scala parallel collections, this method can be interrupted
+   * at any time. This is useful on canceling of task execution, for example.
+   *
+   * @param in - the input collection which should be transformed in parallel.
+   * @param prefix - the prefix assigned to the underlying thread pool.
+   * @param maxThreads - maximum number of thread can be created during execution.
+   * @param f - the lambda function will be applied to each element of `in`.
+   * @tparam I - the type of elements in the input collection.
+   * @tparam O - the type of elements in resulted collection.
+   * @return new collection in which each element was given from the input collection `in` by
+   *         applying the lambda function `f`.
+   */
+  def parmap[I, O, Col[X] <: TraversableLike[X, Col[X]]]
+      (in: Col[I], prefix: String, maxThreads: Int)
+      (f: I => O)
+      (implicit
+        cbf: CanBuildFrom[Col[I], Future[O], Col[Future[O]]], // For in.map
+        cbf2: CanBuildFrom[Col[Future[O]], O, Col[O]] // for Future.sequence
+      ): Col[O] = {
+    val pool = newForkJoinPool(prefix, maxThreads)
+    try {
+      implicit val ec = ExecutionContext.fromExecutor(pool)
+
+      parmap(in)(f)
+    } finally {
+      pool.shutdownNow()
+    }
+  }
+
+  /**
+   * Transforms input collection by applying the given function to each element in parallel fashion.
+   * Comparing to the map() method of Scala parallel collections, this method can be interrupted
+   * at any time. This is useful on canceling of task execution, for example.
+   *
+   * @param in - the input collection which should be transformed in parallel.
+   * @param f - the lambda function will be applied to each element of `in`.
+   * @param ec - an execution context for parallel applying of the given function `f`.
+   * @tparam I - the type of elements in the input collection.
+   * @tparam O - the type of elements in resulted collection.
+   * @return new collection in which each element was given from the input collection `in` by
+   *         applying the lambda function `f`.
+   */
+  def parmap[I, O, Col[X] <: TraversableLike[X, Col[X]]]
+      (in: Col[I])
+      (f: I => O)
+      (implicit
+        cbf: CanBuildFrom[Col[I], Future[O], Col[Future[O]]], // For in.map
+        cbf2: CanBuildFrom[Col[Future[O]], O, Col[O]], // for Future.sequence
+        ec: ExecutionContext
+      ): Col[O] = {
+    val futures = in.map(x => Future(f(x)))
+    val futureSeq = Future.sequence(futures)
+
+    awaitResult(futureSeq, Duration.Inf)
+  }
 }
diff --git a/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/ThreadUtilsSuite.scala
@@ -133,4 +133,37 @@ class ThreadUtilsSuite extends SparkFunSuite {
       "stack trace contains unexpected references to ThreadUtils"
     )
   }
+
+  test("parmap should be interruptible") {
+    val t = new Thread() {
+      setDaemon(true)
+
+      override def run() {
+        try {
+          // "par" is uninterruptible. The following will keep running even if the thread is
+          // interrupted. We should prefer to use "ThreadUtils.parmap".
+          //
+          // (1 to 10).par.flatMap { i =>
+          //   Thread.sleep(100000)
+          //   1 to i
+          // }
+          //
+          ThreadUtils.parmap(1 to 10, "test", 2) { i =>
+            Thread.sleep(100000)
+            1 to i
+          }.flatten
+        } catch {
+          case _: InterruptedException => // excepted
+        }
+      }
+    }
+    t.start()
+    eventually(timeout(10.seconds)) {
+      assert(t.isAlive)
+    }
+    t.interrupt()
+    eventually(timeout(10.seconds)) {
+      assert(!t.isAlive)
+    }
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/ddl.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.command
 import java.util.Locale
 
 import scala.collection.{GenMap, GenSeq}
-import scala.collection.parallel.ForkJoinTaskSupport
+import scala.concurrent.ExecutionContext
 import scala.util.control.NonFatal
 
 import org.apache.hadoop.conf.Configuration
@@ -29,7 +29,7 @@ import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
 import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
 import org.apache.spark.sql.catalyst.TableIdentifier
-import org.apache.spark.sql.catalyst.analysis.{NoSuchTableException, Resolver}
+import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.catalog._
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
@@ -40,6 +40,7 @@ import org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter
 import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.sql.types._
 import org.apache.spark.util.{SerializableConfiguration, ThreadUtils}
+import org.apache.spark.util.ThreadUtils.parmap
 
 // Note: The definition of these commands are based on the ones described in
 // https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL
@@ -621,8 +622,9 @@ case class AlterTableRecoverPartitionsCommand(
     val evalPool = ThreadUtils.newForkJoinPool("AlterTableRecoverPartitionsCommand", 8)
     val partitionSpecsAndLocs: Seq[(TablePartitionSpec, Path)] =
       try {
+        implicit val ec = ExecutionContext.fromExecutor(evalPool)
         scanPartitions(spark, fs, pathFilter, root, Map(), table.partitionColumnNames, threshold,
-          spark.sessionState.conf.resolver, new ForkJoinTaskSupport(evalPool)).seq
+          spark.sessionState.conf.resolver)
       } finally {
         evalPool.shutdown()
       }
@@ -654,23 +656,13 @@ case class AlterTableRecoverPartitionsCommand(
       spec: TablePartitionSpec,
       partitionNames: Seq[String],
       threshold: Int,
-      resolver: Resolver,
-      evalTaskSupport: ForkJoinTaskSupport): GenSeq[(TablePartitionSpec, Path)] = {
+      resolver: Resolver)(implicit ec: ExecutionContext): Seq[(TablePartitionSpec, Path)] = {
     if (partitionNames.isEmpty) {
       return Seq(spec -> path)
     }
 
-    val statuses = fs.listStatus(path, filter)
-    val statusPar: GenSeq[FileStatus] =
-      if (partitionNames.length > 1 && statuses.length > threshold || partitionNames.length > 2) {
-        // parallelize the list of partitions here, then we can have better parallelism later.
-        val parArray = statuses.par
-        parArray.tasksupport = evalTaskSupport
-        parArray
-      } else {
-        statuses
-      }
-    statusPar.flatMap { st =>
+    val statuses = fs.listStatus(path, filter).toSeq
+    def handleStatus(st: FileStatus): Seq[(TablePartitionSpec, Path)] = {
       val name = st.getPath.getName
       if (st.isDirectory && name.contains("=")) {
         val ps = name.split("=", 2)
@@ -679,7 +671,7 @@ case class AlterTableRecoverPartitionsCommand(
         val value = ExternalCatalogUtils.unescapePathName(ps(1))
         if (resolver(columnName, partitionNames.head)) {
           scanPartitions(spark, fs, filter, st.getPath, spec ++ Map(partitionNames.head -> value),
-            partitionNames.drop(1), threshold, resolver, evalTaskSupport)
+            partitionNames.drop(1), threshold, resolver)
         } else {
           logWarning(
             s"expected partition column ${partitionNames.head}, but got ${ps(0)}, ignoring it")
@@ -690,6 +682,14 @@ case class AlterTableRecoverPartitionsCommand(
         Seq.empty
       }
     }
+    val result = if (partitionNames.length > 1 &&
+        statuses.length > threshold || partitionNames.length > 2) {
+      parmap(statuses)(handleStatus _)
+    } else {
+      statuses.map(handleStatus)
+    }
+
+    result.flatten
   }
 
   private def gatherPartitionStats(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
@@ -22,7 +22,6 @@ import java.net.URI
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable
-import scala.collection.parallel.ForkJoinTaskSupport
 import scala.util.{Failure, Try}
 
 import org.apache.hadoop.conf.Configuration
@@ -532,30 +531,23 @@ object ParquetFileFormat extends Logging {
       conf: Configuration,
       partFiles: Seq[FileStatus],
       ignoreCorruptFiles: Boolean): Seq[Footer] = {
-    val parFiles = partFiles.par
-    val pool = ThreadUtils.newForkJoinPool("readingParquetFooters", 8)
-    parFiles.tasksupport = new ForkJoinTaskSupport(pool)
-    try {
-      parFiles.flatMap { currentFile =>
-        try {
-          // Skips row group information since we only need the schema.
-          // ParquetFileReader.readFooter throws RuntimeException, instead of IOException,
-          // when it can't read the footer.
-          Some(new Footer(currentFile.getPath(),
-            ParquetFileReader.readFooter(
-              conf, currentFile, SKIP_ROW_GROUPS)))
-        } catch { case e: RuntimeException =>
-          if (ignoreCorruptFiles) {
-            logWarning(s"Skipped the footer in the corrupted file: $currentFile", e)
-            None
-          } else {
-            throw new IOException(s"Could not read footer for file: $currentFile", e)
-          }
+    ThreadUtils.parmap(partFiles, "readingParquetFooters", 8) { currentFile =>
+      try {
+        // Skips row group information since we only need the schema.
+        // ParquetFileReader.readFooter throws RuntimeException, instead of IOException,
+        // when it can't read the footer.
+        Some(new Footer(currentFile.getPath(),
+          ParquetFileReader.readFooter(
+            conf, currentFile, SKIP_ROW_GROUPS)))
+      } catch { case e: RuntimeException =>
+        if (ignoreCorruptFiles) {
+          logWarning(s"Skipped the footer in the corrupted file: $currentFile", e)
+          None
+        } else {
+          throw new IOException(s"Could not read footer for file: $currentFile", e)
         }
-      }.seq
-    } finally {
-      pool.shutdown()
-    }
+      }
+    }.flatten
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormatSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormatSuite.scala
@@ -51,9 +51,9 @@ class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLCo
     }
 
     testReadFooters(true)
-    val exception = intercept[java.io.IOException] {
+    val exception = intercept[SparkException] {
       testReadFooters(false)
-    }
+    }.getCause
     assert(exception.getMessage().contains("Could not read footer for file"))
   }
 }
diff --git a/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala b/streaming/src/main/scala/org/apache/spark/streaming/util/FileBasedWriteAheadLog.scala
@@ -312,10 +312,10 @@ private[streaming] object FileBasedWriteAheadLog {
       handler: I => Iterator[O]): Iterator[O] = {
     val taskSupport = new ExecutionContextTaskSupport(executionContext)
     val groupSize = taskSupport.parallelismLevel.max(8)
+    implicit val ec = executionContext
+
     source.grouped(groupSize).flatMap { group =>
-      val parallelCollection = group.par
-      parallelCollection.tasksupport = taskSupport
-      parallelCollection.map(handler)
+      ThreadUtils.parmap(group)(handler)
     }.flatten
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -51,9 +51,9 @@ class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLCo`
`51`	`51`	`}`
`52`	`52`
`53`	`53`	`testReadFooters(true)`
`54`		`- val exception = intercept[java.io.IOException] {`
	`54`	`+ val exception = intercept[SparkException] {`
`55`	`55`	`testReadFooters(false)`
`56`		`- }`
	`56`	`+ }.getCause`
`57`	`57`	`assert(exception.getMessage().contains("Could not read footer for file"))`
`58`	`58`	`}`
`59`	`59`	`}`
Original file line number	Diff line number	Diff line change
`@@ -312,10 +312,10 @@ private[streaming] object FileBasedWriteAheadLog {`
`312`	`312`	`handler: I => Iterator[O]): Iterator[O] = {`
`313`	`313`	`val taskSupport = new ExecutionContextTaskSupport(executionContext)`
`314`	`314`	`val groupSize = taskSupport.parallelismLevel.max(8)`
	`315`	`+ implicit val ec = executionContext`
	`316`	`+`
`315`	`317`	`source.grouped(groupSize).flatMap { group =>`
`316`		`- val parallelCollection = group.par`
`317`		`- parallelCollection.tasksupport = taskSupport`
`318`		`- parallelCollection.map(handler)`
	`318`	`+ ThreadUtils.parmap(group)(handler)`
`319`	`319`	`}.flatten`
`320`	`320`	`}`
`321`	`321`	`}`