-
Notifications
You must be signed in to change notification settings - Fork 29k
SPARK-7237 Clean function in several RDD methods #5959
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6c124a9
6846e40
0c8d47e
8b50d93
164d3e4
d92bfcf
c2786df
55d01eb
36feb6c
f6014c0
56d7c92
f83d445
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -717,7 +717,8 @@ abstract class RDD[T: ClassTag]( | |
| def mapPartitionsWithContext[U: ClassTag]( | ||
| f: (TaskContext, Iterator[T]) => Iterator[U], | ||
| preservesPartitioning: Boolean = false): RDD[U] = withScope { | ||
| val func = (context: TaskContext, index: Int, iter: Iterator[T]) => f(context, iter) | ||
| val cleanF = sc.clean(f) | ||
| val func = (context: TaskContext, index: Int, iter: Iterator[T]) => cleanF(context, iter) | ||
| new MapPartitionsRDD(this, sc.clean(func), preservesPartitioning) | ||
| } | ||
|
|
||
|
|
@@ -741,9 +742,11 @@ abstract class RDD[T: ClassTag]( | |
| def mapWith[A, U: ClassTag] | ||
| (constructA: Int => A, preservesPartitioning: Boolean = false) | ||
| (f: (T, A) => U): RDD[U] = withScope { | ||
| val cleanF = sc.clean(f) | ||
| val cleanA = sc.clean(constructA) | ||
| mapPartitionsWithIndex((index, iter) => { | ||
| val a = constructA(index) | ||
| iter.map(t => f(t, a)) | ||
| val a = cleanA(index) | ||
| iter.map(t => cleanF(t, a)) | ||
| }, preservesPartitioning) | ||
| } | ||
|
|
||
|
|
@@ -756,9 +759,11 @@ abstract class RDD[T: ClassTag]( | |
| def flatMapWith[A, U: ClassTag] | ||
| (constructA: Int => A, preservesPartitioning: Boolean = false) | ||
| (f: (T, A) => Seq[U]): RDD[U] = withScope { | ||
| val cleanF = sc.clean(f) | ||
| val cleanA = sc.clean(constructA) | ||
| mapPartitionsWithIndex((index, iter) => { | ||
| val a = constructA(index) | ||
| iter.flatMap(t => f(t, a)) | ||
| val a = cleanA(index) | ||
| iter.flatMap(t => cleanF(t, a)) | ||
| }, preservesPartitioning) | ||
| } | ||
|
|
||
|
|
@@ -769,9 +774,11 @@ abstract class RDD[T: ClassTag]( | |
| */ | ||
| @deprecated("use mapPartitionsWithIndex and foreach", "1.0.0") | ||
| def foreachWith[A](constructA: Int => A)(f: (T, A) => Unit): Unit = withScope { | ||
| val cleanF = sc.clean(f) | ||
| val cleanA = sc.clean(constructA) | ||
| mapPartitionsWithIndex { (index, iter) => | ||
| val a = constructA(index) | ||
| iter.map(t => {f(t, a); t}) | ||
| val a = cleanA(index) | ||
| iter.map(t => {cleanF(t, a); t}) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -782,9 +789,11 @@ abstract class RDD[T: ClassTag]( | |
| */ | ||
| @deprecated("use mapPartitionsWithIndex and filter", "1.0.0") | ||
| def filterWith[A](constructA: Int => A)(p: (T, A) => Boolean): RDD[T] = withScope { | ||
| val cleanP = sc.clean(p) | ||
| val cleanA = sc.clean(constructA) | ||
| mapPartitionsWithIndex((index, iter) => { | ||
| val a = constructA(index) | ||
| iter.filter(t => p(t, a)) | ||
| val a = cleanA(index) | ||
| iter.filter(t => cleanP(t, a)) | ||
| }, preservesPartitioning = true) | ||
| } | ||
|
|
||
|
|
@@ -901,7 +910,8 @@ abstract class RDD[T: ClassTag]( | |
| * Return an RDD that contains all matching values by applying `f`. | ||
| */ | ||
| def collect[U: ClassTag](f: PartialFunction[T, U]): RDD[U] = withScope { | ||
| filter(f.isDefinedAt).map(f) | ||
| val cleanF = sc.clean(f) | ||
| filter(cleanF.isDefinedAt).map(cleanF) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this is correct, but I'm actually not 100% sure if it's necessary. I think it is because the In any case, I would recommend that we keep this change since in the worst case we clean a closure twice, which is harmless (we have tests for this). |
||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| package org.apache.spark.util | ||
|
|
||
| import java.io.NotSerializableException | ||
| import java.util.Random | ||
|
|
||
| import org.scalatest.FunSuite | ||
|
|
||
|
|
@@ -92,6 +93,11 @@ class ClosureCleanerSuite extends FunSuite { | |
| expectCorrectException { TestUserClosuresActuallyCleaned.testKeyBy(rdd) } | ||
| expectCorrectException { TestUserClosuresActuallyCleaned.testMapPartitions(rdd) } | ||
| expectCorrectException { TestUserClosuresActuallyCleaned.testMapPartitionsWithIndex(rdd) } | ||
| expectCorrectException { TestUserClosuresActuallyCleaned.testMapPartitionsWithContext(rdd) } | ||
| expectCorrectException { TestUserClosuresActuallyCleaned.testFlatMapWith(rdd) } | ||
| expectCorrectException { TestUserClosuresActuallyCleaned.testFilterWith(rdd) } | ||
| expectCorrectException { TestUserClosuresActuallyCleaned.testForEachWith(rdd) } | ||
| expectCorrectException { TestUserClosuresActuallyCleaned.testMapWith(rdd) } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this is missing |
||
| expectCorrectException { TestUserClosuresActuallyCleaned.testZipPartitions2(rdd) } | ||
| expectCorrectException { TestUserClosuresActuallyCleaned.testZipPartitions3(rdd) } | ||
| expectCorrectException { TestUserClosuresActuallyCleaned.testZipPartitions4(rdd) } | ||
|
|
@@ -260,6 +266,21 @@ private object TestUserClosuresActuallyCleaned { | |
| def testMapPartitionsWithIndex(rdd: RDD[Int]): Unit = { | ||
| rdd.mapPartitionsWithIndex { (_, it) => return; it }.count() | ||
| } | ||
| def testFlatMapWith(rdd: RDD[Int]): Unit = { | ||
| rdd.flatMapWith ((index: Int) => new Random(index + 42)){ (_, it) => return; Seq() }.count() | ||
| } | ||
| def testMapWith(rdd: RDD[Int]): Unit = { | ||
| rdd.mapWith ((index: Int) => new Random(index + 42)){ (_, it) => return; 0 }.count() | ||
| } | ||
| def testFilterWith(rdd: RDD[Int]): Unit = { | ||
| rdd.filterWith ((index: Int) => new Random(index + 42)){ (_, it) => return; true }.count() | ||
| } | ||
| def testForEachWith(rdd: RDD[Int]): Unit = { | ||
| rdd.foreachWith ((index: Int) => new Random(index + 42)){ (_, it) => return } | ||
| } | ||
| def testMapPartitionsWithContext(rdd: RDD[Int]): Unit = { | ||
| rdd.mapPartitionsWithContext { (_, it) => return; it }.count() | ||
| } | ||
| def testZipPartitions2(rdd: RDD[Int]): Unit = { | ||
| rdd.zipPartitions(rdd) { case (it1, it2) => return; it1 }.count() | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
my understanding is that mappartitionsWithIndex cleans, and thus anything in that is cleaned. maybe it's an incorrect assumption. cc @andrewor14?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like Reynold is correct.
I can update the PR for collect() and undo the change for other methods touched.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think collect might've been cleaned in DAGScheduler's runJob. Double check that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I believe that @andrewor14 added some tests for closure cleaning as part of his recent ClosureCleaner patch; we might check whether that test suite covers these methods.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The test suite does not currently cover these methods because they are deprecated, but maybe we should just add them. (@JoshRosen is referring to
ClosureCleanerSuite)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@rxin @ted-yu actually even though
mapPartitionsWithIndexdoes clean already, it cleans the whole closure but not the ones used in the closure. In this case, I believe it's actually necessary to cleanfhere since we won't actually clean it frommapPartitionsWithIndex. For the same reason I believe we also need to cleanconstructAsince it's a closure provided by the user.