-
Notifications
You must be signed in to change notification settings - Fork 29.2k
[SPARK-7826][CORE] Suppress extra calling getCacheLocs. #6352
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
9a80fad
a4d944a
8248386
f87f2ec
b9c835c
6f3125c
d858b59
10b1b22
3d4d036
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -318,7 +318,7 @@ class DAGSchedulerSuite | |
| } | ||
|
|
||
| test("cache location preferences w/ dependency") { | ||
| val baseRdd = new MyRDD(sc, 1, Nil) | ||
| val baseRdd = new MyRDD(sc, 1, Nil).cache() | ||
| val finalRdd = new MyRDD(sc, 1, List(new OneToOneDependency(baseRdd))) | ||
| cacheLocations(baseRdd.id -> 0) = | ||
| Seq(makeBlockManagerId("hostA"), makeBlockManagerId("hostB")) | ||
|
|
@@ -331,7 +331,7 @@ class DAGSchedulerSuite | |
| } | ||
|
|
||
| test("regression test for getCacheLocs") { | ||
| val rdd = new MyRDD(sc, 3, Nil) | ||
| val rdd = new MyRDD(sc, 3, Nil).cache() | ||
| cacheLocations(rdd.id -> 0) = | ||
| Seq(makeBlockManagerId("hostA"), makeBlockManagerId("hostB")) | ||
| cacheLocations(rdd.id -> 1) = | ||
|
|
@@ -342,6 +342,29 @@ class DAGSchedulerSuite | |
| assert(locs === Seq(Seq("hostA", "hostB"), Seq("hostB", "hostC"), Seq("hostC", "hostD"))) | ||
| } | ||
|
|
||
| /** | ||
| * +---+ shuffle +---+ +---+ +---+ | ||
| * | A |<--------| B |<---| C |<---| D | | ||
| * +---+ +---+ +---+ +---+ | ||
| * Here, D has one-to-one dependencies on C. C is derived from A by performing a shuffle | ||
| * and then a map. If we're trying to determine which ancestor stages need to be computed in | ||
| * order to compute D, we need to figure out whether the shuffle A -> B should be performed. | ||
| * If the RDD C, which has only one ancestor via a narrow dependency, is cached, then we won't | ||
| * need to compute A, even if it has some unavailable output partitions. The same goes for B: | ||
| * if B is 100% cached, then we can avoid the shuffle on A. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Josh's comment was an awesome description of how the dependencies should be computed, but isn't quite appropriate here as the comment for the test. What about something like: This test ensures that if a particular RDD is cached, RDDs earlier in the dependency chain are not computed. It constructs the following chain of dependencies: |
||
| */ | ||
| test("SPARK-7826: getMissingParentStages should consider all ancestor RDDs' cache statuses") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you remove "SPARK-7826" from the name of this test, since the test isn't checking for the bug described by SPARK-7826? It's great to add this test in the PR -- but having the JIRA name in the test name is something we usually only do when the test is for the issue described by that JIRA. |
||
| val rddA = new MyRDD(sc, 1, Nil) | ||
| val rddB = new MyRDD(sc, 1, List(new ShuffleDependency(rddA, null))) | ||
| val rddC = new MyRDD(sc, 1, List(new OneToOneDependency(rddB))).cache() | ||
| val rddD = new MyRDD(sc, 1, List(new OneToOneDependency(rddC))) | ||
| cacheLocations(rddC.id -> 0) = | ||
| Seq(makeBlockManagerId("hostA"), makeBlockManagerId("hostB")) | ||
| submit(rddD, Array(0)) | ||
| assert(scheduler.runningStages.size === 1) | ||
| assert(scheduler.runningStages.head.id === 1) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you actually change this to: And then add a comment saying something like "Make sure that the scheduler is running the final result stage. Because C is cached, the shuffle map stage to compute A does not need to be run."
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (I think this is more intuitive; otherwise, it's hard for someone looking at this to understand why the ID should be 1. This also makes the test more agnostic to unrelated scheduler internals, like if we change the way we assign IDs to stages) |
||
| } | ||
|
|
||
| test("avoid exponential blowup when getting preferred locs list") { | ||
| // Build up a complex dependency graph with repeated zip operations, without preferred locations | ||
| var rdd: RDD[_] = new MyRDD(sc, 1, Nil) | ||
|
|
@@ -678,9 +701,9 @@ class DAGSchedulerSuite | |
| } | ||
|
|
||
| test("cached post-shuffle") { | ||
| val shuffleOneRdd = new MyRDD(sc, 2, Nil) | ||
| val shuffleOneRdd = new MyRDD(sc, 2, Nil).cache() | ||
| val shuffleDepOne = new ShuffleDependency(shuffleOneRdd, null) | ||
| val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)) | ||
| val shuffleTwoRdd = new MyRDD(sc, 2, List(shuffleDepOne)).cache() | ||
| val shuffleDepTwo = new ShuffleDependency(shuffleTwoRdd, null) | ||
| val finalRdd = new MyRDD(sc, 1, List(shuffleDepTwo)) | ||
| submit(finalRdd, Array(0)) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To clarify for other reviewers, I think that we need these
cache()calls so these other tests don't fail due to the skipping of the cached locations lookups.