-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-6963][CORE]Flaky test: o.a.s.ContextCleanerSuite automatically cleanup checkpoint #5548
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -224,7 +224,7 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase { | |
| assert(fs.exists(path)) | ||
|
|
||
| // the checkpoint is not cleaned by default (without the configuration set) | ||
| var postGCTester = new CleanerTester(sc, Seq(rddId), Nil, Nil) | ||
| var postGCTester = new CleanerTester(sc, Seq(rddId), Nil, Nil, Nil) | ||
| rdd = null // Make RDD out of scope | ||
| runGC() | ||
| postGCTester.assertCleanup() | ||
|
|
@@ -245,7 +245,7 @@ class ContextCleanerSuite extends ContextCleanerSuiteBase { | |
| assert(fs.exists(RDDCheckpointData.rddCheckpointDataPath(sc, rddId).get)) | ||
|
|
||
| // Test that GC causes checkpoint data cleanup after dereferencing the RDD | ||
| postGCTester = new CleanerTester(sc, Seq(rddId), Nil, Nil) | ||
| postGCTester = new CleanerTester(sc, Seq(rddId), Nil, Nil, Seq(rddId)) | ||
| rdd = null // Make RDD out of scope | ||
| runGC() | ||
| postGCTester.assertCleanup() | ||
|
|
@@ -406,12 +406,14 @@ class CleanerTester( | |
| sc: SparkContext, | ||
| rddIds: Seq[Int] = Seq.empty, | ||
| shuffleIds: Seq[Int] = Seq.empty, | ||
| broadcastIds: Seq[Long] = Seq.empty) | ||
| broadcastIds: Seq[Long] = Seq.empty, | ||
| checkpointIds: Seq[Long] = Seq.empty) | ||
| extends Logging { | ||
|
|
||
| val toBeCleanedRDDIds = new HashSet[Int] with SynchronizedSet[Int] ++= rddIds | ||
| val toBeCleanedShuffleIds = new HashSet[Int] with SynchronizedSet[Int] ++= shuffleIds | ||
| val toBeCleanedBroadcstIds = new HashSet[Long] with SynchronizedSet[Long] ++= broadcastIds | ||
| val toBeCheckpointIds = new HashSet[Long] with SynchronizedSet[Long] ++= checkpointIds | ||
| val isDistributed = !sc.isLocal | ||
|
|
||
| val cleanerListener = new CleanerListener { | ||
|
|
@@ -427,12 +429,17 @@ class CleanerTester( | |
|
|
||
| def broadcastCleaned(broadcastId: Long): Unit = { | ||
| toBeCleanedBroadcstIds -= broadcastId | ||
| logInfo("Broadcast" + broadcastId + " cleaned") | ||
| logInfo("Broadcast " + broadcastId + " cleaned") | ||
| } | ||
|
|
||
| def accumCleaned(accId: Long): Unit = { | ||
| logInfo("Cleaned accId " + accId + " cleaned") | ||
| } | ||
|
|
||
| def checkpointCleaned(rddId: Long): Unit = { | ||
| toBeCheckpointIds -= rddId | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @srowen When the checkpoint is cleaned,
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great, that's a good explanation. I see that this makes it effectively wait longer to proceed, by which time some necessary conditions are true, like that checkpoint cleanup has happened. Thank you for that detail, and to the limits of my knowledge, this looks good. |
||
| logInfo("checkpoint " + rddId + " cleaned") | ||
| } | ||
| } | ||
|
|
||
| val MAX_VALIDATION_ATTEMPTS = 10 | ||
|
|
@@ -456,7 +463,8 @@ class CleanerTester( | |
|
|
||
| /** Verify that RDDs, shuffles, etc. occupy resources */ | ||
| private def preCleanupValidate() { | ||
| assert(rddIds.nonEmpty || shuffleIds.nonEmpty || broadcastIds.nonEmpty, "Nothing to cleanup") | ||
| assert(rddIds.nonEmpty || shuffleIds.nonEmpty || broadcastIds.nonEmpty || | ||
| checkpointIds.nonEmpty, "Nothing to cleanup") | ||
|
|
||
| // Verify the RDDs have been persisted and blocks are present | ||
| rddIds.foreach { rddId => | ||
|
|
@@ -547,7 +555,8 @@ class CleanerTester( | |
| private def isAllCleanedUp = | ||
| toBeCleanedRDDIds.isEmpty && | ||
| toBeCleanedShuffleIds.isEmpty && | ||
| toBeCleanedBroadcstIds.isEmpty | ||
| toBeCleanedBroadcstIds.isEmpty && | ||
| toBeCheckpointIds.isEmpty | ||
|
|
||
| private def getRDDBlocks(rddId: Int): Seq[BlockId] = { | ||
| blockManager.master.getMatchingBlockIds( _ match { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@srowen This is just a test code bug.
new CleanerTester(sc, Seq(rddId), Nil, Nil)This code is only to ensure that the RDD is cleaned, but checkpoint. rdd and checkpoint almost simultaneously be cleaned, But there are exceptions, depending on the GC. ThecheckpointCleanedensure the checkpoint is cleaned.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How does this fix the assertion that failed? you add a new set of IDs that must now be counted down by a new callback, and you assert it ends up empty, which is good. But the assertion that failed was earlier:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The tips are more confusing.
The jira display the error in: ContextCleanerSuite.scala#L252.
The code in #L252 is
assert(!fs.exists(RDDCheckpointData.rddCheckpointDataPath(sc, rddId).get))There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah right, it is the following one. Still, how does this affect whether the checkpoint files exist?