-
Notifications
You must be signed in to change notification settings - Fork 29.2k
[SPARK-22713][CORE] ExternalAppendOnlyMap leaks when spilled during iteration #21369
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
1c4a6af
82591e6
72f6386
48224d9
536a769
589b423
d5ee172
1d1ddce
e3c61fd
bc7dc11
807032d
621bd23
a2e78e2
4366eb4
686b4d9
4e44585
0cf8913
9eb5600
11b5bb4
1bfca67
25be99b
855854a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -80,7 +80,10 @@ class ExternalAppendOnlyMap[K, V, C]( | |
| this(createCombiner, mergeValue, mergeCombiners, serializer, blockManager, TaskContext.get()) | ||
| } | ||
|
|
||
| @volatile private var currentMap = new SizeTrackingAppendOnlyMap[K, C] | ||
| /** | ||
| * Exposed for testing | ||
| */ | ||
| @volatile private[collection] var currentMap = new SizeTrackingAppendOnlyMap[K, C] | ||
| private val spilledMaps = new ArrayBuffer[DiskMapIterator] | ||
| private val sparkConf = SparkEnv.get.conf | ||
| private val diskBlockManager = blockManager.diskBlockManager | ||
|
|
@@ -114,7 +117,10 @@ class ExternalAppendOnlyMap[K, V, C]( | |
| private val keyComparator = new HashComparator[K] | ||
| private val ser = serializer.newInstance() | ||
|
|
||
| @volatile private var readingIterator: SpillableIterator = null | ||
| /** | ||
| * Exposed for testing | ||
| */ | ||
| @volatile private[collection] var readingIterator: SpillableIterator = null | ||
|
|
||
| /** | ||
| * Number of files this map has spilled so far. | ||
|
|
@@ -267,7 +273,7 @@ class ExternalAppendOnlyMap[K, V, C]( | |
| */ | ||
| def destructiveIterator(inMemoryIterator: Iterator[(K, C)]): Iterator[(K, C)] = { | ||
| readingIterator = new SpillableIterator(inMemoryIterator) | ||
| readingIterator | ||
| readingIterator.toCompletionIterator | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change the original behavior of which keep compatibility with current code, and do not introduce unnecessary function.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What behavior does it change? Your suggested codes does exactly the same but is less streamlined and relies on an intermediate value (fortunately it's already a member variable)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't think this little change will pay a huge influence on
The current fix leads to this, not me. And even this variable is not a member variable, we can define a temp local variable. It's not a big deal. |
||
| } | ||
|
|
||
| /** | ||
|
|
@@ -280,8 +286,7 @@ class ExternalAppendOnlyMap[K, V, C]( | |
| "ExternalAppendOnlyMap.iterator is destructive and should only be called once.") | ||
| } | ||
| if (spilledMaps.isEmpty) { | ||
| CompletionIterator[(K, C), Iterator[(K, C)]]( | ||
| destructiveIterator(currentMap.iterator), freeCurrentMap()) | ||
| destructiveIterator(currentMap.iterator) | ||
| } else { | ||
| new ExternalIterator() | ||
| } | ||
|
|
@@ -305,8 +310,8 @@ class ExternalAppendOnlyMap[K, V, C]( | |
|
|
||
| // Input streams are derived both from the in-memory map and spilled maps on disk | ||
| // The in-memory map is sorted in place, while the spilled maps are already in sorted order | ||
| private val sortedMap = CompletionIterator[(K, C), Iterator[(K, C)]](destructiveIterator( | ||
| currentMap.destructiveSortedIterator(keyComparator)), freeCurrentMap()) | ||
| private val sortedMap = destructiveIterator( | ||
| currentMap.destructiveSortedIterator(keyComparator)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These two lines can be merged into one line?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. unfortunately no, scala-style enforces a max of 100 chars per line |
||
| private val inputStreams = (Seq(sortedMap) ++ spilledMaps).map(it => it.buffered) | ||
|
|
||
| inputStreams.foreach { it => | ||
|
|
@@ -568,13 +573,14 @@ class ExternalAppendOnlyMap[K, V, C]( | |
| context.addTaskCompletionListener(context => cleanup()) | ||
| } | ||
|
|
||
| private[this] class SpillableIterator(var upstream: Iterator[(K, C)]) | ||
| /** | ||
| * Exposed for testing | ||
| */ | ||
| private[collection] class SpillableIterator(var upstream: Iterator[(K, C)]) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
| extends Iterator[(K, C)] { | ||
|
|
||
| private val SPILL_LOCK = new Object() | ||
|
|
||
| private var nextUpstream: Iterator[(K, C)] = null | ||
|
|
||
| private var cur: (K, C) = readNext() | ||
|
|
||
| private var hasSpilled: Boolean = false | ||
|
|
@@ -585,17 +591,25 @@ class ExternalAppendOnlyMap[K, V, C]( | |
| } else { | ||
| logInfo(s"Task ${context.taskAttemptId} force spilling in-memory map to disk and " + | ||
| s"it will release ${org.apache.spark.util.Utils.bytesToString(getUsed())} memory") | ||
| nextUpstream = spillMemoryIteratorToDisk(upstream) | ||
| val nextUpstream = spillMemoryIteratorToDisk(upstream) | ||
| assert(!upstream.hasNext) | ||
| hasSpilled = true | ||
| upstream = nextUpstream | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does the change means we should reassign
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Basically yes, according to my understanding of the code this should have happened on the subsequent hasNext/next call. However according to the analysis in the jira the iterator kept holding this reference, my guess: at this point the entire program started suffering lengthy GC pauses that got it into behaving as if under a deadlock,effectively leaving the ref in place (just a guess)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for fixing this issue. I think the potential solution is to change the
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @JerryLead, I'd appreciate if you could test this. Having that said, I'm not sure how long a completed iterator may be 'sitting' before being discarded so I'm not sure if this is worth fixing, especially using the thorough approach.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cloud-fan , the assumption here is that there are two references to the underlying map: the upstream iterator and the external map itself. as I wrote above, I think there's a potentially more fundamental issue with
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cloud-fan , do you think this is worth doing, I'm referring to the CompletionIterator delaying GC of the sub iterator and cleanup function (usually a closure referring to a larger collection). |
||
| true | ||
| } | ||
| } | ||
|
|
||
| private def destroy() : Unit = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: no space before |
||
| freeCurrentMap() | ||
| upstream = Iterator.empty | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Safer, class remains usable if for some reason hasNext is called again, and this costs absolutely nothing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||
| } | ||
|
|
||
| private[ExternalAppendOnlyMap] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's pretty reasonable to have this method public.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm... the class itself is private (slightly relaxed to package private to ease testing) so I'm not sure what's the benefit in making the method public,
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's weird to see a class private method. I'd suggest just remove |
||
| def toCompletionIterator: CompletionIterator[(K, C), SpillableIterator] = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd prefer private for this method
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
| CompletionIterator[(K, C), SpillableIterator](this, this.destroy ) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: no space before |
||
| } | ||
|
|
||
| def readNext(): (K, C) = SPILL_LOCK.synchronized { | ||
| if (nextUpstream != null) { | ||
| upstream = nextUpstream | ||
| nextUpstream = null | ||
| } | ||
| if (upstream.hasNext) { | ||
| upstream.next() | ||
| } else { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,7 @@ import org.apache.spark._ | |
| import org.apache.spark.internal.config._ | ||
| import org.apache.spark.io.CompressionCodec | ||
| import org.apache.spark.memory.MemoryTestingUtils | ||
| import org.apache.spark.util.CompletionIterator | ||
|
|
||
| class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext { | ||
| import TestUtils.{assertNotSpilled, assertSpilled} | ||
|
|
@@ -414,6 +415,99 @@ class ExternalAppendOnlyMapSuite extends SparkFunSuite with LocalSparkContext { | |
| sc.stop() | ||
| } | ||
|
|
||
| test("spill during iteration") { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand what this test want to do. But it seems code without this PR could also pass it if everything goes normally. And I know it's a little hard to reflect the change by unit test. So, I'd prefer to leave some comments to explain the potential memory leak in source code above.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test was written BEFORE the actual fix and it did fail up untill the fix was in place. I do agree it's a bit clumsy and potential future changes may break the original intention of the test. I've referred a potential testing approach (currently limited to scala's source code) which couldn't be (easily) applied to this code base so I made a best effort to test this. |
||
| val size = 1000 | ||
| val conf = createSparkConf(loadDefaults = true) | ||
| sc = new SparkContext("local-cluster[1,1,1024]", "test", conf) | ||
| val map = createExternalMap[Int] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how about this
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this requires using something like scalatest's eventually, don't you think?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Anyway
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cloud-fan , I've tried the it seems this sneaky ref was generated by the following assertion: @cloud-fan , @gatorsmile , can you please confirm if and how can we import the scala code? otherwise, can you think of an alternative approach for testing this?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cloud-fan, how do you suggest to progress with this?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cloud-fan , can we move on with this?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| map.insertAll((0 until size).iterator.map(i => (i / 10, i))) | ||
| assert(map.numSpills == 0, "map was not supposed to spill") | ||
|
|
||
| val it = map.iterator | ||
| assert( it.isInstanceOf[CompletionIterator[_, _]]) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: no space after |
||
| val underlyingIt = map.readingIterator | ||
| assert( underlyingIt != null ) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| val underlyingMapIterator = underlyingIt.upstream | ||
| assert(underlyingMapIterator != null) | ||
| val underlyingMapIteratorClass = underlyingMapIterator.getClass | ||
| assert(underlyingMapIteratorClass.getEnclosingClass == classOf[AppendOnlyMap[_, _]]) | ||
|
|
||
| val underlyingMap = map.currentMap | ||
| assert(underlyingMap != null) | ||
|
|
||
| val first50Keys = for ( _ <- 0 until 50) yield { | ||
| val (k, vs) = it.next | ||
| val sortedVs = vs.sorted | ||
| assert(sortedVs.seq == (0 until 10).map(10 * k + _)) | ||
| k | ||
| } | ||
| assert( map.numSpills == 0 ) | ||
| map.spill(Long.MaxValue, null) | ||
| // these asserts try to show that we're no longer holding references to the underlying map. | ||
| // it'd be nice to use something like | ||
| // https://github.com/scala/scala/blob/2.13.x/test/junit/scala/tools/testing/AssertUtil.scala | ||
| // (lines 69-89) | ||
| assert(map.currentMap == null) | ||
| assert(underlyingIt.upstream ne underlyingMapIterator) | ||
| assert(underlyingIt.upstream.getClass != underlyingMapIteratorClass) | ||
| assert(underlyingIt.upstream.getClass.getEnclosingClass != classOf[AppendOnlyMap[_, _]]) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we want to prove we are no longer holding the reference, why do we check type here?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the underlying map's iterator is an anonymous class, this is the best I could come up with to check if the upstream iterator holds a ref to the underlying map.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we simply check
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm, we can in line 508 but not in this test. in line 508, we can simply compare with Iterator.empty |
||
|
|
||
| val next50Keys = for ( _ <- 0 until 50) yield { | ||
| val (k, vs) = it.next | ||
| val sortedVs = vs.sorted | ||
| assert(sortedVs.seq == (0 until 10).map(10 * k + _)) | ||
| k | ||
| } | ||
| assert(!it.hasNext) | ||
| val keys = (first50Keys ++ next50Keys).sorted | ||
| assert(keys == (0 until 100)) | ||
| } | ||
|
|
||
| test("drop all references to the underlying map once the iterator is exhausted") { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's also put the jira number in the test name.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
| val size = 1000 | ||
| val conf = createSparkConf(loadDefaults = true) | ||
| sc = new SparkContext("local-cluster[1,1,1024]", "test", conf) | ||
| val map = createExternalMap[Int] | ||
|
|
||
| map.insertAll((0 until size).iterator.map(i => (i / 10, i))) | ||
| assert(map.numSpills == 0, "map was not supposed to spill") | ||
|
|
||
| val it = map.iterator | ||
| assert( it.isInstanceOf[CompletionIterator[_, _]]) | ||
| val underlyingIt = map.readingIterator | ||
| assert( underlyingIt != null ) | ||
| val underlyingMapIterator = underlyingIt.upstream | ||
| assert(underlyingMapIterator != null) | ||
| val underlyingMapIteratorClass = underlyingMapIterator.getClass | ||
| assert(underlyingMapIteratorClass.getEnclosingClass == classOf[AppendOnlyMap[_, _]]) | ||
|
|
||
| val underlyingMap = map.currentMap | ||
| assert(underlyingMap != null) | ||
|
|
||
| val keys = it.map{ | ||
| case (k, vs) => | ||
| val sortedVs = vs.sorted | ||
| assert(sortedVs.seq == (0 until 10).map(10 * k + _)) | ||
| k | ||
| } | ||
| .toList | ||
| .sorted | ||
|
|
||
| assert(it.isEmpty) | ||
| assert(keys == (0 until 100)) | ||
|
|
||
| assert( map.numSpills == 0 ) | ||
| // these asserts try to show that we're no longer holding references to the underlying map. | ||
| // it'd be nice to use something like | ||
| // https://github.com/scala/scala/blob/2.13.x/test/junit/scala/tools/testing/AssertUtil.scala | ||
| // (lines 69-89) | ||
| assert(map.currentMap == null) | ||
| assert(underlyingIt.upstream ne underlyingMapIterator) | ||
| assert(underlyingIt.upstream.getClass != underlyingMapIteratorClass) | ||
| assert(underlyingIt.upstream.getClass.getEnclosingClass != classOf[AppendOnlyMap[_, _]]) | ||
| } | ||
|
|
||
| test("external aggregation updates peak execution memory") { | ||
| val spillThreshold = 1000 | ||
| val conf = createSparkConf(loadDefaults = false) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not exposed in the test.