@@ -19,16 +19,20 @@ package org.apache.spark.network.yarn
1919import java .io .{DataOutputStream , File , FileOutputStream }
2020
2121import scala .annotation .tailrec
22+ import scala .concurrent .duration ._
2223
23- import org .apache .commons . io . FileUtils
24+ import org .apache .hadoop . fs . Path
2425import org .apache .hadoop .yarn .api .records .ApplicationId
2526import org .apache .hadoop .yarn .conf .YarnConfiguration
2627import org .apache .hadoop .yarn .server .api .{ApplicationInitializationContext , ApplicationTerminationContext }
2728import org .scalatest .{BeforeAndAfterEach , Matchers }
29+ import org .scalatest .concurrent .Eventually ._
30+ import org .scalatest .concurrent .Timeouts
2831
2932import org .apache .spark .SparkFunSuite
3033import org .apache .spark .network .shuffle .ShuffleTestAccessor
3134import org .apache .spark .network .shuffle .protocol .ExecutorShuffleInfo
35+ import org .apache .spark .util .Utils
3236
3337class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach {
3438 private [yarn] var yarnConfig : YarnConfiguration = new YarnConfiguration
@@ -40,15 +44,8 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
4044 yarnConfig.set(YarnConfiguration .NM_AUX_SERVICE_FMT .format(" spark_shuffle" ),
4145 classOf [YarnShuffleService ].getCanonicalName)
4246 yarnConfig.setInt(" spark.shuffle.service.port" , 0 )
43-
44- yarnConfig.get(" yarn.nodemanager.local-dirs" ).split(" ," ).foreach { dir =>
45- val d = new File (dir)
46- if (d.exists()) {
47- FileUtils .deleteDirectory(d)
48- }
49- FileUtils .forceMkdir(d)
50- logInfo(s " creating yarn.nodemanager.local-dirs: $d" )
51- }
47+ val localDir = Utils .createTempDir()
48+ yarnConfig.set(" yarn.nodemanager.local-dirs" , localDir.getAbsolutePath)
5249 }
5350
5451 var s1 : YarnShuffleService = null
@@ -234,7 +231,89 @@ class YarnShuffleServiceSuite extends SparkFunSuite with Matchers with BeforeAnd
234231 s3.initializeApplication(app2Data)
235232 ShuffleTestAccessor .getExecutorInfo(app2Id, " exec-2" , resolver3) should be (Some (shuffleInfo2))
236233 s3.stop()
234+ }
235+
236+ test(" get correct recovery path" ) {
237+ // Test recovery path is set outside the shuffle service, this is to simulate NM recovery
238+ // enabled scenario, where recovery path will be set by yarn.
239+ s1 = new YarnShuffleService
240+ val recoveryPath = new Path (Utils .createTempDir().toURI)
241+ s1.setRecoveryPath(recoveryPath)
242+
243+ s1.init(yarnConfig)
244+ s1._recoveryPath should be (recoveryPath)
245+ s1.stop()
237246
247+ // Test recovery path is set inside the shuffle service, this will be happened when NM
248+ // recovery is not enabled or there's no NM recovery (Hadoop 2.5-).
249+ s2 = new YarnShuffleService
250+ s2.init(yarnConfig)
251+ s2._recoveryPath should be
252+ (new Path (yarnConfig.getTrimmedStrings(" yarn.nodemanager.local-dirs" )(0 )))
253+ s2.stop()
238254 }
239255
240- }
256+ test(" moving recovery file form NM local dir to recovery path" ) {
257+ // This is to test when Hadoop is upgrade to 2.5+ and NM recovery is enabled, we should move
258+ // old recovery file to the new path to keep compatibility
259+
260+ // Simulate s1 is running on old version of Hadoop in which recovery file is in the NM local
261+ // dir.
262+ s1 = new YarnShuffleService
263+ s1.init(yarnConfig)
264+ val app1Id = ApplicationId .newInstance(0 , 1 )
265+ val app1Data : ApplicationInitializationContext =
266+ new ApplicationInitializationContext (" user" , app1Id, null )
267+ s1.initializeApplication(app1Data)
268+ val app2Id = ApplicationId .newInstance(0 , 2 )
269+ val app2Data : ApplicationInitializationContext =
270+ new ApplicationInitializationContext (" user" , app2Id, null )
271+ s1.initializeApplication(app2Data)
272+
273+ val execStateFile = s1.registeredExecutorFile
274+ execStateFile should not be (null )
275+ val shuffleInfo1 = new ExecutorShuffleInfo (Array (" /foo" , " /bar" ), 3 , SORT_MANAGER )
276+ val shuffleInfo2 = new ExecutorShuffleInfo (Array (" /bippy" ), 5 , SORT_MANAGER )
277+
278+ val blockHandler = s1.blockHandler
279+ val blockResolver = ShuffleTestAccessor .getBlockResolver(blockHandler)
280+ ShuffleTestAccessor .registeredExecutorFile(blockResolver) should be (execStateFile)
281+
282+ blockResolver.registerExecutor(app1Id.toString, " exec-1" , shuffleInfo1)
283+ blockResolver.registerExecutor(app2Id.toString, " exec-2" , shuffleInfo2)
284+ ShuffleTestAccessor .getExecutorInfo(app1Id, " exec-1" , blockResolver) should
285+ be (Some (shuffleInfo1))
286+ ShuffleTestAccessor .getExecutorInfo(app2Id, " exec-2" , blockResolver) should
287+ be (Some (shuffleInfo2))
288+
289+ assert(execStateFile.exists(), s " $execStateFile did not exist " )
290+
291+ s1.stop()
292+
293+ // Simulate s2 is running on Hadoop 2.5+ with NM recovery is enabled.
294+ assert(execStateFile.exists())
295+ val recoveryPath = new Path (Utils .createTempDir().toURI)
296+ s2 = new YarnShuffleService
297+ s2.setRecoveryPath(recoveryPath)
298+ s2.init(yarnConfig)
299+
300+ val execStateFile2 = s2.registeredExecutorFile
301+ recoveryPath.toString should be (new Path (execStateFile2.getParentFile.toURI).toString)
302+ eventually(timeout(10 seconds), interval(5 millis)) {
303+ assert(! execStateFile.exists())
304+ }
305+
306+ val handler2 = s2.blockHandler
307+ val resolver2 = ShuffleTestAccessor .getBlockResolver(handler2)
308+
309+ // now we reinitialize only one of the apps, and expect yarn to tell us that app2 was stopped
310+ // during the restart
311+ // Since recovery file is got from old path, so the previous state should be stored.
312+ s2.initializeApplication(app1Data)
313+ s2.stopApplication(new ApplicationTerminationContext (app2Id))
314+ ShuffleTestAccessor .getExecutorInfo(app1Id, " exec-1" , resolver2) should be (Some (shuffleInfo1))
315+ ShuffleTestAccessor .getExecutorInfo(app2Id, " exec-2" , resolver2) should be (None )
316+
317+ s2.stop()
318+ }
319+ }
0 commit comments