Skip to content

Commit 3052f49

Browse files
Marcelo Vanzinsquito
authored andcommitted
[SPARK-4705] Handle multiple app attempts event logs, history server.
This change modifies the event logging listener to write the logs for different application attempts to different files. The attempt ID is set by the scheduler backend, so as long as the backend returns that ID to SparkContext, things should work. Currently, the YARN backend does that. The history server was also modified to model multiple attempts per application. Each attempt has its own UI and a separate row in the listing table, so that users can look at all the attempts separately. The UI "adapts" itself to avoid showing attempt-specific info when all the applications being shown have a single attempt. Author: Marcelo Vanzin <[email protected]> Author: twinkle sachdeva <[email protected]> Author: twinkle.sachdeva <[email protected]> Author: twinkle sachdeva <[email protected]> Closes apache#5432 from vanzin/SPARK-4705 and squashes the following commits: 7e289fa [Marcelo Vanzin] Review feedback. f66dcc5 [Marcelo Vanzin] Merge branch 'master' into SPARK-4705 bc885b7 [Marcelo Vanzin] Review feedback. 76a3651 [Marcelo Vanzin] Fix log cleaner, add test. 7c381ec [Marcelo Vanzin] Merge branch 'master' into SPARK-4705 1aa309d [Marcelo Vanzin] Improve sorting of app attempts. 2ad77e7 [Marcelo Vanzin] Missed a reference to the old property name. 9d59d92 [Marcelo Vanzin] Scalastyle... d5a9c37 [Marcelo Vanzin] Update JsonProtocol test, make property name consistent. ba34b69 [Marcelo Vanzin] Use Option[String] for attempt id. f1cb9b3 [Marcelo Vanzin] Merge branch 'master' into SPARK-4705 c14ec19 [Marcelo Vanzin] Merge branch 'master' into SPARK-4705 9092d39 [Marcelo Vanzin] Merge branch 'master' into SPARK-4705 86de638 [Marcelo Vanzin] Merge branch 'master' into SPARK-4705 07446c6 [Marcelo Vanzin] Disable striping for app id / name when multiple attempts exist. 9092af5 [Marcelo Vanzin] Fix HistoryServer test. 3a14503 [Marcelo Vanzin] Argh scalastyle. 657ec18 [Marcelo Vanzin] Fix yarn history URL, app links. c3e0a82 [Marcelo Vanzin] Move app name to app info, more UI fixes. ce5ee5d [Marcelo Vanzin] Misc UI, test, style fixes. cbe8bba [Marcelo Vanzin] Attempt ID in listener event should be an option. 88b1de8 [Marcelo Vanzin] Add a test for apps with multiple attempts. 3245aa2 [Marcelo Vanzin] Make app attempts part of the history server model. 5fd5c6f [Marcelo Vanzin] Fix my broken rebase. 318525a [twinkle.sachdeva] SPARK-4705: 1) moved from directory structure to single file, as per the master branch. 2) Added the attempt id inside the SparkListenerApplicationStart, to make the info available independent of directory structure. 3) Changes in History Server to render the UI as per the snaphot II 6b2e521 [twinkle sachdeva] SPARK-4705 Incorporating the review comments regarding formatting, will do the rest of the changes after this 4c1fc26 [twinkle sachdeva] SPARK-4705 Incorporating the review comments regarding formatting, will do the rest of the changes after this 0eb7722 [twinkle sachdeva] SPARK-4705: Doing cherry-pick of fix into master
1 parent 7fe0f3f commit 3052f49

21 files changed

+546
-201
lines changed

core/src/main/scala/org/apache/spark/SparkContext.scala

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
217217
private var _heartbeatReceiver: RpcEndpointRef = _
218218
@volatile private var _dagScheduler: DAGScheduler = _
219219
private var _applicationId: String = _
220+
private var _applicationAttemptId: Option[String] = None
220221
private var _eventLogger: Option[EventLoggingListener] = None
221222
private var _executorAllocationManager: Option[ExecutorAllocationManager] = None
222223
private var _cleaner: Option[ContextCleaner] = None
@@ -315,6 +316,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
315316
}
316317

317318
def applicationId: String = _applicationId
319+
def applicationAttemptId: Option[String] = _applicationAttemptId
318320

319321
def metricsSystem: MetricsSystem = if (_env != null) _env.metricsSystem else null
320322

@@ -472,6 +474,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
472474
_taskScheduler.start()
473475

474476
_applicationId = _taskScheduler.applicationId()
477+
_applicationAttemptId = taskScheduler.applicationAttemptId()
475478
_conf.set("spark.app.id", _applicationId)
476479
_env.blockManager.initialize(_applicationId)
477480

@@ -484,7 +487,8 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
484487
_eventLogger =
485488
if (isEventLogEnabled) {
486489
val logger =
487-
new EventLoggingListener(_applicationId, _eventLogDir.get, _conf, _hadoopConfiguration)
490+
new EventLoggingListener(_applicationId, _applicationAttemptId, _eventLogDir.get,
491+
_conf, _hadoopConfiguration)
488492
logger.start()
489493
listenerBus.addListener(logger)
490494
Some(logger)
@@ -1868,7 +1872,7 @@ class SparkContext(config: SparkConf) extends Logging with ExecutorAllocationCli
18681872
// Note: this code assumes that the task scheduler has been initialized and has contacted
18691873
// the cluster manager to get an application ID (in case the cluster manager provides one).
18701874
listenerBus.post(SparkListenerApplicationStart(appName, Some(applicationId),
1871-
startTime, sparkUser))
1875+
startTime, sparkUser, applicationAttemptId))
18721876
}
18731877

18741878
/** Post the application end event */

core/src/main/scala/org/apache/spark/deploy/history/ApplicationHistoryProvider.scala

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,19 @@ package org.apache.spark.deploy.history
1919

2020
import org.apache.spark.ui.SparkUI
2121

22-
private[history] case class ApplicationHistoryInfo(
23-
id: String,
24-
name: String,
22+
private[history] case class ApplicationAttemptInfo(
23+
attemptId: Option[String],
2524
startTime: Long,
2625
endTime: Long,
2726
lastUpdated: Long,
2827
sparkUser: String,
2928
completed: Boolean = false)
3029

30+
private[history] case class ApplicationHistoryInfo(
31+
id: String,
32+
name: String,
33+
attempts: List[ApplicationAttemptInfo])
34+
3135
private[history] abstract class ApplicationHistoryProvider {
3236

3337
/**
@@ -41,9 +45,10 @@ private[history] abstract class ApplicationHistoryProvider {
4145
* Returns the Spark UI for a specific application.
4246
*
4347
* @param appId The application ID.
48+
* @param attemptId The application attempt ID (or None if there is no attempt ID).
4449
* @return The application's UI, or None if application is not found.
4550
*/
46-
def getAppUI(appId: String): Option[SparkUI]
51+
def getAppUI(appId: String, attemptId: Option[String]): Option[SparkUI]
4752

4853
/**
4954
* Called when the server is shutting down.

core/src/main/scala/org/apache/spark/deploy/history/FsHistoryProvider.scala

Lines changed: 138 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -32,16 +32,20 @@ import org.apache.spark.deploy.SparkHadoopUtil
3232
import org.apache.spark.io.CompressionCodec
3333
import org.apache.spark.scheduler._
3434
import org.apache.spark.ui.SparkUI
35-
import org.apache.spark.util.{ThreadUtils, Utils}
35+
import org.apache.spark.util.{Clock, SystemClock, ThreadUtils, Utils}
3636
import org.apache.spark.{Logging, SecurityManager, SparkConf}
3737

3838
/**
3939
* A class that provides application history from event logs stored in the file system.
4040
* This provider checks for new finished applications in the background periodically and
4141
* renders the history application UI by parsing the associated event logs.
4242
*/
43-
private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHistoryProvider
44-
with Logging {
43+
private[history] class FsHistoryProvider(conf: SparkConf, clock: Clock)
44+
extends ApplicationHistoryProvider with Logging {
45+
46+
def this(conf: SparkConf) = {
47+
this(conf, new SystemClock())
48+
}
4549

4650
import FsHistoryProvider._
4751

@@ -75,8 +79,8 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
7579
@volatile private var applications: mutable.LinkedHashMap[String, FsApplicationHistoryInfo]
7680
= new mutable.LinkedHashMap()
7781

78-
// List of applications to be deleted by event log cleaner.
79-
private var appsToClean = new mutable.ListBuffer[FsApplicationHistoryInfo]
82+
// List of application logs to be deleted by event log cleaner.
83+
private var attemptsToClean = new mutable.ListBuffer[FsApplicationAttemptInfo]
8084

8185
// Constants used to parse Spark 1.0.0 log directories.
8286
private[history] val LOG_PREFIX = "EVENT_LOG_"
@@ -138,31 +142,33 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
138142

139143
override def getListing(): Iterable[FsApplicationHistoryInfo] = applications.values
140144

141-
override def getAppUI(appId: String): Option[SparkUI] = {
145+
override def getAppUI(appId: String, attemptId: Option[String]): Option[SparkUI] = {
142146
try {
143-
applications.get(appId).map { info =>
144-
val replayBus = new ReplayListenerBus()
145-
val ui = {
146-
val conf = this.conf.clone()
147-
val appSecManager = new SecurityManager(conf)
148-
SparkUI.createHistoryUI(conf, replayBus, appSecManager, appId,
149-
s"${HistoryServer.UI_PATH_PREFIX}/$appId")
150-
// Do not call ui.bind() to avoid creating a new server for each application
151-
}
147+
applications.get(appId).flatMap { appInfo =>
148+
appInfo.attempts.find(_.attemptId == attemptId).map { attempt =>
149+
val replayBus = new ReplayListenerBus()
150+
val ui = {
151+
val conf = this.conf.clone()
152+
val appSecManager = new SecurityManager(conf)
153+
SparkUI.createHistoryUI(conf, replayBus, appSecManager, appId,
154+
HistoryServer.getAttemptURI(appId, attempt.attemptId))
155+
// Do not call ui.bind() to avoid creating a new server for each application
156+
}
152157

153-
val appListener = new ApplicationEventListener()
154-
replayBus.addListener(appListener)
155-
val appInfo = replay(fs.getFileStatus(new Path(logDir, info.logPath)), replayBus)
158+
val appListener = new ApplicationEventListener()
159+
replayBus.addListener(appListener)
160+
val appInfo = replay(fs.getFileStatus(new Path(logDir, attempt.logPath)), replayBus)
156161

157-
ui.setAppName(s"${appInfo.name} ($appId)")
162+
ui.setAppName(s"${appInfo.name} ($appId)")
158163

159-
val uiAclsEnabled = conf.getBoolean("spark.history.ui.acls.enable", false)
160-
ui.getSecurityManager.setAcls(uiAclsEnabled)
161-
// make sure to set admin acls before view acls so they are properly picked up
162-
ui.getSecurityManager.setAdminAcls(appListener.adminAcls.getOrElse(""))
163-
ui.getSecurityManager.setViewAcls(appInfo.sparkUser,
164-
appListener.viewAcls.getOrElse(""))
165-
ui
164+
val uiAclsEnabled = conf.getBoolean("spark.history.ui.acls.enable", false)
165+
ui.getSecurityManager.setAcls(uiAclsEnabled)
166+
// make sure to set admin acls before view acls so they are properly picked up
167+
ui.getSecurityManager.setAdminAcls(appListener.adminAcls.getOrElse(""))
168+
ui.getSecurityManager.setViewAcls(attempt.sparkUser,
169+
appListener.viewAcls.getOrElse(""))
170+
ui
171+
}
166172
}
167173
} catch {
168174
case e: FileNotFoundException => None
@@ -220,7 +226,7 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
220226
*/
221227
private def mergeApplicationListing(logs: Seq[FileStatus]): Unit = {
222228
val bus = new ReplayListenerBus()
223-
val newApps = logs.flatMap { fileStatus =>
229+
val newAttempts = logs.flatMap { fileStatus =>
224230
try {
225231
val res = replay(fileStatus, bus)
226232
logInfo(s"Application log ${res.logPath} loaded successfully.")
@@ -232,76 +238,104 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
232238
e)
233239
None
234240
}
235-
}.toSeq.sortWith(compareAppInfo)
236-
237-
// When there are new logs, merge the new list with the existing one, maintaining
238-
// the expected ordering (descending end time). Maintaining the order is important
239-
// to avoid having to sort the list every time there is a request for the log list.
240-
if (newApps.nonEmpty) {
241-
val mergedApps = new mutable.LinkedHashMap[String, FsApplicationHistoryInfo]()
242-
def addIfAbsent(info: FsApplicationHistoryInfo): Unit = {
243-
if (!mergedApps.contains(info.id) ||
244-
mergedApps(info.id).logPath.endsWith(EventLoggingListener.IN_PROGRESS) &&
245-
!info.logPath.endsWith(EventLoggingListener.IN_PROGRESS)) {
246-
mergedApps += (info.id -> info)
247-
}
248-
}
241+
}
249242

250-
val newIterator = newApps.iterator.buffered
251-
val oldIterator = applications.values.iterator.buffered
252-
while (newIterator.hasNext && oldIterator.hasNext) {
253-
if (compareAppInfo(newIterator.head, oldIterator.head)) {
254-
addIfAbsent(newIterator.next())
255-
} else {
256-
addIfAbsent(oldIterator.next())
243+
if (newAttempts.isEmpty) {
244+
return
245+
}
246+
247+
// Build a map containing all apps that contain new attempts. The app information in this map
248+
// contains both the new app attempt, and those that were already loaded in the existing apps
249+
// map. If an attempt has been updated, it replaces the old attempt in the list.
250+
val newAppMap = new mutable.HashMap[String, FsApplicationHistoryInfo]()
251+
newAttempts.foreach { attempt =>
252+
val appInfo = newAppMap.get(attempt.appId)
253+
.orElse(applications.get(attempt.appId))
254+
.map { app =>
255+
val attempts =
256+
app.attempts.filter(_.attemptId != attempt.attemptId).toList ++ List(attempt)
257+
new FsApplicationHistoryInfo(attempt.appId, attempt.name,
258+
attempts.sortWith(compareAttemptInfo))
257259
}
260+
.getOrElse(new FsApplicationHistoryInfo(attempt.appId, attempt.name, List(attempt)))
261+
newAppMap(attempt.appId) = appInfo
262+
}
263+
264+
// Merge the new app list with the existing one, maintaining the expected ordering (descending
265+
// end time). Maintaining the order is important to avoid having to sort the list every time
266+
// there is a request for the log list.
267+
val newApps = newAppMap.values.toSeq.sortWith(compareAppInfo)
268+
val mergedApps = new mutable.LinkedHashMap[String, FsApplicationHistoryInfo]()
269+
def addIfAbsent(info: FsApplicationHistoryInfo): Unit = {
270+
if (!mergedApps.contains(info.id)) {
271+
mergedApps += (info.id -> info)
258272
}
259-
newIterator.foreach(addIfAbsent)
260-
oldIterator.foreach(addIfAbsent)
273+
}
261274

262-
applications = mergedApps
275+
val newIterator = newApps.iterator.buffered
276+
val oldIterator = applications.values.iterator.buffered
277+
while (newIterator.hasNext && oldIterator.hasNext) {
278+
if (newAppMap.contains(oldIterator.head.id)) {
279+
oldIterator.next()
280+
} else if (compareAppInfo(newIterator.head, oldIterator.head)) {
281+
addIfAbsent(newIterator.next())
282+
} else {
283+
addIfAbsent(oldIterator.next())
284+
}
263285
}
286+
newIterator.foreach(addIfAbsent)
287+
oldIterator.foreach(addIfAbsent)
288+
289+
applications = mergedApps
264290
}
265291

266292
/**
267293
* Delete event logs from the log directory according to the clean policy defined by the user.
268294
*/
269-
private def cleanLogs(): Unit = {
295+
private[history] def cleanLogs(): Unit = {
270296
try {
271297
val maxAge = conf.getTimeAsSeconds("spark.history.fs.cleaner.maxAge", "7d") * 1000
272298

273-
val now = System.currentTimeMillis()
299+
val now = clock.getTimeMillis()
274300
val appsToRetain = new mutable.LinkedHashMap[String, FsApplicationHistoryInfo]()
275301

302+
def shouldClean(attempt: FsApplicationAttemptInfo): Boolean = {
303+
now - attempt.lastUpdated > maxAge && attempt.completed
304+
}
305+
276306
// Scan all logs from the log directory.
277307
// Only completed applications older than the specified max age will be deleted.
278-
applications.values.foreach { info =>
279-
if (now - info.lastUpdated <= maxAge || !info.completed) {
280-
appsToRetain += (info.id -> info)
281-
} else {
282-
appsToClean += info
308+
applications.values.foreach { app =>
309+
val (toClean, toRetain) = app.attempts.partition(shouldClean)
310+
attemptsToClean ++= toClean
311+
312+
if (toClean.isEmpty) {
313+
appsToRetain += (app.id -> app)
314+
} else if (toRetain.nonEmpty) {
315+
appsToRetain += (app.id ->
316+
new FsApplicationHistoryInfo(app.id, app.name, toRetain.toList))
283317
}
284318
}
285319

286320
applications = appsToRetain
287321

288-
val leftToClean = new mutable.ListBuffer[FsApplicationHistoryInfo]
289-
appsToClean.foreach { info =>
322+
val leftToClean = new mutable.ListBuffer[FsApplicationAttemptInfo]
323+
attemptsToClean.foreach { attempt =>
290324
try {
291-
val path = new Path(logDir, info.logPath)
325+
val path = new Path(logDir, attempt.logPath)
292326
if (fs.exists(path)) {
293327
fs.delete(path, true)
294328
}
295329
} catch {
296330
case e: AccessControlException =>
297-
logInfo(s"No permission to delete ${info.logPath}, ignoring.")
331+
logInfo(s"No permission to delete ${attempt.logPath}, ignoring.")
298332
case t: IOException =>
299-
logError(s"IOException in cleaning logs of ${info.logPath}", t)
300-
leftToClean += info
333+
logError(s"IOException in cleaning ${attempt.logPath}", t)
334+
leftToClean += attempt
301335
}
302336
}
303337

304-
appsToClean = leftToClean
338+
attemptsToClean = leftToClean
305339
} catch {
306340
case t: Exception => logError("Exception in cleaning logs", t)
307341
}
@@ -315,14 +349,36 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
315349
private def compareAppInfo(
316350
i1: FsApplicationHistoryInfo,
317351
i2: FsApplicationHistoryInfo): Boolean = {
318-
if (i1.endTime != i2.endTime) i1.endTime >= i2.endTime else i1.startTime >= i2.startTime
352+
val a1 = i1.attempts.head
353+
val a2 = i2.attempts.head
354+
if (a1.endTime != a2.endTime) a1.endTime >= a2.endTime else a1.startTime >= a2.startTime
355+
}
356+
357+
/**
358+
* Comparison function that defines the sort order for application attempts within the same
359+
* application. Order is: running attempts before complete attempts, running attempts sorted
360+
* by start time, completed attempts sorted by end time.
361+
*
362+
* Normally applications should have a single running attempt; but failure to call sc.stop()
363+
* may cause multiple running attempts to show up.
364+
*
365+
* @return Whether `a1` should precede `a2`.
366+
*/
367+
private def compareAttemptInfo(
368+
a1: FsApplicationAttemptInfo,
369+
a2: FsApplicationAttemptInfo): Boolean = {
370+
if (a1.completed == a2.completed) {
371+
if (a1.completed) a1.endTime >= a2.endTime else a1.startTime >= a2.startTime
372+
} else {
373+
!a1.completed
374+
}
319375
}
320376

321377
/**
322378
* Replays the events in the specified log file and returns information about the associated
323379
* application.
324380
*/
325-
private def replay(eventLog: FileStatus, bus: ReplayListenerBus): FsApplicationHistoryInfo = {
381+
private def replay(eventLog: FileStatus, bus: ReplayListenerBus): FsApplicationAttemptInfo = {
326382
val logPath = eventLog.getPath()
327383
logInfo(s"Replaying log path: $logPath")
328384
val logInput =
@@ -336,10 +392,11 @@ private[history] class FsHistoryProvider(conf: SparkConf) extends ApplicationHis
336392
val appCompleted = isApplicationCompleted(eventLog)
337393
bus.addListener(appListener)
338394
bus.replay(logInput, logPath.toString, !appCompleted)
339-
new FsApplicationHistoryInfo(
395+
new FsApplicationAttemptInfo(
340396
logPath.getName(),
341-
appListener.appId.getOrElse(logPath.getName()),
342397
appListener.appName.getOrElse(NOT_STARTED),
398+
appListener.appId.getOrElse(logPath.getName()),
399+
appListener.appAttemptId,
343400
appListener.startTime.getOrElse(-1L),
344401
appListener.endTime.getOrElse(-1L),
345402
getModificationTime(eventLog).get,
@@ -425,13 +482,21 @@ private object FsHistoryProvider {
425482
val DEFAULT_LOG_DIR = "file:/tmp/spark-events"
426483
}
427484

428-
private class FsApplicationHistoryInfo(
485+
private class FsApplicationAttemptInfo(
429486
val logPath: String,
430-
id: String,
431-
name: String,
487+
val name: String,
488+
val appId: String,
489+
attemptId: Option[String],
432490
startTime: Long,
433491
endTime: Long,
434492
lastUpdated: Long,
435493
sparkUser: String,
436494
completed: Boolean = true)
437-
extends ApplicationHistoryInfo(id, name, startTime, endTime, lastUpdated, sparkUser, completed)
495+
extends ApplicationAttemptInfo(
496+
attemptId, startTime, endTime, lastUpdated, sparkUser, completed)
497+
498+
private class FsApplicationHistoryInfo(
499+
id: String,
500+
override val name: String,
501+
override val attempts: List[FsApplicationAttemptInfo])
502+
extends ApplicationHistoryInfo(id, name, attempts)

0 commit comments

Comments
 (0)