Skip to content
Closed
19 changes: 19 additions & 0 deletions core/src/main/scala/org/apache/spark/internal/config/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ package org.apache.spark.internal
import java.util.concurrent.TimeUnit

import org.apache.spark.launcher.SparkLauncher
import org.apache.spark.metrics.GarbageCollectionMetrics
import org.apache.spark.network.util.ByteUnit
import org.apache.spark.scheduler.{EventLoggingListener, SchedulingMode}
import org.apache.spark.storage.{DefaultTopologyMapper, RandomBlockReplicationPolicy}
Expand Down Expand Up @@ -114,6 +115,24 @@ package object config {
.booleanConf
.createWithDefault(false)

private[spark] val EVENT_LOG_GC_METRICS_YOUNG_GENERATION_GARBAGE_COLLECTORS =
ConfigBuilder("spark.eventLog.gcMetrics.youngGenerationGarbageCollectors")
.doc("Names of supported young generation garbage collector. A name usually is " +
" the return of GarbageCollectorMXBean.getName. The built-in young generation garbage " +
s"collectors are ${GarbageCollectionMetrics.YOUNG_GENERATION_BUILTIN_GARBAGE_COLLECTORS}")
.stringConf
.toSequence
.createWithDefault(GarbageCollectionMetrics.YOUNG_GENERATION_BUILTIN_GARBAGE_COLLECTORS)

private[spark] val EVENT_LOG_GC_METRICS_OLD_GENERATION_GARBAGE_COLLECTORS =
ConfigBuilder("spark.eventLog.gcMetrics.oldGenerationGarbageCollectors")
.doc("Names of supported old generation garbage collector. A name usually is " +
"the return of GarbageCollectorMXBean.getName. The built-in old generation garbage " +
s"collectors are ${GarbageCollectionMetrics.OLD_GENERATION_BUILTIN_GARBAGE_COLLECTORS}")
.stringConf
.toSequence
.createWithDefault(GarbageCollectionMetrics.OLD_GENERATION_BUILTIN_GARBAGE_COLLECTORS)

private[spark] val EVENT_LOG_OVERWRITE =
ConfigBuilder("spark.eventLog.overwrite").booleanConf.createWithDefault(false)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,12 @@ package org.apache.spark.metrics
import java.lang.management.{BufferPoolMXBean, ManagementFactory}
import javax.management.ObjectName

import scala.collection.JavaConverters._
import scala.collection.mutable

import org.apache.spark.SparkEnv
import org.apache.spark.executor.ProcfsMetricsGetter
import org.apache.spark.internal.{config, Logging}
import org.apache.spark.memory.MemoryManager

/**
Expand Down Expand Up @@ -99,6 +102,56 @@ case object ProcessTreeMetrics extends ExecutorMetricType {
}
}

case object GarbageCollectionMetrics extends ExecutorMetricType with Logging {
override val names = Seq(
"MinorGCCount",
"MinorGCTime",
"MajorGCCount",
"MajorGCTime"
)

/* We builtin some common GC collectors which categorized as young generation and old */
private[spark] val YOUNG_GENERATION_BUILTIN_GARBAGE_COLLECTORS = Seq(
"Copy",
"PS Scavenge",
"ParNew",
"G1 Young Generation"
)

private[spark] val OLD_GENERATION_BUILTIN_GARBAGE_COLLECTORS = Seq(
"MarkSweepCompact",
"PS MarkSweep",
"ConcurrentMarkSweep",
"G1 Old Generation"
)

private lazy val youngGenerationGarbageCollector: Seq[String] = {
SparkEnv.get.conf.get(config.EVENT_LOG_GC_METRICS_YOUNG_GENERATION_GARBAGE_COLLECTORS)
}

private lazy val oldGenerationGarbageCollector: Seq[String] = {
SparkEnv.get.conf.get(config.EVENT_LOG_GC_METRICS_OLD_GENERATION_GARBAGE_COLLECTORS)
}

override private[spark] def getMetricValues(memoryManager: MemoryManager): Array[Long] = {
val gcMetrics = new Array[Long](names.length) // minorCount, minorTime, majorCount, majorTime
ManagementFactory.getGarbageCollectorMXBeans.asScala.foreach { mxBean =>
if (youngGenerationGarbageCollector.contains(mxBean.getName)) {
gcMetrics(0) = mxBean.getCollectionCount
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it ever possible that multiple collectors are considered young generation (or old generation)? If so, you'd want to use += here.

I'm not sure if there is a configuration where that might happen, but I'm also not super familiar with all variants here. @kiszk maybe you know?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK, one collector per generation is supported only.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for being late. I agree with one collector per generation.

gcMetrics(1) = mxBean.getCollectionTime
} else if (oldGenerationGarbageCollector.contains(mxBean.getName)) {
gcMetrics(2) = mxBean.getCollectionCount
gcMetrics(3) = mxBean.getCollectionTime
} else {
logDebug(s"${mxBean.getName} is an unsupported garbage collector." +
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this message shown every time when getMetricValue is called with new parameters (i.e. not contained). I am curious whether the message output cause performance degradation and increase of log size.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kiszk This message is shown only in DEBUG level. In production, I think no performance degradation since the log level is INFO by default. User should know DEBUG level always impact performance more or less. And this isn't a high frequency calling due to heartbeat intervals.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry one more thing -- with SPARK-26329 #23767 this may show up a lot more. Can you update this so the msg only gets printed once? should be pretty simple.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@squito thanks for this useful comment, I've updated.

s"Add it to ${config.EVENT_LOG_GC_METRICS_YOUNG_GENERATION_GARBAGE_COLLECTORS.key} " +
s"or ${config.EVENT_LOG_GC_METRICS_OLD_GENERATION_GARBAGE_COLLECTORS.key} to enable.")
}
}
gcMetrics
}
}

case object OnHeapExecutionMemory extends MemoryManagerExecutorMetricType(
_.onHeapExecutionMemoryUsed)

Expand Down Expand Up @@ -137,7 +190,8 @@ private[spark] object ExecutorMetricType {
OffHeapUnifiedMemory,
DirectPoolMemory,
MappedPoolMemory,
ProcessTreeMetrics
ProcessTreeMetrics,
GarbageCollectionMetrics
)


Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
[ {
"id": "application_1536831636016_59384",
"name": "Spark Pi",
"attempts": [
{
"attemptId": "1",
"startTime": "2019-01-08T04:33:43.607GMT",
"endTime": "2019-01-08T04:33:58.745GMT",
"lastUpdated": "",
"duration": 15138,
"sparkUser": "lajin",
"completed": true,
"appSparkVersion": "3.0.0-SNAPSHOT",
"lastUpdatedEpoch": 0,
"startTimeEpoch": 1546922023607,
"endTimeEpoch": 1546922038745
}
]
}, {
"id" : "application_1538416563558_0014",
"name" : "PythonBisectingKMeansExample",
"attempts" : [ {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,22 @@
[ {
"id": "application_1536831636016_59384",
"name": "Spark Pi",
"attempts": [
{
"attemptId": "1",
"startTime": "2019-01-08T04:33:43.607GMT",
"endTime": "2019-01-08T04:33:58.745GMT",
"lastUpdated": "",
"duration": 15138,
"sparkUser": "lajin",
"completed": true,
"appSparkVersion": "3.0.0-SNAPSHOT",
"lastUpdatedEpoch": 0,
"startTimeEpoch": 1546922023607,
"endTimeEpoch": 1546922038745
}
]
}, {
"id" : "application_1538416563558_0014",
"name" : "PythonBisectingKMeansExample",
"attempts" : [ {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
[ {
"id" : "driver",
"hostPort" : "047.company.com:42509",
"isActive" : true,
"rddBlocks" : 0,
"memoryUsed" : 0,
"diskUsed" : 0,
"totalCores" : 0,
"maxTasks" : 0,
"activeTasks" : 0,
"failedTasks" : 0,
"completedTasks" : 0,
"totalTasks" : 0,
"totalDuration" : 0,
"totalGCTime" : 0,
"totalInputBytes" : 0,
"totalShuffleRead" : 0,
"totalShuffleWrite" : 0,
"isBlacklisted" : false,
"maxMemory" : 100977868,
"addTime" : "2019-01-08T04:33:44.502GMT",
"executorLogs" : {
"stdout" : "https://047.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000001/lajin/stdout?start=-4096",
"stderr" : "https://047.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000001/lajin/stderr?start=-4096"
},
"memoryMetrics" : {
"usedOnHeapStorageMemory" : 0,
"usedOffHeapStorageMemory" : 0,
"totalOnHeapStorageMemory" : 100977868,
"totalOffHeapStorageMemory" : 0
},
"blacklistedInStages" : [ ],
"peakMemoryMetrics" : {
"JVMHeapMemory" : 211171816,
"JVMOffHeapMemory" : 90237256,
"OnHeapExecutionMemory" : 0,
"OffHeapExecutionMemory" : 0,
"OnHeapStorageMemory" : 4876,
"OffHeapStorageMemory" : 0,
"OnHeapUnifiedMemory" : 4876,
"OffHeapUnifiedMemory" : 0,
"DirectPoolMemory" : 806275,
"MappedPoolMemory" : 0,
"ProcessTreeJVMVMemory" : 2646888448,
"ProcessTreeJVMRSSMemory" : 520900608,
"ProcessTreePythonVMemory" : 0,
"ProcessTreePythonRSSMemory" : 0,
"ProcessTreeOtherVMemory" : 0,
"ProcessTreeOtherRSSMemory" : 0,
"MinorGCCount" : 8,
"MinorGCTime" : 374,
"MajorGCCount" : 3,
"MajorGCTime" : 170
},
"attributes" : { }
}, {
"id" : "2",
"hostPort" : "028.company.com:46325",
"isActive" : true,
"rddBlocks" : 0,
"memoryUsed" : 0,
"diskUsed" : 0,
"totalCores" : 4,
"maxTasks" : 4,
"activeTasks" : 0,
"failedTasks" : 0,
"completedTasks" : 52,
"totalTasks" : 52,
"totalDuration" : 8879,
"totalGCTime" : 420,
"totalInputBytes" : 0,
"totalShuffleRead" : 0,
"totalShuffleWrite" : 0,
"isBlacklisted" : false,
"maxMemory" : 97832140,
"addTime" : "2019-01-08T04:33:54.270GMT",
"executorLogs" : {
"stdout" : "https://028.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000003/lajin/stdout?start=-4096",
"stderr" : "https://028.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000003/lajin/stderr?start=-4096"
},
"memoryMetrics" : {
"usedOnHeapStorageMemory" : 0,
"usedOffHeapStorageMemory" : 0,
"totalOnHeapStorageMemory" : 97832140,
"totalOffHeapStorageMemory" : 0
},
"blacklistedInStages" : [ ],
"attributes" : { }
}, {
"id" : "1",
"hostPort" : "036.company.com:35126",
"isActive" : true,
"rddBlocks" : 0,
"memoryUsed" : 0,
"diskUsed" : 0,
"totalCores" : 4,
"maxTasks" : 4,
"activeTasks" : 0,
"failedTasks" : 0,
"completedTasks" : 48,
"totalTasks" : 48,
"totalDuration" : 8837,
"totalGCTime" : 1192,
"totalInputBytes" : 0,
"totalShuffleRead" : 0,
"totalShuffleWrite" : 0,
"isBlacklisted" : false,
"maxMemory" : 97832140,
"addTime" : "2019-01-08T04:33:55.929GMT",
"executorLogs" : {
"stdout" : "https://036.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000002/lajin/stdout?start=-4096",
"stderr" : "https://036.company.com:50060/node/containerlogs/container_e136_1536831636016_59384_01_000002/lajin/stderr?start=-4096"
},
"memoryMetrics" : {
"usedOnHeapStorageMemory" : 0,
"usedOffHeapStorageMemory" : 0,
"totalOnHeapStorageMemory" : 97832140,
"totalOffHeapStorageMemory" : 0
},
"blacklistedInStages" : [ ],
"attributes" : { }
} ]
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,11 @@
"ProcessTreePythonVMemory": 0,
"ProcessTreePythonRSSMemory": 0,
"ProcessTreeOtherVMemory": 0,
"ProcessTreeOtherRSSMemory": 0
"ProcessTreeOtherRSSMemory": 0,
"MinorGCCount": 0,
"MinorGCTime": 0,
"MajorGCCount": 0,
"MajorGCTime": 0
},
"attributes" : { }
}, {
Expand Down Expand Up @@ -193,7 +197,11 @@
"ProcessTreePythonVMemory": 0,
"ProcessTreePythonRSSMemory": 0,
"ProcessTreeOtherVMemory": 0,
"ProcessTreeOtherRSSMemory": 0
"ProcessTreeOtherRSSMemory": 0,
"MinorGCCount": 0,
"MinorGCTime": 0,
"MajorGCCount": 0,
"MajorGCTime": 0
},
"attributes" : { }
}, {
Expand Down Expand Up @@ -244,7 +252,11 @@
"ProcessTreePythonVMemory": 0,
"ProcessTreePythonRSSMemory": 0,
"ProcessTreeOtherVMemory": 0,
"ProcessTreeOtherRSSMemory": 0
"ProcessTreeOtherRSSMemory": 0,
"MinorGCCount": 0,
"MinorGCTime": 0,
"MajorGCCount": 0,
"MajorGCTime": 0
},
"attributes" : { }
}, {
Expand Down Expand Up @@ -295,7 +307,11 @@
"ProcessTreePythonVMemory": 0,
"ProcessTreePythonRSSMemory": 0,
"ProcessTreeOtherVMemory": 0,
"ProcessTreeOtherRSSMemory": 0
"ProcessTreeOtherRSSMemory": 0,
"MinorGCCount": 0,
"MinorGCTime": 0,
"MajorGCCount": 0,
"MajorGCTime": 0
},
"attributes" : { }
}, {
Expand Down Expand Up @@ -346,7 +362,11 @@
"ProcessTreePythonVMemory": 0,
"ProcessTreePythonRSSMemory": 0,
"ProcessTreeOtherVMemory": 0,
"ProcessTreeOtherRSSMemory": 0
"ProcessTreeOtherRSSMemory": 0,
"MinorGCCount": 0,
"MinorGCTime": 0,
"MajorGCCount": 0,
"MajorGCTime": 0
},
"attributes" : { }
} ]
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,11 @@
"ProcessTreePythonVMemory" : 408375296,
"ProcessTreePythonRSSMemory" : 40284160,
"ProcessTreeOtherVMemory" : 0,
"ProcessTreeOtherRSSMemory" : 0
"ProcessTreeOtherRSSMemory" : 0,
"MinorGCCount": 0,
"MinorGCTime": 0,
"MajorGCCount": 0,
"MajorGCTime": 0
},
"attributes" : { }
}, {
Expand Down Expand Up @@ -94,7 +98,11 @@
"ProcessTreePythonVMemory" : 625926144,
"ProcessTreePythonRSSMemory" : 69013504,
"ProcessTreeOtherVMemory" : 0,
"ProcessTreeOtherRSSMemory" : 0
"ProcessTreeOtherRSSMemory" : 0,
"MinorGCCount": 0,
"MinorGCTime": 0,
"MajorGCCount": 0,
"MajorGCTime": 0
},
"attributes" : { }
} ]
Loading