apache · edwinalu · Mar 9, 2018 · Apr 2, 2018 · Apr 22, 2018 · May 15, 2018
diff --git a/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala b/core/src/main/scala/org/apache/spark/HeartbeatReceiver.scala
@@ -22,6 +22,7 @@ import java.util.concurrent.{ScheduledFuture, TimeUnit}
 import scala.collection.mutable
 import scala.concurrent.Future
 
+import org.apache.spark.executor.ExecutorMetrics
 import org.apache.spark.internal.Logging
 import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
 import org.apache.spark.scheduler._
@@ -37,7 +38,8 @@ import org.apache.spark.util._
 private[spark] case class Heartbeat(
     executorId: String,
     accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], // taskId -> accumulator updates
-    blockManagerId: BlockManagerId)
+    blockManagerId: BlockManagerId,
+    executorUpdates: ExecutorMetrics) // executor level updates
 
 /**
  * An event that SparkContext uses to notify HeartbeatReceiver that SparkContext.taskScheduler is
@@ -120,14 +122,14 @@ private[spark] class HeartbeatReceiver(sc: SparkContext, clock: Clock)
       context.reply(true)
 
     // Messages received from executors
-    case heartbeat @ Heartbeat(executorId, accumUpdates, blockManagerId) =>
+    case heartbeat @ Heartbeat(executorId, accumUpdates, blockManagerId, executorMetrics) =>
       if (scheduler != null) {
         if (executorLastSeen.contains(executorId)) {
           executorLastSeen(executorId) = clock.getTimeMillis()
           eventLoopThread.submit(new Runnable {
             override def run(): Unit = Utils.tryLogNonFatalError {
               val unknownExecutor = !scheduler.executorHeartbeatReceived(
-                executorId, accumUpdates, blockManagerId)
+                executorId, accumUpdates, blockManagerId, executorMetrics)
               val response = HeartbeatResponse(reregisterBlockManager = unknownExecutor)
               context.reply(response)
             }

diff --git a/core/src/main/scala/org/apache/spark/Heartbeater.scala b/core/src/main/scala/org/apache/spark/Heartbeater.scala
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark
+
+import java.util.concurrent.TimeUnit
+
+import org.apache.spark.util.{ThreadUtils, Utils}
+
+/**
+ * Creates a heartbeat thread which will call the specified reportHeartbeat function at
+ * intervals of intervalMs.
+ *
+ * @param reportHeartbeat the heartbeat reporting function to call.
+ * @param name the thread name for the heartbeater.
+ * @param intervalMs the interval between heartbeats.
+ */
+private[spark] class Heartbeater(reportHeartbeat: () => Unit, name: String, intervalMs: Long) {
+  // Executor for the heartbeat task
+  private val heartbeater = ThreadUtils.newDaemonSingleThreadScheduledExecutor(name)
+
+  /** Schedules a task to report a heartbeat. */
+  private[spark] def start(): Unit = {
+    // Wait a random interval so the heartbeats don't end up in sync
+    val initialDelay = intervalMs + (math.random * intervalMs).asInstanceOf[Int]
+
+    val heartbeatTask = new Runnable() {
+      override def run(): Unit = Utils.logUncaughtExceptions(reportHeartbeat())
+    }
+    heartbeater.scheduleAtFixedRate(heartbeatTask, initialDelay, intervalMs, TimeUnit.MILLISECONDS)
+  }
+
+  /** Stops the heartbeat thread. */
+  private[spark] def stop(): Unit = {
+    heartbeater.shutdown()
+    heartbeater.awaitTermination(10, TimeUnit.SECONDS)
+  }
+}
+
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -19,12 +19,13 @@ package org.apache.spark.executor
 
 import java.io.{File, NotSerializableException}
 import java.lang.Thread.UncaughtExceptionHandler
-import java.lang.management.ManagementFactory
+import java.lang.management.{BufferPoolMXBean, ManagementFactory}
 import java.net.{URI, URL}
 import java.nio.ByteBuffer
 import java.util.Properties
 import java.util.concurrent._
 import javax.annotation.concurrent.GuardedBy
+import javax.management.ObjectName
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, HashMap, Map}
@@ -36,7 +37,7 @@ import org.apache.spark._
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
-import org.apache.spark.memory.{SparkOutOfMemoryError, TaskMemoryManager}
+import org.apache.spark.memory.{MemoryManager, SparkOutOfMemoryError, TaskMemoryManager}
 import org.apache.spark.rpc.RpcTimeout
 import org.apache.spark.scheduler.{DirectTaskResult, IndirectTaskResult, Task, TaskDescription}
 import org.apache.spark.shuffle.FetchFailedException
@@ -71,6 +72,12 @@ private[spark] class Executor(
 
   private val conf = env.conf
 
+  // BufferPoolMXBean for direct memory
+  private val directBufferPool = Executor.getBufferPool(Executor.DIRECT_BUFFER_POOL_NAME)
+
+  // BufferPoolMXBean for mapped memory
+  private val mappedBufferPool = Executor.getBufferPool(Executor.MAPPED_BUFFER_POOL_NAME)
+
   // No ip or host:port - just hostname
   Utils.checkHost(executorHostname)
   // must not have port specified.
@@ -148,7 +155,8 @@ private[spark] class Executor(
   private val runningTasks = new ConcurrentHashMap[Long, TaskRunner]
 
   // Executor for the heartbeat task.
-  private val heartbeater = ThreadUtils.newDaemonSingleThreadScheduledExecutor("driver-heartbeater")
+  private val heartbeater = new Heartbeater(reportHeartBeat, "executor-heartbeater",
+    conf.getTimeAsMs("spark.executor.heartbeatInterval", "10s"))
 
   // must be initialized before running startDriverHeartbeat()
   private val heartbeatReceiverRef =
@@ -167,7 +175,7 @@ private[spark] class Executor(
    */
   private var heartbeatFailures = 0
 
-  startDriverHeartbeater()
+  heartbeater.start()
 
   private[executor] def numRunningTasks: Int = runningTasks.size()
 
@@ -216,8 +224,7 @@ private[spark] class Executor(
 
   def stop(): Unit = {
     env.metricsSystem.report()
-    heartbeater.shutdown()
-    heartbeater.awaitTermination(10, TimeUnit.SECONDS)
+    heartbeater.stop()
     threadPool.shutdown()
     if (!isLocal) {
       env.stop()
@@ -772,6 +779,10 @@ private[spark] class Executor(
     val accumUpdates = new ArrayBuffer[(Long, Seq[AccumulatorV2[_, _]])]()
     val curGCTime = computeTotalGcTime()
 
+    // get executor level memory metrics
+    val executorUpdates = Executor.getCurrentExecutorMetrics(env.memoryManager,
+      directBufferPool, mappedBufferPool)
+
     for (taskRunner <- runningTasks.values().asScala) {
       if (taskRunner.task != null) {
         taskRunner.task.metrics.mergeShuffleReadMetrics()
@@ -780,7 +791,8 @@ private[spark] class Executor(
       }
     }
 
-    val message = Heartbeat(executorId, accumUpdates.toArray, env.blockManager.blockManagerId)
+    val message = Heartbeat(executorId, accumUpdates.toArray, env.blockManager.blockManagerId,
+      executorUpdates)
     try {
       val response = heartbeatReceiverRef.askSync[HeartbeatResponse](
           message, RpcTimeout(conf, "spark.executor.heartbeatInterval", "10s"))
@@ -800,26 +812,50 @@ private[spark] class Executor(
         }
     }
   }
-
-  /**
-   * Schedules a task to report heartbeat and partial metrics for active tasks to driver.
-   */
-  private def startDriverHeartbeater(): Unit = {
-    val intervalMs = conf.getTimeAsMs("spark.executor.heartbeatInterval", "10s")
-
-    // Wait a random interval so the heartbeats don't end up in sync
-    val initialDelay = intervalMs + (math.random * intervalMs).asInstanceOf[Int]
-
-    val heartbeatTask = new Runnable() {
-      override def run(): Unit = Utils.logUncaughtExceptions(reportHeartBeat())
-    }
-    heartbeater.scheduleAtFixedRate(heartbeatTask, initialDelay, intervalMs, TimeUnit.MILLISECONDS)
-  }
 }
 
 private[spark] object Executor {
   // This is reserved for internal use by components that need to read task properties before a
   // task is fully deserialized. When possible, the TaskContext.getLocalProperty call should be
   // used instead.
   val taskDeserializationProps: ThreadLocal[Properties] = new ThreadLocal[Properties]
+
+  val DIRECT_BUFFER_POOL_NAME = "direct"
+  val MAPPED_BUFFER_POOL_NAME = "mapped"
+
+  /** Get the BufferPoolMXBean for the specified buffer pool. */
+  def getBufferPool(pool: String): BufferPoolMXBean = {
+    val name = new ObjectName("java.nio:type=BufferPool,name=" + pool)
+    ManagementFactory.newPlatformMXBeanProxy(ManagementFactory.getPlatformMBeanServer,
+      name.toString, classOf[BufferPoolMXBean])
+  }
+
+  /**
+   * Get the current executor level memory metrics.
+   *
+   * @param memoryManager the memory manager
+   * @param direct the direct memory buffer pool
+   * @param mapped the mapped memory buffer pool
+   * @return the executor memory metrics
+   */
+  def getCurrentExecutorMetrics(
+      memoryManager: MemoryManager,
+      direct: BufferPoolMXBean,
+      mapped: BufferPoolMXBean) : ExecutorMetrics = {
+    val onHeapExecutionMemoryUsed = memoryManager.onHeapExecutionMemoryUsed
+    val offHeapExecutionMemoryUsed = memoryManager.offHeapExecutionMemoryUsed
+    val onHeapStorageMemoryUsed = memoryManager.onHeapStorageMemoryUsed
+    val offHeapStorageMemoryUsed = memoryManager.offHeapStorageMemoryUsed
+    new ExecutorMetrics(System.currentTimeMillis(),
+      ManagementFactory.getMemoryMXBean.getHeapMemoryUsage().getUsed(),
+      ManagementFactory.getMemoryMXBean.getNonHeapMemoryUsage().getUsed(),
+      onHeapExecutionMemoryUsed,
+      offHeapExecutionMemoryUsed,
+      onHeapStorageMemoryUsed,
+      offHeapStorageMemoryUsed,
+      onHeapExecutionMemoryUsed + onHeapStorageMemoryUsed, // on heap unified memory
+      offHeapExecutionMemoryUsed + offHeapStorageMemoryUsed, // off heap unified memory
+      direct.getMemoryUsed,
+      mapped.getMemoryUsed)
+  }
 }
diff --git a/core/src/main/scala/org/apache/spark/executor/ExecutorMetrics.scala b/core/src/main/scala/org/apache/spark/executor/ExecutorMetrics.scala
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.executor
+
+import org.apache.spark.annotation.DeveloperApi
+
+/**
+ * :: DeveloperApi ::
+ * Executor level metrics.
+ *
+ * This is sent to the driver periodically (on executor heartbeat), to provide
+ * information about each executor's metrics.
+ *
+ * @param timestamp the time the metrics were collected, or -1 for Spark history
+ *                  log events which are logged when a stage has completed
+ * @param jvmUsedHeapMemory the amount of JVM used heap memory for the executor
+ * @param jvmUsedNonHeapMemory the amount of JVM used non-heap memory for the executor
+ * @param onHeapExecutionMemory the amount of on heap execution memory used
+ * @param offHeapExecutionMemory the amount of off heap execution memory used
+ * @param onHeapStorageMemory the amount of on heap storage memory used
+ * @param offHeapStorageMemory the amount of off heap storage memory used
+ * @param onHeapUnifiedMemory the amount of on heap unified region memory used
+ * @param offHeapUnifiedMemory the amount of off heap unified region memory used
+ * @param directMemory the amount of direct memory used
+ * @param mappedMemory the amount of mapped memory used
+ */
+@DeveloperApi
+class ExecutorMetrics private[spark] (
+    val timestamp: Long,
+    val jvmUsedHeapMemory: Long,
+    val jvmUsedNonHeapMemory: Long,
+    val onHeapExecutionMemory: Long,
+    val offHeapExecutionMemory: Long,
+    val onHeapStorageMemory: Long,
+    val offHeapStorageMemory: Long,
+    val onHeapUnifiedMemory: Long,
+    val offHeapUnifiedMemory: Long,
+    val directMemory: Long,
+    val mappedMemory: Long) extends Serializable
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -69,6 +69,11 @@ package object config {
     .bytesConf(ByteUnit.KiB)
     .createWithDefaultString("100k")
 
+  private[spark] val EVENT_LOG_EXECUTOR_METRICS_UPDATES =
+    ConfigBuilder("spark.eventLog.logExecutorMetricsUpdates.enabled")
+      .booleanConf
+      .createWithDefault(true)
+
   private[spark] val EVENT_LOG_OVERWRITE =
     ConfigBuilder("spark.eventLog.overwrite").booleanConf.createWithDefault(false)
 

diff --git a/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala b/core/src/main/scala/org/apache/spark/memory/MemoryManager.scala
@@ -180,6 +180,26 @@ private[spark] abstract class MemoryManager(
     onHeapStorageMemoryPool.memoryUsed + offHeapStorageMemoryPool.memoryUsed
   }
 
+  /**
+   *  On heap execution memory currently in use, in bytes.
+   */
+  final def onHeapExecutionMemoryUsed: Long = onHeapExecutionMemoryPool.memoryUsed
+
+  /**
+   *  Off heap execution memory currently in use, in bytes.
+   */
+  final def offHeapExecutionMemoryUsed: Long = offHeapExecutionMemoryPool.memoryUsed
+
+  /**
+   *  On heap storage memory currently in use, in bytes.
+   */
+  final def onHeapStorageMemoryUsed: Long = onHeapStorageMemoryPool.memoryUsed
+
+  /**
+   *  Off heap storage memory currently in use, in bytes.
+   */
+  final def offHeapStorageMemoryUsed: Long = offHeapStorageMemoryPool.memoryUsed
+
   /**
    * Returns the execution memory consumption, in bytes, for the given task.
    */