apache
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala‎
Lines changed: 39 additions & 7 deletions b/‎core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala‎
Lines changed: 39 additions & 7 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala‎
Lines changed: 18 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala‎
Lines changed: 52 additions & 3 deletions b/‎core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala‎
Lines changed: 52 additions & 3 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala‎
Lines changed: 6 additions & 1 deletion b/‎core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/ui/ToolTips.scala‎
Lines changed: 2 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/ui/ToolTips.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala‎
Lines changed: 5 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/ui/exec/ExecutorsTab.scala‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala‎
Lines changed: 3 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/ui/jobs/ExecutorTable.scala‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala‎
Lines changed: 6 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/ui/jobs/JobProgressListener.scala‎
Lines changed: 6 additions & 0 deletions
@@ -17,6 +17,7 @@
 
 package org.apache.spark.deploy
 
+import java.lang.reflect.Method
 import java.security.PrivilegedExceptionAction
 
 import org.apache.hadoop.conf.Configuration
@@ -133,14 +134,9 @@ class SparkHadoopUtil extends Logging {
    */
   private[spark] def getFSBytesReadOnThreadCallback(path: Path, conf: Configuration)
     : Option[() => Long] = {
-    val qualifiedPath = path.getFileSystem(conf).makeQualified(path)
-    val scheme = qualifiedPath.toUri().getScheme()
-    val stats = FileSystem.getAllStatistics().filter(_.getScheme().equals(scheme))
     try {
-      val threadStats = stats.map(Utils.invoke(classOf[Statistics], _, "getThreadStatistics"))
-      val statisticsDataClass =
-        Class.forName("org.apache.hadoop.fs.FileSystem$Statistics$StatisticsData")
-      val getBytesReadMethod = statisticsDataClass.getDeclaredMethod("getBytesRead")
+      val threadStats = getFileSystemThreadStatistics(path, conf)
+      val getBytesReadMethod = getFileSystemThreadStatisticsMethod("getBytesRead")
       val f = () => threadStats.map(getBytesReadMethod.invoke(_).asInstanceOf[Long]).sum
       val baselineBytesRead = f()
       Some(() => f() - baselineBytesRead)
@@ -151,6 +147,42 @@ class SparkHadoopUtil extends Logging {
       }
     }
   }
+
+  /**
+   * Returns a function that can be called to find Hadoop FileSystem bytes written. If
+   * getFSBytesWrittenOnThreadCallback is called from thread r at time t, the returned callback will
+   * return the bytes written on r since t.  Reflection is required because thread-level FileSystem
+   * statistics are only available as of Hadoop 2.5 (see HADOOP-10688).
+   * Returns None if the required method can't be found.
+   */
+  private[spark] def getFSBytesWrittenOnThreadCallback(path: Path, conf: Configuration)
+    : Option[() => Long] = {
+    try {
+      val threadStats = getFileSystemThreadStatistics(path, conf)
+      val getBytesReadMethod = getFileSystemThreadStatisticsMethod("getBytesWritten")
+      val f = () => threadStats.map(getBytesReadMethod.invoke(_).asInstanceOf[Long]).sum
+      val baselineBytesRead = f()
+      Some(() => f() - baselineBytesRead)
+    } catch {
+      case e: NoSuchMethodException => {
+        logDebug("Couldn't find method for retrieving thread-level FileSystem output data", e)
+        None
+      }
+    }
+  }
+
+  private def getFileSystemThreadStatistics(path: Path, conf: Configuration): Seq[AnyRef] = {
+    val qualifiedPath = path.getFileSystem(conf).makeQualified(path)
+    val scheme = qualifiedPath.toUri().getScheme()
+    val stats = FileSystem.getAllStatistics().filter(_.getScheme().equals(scheme))
+    stats.map(Utils.invoke(classOf[Statistics], _, "getThreadStatistics"))
+  }
+
+  private def getFileSystemThreadStatisticsMethod(methodName: String): Method = {
+    val statisticsDataClass =
+      Class.forName("org.apache.hadoop.fs.FileSystem$Statistics$StatisticsData")
+    statisticsDataClass.getDeclaredMethod(methodName)
+  }
 }
 
 object SparkHadoopUtil {
 
@@ -82,6 +82,12 @@ class TaskMetrics extends Serializable {
    */
   var inputMetrics: Option[InputMetrics] = None
 
+  /**
+   * If this task writes data externally (e.g. to a distributed filesystem), metrics on how much
+   * data was written are stored here.
+   */
+  var outputMetrics: Option[OutputMetrics] = None
+
   /**
    * If this task reads from shuffle output, metrics on getting shuffle data will be collected here.
    * This includes read metrics aggregated over all the task's shuffle dependencies.
@@ -169,6 +175,18 @@ case class InputMetrics(readMethod: DataReadMethod.Value) {
   var bytesRead: Long = 0L
 }
 
+/**
+ * :: DeveloperApi ::
+ * Metrics about writing output data.
+ */
+@DeveloperApi
+case class OutputMetrics() {
+  /**
+   * Total bytes written
+   */
+  var bytesWritten: Long = 0L
+}
+
 /**
  * :: DeveloperApi ::
  * Metrics pertaining to shuffle data read in a given task.
 
@@ -28,7 +28,7 @@ import scala.reflect.ClassTag
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus
 import org.apache.hadoop.conf.{Configurable, Configuration}
-import org.apache.hadoop.fs.FileSystem
+import org.apache.hadoop.fs.{Path, FileSystem}
 import org.apache.hadoop.io.SequenceFile.CompressionType
 import org.apache.hadoop.io.compress.CompressionCodec
 import org.apache.hadoop.mapred.{FileOutputCommitter, FileOutputFormat, JobConf, OutputFormat}
@@ -40,6 +40,7 @@ import org.apache.spark.Partitioner.defaultPartitioner
 import org.apache.spark.SparkContext._
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.executor.OutputMetrics
 import org.apache.spark.partial.{BoundedDouble, PartialResult}
 import org.apache.spark.serializer.Serializer
 import org.apache.spark.util.Utils
@@ -961,30 +962,52 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     }
 
     val writeShard = (context: TaskContext, iter: Iterator[(K,V)]) => {
+      val config = wrappedConf.value
       // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
       // around by taking a mod. We expect that no task will be attempted 2 billion times.
       val attemptNumber = (context.attemptId % Int.MaxValue).toInt
       /* "reduce task" <split #> <attempt # = spark task #> */
       val attemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = false, context.partitionId,
         attemptNumber)
-      val hadoopContext = newTaskAttemptContext(wrappedConf.value, attemptId)
+      val hadoopContext = newTaskAttemptContext(config, attemptId)
       val format = outfmt.newInstance
       format match {
-        case c: Configurable => c.setConf(wrappedConf.value)
+        case c: Configurable => c.setConf(config)
         case _ => ()
       }
       val committer = format.getOutputCommitter(hadoopContext)
       committer.setupTask(hadoopContext)
+
+      val bytesWrittenCallback = Option(config.get("mapreduce.output.fileoutputformat.outputdir"))
+        .map(new Path(_))
+        .flatMap(SparkHadoopUtil.get.getFSBytesWrittenOnThreadCallback(_, config))
+      val outputMetrics = new OutputMetrics()
+      if (bytesWrittenCallback.isDefined) {
+        context.taskMetrics.outputMetrics = Some(outputMetrics)
+      }
+
       val writer = format.getRecordWriter(hadoopContext).asInstanceOf[NewRecordWriter[K,V]]
       try {
+        var recordsSinceMetricsUpdate = 0
         while (iter.hasNext) {
           val pair = iter.next()
           writer.write(pair._1, pair._2)
+
+          // Update bytes written metric every few records
+          if (recordsSinceMetricsUpdate ==
+              PairRDDFunctions.RECORDS_BETWEEN_BYTES_WRITTEN_METRIC_UPDATES
+              && bytesWrittenCallback.isDefined) {
+            recordsSinceMetricsUpdate = 0
+            bytesWrittenCallback.foreach { fn => outputMetrics.bytesWritten = fn() }
+          } else {
+            recordsSinceMetricsUpdate += 1
+          }
         }
       } finally {
         writer.close(hadoopContext)
       }
       committer.commitTask(hadoopContext)
+      bytesWrittenCallback.foreach { fn => outputMetrics.bytesWritten = fn() }
       1
     } : Int
 
@@ -1005,6 +1028,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
   def saveAsHadoopDataset(conf: JobConf) {
     // Rename this as hadoopConf internally to avoid shadowing (see SPARK-2038).
     val hadoopConf = conf
+    val wrappedConf = new SerializableWritable(hadoopConf)
     val outputFormatInstance = hadoopConf.getOutputFormat
     val keyClass = hadoopConf.getOutputKeyClass
     val valueClass = hadoopConf.getOutputValueClass
@@ -1032,21 +1056,42 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
     writer.preSetup()
 
     val writeToFile = (context: TaskContext, iter: Iterator[(K, V)]) => {
+      val config = wrappedConf.value
       // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
       // around by taking a mod. We expect that no task will be attempted 2 billion times.
       val attemptNumber = (context.attemptId % Int.MaxValue).toInt
 
+      val bytesWrittenCallback = Option(config.get("mapreduce.output.fileoutputformat.outputdir"))
+        .map(new Path(_))
+        .flatMap(SparkHadoopUtil.get.getFSBytesWrittenOnThreadCallback(_, config))
+      val outputMetrics = new OutputMetrics()
+      if (bytesWrittenCallback.isDefined) {
+        context.taskMetrics.outputMetrics = Some(outputMetrics)
+      }
+
       writer.setup(context.stageId, context.partitionId, attemptNumber)
       writer.open()
       try {
+        var recordsSinceMetricsUpdate = 0
         while (iter.hasNext) {
           val record = iter.next()
           writer.write(record._1.asInstanceOf[AnyRef], record._2.asInstanceOf[AnyRef])
+
+          // Update bytes written metric every few records
+          if (recordsSinceMetricsUpdate ==
+              PairRDDFunctions.RECORDS_BETWEEN_BYTES_WRITTEN_METRIC_UPDATES
+              && bytesWrittenCallback.isDefined) {
+            recordsSinceMetricsUpdate = 0
+            bytesWrittenCallback.foreach { fn => outputMetrics.bytesWritten = fn() }
+          } else {
+            recordsSinceMetricsUpdate += 1
+          }
         }
       } finally {
         writer.close()
       }
       writer.commit()
+      bytesWrittenCallback.foreach { fn => outputMetrics.bytesWritten = fn() }
     }
 
     self.context.runJob(self, writeToFile)
@@ -1069,3 +1114,7 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
 
   private[spark] def keyOrdering: Option[Ordering[K]] = Option(ord)
 }
+
+private[spark] object PairRDDFunctions {
+  val RECORDS_BETWEEN_BYTES_WRITTEN_METRIC_UPDATES = 256
+}
@@ -158,6 +158,11 @@ class JobLogger(val user: String, val logDirName: String) extends SparkListener
         " INPUT_BYTES=" + metrics.bytesRead
       case None => ""
     }
+    val outputMetrics = taskMetrics.outputMetrics match {
+      case Some(metrics) =>
+        " OUTPUT_BYTES=" + metrics.bytesWritten
+      case None => ""
+    }
     val shuffleReadMetrics = taskMetrics.shuffleReadMetrics match {
       case Some(metrics) =>
         " BLOCK_FETCHED_TOTAL=" + metrics.totalBlocksFetched +
@@ -173,7 +178,7 @@ class JobLogger(val user: String, val logDirName: String) extends SparkListener
         " SHUFFLE_WRITE_TIME=" + metrics.shuffleWriteTime
       case None => ""
     }
-    stageLogInfo(stageId, status + info + executorRunTime + gcTime + inputMetrics +
+    stageLogInfo(stageId, status + info + executorRunTime + gcTime + inputMetrics + outputMetrics +
       shuffleReadMetrics + writeMetrics)
   }
 
 
@@ -29,6 +29,8 @@ private[spark] object ToolTips {
 
   val INPUT = "Bytes read from Hadoop or from Spark storage."
 
+  val OUTPUT = "Bytes written to Hadoop."
+
   val SHUFFLE_WRITE = "Bytes written to disk in order to be read by a shuffle in a future stage."
 
   val SHUFFLE_READ =
 
@@ -48,6 +48,7 @@ class ExecutorsListener(storageStatusListener: StorageStatusListener) extends Sp
   val executorToTasksFailed = HashMap[String, Int]()
   val executorToDuration = HashMap[String, Long]()
   val executorToInputBytes = HashMap[String, Long]()
+  val executorToOutputBytes = HashMap[String, Long]()
   val executorToShuffleRead = HashMap[String, Long]()
   val executorToShuffleWrite = HashMap[String, Long]()
 
@@ -78,6 +79,10 @@ class ExecutorsListener(storageStatusListener: StorageStatusListener) extends Sp
           executorToInputBytes(eid) =
             executorToInputBytes.getOrElse(eid, 0L) + inputMetrics.bytesRead
         }
+        metrics.outputMetrics.foreach { outputMetrics =>
+          executorToOutputBytes(eid) =
+            executorToOutputBytes.getOrElse(eid, 0L) + outputMetrics.bytesWritten
+        }
         metrics.shuffleReadMetrics.foreach { shuffleRead =>
           executorToShuffleRead(eid) =
             executorToShuffleRead.getOrElse(eid, 0L) + shuffleRead.remoteBytesRead
 
@@ -45,6 +45,7 @@ private[ui] class ExecutorTable(stageId: Int, stageAttemptId: Int, parent: JobPr
         <th>Failed Tasks</th>
         <th>Succeeded Tasks</th>
         <th><span data-toggle="tooltip" title={ToolTips.INPUT}>Input</span></th>
+        <th><span data-toggle="tooltip" title={ToolTips.OUTPUT}>Output</span></th>
         <th><span data-toggle="tooltip" title={ToolTips.SHUFFLE_READ}>Shuffle Read</span></th>
         <th><span data-toggle="tooltip" title={ToolTips.SHUFFLE_WRITE}>Shuffle Write</span></th>
         <th>Shuffle Spill (Memory)</th>
@@ -77,6 +78,8 @@ private[ui] class ExecutorTable(stageId: Int, stageAttemptId: Int, parent: JobPr
             <td>{v.succeededTasks}</td>
             <td sorttable_customkey={v.inputBytes.toString}>
               {Utils.bytesToString(v.inputBytes)}</td>
+            <td sorttable_customkey={v.outputBytes.toString}>
+              {Utils.bytesToString(v.outputBytes)}</td>
             <td sorttable_customkey={v.shuffleRead.toString}>
               {Utils.bytesToString(v.shuffleRead)}</td>
             <td sorttable_customkey={v.shuffleWrite.toString}>
 
@@ -259,6 +259,12 @@ class JobProgressListener(conf: SparkConf) extends SparkListener with Logging {
     stageData.inputBytes += inputBytesDelta
     execSummary.inputBytes += inputBytesDelta
 
+    val outputBytesDelta =
+      (taskMetrics.outputMetrics.map(_.bytesWritten).getOrElse(0L)
+        - oldMetrics.flatMap(_.outputMetrics).map(_.bytesWritten).getOrElse(0L))
+    stageData.outputBytes += outputBytesDelta
+    execSummary.outputBytes += outputBytesDelta
+
     val diskSpillDelta =
       taskMetrics.diskBytesSpilled - oldMetrics.map(_.diskBytesSpilled).getOrElse(0L)
     stageData.diskBytesSpilled += diskSpillDelta
Original file line number	Diff line number	Diff line change
`@@ -158,6 +158,11 @@ class JobLogger(val user: String, val logDirName: String) extends SparkListener`
`158`	`158`	`" INPUT_BYTES=" + metrics.bytesRead`
`159`	`159`	`case None => ""`
`160`	`160`	`}`
	`161`	`+ val outputMetrics = taskMetrics.outputMetrics match {`
	`162`	`+ case Some(metrics) =>`
	`163`	`+ " OUTPUT_BYTES=" + metrics.bytesWritten`
	`164`	`+ case None => ""`
	`165`	`+ }`
`161`	`166`	`val shuffleReadMetrics = taskMetrics.shuffleReadMetrics match {`
`162`	`167`	`case Some(metrics) =>`
`163`	`168`	`" BLOCK_FETCHED_TOTAL=" + metrics.totalBlocksFetched +`
`@@ -173,7 +178,7 @@ class JobLogger(val user: String, val logDirName: String) extends SparkListener`
`173`	`178`	`" SHUFFLE_WRITE_TIME=" + metrics.shuffleWriteTime`
`174`	`179`	`case None => ""`
`175`	`180`	`}`
`176`		`- stageLogInfo(stageId, status + info + executorRunTime + gcTime + inputMetrics +`
	`181`	`+ stageLogInfo(stageId, status + info + executorRunTime + gcTime + inputMetrics + outputMetrics +`
`177`	`182`	`shuffleReadMetrics + writeMetrics)`
`178`	`183`	`}`
`179`	`184`