Example of using named accumulators for custom RDD metrics.

pwendell · pwendell · commit ad85076f621d · 2014-07-06T20:41:51.000+09:00
diff --git a/core/src/main/scala/org/apache/spark/CacheManager.scala b/core/src/main/scala/org/apache/spark/CacheManager.scala
@@ -45,6 +45,9 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
       case Some(blockResult) =>
         // Partition is already materialized, so just return its values
         context.taskMetrics.inputMetrics = Some(blockResult.inputMetrics)
+        if (blockResult.inputMetrics.bytesRead > 0) {
+          rdd.inputBytes += blockResult.inputMetrics.bytesRead
+        }
         new InterruptibleIterator(context, blockResult.data.asInstanceOf[Iterator[T]])
 
       case None =>
diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -119,6 +119,8 @@ class HadoopRDD[K, V](
       minPartitions)
   }
 
+  val hadoopInputBytes = sc.accumulator(0L, s"rdd-$id.input.bytes.hadoop")(SparkContext.LongAccumulatorParam)
+
   protected val jobConfCacheKey = "rdd_%d_job_conf".format(id)
 
   protected val inputFormatCacheKey = "rdd_%d_input_format".format(id)
@@ -205,6 +207,7 @@ class HadoopRDD[K, V](
          * always at record boundaries, so tasks may need to read into other splits to complete
          * a record. */
         inputMetrics.bytesRead = split.inputSplit.value.getLength()
+        hadoopInputBytes += split.inputSplit.value.getLength()
       } catch {
         case e: java.io.IOException =>
           logWarning("Unable to get input size to set InputMetrics for task", e)
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1270,4 +1270,10 @@ abstract class RDD[T: ClassTag](
   def toJavaRDD() : JavaRDD[T] = {
     new JavaRDD(this)(elementClassTag)
   }
+
+  // =======================================================================
+  // Common metrics
+  // =======================================================================
+  // Input bytes if this RDD was read from persisted data or a filesystem
+  val inputBytes = sc.accumulator(0L, s"rdd-$id.input.bytes.persisted")
 }

Original file line number	Diff line number	Diff line change
`@@ -1270,4 +1270,10 @@ abstract class RDD[T: ClassTag](`
`1270`	`1270`	`def toJavaRDD() : JavaRDD[T] = {`
`1271`	`1271`	`new JavaRDD(this)(elementClassTag)`
`1272`	`1272`	`}`
	`1273`	`+`
	`1274`	`+ // =======================================================================`
	`1275`	`+ // Common metrics`
	`1276`	`+ // =======================================================================`
	`1277`	`+ // Input bytes if this RDD was read from persisted data or a filesystem`
	`1278`	`+ val inputBytes = sc.accumulator(0L, s"rdd-$id.input.bytes.persisted")`
`1273`	`1279`	`}`