-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-30306][CORE][PYTHON][WIP] Instrument Python UDF execution time and throughput metrics using Spark Metrics system #26953
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.api.python | ||
|
|
||
| import java.util.concurrent.atomic.AtomicLong | ||
|
|
||
| private[spark] object PythonMetrics { | ||
|
|
||
| // Instrument with general metrics on serialization/deserialization JVM-to-Python | ||
| private val toWorkerWriteTime = new AtomicLong(0L) | ||
| private val toWorkerBatchCount = new AtomicLong(0L) | ||
| private val toWorkerBytesWritten = new AtomicLong(0L) | ||
| private val fromWorkerReadTime = new AtomicLong(0L) | ||
| private val fromWorkerBatchCount = new AtomicLong(0L) | ||
| private val fromWorkerBytesRead = new AtomicLong(0L) | ||
|
|
||
| // Instrument Pandas_UDF | ||
| private val pandasUDFReadRowCount = new AtomicLong(0L) | ||
| private val pandasUDFWriteRowCount = new AtomicLong(0L) | ||
|
|
||
| def incToWorkerWriteTime(delta: Long): Unit = { | ||
| toWorkerWriteTime.getAndAdd(delta) | ||
| } | ||
|
|
||
| def getToWorkerWriteTime: Long = { | ||
| toWorkerWriteTime.get | ||
| } | ||
|
|
||
| def incToWorkerBytesWritten(delta: Long): Unit = { | ||
| toWorkerBytesWritten.getAndAdd(delta) | ||
| } | ||
|
|
||
| def getToWorkerBytesWritten: Long = { | ||
| toWorkerBytesWritten.get | ||
| } | ||
|
|
||
| def incToWorkerBatchCount(delta: Long): Unit = { | ||
| toWorkerBatchCount.getAndAdd(delta) | ||
| } | ||
|
|
||
| def getToWorkerBatchCount: Long = { | ||
| toWorkerBatchCount.get | ||
| } | ||
|
|
||
| def incFromWorkerReadTime(delta: Long): Unit = { | ||
| fromWorkerReadTime.getAndAdd(delta) | ||
| } | ||
|
|
||
| def getFromWorkerReadTime: Long = { | ||
| fromWorkerReadTime.get | ||
| } | ||
|
|
||
| def incFromWorkerBatchCount(delta: Long): Unit = { | ||
| fromWorkerBatchCount.getAndAdd(delta) | ||
| } | ||
|
|
||
| def getFromWorkerBatchCount: Long = { | ||
| fromWorkerBatchCount.get | ||
| } | ||
|
|
||
| def incFromWorkerBytesRead(delta: Long): Unit = { | ||
| fromWorkerBytesRead.getAndAdd(delta) | ||
| } | ||
|
|
||
| def getFromWorkerBytesRead: Long = { | ||
| fromWorkerBytesRead.get | ||
| } | ||
|
|
||
| // Pandas_UDF | ||
| def incPandasUDFReadRowCount(step: Long): Unit = { | ||
| pandasUDFReadRowCount.getAndAdd(step) | ||
| } | ||
|
|
||
| def getPandasUDFReadRowCount: Long = { | ||
| pandasUDFReadRowCount.get | ||
| } | ||
|
|
||
| def incPandasUDFWriteRowCount(step: Long): Unit = { | ||
| pandasUDFWriteRowCount.getAndAdd(step) | ||
| } | ||
|
|
||
| def getPandasUDFWriteRowCount: Long = { | ||
| pandasUDFWriteRowCount.get | ||
| } | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -229,6 +229,9 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( | |
|
|
||
| override def run(): Unit = Utils.logUncaughtExceptions { | ||
| try { | ||
| // time instrumentation | ||
| val startTime = System.nanoTime() | ||
|
|
||
| TaskContext.setTaskContext(context) | ||
| val stream = new BufferedOutputStream(worker.getOutputStream, bufferSize) | ||
| val dataOut = new DataOutputStream(stream) | ||
|
|
@@ -397,6 +400,12 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( | |
|
|
||
| dataOut.writeInt(SpecialLengths.END_OF_STREAM) | ||
| dataOut.flush() | ||
|
|
||
| val deltaTime = System.nanoTime()-startTime | ||
| val deltaBytes = dataOut.size() | ||
| PythonMetrics.incToWorkerWriteTime(deltaTime) | ||
| PythonMetrics.incToWorkerBytesWritten(deltaBytes) | ||
|
|
||
| } catch { | ||
| case t: Throwable if (NonFatal(t) || t.isInstanceOf[Exception]) => | ||
| if (context.isCompleted || context.isInterrupted) { | ||
|
|
@@ -466,7 +475,10 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( | |
|
|
||
| override def hasNext: Boolean = nextObj != null || { | ||
| if (!eos) { | ||
| val startTime = System.nanoTime() | ||
| nextObj = read() | ||
| val deltaTime = System.nanoTime()-startTime | ||
| PythonMetrics.incFromWorkerReadTime(deltaTime) | ||
|
||
| hasNext | ||
| } else { | ||
| false | ||
|
|
@@ -477,6 +489,7 @@ private[spark] abstract class BasePythonRunner[IN, OUT]( | |
| if (hasNext) { | ||
| val obj = nextObj | ||
| nextObj = null.asInstanceOf[OUT] | ||
| PythonMetrics.incFromWorkerBatchCount(1L) | ||
| obj | ||
| } else { | ||
| Iterator.empty.next() | ||
|
|
@@ -642,6 +655,7 @@ private[spark] class PythonRunner(funcs: Seq[ChainedPythonFunctions]) | |
| case length if length > 0 => | ||
| val obj = new Array[Byte](length) | ||
| stream.readFully(obj) | ||
| PythonMetrics.incFromWorkerBytesRead(length) | ||
| obj | ||
| case 0 => Array.emptyByteArray | ||
| case SpecialLengths.TIMING_DATA => | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.metrics.source | ||
|
|
||
| import com.codahale.metrics.{Gauge, MetricRegistry} | ||
|
|
||
| import org.apache.spark.api.python.PythonMetrics | ||
|
|
||
| private[spark] class PythonMetricsSource extends Source { | ||
|
|
||
| override val metricRegistry = new MetricRegistry() | ||
| override val sourceName = "PythonMetrics" | ||
|
|
||
| // This instruments the time spent to write/send serialized data to Python workers. | ||
| // Includes operations for MapPartition, PythonUDF and PandasUDF. | ||
| // Time is measured in nanoseconds. | ||
| metricRegistry.register(MetricRegistry.name("WriteTimeToWorkers"), new Gauge[Long] { | ||
| override def getValue: Long = PythonMetrics.getToWorkerWriteTime | ||
| }) | ||
|
|
||
| // This instruments the number of data batches sent to Python workers. | ||
| // Includes operations for MapPartition, PythonUDF and PandasUDF. | ||
| metricRegistry.register(MetricRegistry.name("NumBatchesToWorkers"), new Gauge[Long] { | ||
| override def getValue: Long = PythonMetrics.getToWorkerBatchCount | ||
| }) | ||
|
|
||
| // This instruments the number of bytes sent to Python workers. | ||
| // Includes operations for MapPartition, PythonUDF and PandasUDF. | ||
| metricRegistry.register(MetricRegistry.name("BytesSentToWorkers"), new Gauge[Long] { | ||
| override def getValue: Long = PythonMetrics.getToWorkerBytesWritten | ||
| }) | ||
|
|
||
| // This instruments the number of bytes received from to Python workers. | ||
| // Includes operations for MapPartition, PythonUDF and PandasUDF. | ||
| metricRegistry.register(MetricRegistry.name("BytesReceivedFromWorkers"), new Gauge[Long] { | ||
| override def getValue: Long = PythonMetrics.getFromWorkerBytesRead | ||
| }) | ||
|
|
||
| // This instruments the time spent reading/receiving data back from Python workers. | ||
| // It includes read operations for MapPartition, PythonUDF and PandasUDF. | ||
| // Time is measured in nanoseconds. | ||
| metricRegistry.register(MetricRegistry.name("FetchResultsTimeFromWorkers"), new Gauge[Long] { | ||
| override def getValue: Long = PythonMetrics.getFromWorkerReadTime | ||
| }) | ||
|
|
||
| // This instruments the number of data batches received back from Python workers. | ||
| // Includes operations for MapPartition, PythonUDF and PandasUDF. | ||
| metricRegistry.register(MetricRegistry.name("NumBatchesFromWorkers"), new Gauge[Long] { | ||
| override def getValue: Long = PythonMetrics.getFromWorkerBatchCount | ||
| }) | ||
|
|
||
| // This instruments the number of rows received back from Python workers, | ||
| // for Pandas UDF operations. | ||
| metricRegistry.register(MetricRegistry.name("PandasUDFReceivedNumRows"), new Gauge[Long] { | ||
| override def getValue: Long = PythonMetrics.getPandasUDFReadRowCount | ||
| }) | ||
|
|
||
| // This instruments the number of rows sent to Python workers, | ||
| // for Pandas UDF operations. | ||
| metricRegistry.register(MetricRegistry.name("PandasUDFSentNumRows"), new Gauge[Long] { | ||
| override def getValue: Long = PythonMetrics.getPandasUDFWriteRowCount | ||
| }) | ||
|
|
||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So, these metrics are incremental across all JVM-Python and Python UDFs in single Spark app?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Correct. The metrics values are cumulative, measured and reported for each executor, on the same spirit of other metrics in the metrics system like
cpuTime.countrunTime.count, etc.