Skip to content

Commit

Permalink
Add GIL contention metric to Prometheus (#7651)
Browse files Browse the repository at this point in the history
Co-authored-by: Gabe Joseph <[email protected]>
Co-authored-by: Hendrik Makait <[email protected]>
  • Loading branch information
3 people authored Apr 11, 2023
1 parent 61d7881 commit e1944ec
Show file tree
Hide file tree
Showing 9 changed files with 65 additions and 14 deletions.
2 changes: 1 addition & 1 deletion continuous_integration/environment-3.10.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,4 @@ dependencies:
- git+https://github.com/dask/zict
- git+https://github.com/fsspec/filesystem_spec
- keras
- gilknocker>=0.3.0
- gilknocker>=0.4.0
2 changes: 1 addition & 1 deletion continuous_integration/environment-3.11.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,4 @@ dependencies:
- git+https://github.com/dask/zict
- git+https://github.com/fsspec/filesystem_spec
- keras
- gilknocker>=0.3.0
- gilknocker>=0.4.0
2 changes: 1 addition & 1 deletion continuous_integration/environment-3.8.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,4 +48,4 @@ dependencies:
- git+https://github.com/dask/dask
- git+https://github.com/jcrist/crick # Only tested here
- keras
- gilknocker>=0.3.0
- gilknocker>=0.4.0
2 changes: 1 addition & 1 deletion continuous_integration/environment-3.9.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,4 @@ dependencies:
- pip:
- git+https://github.com/dask/dask
- keras
- gilknocker>=0.3.0
- gilknocker>=0.4.0
7 changes: 7 additions & 0 deletions distributed/http/scheduler/prometheus/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ def collect(self) -> Iterator[GaugeMetricFamily | CounterMetricFamily]:
)
yield worker_states

if self.server.monitor.monitor_gil_contention:
yield CounterMetricFamily(
self.build_name("gil_contention"),
"GIL contention metric",
value=self.server.monitor._cumulative_gil_contention,
)

tasks = GaugeMetricFamily(
self.build_name("tasks"),
"Number of tasks known by scheduler",
Expand Down
35 changes: 25 additions & 10 deletions distributed/http/tests/test_core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import pathlib
from unittest import mock

import pytest
from tornado.httpclient import AsyncHTTPClient
Expand All @@ -16,13 +17,26 @@ async def test_scheduler(c, s, a, b):
assert response.code == 200


@gen_cluster(client=True, nthreads=[("", 1)])
async def test_prometheus_api_doc(c, s, a):
@mock.patch("warnings.warn", return_value=None)
@gen_cluster(
client=True,
nthreads=[("", 1)],
config={"distributed.admin.system-monitor.gil.enabled": True},
)
async def test_prometheus_api_doc(c, s, a, _):
"""Test that the Sphinx documentation of Prometheus endpoints matches the
implementation.
"""
pytest.importorskip("prometheus_client")

documented = set()
root_dir = pathlib.Path(__file__).parent.parent.parent.parent
with open(root_dir / "docs" / "source" / "prometheus.rst") as fh:
for row in fh:
row = row.strip()
if row.startswith("dask_"):
documented.add(row)

# Some metrics only appear if there are tasks on the cluster
fut = c.submit(inc, 1)
await fut
Expand Down Expand Up @@ -53,14 +67,15 @@ async def test_prometheus_api_doc(c, s, a):
"dask_worker_transfer_bandwidth_median_bytes",
}

implemented = scheduler_metrics | worker_metrics | crick_metrics
try:
import gilknocker # noqa: F401

documented = set()
root_dir = pathlib.Path(__file__).parent.parent.parent.parent
with open(root_dir / "docs" / "source" / "prometheus.rst") as fh:
for row in fh:
row = row.strip()
if row.startswith("dask_"):
documented.add(row)
gil_metrics = set() # Already in worker_metrics
except ImportError:
gil_metrics = {
"dask_scheduler_gil_contention_total",
"dask_worker_gil_contention_total",
}

implemented = scheduler_metrics | worker_metrics | crick_metrics | gil_metrics
assert documented == implemented
7 changes: 7 additions & 0 deletions distributed/http/worker/prometheus/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,13 @@ def collect(self) -> Iterator[Metric]:
value=ws.transfer_incoming_count,
)

if self.server.monitor.monitor_gil_contention:
yield CounterMetricFamily(
self.build_name("gil_contention"),
"GIL contention metric",
value=self.server.monitor._cumulative_gil_contention,
)

yield GaugeMetricFamily(
self.build_name("threads"),
"Number of worker threads",
Expand Down
4 changes: 4 additions & 0 deletions distributed/system_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ class SystemMonitor:
_last_host_cpu_counters: Any # dynamically-defined psutil namedtuple
_last_gil_contention: float # 0-1 value

_cumulative_gil_contention: float

gpu_name: str | None
gpu_memory_total: int

Expand Down Expand Up @@ -108,6 +110,7 @@ def __init__(
self.monitor_gil_contention = False
else:
self.quantities["gil_contention"] = deque(maxlen=maxlen)
self._cumulative_gil_contention = 0.0
raw_interval = dask.config.get(
"distributed.admin.system-monitor.gil.interval",
)
Expand Down Expand Up @@ -191,6 +194,7 @@ def update(self) -> dict[str, Any]:

if self.monitor_gil_contention:
self._last_gil_contention = self._gilknocker.contention_metric
self._cumulative_gil_contention += self._last_gil_contention
result["gil_contention"] = self._last_gil_contention
self._gilknocker.reset_contention_metric()

Expand Down
18 changes: 18 additions & 0 deletions docs/source/prometheus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@ dask_scheduler_clients
Number of clients connected
dask_scheduler_desired_workers
Number of workers scheduler needs for task graph
dask_scheduler_gil_contention_total
Value representing cumulative total of GIL contention,
in the form of summed percentages.

.. note::
Requires ``gilknocker`` to be installed, and
``distributed.admin.system-monitor.gil.enabled``
configuration to be set.

dask_scheduler_workers
Number of workers known by scheduler
dask_scheduler_tasks
Expand Down Expand Up @@ -117,6 +126,15 @@ dask_worker_tasks
Number of tasks at worker
dask_worker_threads
Number of worker threads
dask_worker_gil_contention_total
Value representing cumulative total GIL contention on worker,
in the form of summed percentages.

.. note::
Requires ``gilknocker`` to be installed, and
``distributed.admin.system-monitor.gil.enabled``
configuration to be set.

dask_worker_latency_seconds
Latency of worker connection
dask_worker_memory_bytes
Expand Down

0 comments on commit e1944ec

Please sign in to comment.