diff --git a/continuous_integration/environment-3.10.yaml b/continuous_integration/environment-3.10.yaml index aa5b9dc67d4..48172dd6c3d 100644 --- a/continuous_integration/environment-3.10.yaml +++ b/continuous_integration/environment-3.10.yaml @@ -47,4 +47,4 @@ dependencies: - git+https://github.com/dask/zict - git+https://github.com/fsspec/filesystem_spec - keras - - gilknocker>=0.3.0 + - gilknocker>=0.4.0 diff --git a/continuous_integration/environment-3.11.yaml b/continuous_integration/environment-3.11.yaml index 5ec7e6b92dd..379f4bba494 100644 --- a/continuous_integration/environment-3.11.yaml +++ b/continuous_integration/environment-3.11.yaml @@ -47,4 +47,4 @@ dependencies: - git+https://github.com/dask/zict - git+https://github.com/fsspec/filesystem_spec - keras - - gilknocker>=0.3.0 + - gilknocker>=0.4.0 diff --git a/continuous_integration/environment-3.8.yaml b/continuous_integration/environment-3.8.yaml index 62b303a5a35..5a41cef53e3 100644 --- a/continuous_integration/environment-3.8.yaml +++ b/continuous_integration/environment-3.8.yaml @@ -48,4 +48,4 @@ dependencies: - git+https://github.com/dask/dask - git+https://github.com/jcrist/crick # Only tested here - keras - - gilknocker>=0.3.0 + - gilknocker>=0.4.0 diff --git a/continuous_integration/environment-3.9.yaml b/continuous_integration/environment-3.9.yaml index f3040ba735d..a43f24cdbae 100644 --- a/continuous_integration/environment-3.9.yaml +++ b/continuous_integration/environment-3.9.yaml @@ -50,4 +50,4 @@ dependencies: - pip: - git+https://github.com/dask/dask - keras - - gilknocker>=0.3.0 + - gilknocker>=0.4.0 diff --git a/distributed/http/scheduler/prometheus/core.py b/distributed/http/scheduler/prometheus/core.py index 5f2e78faad6..660675636e6 100644 --- a/distributed/http/scheduler/prometheus/core.py +++ b/distributed/http/scheduler/prometheus/core.py @@ -52,6 +52,13 @@ def collect(self) -> Iterator[GaugeMetricFamily | CounterMetricFamily]: ) yield worker_states + if self.server.monitor.monitor_gil_contention: + yield CounterMetricFamily( + self.build_name("gil_contention"), + "GIL contention metric", + value=self.server.monitor._cumulative_gil_contention, + ) + tasks = GaugeMetricFamily( self.build_name("tasks"), "Number of tasks known by scheduler", diff --git a/distributed/http/tests/test_core.py b/distributed/http/tests/test_core.py index 4cf85bba33a..7f43d581156 100644 --- a/distributed/http/tests/test_core.py +++ b/distributed/http/tests/test_core.py @@ -1,6 +1,7 @@ from __future__ import annotations import pathlib +from unittest import mock import pytest from tornado.httpclient import AsyncHTTPClient @@ -16,13 +17,26 @@ async def test_scheduler(c, s, a, b): assert response.code == 200 -@gen_cluster(client=True, nthreads=[("", 1)]) -async def test_prometheus_api_doc(c, s, a): +@mock.patch("warnings.warn", return_value=None) +@gen_cluster( + client=True, + nthreads=[("", 1)], + config={"distributed.admin.system-monitor.gil.enabled": True}, +) +async def test_prometheus_api_doc(c, s, a, _): """Test that the Sphinx documentation of Prometheus endpoints matches the implementation. """ pytest.importorskip("prometheus_client") + documented = set() + root_dir = pathlib.Path(__file__).parent.parent.parent.parent + with open(root_dir / "docs" / "source" / "prometheus.rst") as fh: + for row in fh: + row = row.strip() + if row.startswith("dask_"): + documented.add(row) + # Some metrics only appear if there are tasks on the cluster fut = c.submit(inc, 1) await fut @@ -53,14 +67,15 @@ async def test_prometheus_api_doc(c, s, a): "dask_worker_transfer_bandwidth_median_bytes", } - implemented = scheduler_metrics | worker_metrics | crick_metrics + try: + import gilknocker # noqa: F401 - documented = set() - root_dir = pathlib.Path(__file__).parent.parent.parent.parent - with open(root_dir / "docs" / "source" / "prometheus.rst") as fh: - for row in fh: - row = row.strip() - if row.startswith("dask_"): - documented.add(row) + gil_metrics = set() # Already in worker_metrics + except ImportError: + gil_metrics = { + "dask_scheduler_gil_contention_total", + "dask_worker_gil_contention_total", + } + implemented = scheduler_metrics | worker_metrics | crick_metrics | gil_metrics assert documented == implemented diff --git a/distributed/http/worker/prometheus/core.py b/distributed/http/worker/prometheus/core.py index b8a60775bf3..23aef2d9f61 100644 --- a/distributed/http/worker/prometheus/core.py +++ b/distributed/http/worker/prometheus/core.py @@ -59,6 +59,13 @@ def collect(self) -> Iterator[Metric]: value=ws.transfer_incoming_count, ) + if self.server.monitor.monitor_gil_contention: + yield CounterMetricFamily( + self.build_name("gil_contention"), + "GIL contention metric", + value=self.server.monitor._cumulative_gil_contention, + ) + yield GaugeMetricFamily( self.build_name("threads"), "Number of worker threads", diff --git a/distributed/system_monitor.py b/distributed/system_monitor.py index 94c9a012e17..85bdae0d423 100644 --- a/distributed/system_monitor.py +++ b/distributed/system_monitor.py @@ -30,6 +30,8 @@ class SystemMonitor: _last_host_cpu_counters: Any # dynamically-defined psutil namedtuple _last_gil_contention: float # 0-1 value + _cumulative_gil_contention: float + gpu_name: str | None gpu_memory_total: int @@ -108,6 +110,7 @@ def __init__( self.monitor_gil_contention = False else: self.quantities["gil_contention"] = deque(maxlen=maxlen) + self._cumulative_gil_contention = 0.0 raw_interval = dask.config.get( "distributed.admin.system-monitor.gil.interval", ) @@ -191,6 +194,7 @@ def update(self) -> dict[str, Any]: if self.monitor_gil_contention: self._last_gil_contention = self._gilknocker.contention_metric + self._cumulative_gil_contention += self._last_gil_contention result["gil_contention"] = self._last_gil_contention self._gilknocker.reset_contention_metric() diff --git a/docs/source/prometheus.rst b/docs/source/prometheus.rst index d2707c2503c..3c9225e737c 100644 --- a/docs/source/prometheus.rst +++ b/docs/source/prometheus.rst @@ -26,6 +26,15 @@ dask_scheduler_clients Number of clients connected dask_scheduler_desired_workers Number of workers scheduler needs for task graph +dask_scheduler_gil_contention_total + Value representing cumulative total of GIL contention, + in the form of summed percentages. + + .. note:: + Requires ``gilknocker`` to be installed, and + ``distributed.admin.system-monitor.gil.enabled`` + configuration to be set. + dask_scheduler_workers Number of workers known by scheduler dask_scheduler_tasks @@ -117,6 +126,15 @@ dask_worker_tasks Number of tasks at worker dask_worker_threads Number of worker threads +dask_worker_gil_contention_total + Value representing cumulative total GIL contention on worker, + in the form of summed percentages. + + .. note:: + Requires ``gilknocker`` to be installed, and + ``distributed.admin.system-monitor.gil.enabled`` + configuration to be set. + dask_worker_latency_seconds Latency of worker connection dask_worker_memory_bytes