From 1090c116191e01fe2dfaf7c110978f184bbb7a97 Mon Sep 17 00:00:00 2001 From: Jan Hilgard Date: Mon, 16 Feb 2026 09:30:27 +0100 Subject: [PATCH 1/2] Fix Metal resource leak under high concurrency (#91) Addresses Metal buffer leak where batch.tokens grows via mx.concatenate() each generation step without evaluation, causing computation graph nodes to hold AGXAllocation handles indefinitely. Changes: - Add mx.async_eval(*batch.tokens) after each generation step to eagerly evaluate accumulated token concatenations and release Metal buffers - Make cache clear interval adaptive: scales inversely with active sequence count (min interval 8) to prevent Metal resource handle exhaustion under high-concurrency workloads - Add explicit mx.eval(*tokens) during periodic cache clear to collapse any remaining lazy concatenation chains before clearing Co-Authored-By: Claude Opus 4.6 --- vllm_mlx/scheduler.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/vllm_mlx/scheduler.py b/vllm_mlx/scheduler.py index 233b31092..fce99c5eb 100644 --- a/vllm_mlx/scheduler.py +++ b/vllm_mlx/scheduler.py @@ -200,6 +200,10 @@ def _generation_step(self=batch_gen): batch.tokens, ) mx.async_eval(batch.y, batch.logprobs) + # Evaluate accumulated tokens to prevent Metal buffer buildup + # from lazy mx.concatenate() chains holding AGXAllocation handles + if batch.tokens: + mx.async_eval(*batch.tokens) y = y.tolist() self._stats.generation_time += _time.perf_counter() - tic_gen @@ -2232,9 +2236,24 @@ def step(self, max_retries: int = 1) -> SchedulerOutput: old_finished = self.finished_req_ids self.finished_req_ids = set() - # Periodically clear Metal cache to prevent memory accumulation + # Adaptive interval: scale inversely with concurrency to prevent + # Metal resource handle exhaustion under high-concurrency workloads. + active_seqs = len(self.running) + effective_interval = max( + 8, self._clear_cache_interval // max(1, active_seqs // 8) + ) + self._step_count += 1 - if self._step_count % self._clear_cache_interval == 0: + if self._step_count % effective_interval == 0: + # Evaluate batch tokens to collapse lazy concatenation chains + if ( + self.batch_generator is not None + and self.batch_generator.active_batch is not None + and hasattr(self.batch_generator.active_batch, "tokens") + ): + tokens = self.batch_generator.active_batch.tokens + if tokens: + mx.eval(*tokens) mx.clear_cache() # Periodically log memory stats for monitoring From 30c5ad47471739ed1359c0ccae68d236f7f080f5 Mon Sep 17 00:00:00 2001 From: Wayner Barrios Date: Thu, 12 Mar 2026 15:25:38 -0500 Subject: [PATCH 2/2] fix: remove redundant async_eval and derive cache interval floor from config --- vllm_mlx/scheduler.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/vllm_mlx/scheduler.py b/vllm_mlx/scheduler.py index fce99c5eb..88d144cb7 100644 --- a/vllm_mlx/scheduler.py +++ b/vllm_mlx/scheduler.py @@ -200,10 +200,6 @@ def _generation_step(self=batch_gen): batch.tokens, ) mx.async_eval(batch.y, batch.logprobs) - # Evaluate accumulated tokens to prevent Metal buffer buildup - # from lazy mx.concatenate() chains holding AGXAllocation handles - if batch.tokens: - mx.async_eval(*batch.tokens) y = y.tolist() self._stats.generation_time += _time.perf_counter() - tic_gen @@ -2239,8 +2235,9 @@ def step(self, max_retries: int = 1) -> SchedulerOutput: # Adaptive interval: scale inversely with concurrency to prevent # Metal resource handle exhaustion under high-concurrency workloads. active_seqs = len(self.running) + min_interval = max(4, self._clear_cache_interval // 4) effective_interval = max( - 8, self._clear_cache_interval // max(1, active_seqs // 8) + min_interval, self._clear_cache_interval // max(1, active_seqs // 8) ) self._step_count += 1