From 1090c116191e01fe2dfaf7c110978f184bbb7a97 Mon Sep 17 00:00:00 2001
From: Jan Hilgard <jan.hilgard@gmail.com>
Date: Mon, 16 Feb 2026 09:30:27 +0100
Subject: [PATCH 1/2] Fix Metal resource leak under high concurrency (#91)

Addresses Metal buffer leak where batch.tokens grows via mx.concatenate()
each generation step without evaluation, causing computation graph nodes
to hold AGXAllocation handles indefinitely.

Changes:
- Add mx.async_eval(*batch.tokens) after each generation step to eagerly
  evaluate accumulated token concatenations and release Metal buffers
- Make cache clear interval adaptive: scales inversely with active
  sequence count (min interval 8) to prevent Metal resource handle
  exhaustion under high-concurrency workloads
- Add explicit mx.eval(*tokens) during periodic cache clear to collapse
  any remaining lazy concatenation chains before clearing

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 vllm_mlx/scheduler.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/vllm_mlx/scheduler.py b/vllm_mlx/scheduler.py
index 233b31092..fce99c5eb 100644
--- a/vllm_mlx/scheduler.py
+++ b/vllm_mlx/scheduler.py
@@ -200,6 +200,10 @@ def _generation_step(self=batch_gen):
             batch.tokens,
         )
         mx.async_eval(batch.y, batch.logprobs)
+        # Evaluate accumulated tokens to prevent Metal buffer buildup
+        # from lazy mx.concatenate() chains holding AGXAllocation handles
+        if batch.tokens:
+            mx.async_eval(*batch.tokens)
 
         y = y.tolist()
         self._stats.generation_time += _time.perf_counter() - tic_gen
@@ -2232,9 +2236,24 @@ def step(self, max_retries: int = 1) -> SchedulerOutput:
         old_finished = self.finished_req_ids
         self.finished_req_ids = set()
 
-        # Periodically clear Metal cache to prevent memory accumulation
+        # Adaptive interval: scale inversely with concurrency to prevent
+        # Metal resource handle exhaustion under high-concurrency workloads.
+        active_seqs = len(self.running)
+        effective_interval = max(
+            8, self._clear_cache_interval // max(1, active_seqs // 8)
+        )
+
         self._step_count += 1
-        if self._step_count % self._clear_cache_interval == 0:
+        if self._step_count % effective_interval == 0:
+            # Evaluate batch tokens to collapse lazy concatenation chains
+            if (
+                self.batch_generator is not None
+                and self.batch_generator.active_batch is not None
+                and hasattr(self.batch_generator.active_batch, "tokens")
+            ):
+                tokens = self.batch_generator.active_batch.tokens
+                if tokens:
+                    mx.eval(*tokens)
             mx.clear_cache()
 
         # Periodically log memory stats for monitoring

From 30c5ad47471739ed1359c0ccae68d236f7f080f5 Mon Sep 17 00:00:00 2001
From: Wayner Barrios <waybarrios@gmail.com>
Date: Thu, 12 Mar 2026 15:25:38 -0500
Subject: [PATCH 2/2] fix: remove redundant async_eval and derive cache
 interval floor from config

---
 vllm_mlx/scheduler.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/vllm_mlx/scheduler.py b/vllm_mlx/scheduler.py
index fce99c5eb..88d144cb7 100644
--- a/vllm_mlx/scheduler.py
+++ b/vllm_mlx/scheduler.py
@@ -200,10 +200,6 @@ def _generation_step(self=batch_gen):
             batch.tokens,
         )
         mx.async_eval(batch.y, batch.logprobs)
-        # Evaluate accumulated tokens to prevent Metal buffer buildup
-        # from lazy mx.concatenate() chains holding AGXAllocation handles
-        if batch.tokens:
-            mx.async_eval(*batch.tokens)
 
         y = y.tolist()
         self._stats.generation_time += _time.perf_counter() - tic_gen
@@ -2239,8 +2235,9 @@ def step(self, max_retries: int = 1) -> SchedulerOutput:
         # Adaptive interval: scale inversely with concurrency to prevent
         # Metal resource handle exhaustion under high-concurrency workloads.
         active_seqs = len(self.running)
+        min_interval = max(4, self._clear_cache_interval // 4)
         effective_interval = max(
-            8, self._clear_cache_interval // max(1, active_seqs // 8)
+            min_interval, self._clear_cache_interval // max(1, active_seqs // 8)
         )
 
         self._step_count += 1