waybarrios · waybarrios · Mar 12, 2026 · Feb 16, 2026 · Mar 12, 2026
diff --git a/vllm_mlx/scheduler.py b/vllm_mlx/scheduler.py
@@ -2232,9 +2232,25 @@ def step(self, max_retries: int = 1) -> SchedulerOutput:
         old_finished = self.finished_req_ids
         self.finished_req_ids = set()
 
-        # Periodically clear Metal cache to prevent memory accumulation
+        # Adaptive interval: scale inversely with concurrency to prevent
+        # Metal resource handle exhaustion under high-concurrency workloads.
+        active_seqs = len(self.running)
+        min_interval = max(4, self._clear_cache_interval // 4)
+        effective_interval = max(
+            min_interval, self._clear_cache_interval // max(1, active_seqs // 8)
+        )
+
         self._step_count += 1
-        if self._step_count % self._clear_cache_interval == 0:
+        if self._step_count % effective_interval == 0:
+            # Evaluate batch tokens to collapse lazy concatenation chains
+            if (
+                self.batch_generator is not None
+                and self.batch_generator.active_batch is not None
+                and hasattr(self.batch_generator.active_batch, "tokens")
+            ):
+                tokens = self.batch_generator.active_batch.tokens
+                if tokens:
+                    mx.eval(*tokens)
             mx.clear_cache()
 
         # Periodically log memory stats for monitoring