diff --git a/vllm_mlx/scheduler.py b/vllm_mlx/scheduler.py index 233b3109..88d144cb 100644 --- a/vllm_mlx/scheduler.py +++ b/vllm_mlx/scheduler.py @@ -2232,9 +2232,25 @@ def step(self, max_retries: int = 1) -> SchedulerOutput: old_finished = self.finished_req_ids self.finished_req_ids = set() - # Periodically clear Metal cache to prevent memory accumulation + # Adaptive interval: scale inversely with concurrency to prevent + # Metal resource handle exhaustion under high-concurrency workloads. + active_seqs = len(self.running) + min_interval = max(4, self._clear_cache_interval // 4) + effective_interval = max( + min_interval, self._clear_cache_interval // max(1, active_seqs // 8) + ) + self._step_count += 1 - if self._step_count % self._clear_cache_interval == 0: + if self._step_count % effective_interval == 0: + # Evaluate batch tokens to collapse lazy concatenation chains + if ( + self.batch_generator is not None + and self.batch_generator.active_batch is not None + and hasattr(self.batch_generator.active_batch, "tokens") + ): + tokens = self.batch_generator.active_batch.tokens + if tokens: + mx.eval(*tokens) mx.clear_cache() # Periodically log memory stats for monitoring