diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index 23459963f090..2031a8d6688d 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -176,3 +176,46 @@ def test_models_distributed( distributed_executor_backend=distributed_executor_backend, enforce_eager=False, ) + + +@pytest.mark.core_model +@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) +def test_encoder_cache_cleanup( + vllm_runner, + model: str, + input_audios, + monkeypatch, +) -> None: + """Test that encoder cache is properly cleaned up after requests complete. + + This is a regression test for a bug where encoder cache entries were freed + in the same scheduling step they were allocated, before the model could use + them. + """ + # Set single-process mode to access the model runner's encoder cache directly + monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + check_model_available(model) + + with vllm_runner( + model, + dtype="half", + max_model_len=448, + tensor_parallel_size=1, + limit_mm_per_prompt={"audio": 2}, + enforce_eager=True, + ) as vllm_model: + engine_core = vllm_model.llm.llm_engine.engine_core.engine_core + model_runner = engine_core.model_executor.driver_worker.worker.model_runner + encoder_cache = model_runner.encoder_cache + + # Run multiple sequential requests to ensure cache is properly managed + for vllm_prompts, _, audios in input_audios: + vllm_model.generate_greedy(vllm_prompts, max_tokens=50, audios=audios) + + # After all requests complete, encoder cache should be empty + cache_size = len(encoder_cache) + assert cache_size == 0, ( + f"Encoder cache should be empty after all requests complete, " + f"but has {cache_size} entries. This indicates encoder cache " + f"entries are not being properly freed." + ) diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index d73c05d2cf80..43b44fdaf665 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -357,7 +357,8 @@ class EncoderDecoderCacheManager(EncoderCacheManager): def __init__(self, cache_size: int): self.cache_size = cache_size self.num_free_slots = cache_size - self.freed: list[str] = [] + self.allocated: list[str] = [] + self.to_free: list[str] = [] def check_and_update_cache(self, request: Request, input_id: int) -> bool: return False @@ -383,7 +384,7 @@ def allocate(self, request: Request, input_id: int) -> None: self.num_free_slots -= num_encoder_embeds mm_hash = request.mm_features[input_id].identifier - self.freed.append(mm_hash) + self.allocated.append(mm_hash) def free(self, request: Request) -> None: for input_id in range(len(request.mm_features)): @@ -393,9 +394,14 @@ def get_cached_input_ids(self, request: Request) -> set[int]: return set(range(len(request.mm_features))) def get_freed_mm_hashes(self) -> list[str]: - freed = self.freed - self.freed = [] - return freed + # As encoder cache is not used for enc-dec models, we can free the entries here + # The actual free happens in the runner, *before* the model is executed. + # Therefore, `freeable` acts as a buffer to free the entries only after the + # model is executed, mimicking the state transition of `EncoderCacheManager`. + to_free = self.to_free + self.to_free = self.allocated + self.allocated = [] + return to_free def free_encoder_input(self, request: Request, input_id: int) -> None: num_encoder_embeds = request.get_num_encoder_embeds(input_id)