Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions tests/models/multimodal/generation/test_whisper.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,46 @@ def test_models_distributed(
distributed_executor_backend=distributed_executor_backend,
enforce_eager=False,
)


@pytest.mark.core_model
@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"])
def test_encoder_cache_cleanup(
vllm_runner,
model: str,
input_audios,
monkeypatch,
) -> None:
"""Test that encoder cache is properly cleaned up after requests complete.

This is a regression test for a bug where encoder cache entries were freed
in the same scheduling step they were allocated, before the model could use
them.
"""
# Set single-process mode to access the model runner's encoder cache directly
monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
check_model_available(model)

with vllm_runner(
model,
dtype="half",
max_model_len=448,
tensor_parallel_size=1,
limit_mm_per_prompt={"audio": 2},
enforce_eager=True,
) as vllm_model:
engine_core = vllm_model.llm.llm_engine.engine_core.engine_core
model_runner = engine_core.model_executor.driver_worker.worker.model_runner
encoder_cache = model_runner.encoder_cache

# Run multiple sequential requests to ensure cache is properly managed
for vllm_prompts, _, audios in input_audios:
vllm_model.generate_greedy(vllm_prompts, max_tokens=50, audios=audios)

# After all requests complete, encoder cache should be empty
cache_size = len(encoder_cache)
assert cache_size == 0, (
f"Encoder cache should be empty after all requests complete, "
f"but has {cache_size} entries. This indicates encoder cache "
f"entries are not being properly freed."
)
Comment on lines +181 to +221
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The addition of test_encoder_cache_cleanup is an excellent and necessary regression test. It directly targets the memory leak issue by verifying that the encoder cache is empty after multiple sequential requests, providing strong assurance that the fix is effective and robust.

16 changes: 11 additions & 5 deletions vllm/v1/core/encoder_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,8 @@ class EncoderDecoderCacheManager(EncoderCacheManager):
def __init__(self, cache_size: int):
self.cache_size = cache_size
self.num_free_slots = cache_size
self.freed: list[str] = []
self.allocated: list[str] = []
self.to_free: list[str] = []
Comment on lines +360 to +361
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The change from a single freed list to allocated and to_free lists is crucial for implementing the double-buffering mechanism. This ensures that entries are not prematurely marked for freeing before the model has a chance to use them, directly addressing the memory leak.


def check_and_update_cache(self, request: Request, input_id: int) -> bool:
return False
Expand All @@ -383,7 +384,7 @@ def allocate(self, request: Request, input_id: int) -> None:
self.num_free_slots -= num_encoder_embeds

mm_hash = request.mm_features[input_id].identifier
self.freed.append(mm_hash)
self.allocated.append(mm_hash)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Updating the allocate method to append to self.allocated correctly marks newly allocated entries for processing in the next scheduling step, aligning with the double-buffering strategy to prevent premature freeing.


def free(self, request: Request) -> None:
for input_id in range(len(request.mm_features)):
Expand All @@ -393,9 +394,14 @@ def get_cached_input_ids(self, request: Request) -> set[int]:
return set(range(len(request.mm_features)))

def get_freed_mm_hashes(self) -> list[str]:
freed = self.freed
self.freed = []
return freed
# As encoder cache is not used for enc-dec models, we can free the entries here
# The actual free happens in the runner, *before* the model is executed.
# Therefore, `freeable` acts as a buffer to free the entries only after the
# model is executed, mimicking the state transition of `EncoderCacheManager`.
to_free = self.to_free
self.to_free = self.allocated
self.allocated = []
Comment on lines +397 to +403
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This new logic for get_freed_mm_hashes is the core of the fix. By moving self.allocated to self.to_free and then returning the previous self.to_free list, the system ensures that cache entries are only freed after a full scheduling cycle. This effectively prevents the race condition that caused the memory leak by delaying the deallocation until the encoder outputs have been consumed by the model.

return to_free

def free_encoder_input(self, request: Request, input_id: int) -> None:
num_encoder_embeds = request.get_num_encoder_embeds(input_id)
Expand Down
Loading