diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a7c2a8800e7a..0e2e381f282d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -4787,6 +4787,7 @@ def _dummy_run( pad_attn = cudagraph_runtime_mode == CUDAGraphMode.FULL attn_metadata, _ = self._build_attention_metadata( num_tokens=num_tokens_unpadded, + num_tokens_padded=num_tokens_padded if pad_attn else None, num_reqs=num_reqs_padded, max_query_len=max_query_len, ubatch_slices=ubatch_slices_padded if pad_attn else ubatch_slices,