Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions tests/v1/attention/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,4 +340,11 @@ class BackendConfig:
"cudagraph_mode": "FULL_AND_PIECEWISE",
},
),
"RocmAttn": BackendConfig(
name="RocmAttn",
env_vars={"VLLM_V1_USE_PREFILL_DECODE_ATTENTION": "1"},
comp_config={
"cudagraph_mode": "FULL",
},
),
}
62 changes: 42 additions & 20 deletions tests/v1/cudagraph/test_cudagraph_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,22 @@ def temporary_environ(env_vars):

# test attention backend and cudagraph_mode combo
# (backend_name, cudagraph_mode, supported)
combo_cases_1 = [
("FA3", "FULL", True),
("FA3", "FULL_AND_PIECEWISE", True),
("FA2", "FULL", True), # Should fallback to FULL_AND_PIECEWISE
("FA2", "FULL_AND_PIECEWISE", True),
("FlashInfer", "FULL", True), # Should fallback to FULL_AND_PIECEWISE
("FlashInfer", "FULL_AND_PIECEWISE", True),
]
if current_platform.is_rocm():
combo_cases_1 = [
("RocmAttn", "FULL", True),
("RocmAttn", "FULL_AND_PIECEWISE", True),
("TritonAttn", "FULL", True),
("TritonAttn", "FULL_AND_PIECEWISE", True),
]
else:
combo_cases_1 = [
("FA3", "FULL", True),
("FA3", "FULL_AND_PIECEWISE", True),
("FA2", "FULL", True), # Should fallback to FULL_AND_PIECEWISE
("FA2", "FULL_AND_PIECEWISE", True),
("FlashInfer", "FULL", True), # Should fallback to FULL_AND_PIECEWISE
("FlashInfer", "FULL_AND_PIECEWISE", True),
]


@pytest.mark.parametrize("backend_name, cudagraph_mode, supported", combo_cases_1)
Expand Down Expand Up @@ -92,18 +100,32 @@ def test_backend_and_cudagraph_mode_combo(backend_name, cudagraph_mode, supporte

# test cudagraph_mode with different compilation mode.
# (backend_name, cudagraph_mode, compilation_mode, supported)
combo_cases_2 = [
("FA2", "FULL", CompilationMode.NONE, True),
("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
("FA2", "PIECEWISE", CompilationMode.NONE, False),
("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
("FA2", "NONE", CompilationMode.NONE, True),
("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
]
if current_platform.is_rocm():
combo_cases_2 = [
("RocmAttn", "FULL", CompilationMode.NONE, True),
("RocmAttn", "FULL", CompilationMode.VLLM_COMPILE, True),
("RocmAttn", "PIECEWISE", CompilationMode.NONE, False),
("RocmAttn", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
("RocmAttn", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
("RocmAttn", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
("RocmAttn", "NONE", CompilationMode.NONE, True),
("RocmAttn", "NONE", CompilationMode.VLLM_COMPILE, True),
]
else:
combo_cases_2 = [
("FA2", "FULL", CompilationMode.NONE, True),
("FA2", "FULL", CompilationMode.VLLM_COMPILE, True),
("FA2", "PIECEWISE", CompilationMode.NONE, False),
("FA2", "PIECEWISE", CompilationMode.VLLM_COMPILE, True),
("FA2", "FULL_AND_PIECEWISE", CompilationMode.NONE, False),
("FA2", "FULL_AND_PIECEWISE", CompilationMode.VLLM_COMPILE, True),
("FA2", "FULL_DECODE_ONLY", CompilationMode.NONE, True),
("FA2", "FULL_DECODE_ONLY", CompilationMode.VLLM_COMPILE, True),
("FA2", "NONE", CompilationMode.NONE, True),
("FA2", "NONE", CompilationMode.VLLM_COMPILE, True),
]


@pytest.mark.parametrize(
Expand Down
4 changes: 2 additions & 2 deletions vllm/platforms/rocm.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,8 @@ def get_attn_backend_cls(
return AttentionBackendEnum.TRITON_ATTN.get_path()

raise RuntimeError(
"V0 attention backends have been removed. Set VLLM_USE_V1=1 "
"to select a supported backend."
f"Attention backend {selected_backend.name} is not supported on "
"ROCm. Note that V0 attention backends have been removed."
)

@classmethod
Expand Down