Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
230246d
try to enable new fusion pass test for ROCm
tjtanaa Feb 9, 2026
1c9552a
fix silu-mul-groupquant fuion test
vllmellm Feb 9, 2026
bffe181
fix full graph test
vllmellm Feb 10, 2026
28ed03f
clearer test case for silu mul and group quant test
vllmellm Feb 10, 2026
5628eb9
fix e2e fusion tests
vllmellm Feb 10, 2026
223fe34
Merge branch 'fusionpassci' of https://github.com/EmbeddedLLM/vllm in…
vllmellm Feb 10, 2026
119b4b0
fix tests in fusion silu_mul and tidy up kite
vllmellm Feb 10, 2026
218fcfb
remove unnecessary change
vllmellm Feb 10, 2026
befaba1
remove duplicate
tjtanaa Feb 10, 2026
ca801a1
need to add quote
tjtanaa Feb 10, 2026
0b65174
fix syntax
tjtanaa Feb 10, 2026
be40a22
fix Fusion E2E TP2 (MI325) path
tjtanaa Feb 10, 2026
d8d0712
fix test-amd syntax
tjtanaa Feb 11, 2026
a03b94d
Merge remote-tracking branch 'origin/main' into fusionpassci
tjtanaa Feb 11, 2026
f58033a
revert pytorch tests
tjtanaa Feb 11, 2026
eabee32
fix agent pool
tjtanaa Feb 11, 2026
56ac061
add fix test_full_graph
tjtanaa Feb 11, 2026
b8c0bcd
remove unrelated comment
tjtanaa Feb 11, 2026
9ef71e4
reduce test and compute resource
tjtanaa Feb 11, 2026
158ea2f
skip kvcache tests and reverted the changes in test_full_graph
tjtanaa Feb 11, 2026
0997661
remove tj marker
tjtanaa Feb 11, 2026
f432148
syn main
tjtanaa Feb 11, 2026
6891c60
fix syntax
tjtanaa Feb 11, 2026
1e8fe87
add skip marker
tjtanaa Feb 11, 2026
b81b0f9
revert test
tjtanaa Feb 11, 2026
0326f76
fix the test case, amd cannot run nvidia model
tjtanaa Feb 11, 2026
9001be5
remove sequence parallel test
tjtanaa Feb 11, 2026
ca222af
skip sequence parallel on non-cuda
tjtanaa Feb 11, 2026
06b0aca
fix the test_config_generation.py
tjtanaa Feb 11, 2026
676184e
fix test_configuration
tjtanaa Feb 12, 2026
b566461
fix the qwen3 e2e fusion pass on ROCm + AITER
tjtanaa Feb 12, 2026
a7dd03f
Merge remote-tracking branch 'origin/main' into fusionpasscionly
tjtanaa Feb 12, 2026
24a142d
fix pytest command
tjtanaa Feb 12, 2026
7e2cca4
fix pre-commit
tjtanaa Feb 12, 2026
089969c
fix the model configi
tjtanaa Feb 12, 2026
5d05398
remove experimental flag
tjtanaa Feb 13, 2026
3af7195
Merge branch 'main' into fusionpasscionly
tjtanaa Feb 13, 2026
4922c7a
test suggestion
tjtanaa Feb 13, 2026
84f0847
Merge branch 'fusionpasscionly' of https://github.com/EmbeddedLLM/vll…
tjtanaa Feb 13, 2026
1aace95
revert pytorch test
tjtanaa Feb 14, 2026
3f0e188
remove mla related bugfix
tjtanaa Feb 14, 2026
64d3b63
convert condition to pytest.param
tjtanaa Feb 14, 2026
c186a19
apply suggestion
tjtanaa Feb 14, 2026
050544d
fix error from pytest.param
tjtanaa Feb 14, 2026
059205d
Merge branch 'main' into fusionpasscionly
tjtanaa Feb 15, 2026
97102c3
remove rocm branching in model defination
vllmellm Feb 24, 2026
071bdb7
sync main
tjtanaa Feb 24, 2026
0a42a79
remove todo
tjtanaa Feb 26, 2026
3204c5c
remove unnecessary test_tp1_quant.py
tjtanaa Feb 26, 2026
de42cfb
apply reviewer feedback
tjtanaa Feb 26, 2026
a168f7b
remove comment
tjtanaa Mar 2, 2026
53d253d
fix SiluMulGroupQaunt
vllmellm Mar 3, 2026
8374509
comment out redundant tests
tjtanaa Mar 3, 2026
4b0cd59
Merge remote-tracking branch 'origin/main' into fusionpasscionly
tjtanaa Mar 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 107 additions & 40 deletions .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,8 @@ steps:
--ignore=lora/test_qwen3moe_tp.py
parallelism: 4

##### .buildkite/test_areas/pytorch.yaml #####
# corresponds to .buildkite/test_areas/pytorch.yaml
- label: PyTorch Compilation Unit Tests # 15min
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental, amdproduction]
Expand All @@ -627,6 +629,20 @@ steps:
# they do not suffer from https://github.com/vllm-project/vllm/issues/28965
- "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"

# corresponds to .buildkite/test_areas/pytorch.yaml
- label: PyTorch Compilation Passes Unit Tests
timeout_in_minutes: 20
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
source_file_dependencies:
- vllm/
- tests/compile/passes
commands:
# TODO: clean up this comment if not needed. It is used to
# keep track of the tests changes during vLLM IR Ops refactoring.
# Use `find` to launch multiple instances of pytest.
- "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"

- label: PyTorch Fullgraph Smoke Test # 15min
timeout_in_minutes: 30
mirror_hardwares: [amdexperimental, amdproduction]
Expand Down Expand Up @@ -1211,41 +1227,6 @@ steps:
- pytest -v -s tests/kernels/moe/test_flashinfer.py
- pytest -v -s tests/kernels/moe/test_cutedsl_moe.py

- label: Blackwell Fusion and Compile Tests # 30 min
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
gpu: b200
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/passes/test_fusion_attn.py
- tests/compile/passes/test_silu_mul_quant_fusion.py
- tests/compile/passes/distributed/test_fusion_all_reduce.py
- tests/compile/fullgraph/test_full_graph.py
commands:
- nvidia-smi
- pytest -v -s tests/compile/passes/test_fusion_attn.py
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_gpus=2 is not set
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py

# # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# # Wrap with quotes to escape yaml
# - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Blackwell GPT-OSS Eval
timeout_in_minutes: 60
working_dir: "/vllm-workspace/"
Expand Down Expand Up @@ -1371,7 +1352,6 @@ steps:
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- pytest -v -s compile/correctness_e2e/test_sequence_parallel.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s v1/worker/test_worker_memory_snapshot.py

Expand Down Expand Up @@ -1601,16 +1581,16 @@ steps:
commands:
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
- pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
Comment on lines 1582 to 1583
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are actually passing? I'm surprised

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea. The test still passes but the logs are not useful. fused ops just call torch.ops.symm_mem which exists in ROCm even though they don't work.

The tests/compile/fusions_e2e/test_tp2_async_tp.py also passes. But it doesn't mean this feature works on ROCm

- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
# TODO: this test is not supported on ROCm, there are aiter kernels for this.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you quote this issue (and let's make a sub-issue?): #25179

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Created

# - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
#- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
# - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
# Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
- pytest -v -s tests/distributed/test_context_parallel.py
- HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
- pytest -v -s tests/v1/distributed/test_dbo.py
# this test is not supported on ROCm
# - pytest -v -s tests/v1/distributed/test_dbo.py

##### B200 test #####
- label: Distributed Tests (B200) # optional
Expand Down Expand Up @@ -1721,6 +1701,93 @@ steps:
commands:
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040

##### .buildkite/test_areas/compile.yaml #####
# Slowly setting up the tests so that it is also easier for the
# CI team to review and upstream to the pipelinev2.
# The following tests are important for vLLM IR Ops refactoring,
# which affects fusion passes on ROCm. So we have to
# enable them as as soon as possible.

## TODO: Enable the test in this group
# # corresponds to .buildkite/test_areas/compile.yaml
# - label: Fusion and Compile Unit Tests (2xMI325 GPUs)
# timeout_in_minutes: 20
# working_dir: "/vllm-workspace/"
# mirror_hardwares: [amdexperimental, amdproduction, tj]
# agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs
# source_file_dependencies:
# - csrc/quantization/fp4/
# - vllm/model_executor/layers/quantization/
# - vllm/model_executor/layers/layernorm.py
# - vllm/model_executor/layers/activation.py
# - vllm/model_executor/layers/attention/attention.py
# - vllm/v1/attention/backends/flashinfer.py
# - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
# - tests/compile/test_fusion_attn.py
# - tests/compile/test_silu_mul_quant_fusion.py
# - tests/compile/distributed/test_fusion_all_reduce.py
# - tests/compile/fullgraph/test_full_graph.py
# commands:
# - rocm-smi
# # we run all backend tests on ROCm
# # These two tests are covered in "PyTorch Compilation Passes Unit Tests"
# # - "pytest -v -s tests/compile/passes/test_fusion_attn.py"
# # - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py"
# # TODO: this test is not supported on ROCm, there are aiter kernels for this.
# # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
# # TODO: find out more details
# # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

# corresponds to .buildkite/test_areas/compile.yaml
- label: Fusion E2E Quick (MI325)
timeout_in_minutes: 15
working_dir: "/vllm-workspace/"
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
num_devices: 1
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands:
- rocm-smi
# Run all models and attn backends but only Inductor partition and native custom ops
- "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
# Different from CUDA, Qwen requires +rms_norm and +quant_fp8 as rms+quant fusion is only supported on AITER
- "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and +rms_norm and +quant_fp8 and qwen3'"

# corresponds to .buildkite/test_areas/compile.yaml
- label: Fusion E2E Config Sweep (MI325)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
num_devices: 1
source_file_dependencies:
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands:
- rocm-smi
# Run just llama3 (fp8) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
Comment thread
tjtanaa marked this conversation as resolved.

## There are no ops on ROCm for these tests.
## The test still passes but the logs are not useful.
## fused ops just call torch.ops.symm_mem which
## exists in ROCm even though they don't work
# - label: AsyncTP Correctness Tests (2xMI325 GPUs)
# - label: Fusion E2E TP2 Quick (MI325)
# - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
# - label: Fusion E2E TP2 (MI325)
# - label: Sequence Parallel Correctness Tests (2xMI325 GPUs)


#####################################################################################################################################
Expand Down
4 changes: 4 additions & 0 deletions tests/compile/fusions_e2e/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

class Matches(NamedTuple):
# simple pointwise
aiter_rms_quant_fusion: int = 0
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this ever a different number from rms_quant_fusion?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will never be a different number from rms_quant_fusion. I am introducing this new aiter_rms_quant_fusion key because of the grep logic to find out how many ops has been replaced. This new key will be used to invoke a different grep logic. Once we unify the implementation in rocm_aiter_fusion.py into existing fusion op category, we should be able to remove this aiter_rms_quant_fusion key.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, thanks!

rms_quant_fusion: int = 0
act_quant_fusion: int = 0
norm_rope_fusion: int = 0
Expand Down Expand Up @@ -82,6 +83,9 @@ def has_cuda_graph_wrapper_metadata() -> bool:
]

FUSION_LOG_PATTERNS: dict[str, re.Pattern] = {
"aiter_rms_quant_fusion": re.compile(
r"RocmAiterRMSNormQuantFusionPass Replaced (\d+) patterns"
),
"rms_quant_fusion": re.compile(r"rms_quant_fusion.py:\d+] Replaced (\d+) patterns"),
"act_quant_fusion": re.compile(r"act_quant_fusion.py:\d+] Replaced (\d+) patterns"),
"norm_rope_fusion": re.compile(
Expand Down
5 changes: 5 additions & 0 deletions tests/compile/fusions_e2e/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,14 @@ def run(
compilation_config: dict,
matches_check: list[str],
use_deepgemm: bool = False,
use_aiter: bool = False,
tp_size: int = 1,
):
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1" if use_deepgemm else "0")
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1" if use_aiter else "0")
Comment thread
tjtanaa marked this conversation as resolved.
from vllm._aiter_ops import rocm_aiter_ops

rocm_aiter_ops.refresh_env_variables()

# Disable, compile cache to make sure custom passes run.
# Otherwise, we can't verify fusion happened through the logs.
Expand Down
22 changes: 20 additions & 2 deletions tests/compile/fusions_e2e/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest

from vllm._aiter_ops import is_aiter_found_and_supported
from vllm.platforms import current_platform
from vllm.utils.flashinfer import has_flashinfer
from vllm.v1.attention.backends.registry import AttentionBackendEnum

Expand All @@ -24,6 +26,24 @@
AttentionBackendCase(backend=AttentionBackendEnum.TRITON_ATTN), id="TRITON_ATTN"
)

ROCM_ATTN = pytest.param(
AttentionBackendCase(backend=AttentionBackendEnum.ROCM_ATTN),
id="ROCM_ATTN",
marks=pytest.mark.skipif(
not current_platform.is_rocm(),
reason="ROCm attention only for AMD",
),
)

ROCM_AITER_UNIFIED_ATTN = pytest.param(
AttentionBackendCase(backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN),
id="ROCM_AITER_UNIFIED_ATTN",
marks=pytest.mark.skipif(
not is_aiter_found_and_supported(),
reason="ROCM_AITER_UNIFIED_ATTN only for AMD when AITER is installed",
),
)

# Models
llama3_8b = ModelFusionInfo(
model_name="meta-llama/Llama-3.1-8B-Instruct",
Expand All @@ -49,7 +69,6 @@
llama3_8b_fp4 = ModelFusionInfo(
model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
matches=lambda n_layers: Matches(
rms_quant_fusion=0,
Comment thread
ProExpertProg marked this conversation as resolved.
act_quant_fusion=n_layers,
attn_quant_fusion=n_layers,
ar_rms_fusion=n_layers * 2 + 1,
Expand Down Expand Up @@ -79,7 +98,6 @@
model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4",
hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
matches=lambda n_layers: Matches(
rms_quant_fusion=0,
Comment thread
ProExpertProg marked this conversation as resolved.
attn_quant_fusion=n_layers,
ar_rms_fusion=n_layers * 2,
sequence_parallel=n_layers * 2,
Expand Down
42 changes: 39 additions & 3 deletions tests/compile/fusions_e2e/test_tp1_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest

from vllm.config import PassConfig
from vllm.platforms import current_platform
from vllm.utils.flashinfer import is_flashinfer_fp8_blockscale_gemm_supported

from .common import (
Expand All @@ -16,6 +17,8 @@
)
from .models import (
FLASHINFER_ATTN,
ROCM_AITER_UNIFIED_ATTN,
ROCM_ATTN,
TRITON_ATTN,
llama3_8b_fp4,
llama3_8b_fp8,
Expand All @@ -29,12 +32,33 @@
"model_name, matches_fn, model_kwargs, hf_overrides, use_deepgemm",
[
(*llama3_8b_fp8, False),
(*llama4_scout_fp8, False),
(*qwen3_a3b_fp8, False),
(*qwen3_a3b_fp8, True),
pytest.param(
*llama4_scout_fp8,
False,
marks=pytest.mark.skipif(
not current_platform.is_cuda(),
reason="Llama4 Scout FP8 only supported on CUDA",
),
),
pytest.param(
*qwen3_a3b_fp8,
True,
marks=pytest.mark.skipif(
not current_platform.is_cuda(), reason="DeepGemm only supported on CUDA"
),
),
],
)
@pytest.mark.parametrize(
Comment thread
tjtanaa marked this conversation as resolved.
"attn_backend",
[
TRITON_ATTN,
FLASHINFER_ATTN,
ROCM_ATTN,
ROCM_AITER_UNIFIED_ATTN,
],
)
@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
@pytest.mark.parametrize("n_layers", [6])
@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
Expand Down Expand Up @@ -81,13 +105,24 @@ def test_tp1_fp8_fusions(
),
)

use_aiter = current_platform.is_rocm() and ("qwen" in model_name.lower())

matches_check = [
"rms_quant_fusion",
"act_quant_fusion",
"norm_rope_fusion",
"attn_quant_fusion",
]

if use_aiter:
matches_check[0] = "aiter_rms_quant_fusion"
Comment thread
ProExpertProg marked this conversation as resolved.

matches = matches._replace(aiter_rms_quant_fusion=matches.rms_quant_fusion)
# TODO: enable the `norm_rope_fusion` test,
# On ROCm norm_rope_fusion is only supported without
# enabling AITER.
matches_check.remove("norm_rope_fusion")

run_e2e_fusion_test(
model_name,
matches,
Expand All @@ -96,6 +131,7 @@ def test_tp1_fp8_fusions(
compilation_config,
matches_check,
use_deepgemm=use_deepgemm,
use_aiter=use_aiter,
)


Expand Down
3 changes: 3 additions & 0 deletions tests/compile/fusions_e2e/test_tp2_ar_rms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest

from vllm.config import PassConfig
from vllm.platforms import current_platform

from ...utils import multi_gpu_test
from .common import (
Expand All @@ -26,6 +27,8 @@
qwen3_a3b_fp8,
)

pytestmark = pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")


@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
Expand Down
3 changes: 3 additions & 0 deletions tests/compile/fusions_e2e/test_tp2_async_tp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest

from vllm.config import PassConfig
from vllm.platforms import current_platform

from ...utils import multi_gpu_test
from .common import (
Expand All @@ -23,6 +24,8 @@
qwen3_a3b,
)

pytestmark = pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")


@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
Expand Down
Loading