From 230246d9349db65878aee63f7c4c8b4a920fa821 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Mon, 9 Feb 2026 08:28:20 +0000 Subject: [PATCH 01/45] try to enable new fusion pass test for ROCm Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 174 +++++++++++++----- tests/compile/fullgraph/test_full_graph.py | 34 +++- tests/compile/fusions_e2e/models.py | 83 +++++++-- tests/compile/fusions_e2e/test_tp1_quant.py | 15 +- tests/compile/passes/test_fusion_attn.py | 2 +- .../passes/test_silu_mul_quant_fusion.py | 15 +- .../layers/quantization/input_quant_fp8.py | 1 + 7 files changed, 245 insertions(+), 79 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 503b3a76f941..986708e37641 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -570,9 +570,11 @@ steps: --ignore=lora/test_qwen3moe_tp.py parallelism: 4 +##### .buildkite/test_areas/pytorch.yaml ##### +# corresponds to .buildkite/test_areas/pytorch.yaml - label: PyTorch Compilation Unit Tests # 15min timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, tj] agent_pool: mi325_1 # grade: Blocking torch_nightly: true @@ -586,10 +588,14 @@ steps: # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + # TODO: clean up this comment if not needed. It is used to + # keep track of the tests changes during vLLM IR Ops refactoring. + - pytest -s -v compile/passes --ignore compile/passes/distributed +# corresponds to .buildkite/test_areas/pytorch.yaml - label: PyTorch Fullgraph Smoke Test # 15min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental, amdproduction, tj] agent_pool: mi325_1 # grade: Blocking torch_nightly: true @@ -603,9 +609,10 @@ steps: # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" -- label: PyTorch Fullgraph Test # 27min - timeout_in_minutes: 40 - mirror_hardwares: [amdexperimental, amdproduction] +# corresponds to .buildkite/test_areas/pytorch.yaml +- label: PyTorch Fullgraph # 27min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, tj] agent_pool: mi325_1 # grade: Blocking torch_nightly: true @@ -1176,41 +1183,6 @@ steps: - pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py -- label: Blackwell Fusion and Compile Tests # 30 min - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/" - gpu: b200 - source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/v1/worker/ - - vllm/v1/cudagraph_dispatcher.py - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/passes/test_fusion_attn.py - - tests/compile/passes/test_silu_mul_quant_fusion.py - - tests/compile/passes/distributed/test_fusion_all_reduce.py - - tests/compile/fullgraph/test_full_graph.py - commands: - - nvidia-smi - - pytest -v -s tests/compile/passes/test_fusion_attn.py - - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py - # this runner has 2 GPUs available even though num_gpus=2 is not set - - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py - - # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time - # # Wrap with quotes to escape yaml - # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" - # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 - # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - - # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - - label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 working_dir: "/vllm-workspace/" @@ -1334,7 +1306,6 @@ steps: - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - pytest -v -s distributed/test_sequence_parallel.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py @@ -1558,17 +1529,20 @@ steps: num_gpus: 2 commands: - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py + # ================= 24 passed, 11 warnings in 192.85s (0:03:12) ================== - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py + # ================== 48 passed, 8 warnings in 386.41s (0:06:26) ================== - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py + # ======================== 8 skipped, 9 warnings in 2.08s ======================== #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py + # ======================== 4 passed, 3 warnings in 30.45s ======================== - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization - pytest -v -s tests/v1/distributed/test_dbo.py + # ======================== 2 skipped, 3 warnings in 1.97s ======================== ##### B200 test ##### - label: Distributed Tests (B200) # optional @@ -1692,3 +1666,115 @@ steps: working_dir: "/vllm-workspace" commands: - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 + +##### .buildkite/test_areas/compile.yaml ##### +# Slowly setting up the tests so that it is also easier for the +# CI team to review and upstream to the pipelinev2. +# The following tests are important for vLLM IR Ops refactoring, +# which affects fusion passes on ROCm. So we have to +# enable them as as soon as possible. + +# corresponds to .buildkite/test_areas/compile.yaml +- label: Sequence Parallel Correctness Tests (2xMI325 GPUs) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction, tj] + agent_pool: mi325_2 + num_devices: 2 + source_file_dependencies: + - vllm/model_executor/layers/ + - vllm/compilation/ + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py + - tests/compile/correctness_e2e/test_sequence_parallel.py + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py + +# corresponds to .buildkite/test_areas/compile.yaml +- label: Fusion and Compile Unit Tests (2xMI325 GPUs) + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction, tj] + agent_pool: mi325_2 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/ + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/attention/attention.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes + - tests/compile/test_fusion_attn.py + - tests/compile/test_silu_mul_quant_fusion.py + - tests/compile/distributed/test_fusion_all_reduce.py + - tests/compile/fullgraph/test_full_graph.py + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/ + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/attention/attention.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes + - tests/compile/passes/test_fusion_attn.py + - tests/compile/passes/test_silu_mul_quant_fusion.py + - tests/compile/passes/distributed/test_fusion_all_reduce.py + - tests/compile/fullgraph/test_full_graph.py + commands: + - rocm-smi + # we run all backend tests on ROCm + - pytest -v -s tests/compile/passes/test_fusion_attn.py + - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py + # TODO: this test is not supported on ROCm, there are aiter kernels for this. + # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + # TODO: this test is not supported on ROCm + # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + +# corresponds to .buildkite/test_areas/compile.yaml +- label: Fusion E2E Quick (MI325) + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction, tj] + agent_pool: mi325_1 + num_devices: 1 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/compilation/ + - tests/compile/fusions_e2e/ + commands: + - rocm-smi + # Run all models and attn backends but only Inductor partition and native custom ops + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" + +# corresponds to .buildkite/test_areas/compile.yaml +- label: Fusion E2E Config Sweep (MI325) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction, tj] + agent_pool: mi325_1 + num_devices: 1 + source_file_dependencies: + - csrc/quantization/ + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/attention/attention.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/fusions_e2e/ + commands: + - rocm-smi + # Run just llama3 (fp8) for all config combinations + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3" + +# corresponds to .buildkite/test_areas/kernels.yaml +# Skip the following tests as they are not supported on ROCm +# - label: Fusion E2E TP2 AR-RMS Config Sweep (H100) +# - label: Fusion E2E TP2 AsyncTP Config Sweep (H100) +# - label: Fusion E2E TP2 (B200) diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index ed4c92d90ff7..733ec22c98d6 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -194,13 +194,31 @@ def test_custom_compile_config( ) @pytest.mark.parametrize( "model, backend", - [ - ("Qwen/Qwen2-0.5B", None), # Standard attention model - ( - "deepseek-ai/DeepSeek-V2-Lite", - AttentionBackendEnum.FLASHINFER_MLA, - ), # MLA (Multi-head Latent Attention) model - ], + ( + [ + ("Qwen/Qwen2-0.5B", None), # Standard attention model + ( + "deepseek-ai/DeepSeek-V2-Lite", + AttentionBackendEnum.FLASHINFER_MLA, + ), # MLA (Multi-head Latent Attention) model + ] + if current_platform.is_cuda() + else [ + # ("Qwen/Qwen2-0.5B", None), # Standard attention model + # ( + # "deepseek-ai/DeepSeek-V2-Lite", + # AttentionBackendEnum.TRITON_MLA, + # ), # MLA (Multi-head Latent Attention) model + ( + "deepseek-ai/DeepSeek-V2-Lite", + AttentionBackendEnum.ROCM_AITER_MLA, + ), # MLA (Multi-head Latent Attention) model + ( + "deepseek-ai/DeepSeek-V2-Lite", + AttentionBackendEnum.ROCM_AITER_TRITON_MLA, + ), # MLA (Multi-head Latent Attention) model + ] + ), ) def test_fp8_kv_scale_compile( compilation_mode: int, @@ -209,7 +227,7 @@ def test_fp8_kv_scale_compile( ): model_kwargs = { "quantization": "fp8", - "kv_cache_dtype": "fp8_e4m3", + "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8", "calculate_kv_scales": True, "max_model_len": 512, } diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py index f54f617c64d4..525ed1b515bc 100644 --- a/tests/compile/fusions_e2e/models.py +++ b/tests/compile/fusions_e2e/models.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest +from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer from vllm.v1.attention.backends.registry import AttentionBackendEnum @@ -24,37 +25,83 @@ AttentionBackendCase(backend=AttentionBackendEnum.TRITON_ATTN), id="TRITON_ATTN" ) +# ROCm backends +ROCM_ATTN = pytest.param( + AttentionBackendCase(backend=AttentionBackendEnum.ROCM_ATTN), id="ROCM_ATTN" +) + +ROCM_AITER_UNIFIED_ATTN = pytest.param( + AttentionBackendCase(backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN), + id="ROCM_AITER_UNIFIED_ATTN", +) + # Models llama3_8b = ModelFusionInfo( model_name="meta-llama/Llama-3.1-8B-Instruct", - matches=lambda n_layers: Matches( - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 4, + matches=( + lambda n_layers: Matches( + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ), + ) + if current_platform.is_cuda() + else ( # ROCm matches + lambda n_layers: Matches( + ar_rms_fusion=0, + sequence_parallel=0, + async_tp=0, + ), ), ) llama3_8b_fp8 = ModelFusionInfo( model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", - matches=lambda n_layers: Matches( - rms_quant_fusion=n_layers * 2, - act_quant_fusion=n_layers, - attn_quant_fusion=n_layers, - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 4, + matches=( + lambda n_layers: Matches( + rms_quant_fusion=n_layers * 2, + act_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ) + ) + if current_platform.is_cuda() + else ( # ROCm matches + lambda n_layers: Matches( + rms_quant_fusion=n_layers * 2, + act_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=0, + sequence_parallel=0, + async_tp=0, + ), ), ) llama3_8b_fp4 = ModelFusionInfo( model_name="nvidia/Llama-3.1-8B-Instruct-FP4", - matches=lambda n_layers: Matches( - rms_quant_fusion=0, - act_quant_fusion=n_layers, - attn_quant_fusion=n_layers, - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 4, + matches=( + lambda n_layers: Matches( + rms_quant_fusion=0, + act_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ) + ) + if current_platform.is_cuda() + else ( # ROCm matches + lambda n_layers: Matches( + rms_quant_fusion=0, + act_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=0, + sequence_parallel=0, + async_tp=0, + ), ), ) diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py index 03f102794f85..25a607051a89 100644 --- a/tests/compile/fusions_e2e/test_tp1_quant.py +++ b/tests/compile/fusions_e2e/test_tp1_quant.py @@ -5,6 +5,7 @@ import pytest from vllm.config import PassConfig +from vllm.platforms import current_platform from .common import ( INDUCTOR_GRAPH_PARTITION, @@ -15,11 +16,12 @@ ) from .models import ( FLASHINFER_ATTN, + ROCM_AITER_UNIFIED_ATTN, + ROCM_ATTN, TRITON_ATTN, llama3_8b_fp4, llama3_8b_fp8, llama4_scout_fp4, - llama4_scout_fp8, qwen3_a3b_fp8, ) @@ -28,12 +30,17 @@ "model_name, matches_fn, model_kwargs, hf_overrides, use_deepgemm", [ (*llama3_8b_fp8, False), - (*llama4_scout_fp8, False), + # (*llama4_scout_fp8, False), (*qwen3_a3b_fp8, False), - (*qwen3_a3b_fp8, True), + # (*qwen3_a3b_fp8, True), ], ) -@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN]) +@pytest.mark.parametrize( + "attn_backend", + [TRITON_ATTN, FLASHINFER_ATTN] + if current_platform.is_cuda() + else [TRITON_ATTN, ROCM_ATTN, ROCM_AITER_UNIFIED_ATTN], +) @pytest.mark.parametrize("n_layers", [6]) @pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm")) @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) diff --git a/tests/compile/passes/test_fusion_attn.py b/tests/compile/passes/test_fusion_attn.py index 75d5c42f0731..a35db7bb21ff 100644 --- a/tests/compile/passes/test_fusion_attn.py +++ b/tests/compile/passes/test_fusion_attn.py @@ -267,7 +267,7 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor): PATTERN_TEST_MODELS_FP8 = [ ("amd/Llama-3.1-8B-Instruct-FP8-KV", TestAttentionFp8StaticQuantPatternModel) ] - BACKENDS = [ + BACKENDS_FP8 = [ AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN, AttentionBackendEnum.ROCM_ATTN, AttentionBackendEnum.TRITON_ATTN, diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py index c5ef015015ce..64aad53525a5 100644 --- a/tests/compile/passes/test_silu_mul_quant_fusion.py +++ b/tests/compile/passes/test_silu_mul_quant_fusion.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools +from typing import Any import pytest import torch @@ -148,6 +149,9 @@ def __init__(self, hidden_size: int, **kwargs): weight_group_shape=GroupShape(128, 128), act_quant_group_shape=GroupShape(1, 128), cutlass_block_fp8_supported=False, + # this parameter cannot always be True, + # it depends on the VLLM_ROCM_USE_AITER + # and VLLM_ROCM_USE_AITER_LINEAR environment variables use_aiter_and_is_supported=True, ) self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() @@ -181,6 +185,12 @@ def ops_in_model_after(self): ] TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS +EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [ + (TestSiluMulGroupFp8QuantModel, False, None), +] +if current_platform.is_cuda(): + EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None)) + @pytest.mark.parametrize("num_tokens", [32, 64]) @pytest.mark.parametrize("hidden_size", [128, 256]) @@ -189,10 +199,7 @@ def ops_in_model_after(self): @pytest.mark.parametrize( "model_class, enable_quant_fp8_custom_op, force_kernel", list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS)) - + [ - (TestSiluMulNvfp4QuantModel, False, None), - (TestSiluMulGroupFp8QuantModel, False, None), - ], + + EXTENDED_TESTCASES, ) @pytest.mark.skipif( envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm" diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py index 5bc78afa43b0..ed3b981cf183 100644 --- a/vllm/model_executor/layers/quantization/input_quant_fp8.py +++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py @@ -171,6 +171,7 @@ def forward_native( x: torch.Tensor, scale: torch.Tensor | None = None, scale_ub: torch.Tensor | None = None, + **kwargs, ): if self.is_group_quant and not self.static: assert scale is None, "Dynamic group quantization does not use scale" From 1c9552affe7f37454cde6b496a64b26e0d859ce0 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Mon, 9 Feb 2026 13:21:08 +0000 Subject: [PATCH 02/45] fix silu-mul-groupquant fuion test Signed-off-by: vllmellm --- .../passes/test_silu_mul_quant_fusion.py | 10 +++++++--- .../passes/fusion/rocm_aiter_fusion.py | 18 +++++++----------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py index 64aad53525a5..687bb9aa6bfe 100644 --- a/tests/compile/passes/test_silu_mul_quant_fusion.py +++ b/tests/compile/passes/test_silu_mul_quant_fusion.py @@ -10,7 +10,7 @@ from tests.compile.backend import TestBackend from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor from tests.utils import TestFP8Layer -from vllm._aiter_ops import IS_AITER_FOUND +from vllm._aiter_ops import IS_AITER_FOUND, rocm_aiter_ops from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.compilation.passes.fusion.act_quant_fusion import ( FUSED_OPS, @@ -186,7 +186,7 @@ def ops_in_model_after(self): TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [ - (TestSiluMulGroupFp8QuantModel, False, None), + (TestSiluMulGroupFp8QuantModel, True, None), ] if current_platform.is_cuda(): EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None)) @@ -216,6 +216,7 @@ def test_fusion_silu_and_mul_quant( enable_silu_mul_custom_op: bool, enable_quant_fp8_custom_op: bool, force_kernel: FP8ScaledMMLinearKernel | None, + monkeypatch: pytest.MonkeyPatch, ): if model_class is TestSiluMulNvfp4QuantModel and not is_nvfp4_supported(): pytest.skip("NVFP4 is not supported on this GPU.") @@ -242,13 +243,16 @@ def test_fusion_silu_and_mul_quant( ), ) - with set_current_vllm_config(config): + with set_current_vllm_config(config), monkeypatch.context() as m: fusion_passes = [ActivationQuantFusionPass(config)] if IS_AITER_FOUND: from vllm.compilation.passes.fusion.rocm_aiter_fusion import ( RocmAiterSiluMulFp8GroupQuantFusionPass, ) + m.setenv("VLLM_ROCM_USE_AITER", "1") + rocm_aiter_ops.refresh_env_variables() + fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)] passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)] diff --git a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py index d8131ce952d2..99278365c5db 100644 --- a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py +++ b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py @@ -5,7 +5,6 @@ import torch._inductor.pattern_matcher as pm from torch import fx from torch._inductor.pattern_matcher import PatternMatcherPass -from torch._ops import OpOverload import vllm.model_executor.layers.quantization.utils.fp8_utils # noqa: F401 from vllm._aiter_ops import rocm_aiter_ops @@ -15,6 +14,7 @@ GroupShape, QuantKey, ScaleDesc, + kFp8Static128BlockSym, ) from vllm.platforms import current_platform @@ -332,9 +332,11 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern): FUSED_SILU_MUL_QUANT_OP = rocm_aiter_ops.get_act_mul_fused_fp8_group_quant_op() - def __init__(self, quant_op: OpOverload) -> None: + def __init__(self) -> None: self.silu_and_mul_matcher = MatcherSiluAndMul() - self.quant_op = quant_op + self.quant_matcher = MatcherQuantFP8( + quant_key=kFp8Static128BlockSym, match_rocm_aiter=True + ) def get_inputs(self) -> list[torch.Tensor]: return [ @@ -346,7 +348,7 @@ def pattern( input: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: at1 = self.silu_and_mul_matcher(input) - at2 = self.quant_op(at1, 128) + at2 = self.quant_matcher(at1, 128) return at2[0], at2[1] def replacement( @@ -370,11 +372,6 @@ class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass): https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980 """ - AITER_GROUP_FP8_QUANT_OP = rocm_aiter_ops.get_group_quant_op() - TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default - - QUANT_OPS = [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP] - @enable_fake_mode def __init__(self, config: VllmConfig) -> None: super().__init__(config) @@ -383,8 +380,7 @@ def __init__(self, config: VllmConfig) -> None: pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass" ) - for quant_op in self.QUANT_OPS: - AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns) + AiterSiluMulFp8GroupQuantPattern().register(self.patterns) self.dump_patterns(config, self.patterns) From bffe1814354d8ed1055c6a02ecdde905f3f61549 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 10 Feb 2026 07:42:59 +0000 Subject: [PATCH 03/45] fix full graph test Signed-off-by: vllmellm --- tests/compile/fullgraph/test_full_graph.py | 27 ++++++++++++------- .../layers/attention/mla_attention.py | 1 + 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index 733ec22c98d6..921f57cea0a6 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -10,6 +10,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams +from vllm._aiter_ops import rocm_aiter_ops from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig from vllm.platforms import current_platform from vllm.utils.torch_utils import is_torch_equal_or_newer @@ -224,17 +225,25 @@ def test_fp8_kv_scale_compile( compilation_mode: int, model: str, backend: AttentionBackendEnum | None, + monkeypatch: pytest.MonkeyPatch, ): - model_kwargs = { - "quantization": "fp8", - "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8", - "calculate_kv_scales": True, - "max_model_len": 512, - } - if backend: - model_kwargs["attention_config"] = {"backend": backend.name} + with monkeypatch.context() as m: + model_kwargs = { + "quantization": "fp8", + "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8", + "calculate_kv_scales": True, + "max_model_len": 512, + } + if backend: + model_kwargs["attention_config"] = {"backend": backend.name} + if current_platform.is_rocm(): + m.setenv("VLLM_ROCM_USE_AITER", "1") + # Disable Aiter MOE as some shapes are not supported + m.setenv("VLLM_ROCM_USE_AITER_MOE", "0") + + rocm_aiter_ops.refresh_env_variables() - run_model(compilation_mode, model, **model_kwargs) + run_model(compilation_mode, model, **model_kwargs) def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs): diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index c31aa7b41d0d..ed31a2d176e3 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -403,6 +403,7 @@ def __init__( self.is_aiter_triton_fp4_bmm_enabled = ( rocm_aiter_ops.is_fp4bmm_enabled() and self.kv_b_proj.weight.dtype == torch.bfloat16 + and current_platform.has_device_capability(95) # gfx950 and above ) # Attributes for forward_impl method From 28ed03f12de4842576366a34efbf31d44cb2a34a Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 10 Feb 2026 07:52:31 +0000 Subject: [PATCH 04/45] clearer test case for silu mul and group quant test Signed-off-by: vllmellm --- tests/compile/passes/test_silu_mul_quant_fusion.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py index 687bb9aa6bfe..c6794a156240 100644 --- a/tests/compile/passes/test_silu_mul_quant_fusion.py +++ b/tests/compile/passes/test_silu_mul_quant_fusion.py @@ -184,10 +184,11 @@ def ops_in_model_after(self): PerTensorTorchFP8ScaledMMLinearKernel, ] TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS +EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [] +# SiluMulGroupFp8Quant is only supported on ROCm +if current_platform.is_rocm(): + EXTENDED_TESTCASES.append((TestSiluMulGroupFp8QuantModel, True, None)) -EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [ - (TestSiluMulGroupFp8QuantModel, True, None), -] if current_platform.is_cuda(): EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None)) @@ -245,7 +246,7 @@ def test_fusion_silu_and_mul_quant( with set_current_vllm_config(config), monkeypatch.context() as m: fusion_passes = [ActivationQuantFusionPass(config)] - if IS_AITER_FOUND: + if current_platform.is_rocm() and IS_AITER_FOUND: from vllm.compilation.passes.fusion.rocm_aiter_fusion import ( RocmAiterSiluMulFp8GroupQuantFusionPass, ) From 5628eb9b272da81a81da4477cb5774fdf7143632 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 10 Feb 2026 08:40:13 +0000 Subject: [PATCH 05/45] fix e2e fusion tests Signed-off-by: vllmellm --- .buildkite/test-amd.yaml | 71 +++++++++- tests/compile/fusions_e2e/models.py | 125 ++++++++++++------ .../compile/fusions_e2e/test_tp2_async_tp.py | 14 +- 3 files changed, 164 insertions(+), 46 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 986708e37641..64a878baa774 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1691,6 +1691,17 @@ steps: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py +# corresponds to .buildkite/test_areas/compile.yaml +- label: AsyncTP Correctness Tests (2xMI325 GPUs) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/" + device: mi325_2 + optional: true + num_devices: 2 + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py + # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion and Compile Unit Tests (2xMI325 GPUs) timeout_in_minutes: 20 @@ -1750,7 +1761,8 @@ steps: # Run all models and attn backends but only Inductor partition and native custom ops - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported - - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" + # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet. + # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion E2E Config Sweep (MI325) @@ -1771,10 +1783,57 @@ steps: commands: - rocm-smi # Run just llama3 (fp8) for all config combinations + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3" - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3" -# corresponds to .buildkite/test_areas/kernels.yaml -# Skip the following tests as they are not supported on ROCm -# - label: Fusion E2E TP2 AR-RMS Config Sweep (H100) -# - label: Fusion E2E TP2 AsyncTP Config Sweep (H100) -# - label: Fusion E2E TP2 (B200) +- label: Fusion E2E TP2 Quick (MI325) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + device: mi325_1 + num_devices: 2 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/compilation/ + - tests/compile/fusions_e2e/ + commands: + - rocm-smi + # Run all models and attn backends but only Inductor partition and native custom ops + - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + +# corresponds to .buildkite/test_areas/compile.yaml +- label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + device: mi325_2 + num_devices: 2 + source_file_dependencies: + - csrc/quantization/ + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/attention/attention.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/fusions_e2e/ + commands: + - rocm-smi + # Run just llama3 (fp8 & bf16) for all config combinations + - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3" + +- label: Fusion E2E TP2 (MI325) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + device: mi325_2 + num_devices: 2 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/compilation/ + - tests/compile/fusions_e2e/ + commands: + - rocm-smi + # Run all models and attn backends but only Inductor partition and native custom ops + - pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py index 525ed1b515bc..77cb1b4d3ad9 100644 --- a/tests/compile/fusions_e2e/models.py +++ b/tests/compile/fusions_e2e/models.py @@ -43,15 +43,16 @@ ar_rms_fusion=n_layers * 2 + 1, sequence_parallel=n_layers * 2 + 1, async_tp=n_layers * 4, - ), + ) ) if current_platform.is_cuda() - else ( # ROCm matches + # ROCm matches + else ( lambda n_layers: Matches( ar_rms_fusion=0, - sequence_parallel=0, - async_tp=0, - ), + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ) ), ) @@ -68,15 +69,16 @@ ) ) if current_platform.is_cuda() - else ( # ROCm matches + # ROCm matches + else ( lambda n_layers: Matches( rms_quant_fusion=n_layers * 2, act_quant_fusion=n_layers, attn_quant_fusion=n_layers, ar_rms_fusion=0, - sequence_parallel=0, - async_tp=0, - ), + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ) ), ) @@ -93,15 +95,16 @@ ) ) if current_platform.is_cuda() - else ( # ROCm matches + # ROCm matches + else ( lambda n_layers: Matches( rms_quant_fusion=0, act_quant_fusion=n_layers, attn_quant_fusion=n_layers, ar_rms_fusion=0, - sequence_parallel=0, - async_tp=0, - ), + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ) ), ) @@ -113,45 +116,93 @@ llama4_scout_fp8 = ModelFusionInfo( model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}}, - matches=lambda n_layers: Matches( - rms_quant_fusion=n_layers, - attn_quant_fusion=n_layers, - ar_rms_fusion=n_layers * 2, - sequence_parallel=n_layers * 2, - async_tp=n_layers * 2 - 1, + matches=( + lambda n_layers: Matches( + rms_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2, + sequence_parallel=n_layers * 2, + async_tp=n_layers * 2 - 1, + ) + ) + if current_platform.is_cuda() + # ROCm matches + else ( + lambda n_layers: Matches( + rms_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + sequence_parallel=n_layers * 2, + ) ), ) llama4_scout_fp4 = ModelFusionInfo( model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4", hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}}, - matches=lambda n_layers: Matches( - rms_quant_fusion=0, - attn_quant_fusion=n_layers, - ar_rms_fusion=n_layers * 2, - sequence_parallel=n_layers * 2, - async_tp=n_layers * 2 - 1, + matches=( + lambda n_layers: Matches( + rms_quant_fusion=0, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2, + sequence_parallel=n_layers * 2, + async_tp=n_layers * 2 - 1, + ) + ) + if current_platform.is_cuda() + # ROCm matches + else ( + lambda n_layers: Matches( + rms_quant_fusion=0, + attn_quant_fusion=n_layers, + sequence_parallel=n_layers * 2, + async_tp=n_layers * 2 - 1, + ) ), ) qwen3_a3b = ModelFusionInfo( model_name="Qwen/Qwen3-30B-A3B", - matches=lambda n_layers: Matches( - norm_rope_fusion=n_layers, - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 2, + matches=( + lambda n_layers: Matches( + norm_rope_fusion=n_layers, + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 2, + ) + ) + if current_platform.is_cuda() + # ROCm matches + else ( + lambda n_layers: Matches( + norm_rope_fusion=n_layers, + ar_rms_fusion=0, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 2, + ) ), ) qwen3_a3b_fp8 = ModelFusionInfo( model_name="Qwen/Qwen3-30B-A3B-FP8", - matches=lambda n_layers: Matches( - rms_quant_fusion=n_layers, - norm_rope_fusion=n_layers, - attn_quant_fusion=0, # attn + group quant not supported - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 2, + matches=( + lambda n_layers: Matches( + rms_quant_fusion=n_layers, + norm_rope_fusion=n_layers, + attn_quant_fusion=0, # attn + group quant not supported + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 2, + ) + ) + if current_platform.is_cuda() + # ROCm matches + else ( + lambda n_layers: Matches( + rms_quant_fusion=n_layers, + norm_rope_fusion=n_layers, + ar_rms_fusion=0, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 2, + ) ), ) diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py index 4769ca1e0b63..fb743c1ba7d3 100644 --- a/tests/compile/fusions_e2e/test_tp2_async_tp.py +++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py @@ -5,6 +5,7 @@ import pytest from vllm.config import PassConfig +from vllm.platforms import current_platform from ...utils import multi_gpu_test from .common import ( @@ -16,6 +17,8 @@ ) from .models import ( FLASHINFER_ATTN, + ROCM_AITER_UNIFIED_ATTN, + ROCM_ATTN, TRITON_ATTN, llama3_8b, llama3_8b_fp8, @@ -29,9 +32,14 @@ "model_name, matches_fn, model_kwargs, hf_overrides", [llama3_8b_fp8, llama4_scout_fp8], ) -@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN]) +@pytest.mark.parametrize( + "attn_backend", + [TRITON_ATTN, FLASHINFER_ATTN] + if current_platform.is_cuda() + else [TRITON_ATTN, ROCM_ATTN, ROCM_AITER_UNIFIED_ATTN], +) @pytest.mark.parametrize("n_layers", [4]) -@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm")) +@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("quant_fp8", "rms_norm"))) @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) def test_tp2_async_tp_fp8_fusions( model_name: str, @@ -96,7 +104,7 @@ def test_tp2_async_tp_fp8_fusions( ) @pytest.mark.parametrize("attn_backend", [TRITON_ATTN]) @pytest.mark.parametrize("n_layers", [4]) -@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm")) +@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("rms_norm"))) @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) def test_tp2_async_tp_fusions( model_name: str, From 119b4b01b345e17264c84d37f2acd060e6a5448a Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 10 Feb 2026 10:18:50 +0000 Subject: [PATCH 06/45] fix tests in fusion silu_mul and tidy up kite Signed-off-by: vllmellm --- .buildkite/test-amd.yaml | 4 ++++ .../passes/test_silu_mul_quant_fusion.py | 19 +++++++------------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 64a878baa774..33ba6689faa5 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1695,6 +1695,7 @@ steps: - label: AsyncTP Correctness Tests (2xMI325 GPUs) timeout_in_minutes: 50 working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction, tj] device: mi325_2 optional: true num_devices: 2 @@ -1789,6 +1790,7 @@ steps: - label: Fusion E2E TP2 Quick (MI325) timeout_in_minutes: 40 working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction, tj] device: mi325_1 num_devices: 2 source_file_dependencies: @@ -1806,6 +1808,7 @@ steps: - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) timeout_in_minutes: 40 working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction, tj] device: mi325_2 num_devices: 2 source_file_dependencies: @@ -1823,6 +1826,7 @@ steps: - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3" - label: Fusion E2E TP2 (MI325) + mirror_hardwares: [amdexperimental, amdproduction, tj] timeout_in_minutes: 40 working_dir: "/vllm-workspace/" device: mi325_2 diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py index c6794a156240..abd32c38ca04 100644 --- a/tests/compile/passes/test_silu_mul_quant_fusion.py +++ b/tests/compile/passes/test_silu_mul_quant_fusion.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools -from typing import Any import pytest import torch @@ -10,7 +9,7 @@ from tests.compile.backend import TestBackend from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor from tests.utils import TestFP8Layer -from vllm._aiter_ops import IS_AITER_FOUND, rocm_aiter_ops +from vllm._aiter_ops import IS_AITER_FOUND from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.compilation.passes.fusion.act_quant_fusion import ( FUSED_OPS, @@ -184,13 +183,6 @@ def ops_in_model_after(self): PerTensorTorchFP8ScaledMMLinearKernel, ] TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS -EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [] -# SiluMulGroupFp8Quant is only supported on ROCm -if current_platform.is_rocm(): - EXTENDED_TESTCASES.append((TestSiluMulGroupFp8QuantModel, True, None)) - -if current_platform.is_cuda(): - EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None)) @pytest.mark.parametrize("num_tokens", [32, 64]) @@ -200,7 +192,10 @@ def ops_in_model_after(self): @pytest.mark.parametrize( "model_class, enable_quant_fp8_custom_op, force_kernel", list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS)) - + EXTENDED_TESTCASES, + + [ + (TestSiluMulNvfp4QuantModel, False, None), + (TestSiluMulGroupFp8QuantModel, True, None), + ], ) @pytest.mark.skipif( envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm" @@ -246,14 +241,14 @@ def test_fusion_silu_and_mul_quant( with set_current_vllm_config(config), monkeypatch.context() as m: fusion_passes = [ActivationQuantFusionPass(config)] - if current_platform.is_rocm() and IS_AITER_FOUND: + if IS_AITER_FOUND and model_class is TestSiluMulGroupFp8QuantModel: + from vllm._aiter_ops import rocm_aiter_ops from vllm.compilation.passes.fusion.rocm_aiter_fusion import ( RocmAiterSiluMulFp8GroupQuantFusionPass, ) m.setenv("VLLM_ROCM_USE_AITER", "1") rocm_aiter_ops.refresh_env_variables() - fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)] passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)] From 218fcfb221df513d2c000e3e133e27f9e6f1d010 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 10 Feb 2026 10:25:11 +0000 Subject: [PATCH 07/45] remove unnecessary change Signed-off-by: vllmellm --- vllm/model_executor/layers/quantization/input_quant_fp8.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py index ed3b981cf183..5bc78afa43b0 100644 --- a/vllm/model_executor/layers/quantization/input_quant_fp8.py +++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py @@ -171,7 +171,6 @@ def forward_native( x: torch.Tensor, scale: torch.Tensor | None = None, scale_ub: torch.Tensor | None = None, - **kwargs, ): if self.is_group_quant and not self.static: assert scale is None, "Dynamic group quantization does not use scale" From befaba1bc95b823ac36b0091632b206b8c6faa76 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 10 Feb 2026 13:46:06 +0000 Subject: [PATCH 08/45] remove duplicate Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 33ba6689faa5..149767bdd06b 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1721,18 +1721,6 @@ steps: - tests/compile/test_silu_mul_quant_fusion.py - tests/compile/distributed/test_fusion_all_reduce.py - tests/compile/fullgraph/test_full_graph.py - source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/ - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/attention/attention.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes - - tests/compile/passes/test_fusion_attn.py - - tests/compile/passes/test_silu_mul_quant_fusion.py - - tests/compile/passes/distributed/test_fusion_all_reduce.py - - tests/compile/fullgraph/test_full_graph.py commands: - rocm-smi # we run all backend tests on ROCm From ca801a13e3d9dc9c37bb1e429a7134cc4e28e58a Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 10 Feb 2026 15:16:30 +0000 Subject: [PATCH 09/45] need to add quote Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 149767bdd06b..8d9c6eb62db0 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -620,7 +620,7 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + - "pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'" # # Limit to no custom ops to reduce running time # # Wrap with quotes to escape yaml and avoid starting -k string with a - # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" @@ -1689,7 +1689,7 @@ steps: - tests/compile/correctness_e2e/test_sequence_parallel.py commands: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py + - "pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py" # corresponds to .buildkite/test_areas/compile.yaml - label: AsyncTP Correctness Tests (2xMI325 GPUs) @@ -1701,7 +1701,7 @@ steps: num_devices: 2 commands: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py + - "pytest -v -s tests/compile/correctness_e2e/test_async_tp.py" # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion and Compile Unit Tests (2xMI325 GPUs) @@ -1724,8 +1724,8 @@ steps: commands: - rocm-smi # we run all backend tests on ROCm - - pytest -v -s tests/compile/passes/test_fusion_attn.py - - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py + - "pytest -v -s tests/compile/passes/test_fusion_attn.py" + - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py" # TODO: this test is not supported on ROCm, there are aiter kernels for this. # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) @@ -1748,7 +1748,7 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"' # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet. # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" @@ -1772,8 +1772,8 @@ steps: commands: - rocm-smi # Run just llama3 (fp8) for all config combinations - - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3" - - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3" + - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"' + - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"' - label: Fusion E2E TP2 Quick (MI325) timeout_in_minutes: 40 @@ -1790,7 +1790,7 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"' # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) @@ -1811,7 +1811,7 @@ steps: commands: - rocm-smi # Run just llama3 (fp8 & bf16) for all config combinations - - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3" + - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"' - label: Fusion E2E TP2 (MI325) mirror_hardwares: [amdexperimental, amdproduction, tj] @@ -1828,4 +1828,4 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + - 'pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"' From 0b65174b5ad3074344ae519852d28561bd155f46 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 10 Feb 2026 15:47:57 +0000 Subject: [PATCH 10/45] fix syntax Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 8d9c6eb62db0..439075c6a843 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1748,7 +1748,7 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"' + - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet. # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" @@ -1772,8 +1772,8 @@ steps: commands: - rocm-smi # Run just llama3 (fp8) for all config combinations - - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"' - - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"' + - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'llama-3'" + - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'" - label: Fusion E2E TP2 Quick (MI325) timeout_in_minutes: 40 @@ -1790,7 +1790,7 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"' + - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) @@ -1811,7 +1811,7 @@ steps: commands: - rocm-smi # Run just llama3 (fp8 & bf16) for all config combinations - - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"' + - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'" - label: Fusion E2E TP2 (MI325) mirror_hardwares: [amdexperimental, amdproduction, tj] @@ -1828,4 +1828,4 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - 'pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"' + - "pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" From be40a224ccc64d072f2664d15b58eee9fe46c4b2 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 10 Feb 2026 16:49:52 +0000 Subject: [PATCH 11/45] fix Fusion E2E TP2 (MI325) path Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 439075c6a843..407a3d671803 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1814,7 +1814,7 @@ steps: - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'" - label: Fusion E2E TP2 (MI325) - mirror_hardwares: [amdexperimental, amdproduction, tj] + mirror_hardwares: [amdexperimental, amdproduction, tj, tj2] timeout_in_minutes: 40 working_dir: "/vllm-workspace/" device: mi325_2 @@ -1828,4 +1828,4 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - "pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" + - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" From d8d071254ffbafcabb69fd9221e08a755298e7e5 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 02:48:27 +0000 Subject: [PATCH 12/45] fix test-amd syntax Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 407a3d671803..8d469a39b042 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1696,7 +1696,7 @@ steps: timeout_in_minutes: 50 working_dir: "/vllm-workspace/" mirror_hardwares: [amdexperimental, amdproduction, tj] - device: mi325_2 + agent_pool: mi325_2 optional: true num_devices: 2 commands: @@ -1750,7 +1750,7 @@ steps: # Run all models and attn backends but only Inductor partition and native custom ops - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported - # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet. + # TODO: Qwen uses group quantization which the pattern matcher on ROCm is not supported yet. # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" # corresponds to .buildkite/test_areas/compile.yaml @@ -1779,7 +1779,7 @@ steps: timeout_in_minutes: 40 working_dir: "/vllm-workspace/" mirror_hardwares: [amdexperimental, amdproduction, tj] - device: mi325_1 + agent_pool: mi325_1 num_devices: 2 source_file_dependencies: - csrc/quantization/ @@ -1790,14 +1790,14 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" + - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) timeout_in_minutes: 40 working_dir: "/vllm-workspace/" mirror_hardwares: [amdexperimental, amdproduction, tj] - device: mi325_2 + agent_pool: mi325_2 num_devices: 2 source_file_dependencies: - csrc/quantization/ @@ -1811,13 +1811,13 @@ steps: commands: - rocm-smi # Run just llama3 (fp8 & bf16) for all config combinations - - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'" + - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'" - label: Fusion E2E TP2 (MI325) mirror_hardwares: [amdexperimental, amdproduction, tj, tj2] timeout_in_minutes: 40 working_dir: "/vllm-workspace/" - device: mi325_2 + agent_pool: mi325_2 num_devices: 2 source_file_dependencies: - csrc/quantization/ @@ -1828,4 +1828,4 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" + - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" From f58033a22e7088f79830ea5f99a146c60b89b8be Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 04:15:47 +0000 Subject: [PATCH 13/45] revert pytorch tests Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 0507c617745a..1329af10973c 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -569,12 +569,9 @@ steps: --ignore=lora/test_gptoss_tp.py \ --ignore=lora/test_qwen3moe_tp.py parallelism: 4 - -##### .buildkite/test_areas/pytorch.yaml ##### -# corresponds to .buildkite/test_areas/pytorch.yaml - label: PyTorch Compilation Unit Tests # 15min timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, tj] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking torch_nightly: true @@ -588,14 +585,10 @@ steps: # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - # TODO: clean up this comment if not needed. It is used to - # keep track of the tests changes during vLLM IR Ops refactoring. - - pytest -s -v compile/passes --ignore compile/passes/distributed -# corresponds to .buildkite/test_areas/pytorch.yaml - label: PyTorch Fullgraph Smoke Test # 15min - timeout_in_minutes: 35 - mirror_hardwares: [amdexperimental, amdproduction, tj] + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking torch_nightly: true @@ -609,10 +602,9 @@ steps: # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" -# corresponds to .buildkite/test_areas/pytorch.yaml -- label: PyTorch Fullgraph # 27min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, tj] +- label: PyTorch Fullgraph Test # 27min + timeout_in_minutes: 40 + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking torch_nightly: true @@ -620,7 +612,7 @@ steps: - vllm/ - tests/compile commands: - - "pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'" + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' # # Limit to no custom ops to reduce running time # # Wrap with quotes to escape yaml and avoid starting -k string with a - # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" @@ -1529,20 +1521,17 @@ steps: num_gpus: 2 commands: - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py - # ================= 24 passed, 11 warnings in 192.85s (0:03:12) ================== - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py - # ================== 48 passed, 8 warnings in 386.41s (0:06:26) ================== - - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py - # ======================== 8 skipped, 9 warnings in 2.08s ======================== + # TODO: this test is not supported on ROCm, there are aiter kernels for this. + # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - pytest -v -s tests/distributed/test_context_parallel.py - # ======================== 4 passed, 3 warnings in 30.45s ======================== - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization - - pytest -v -s tests/v1/distributed/test_dbo.py - # ======================== 2 skipped, 3 warnings in 1.97s ======================== + # TODO: this test is not supported on ROCm + # - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### - label: Distributed Tests (B200) # optional From eabee32b32f103afd0ec64a0fb08045b8357e288 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 05:01:29 +0000 Subject: [PATCH 14/45] fix agent pool Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 2 +- tests/compile/passes/test_silu_mul_quant_fusion.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 1329af10973c..02f91563ed00 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1768,7 +1768,7 @@ steps: timeout_in_minutes: 40 working_dir: "/vllm-workspace/" mirror_hardwares: [amdexperimental, amdproduction, tj] - agent_pool: mi325_1 + agent_pool: mi325_2 num_devices: 2 source_file_dependencies: - csrc/quantization/ diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py index abd32c38ca04..f6d5e112dd12 100644 --- a/tests/compile/passes/test_silu_mul_quant_fusion.py +++ b/tests/compile/passes/test_silu_mul_quant_fusion.py @@ -148,9 +148,6 @@ def __init__(self, hidden_size: int, **kwargs): weight_group_shape=GroupShape(128, 128), act_quant_group_shape=GroupShape(1, 128), cutlass_block_fp8_supported=False, - # this parameter cannot always be True, - # it depends on the VLLM_ROCM_USE_AITER - # and VLLM_ROCM_USE_AITER_LINEAR environment variables use_aiter_and_is_supported=True, ) self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() @@ -185,6 +182,7 @@ def ops_in_model_after(self): TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS + @pytest.mark.parametrize("num_tokens", [32, 64]) @pytest.mark.parametrize("hidden_size", [128, 256]) @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) @@ -194,6 +192,8 @@ def ops_in_model_after(self): list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS)) + [ (TestSiluMulNvfp4QuantModel, False, None), + # GroupFP8Quant fusion only works with AITER on ROCm. + # and the enable_quant_fp8_custom_op must be True. (TestSiluMulGroupFp8QuantModel, True, None), ], ) From 56ac061c19626b03916a6b200f346092d1d4ba32 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 07:56:41 +0000 Subject: [PATCH 15/45] add fix test_full_graph Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 53 +--------------------- tests/compile/fullgraph/test_full_graph.py | 8 ++-- tests/compile/fusions_e2e/models.py | 2 + 3 files changed, 6 insertions(+), 57 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 02f91563ed00..b3b18b0b1b53 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1719,7 +1719,7 @@ steps: # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) # TODO: this test is not supported on ROCm - # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion E2E Quick (MI325) @@ -1765,56 +1765,5 @@ steps: - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'" - label: Fusion E2E TP2 Quick (MI325) - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction, tj] - agent_pool: mi325_2 - num_devices: 2 - source_file_dependencies: - - csrc/quantization/ - - vllm/model_executor/ - - vllm/v1/attention/ - - vllm/compilation/ - - tests/compile/fusions_e2e/ - commands: - - rocm-smi - # Run all models and attn backends but only Inductor partition and native custom ops - - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" - -# corresponds to .buildkite/test_areas/compile.yaml - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction, tj] - agent_pool: mi325_2 - num_devices: 2 - source_file_dependencies: - - csrc/quantization/ - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/attention/attention.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/fusions_e2e/ - commands: - - rocm-smi - # Run just llama3 (fp8 & bf16) for all config combinations - - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'" - - label: Fusion E2E TP2 (MI325) - mirror_hardwares: [amdexperimental, amdproduction, tj, tj2] - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/" - agent_pool: mi325_2 - num_devices: 2 - source_file_dependencies: - - csrc/quantization/ - - vllm/model_executor/ - - vllm/v1/attention/ - - vllm/compilation/ - - tests/compile/fusions_e2e/ - commands: - - rocm-smi - # Run all models and attn backends but only Inductor partition and native custom ops - - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index 921f57cea0a6..447391903314 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -205,11 +205,9 @@ def test_custom_compile_config( ] if current_platform.is_cuda() else [ - # ("Qwen/Qwen2-0.5B", None), # Standard attention model - # ( - # "deepseek-ai/DeepSeek-V2-Lite", - # AttentionBackendEnum.TRITON_MLA, - # ), # MLA (Multi-head Latent Attention) model + ("Qwen/Qwen2-0.5B", None), # Standard attention model + # AttentionBackendEnum.TRITON_MLA does not support + # fp8 kv scale compile. ( "deepseek-ai/DeepSeek-V2-Lite", AttentionBackendEnum.ROCM_AITER_MLA, diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py index 77cb1b4d3ad9..ffca5bc6c0a5 100644 --- a/tests/compile/fusions_e2e/models.py +++ b/tests/compile/fusions_e2e/models.py @@ -131,7 +131,9 @@ lambda n_layers: Matches( rms_quant_fusion=n_layers, attn_quant_fusion=n_layers, + ar_rms_fusion=0, sequence_parallel=n_layers * 2, + async_tp=0 ) ), ) From b8c0bcdff3aa3baf4b6b2c99feab6d1db7f066d7 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 07:58:25 +0000 Subject: [PATCH 16/45] remove unrelated comment Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index b3b18b0b1b53..766fc76b09b3 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1717,8 +1717,6 @@ steps: - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py" # TODO: this test is not supported on ROCm, there are aiter kernels for this. # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py - # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - # TODO: this test is not supported on ROCm - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile # corresponds to .buildkite/test_areas/compile.yaml From 9ef71e4de064fc1fc459f668dc99a223fd34a93d Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 08:04:37 +0000 Subject: [PATCH 17/45] reduce test and compute resource Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 766fc76b09b3..263555e219cf 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1680,24 +1680,12 @@ steps: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - "pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py" -# corresponds to .buildkite/test_areas/compile.yaml -- label: AsyncTP Correctness Tests (2xMI325 GPUs) - timeout_in_minutes: 50 - working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction, tj] - agent_pool: mi325_2 - optional: true - num_devices: 2 - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - "pytest -v -s tests/compile/correctness_e2e/test_async_tp.py" - # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion and Compile Unit Tests (2xMI325 GPUs) timeout_in_minutes: 20 working_dir: "/vllm-workspace/" mirror_hardwares: [amdexperimental, amdproduction, tj] - agent_pool: mi325_2 + agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs source_file_dependencies: - csrc/quantization/fp4/ - vllm/model_executor/layers/quantization/ @@ -1762,6 +1750,7 @@ steps: - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'llama-3'" - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'" -- label: Fusion E2E TP2 Quick (MI325) -- label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) -- label: Fusion E2E TP2 (MI325) +# - label: AsyncTP Correctness Tests (2xMI325 GPUs) +# - label: Fusion E2E TP2 Quick (MI325) +# - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) +# - label: Fusion E2E TP2 (MI325) From 158ea2fc6d1bdf3f88eacf52a06d8597a9be8cdf Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 10:58:55 +0000 Subject: [PATCH 18/45] skip kvcache tests and reverted the changes in test_full_graph Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 3 +- tests/compile/fullgraph/test_full_graph.py | 43 +++++----------------- 2 files changed, 12 insertions(+), 34 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 263555e219cf..cb5f10295409 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1705,7 +1705,8 @@ steps: - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py" # TODO: this test is not supported on ROCm, there are aiter kernels for this. # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py - - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + # TODO: find out more details + # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion E2E Quick (MI325) diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index 447391903314..cf16e7fffd40 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -10,7 +10,6 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams -from vllm._aiter_ops import rocm_aiter_ops from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig from vllm.platforms import current_platform from vllm.utils.torch_utils import is_torch_equal_or_newer @@ -202,46 +201,24 @@ def test_custom_compile_config( "deepseek-ai/DeepSeek-V2-Lite", AttentionBackendEnum.FLASHINFER_MLA, ), # MLA (Multi-head Latent Attention) model - ] - if current_platform.is_cuda() - else [ - ("Qwen/Qwen2-0.5B", None), # Standard attention model - # AttentionBackendEnum.TRITON_MLA does not support - # fp8 kv scale compile. - ( - "deepseek-ai/DeepSeek-V2-Lite", - AttentionBackendEnum.ROCM_AITER_MLA, - ), # MLA (Multi-head Latent Attention) model - ( - "deepseek-ai/DeepSeek-V2-Lite", - AttentionBackendEnum.ROCM_AITER_TRITON_MLA, - ), # MLA (Multi-head Latent Attention) model - ] + ], ), ) def test_fp8_kv_scale_compile( compilation_mode: int, model: str, backend: AttentionBackendEnum | None, - monkeypatch: pytest.MonkeyPatch, ): - with monkeypatch.context() as m: - model_kwargs = { - "quantization": "fp8", - "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8", - "calculate_kv_scales": True, - "max_model_len": 512, - } - if backend: - model_kwargs["attention_config"] = {"backend": backend.name} - if current_platform.is_rocm(): - m.setenv("VLLM_ROCM_USE_AITER", "1") - # Disable Aiter MOE as some shapes are not supported - m.setenv("VLLM_ROCM_USE_AITER_MOE", "0") + model_kwargs = { + "quantization": "fp8", + "kv_cache_dtype": "fp8_e4m3", + "calculate_kv_scales": True, + "max_model_len": 512, + } + if backend: + model_kwargs["attention_config"] = {"backend": backend.name} - rocm_aiter_ops.refresh_env_variables() - - run_model(compilation_mode, model, **model_kwargs) + run_model(compilation_mode, model, **model_kwargs) def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs): From 099766164aede247b5e52344466af1f95b60f433 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 13:25:14 +0000 Subject: [PATCH 19/45] remove tj marker Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index cb5f10295409..1aad50efb39a 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1667,7 +1667,7 @@ steps: - label: Sequence Parallel Correctness Tests (2xMI325 GPUs) timeout_in_minutes: 50 working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction, tj] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_2 num_devices: 2 source_file_dependencies: @@ -1684,7 +1684,7 @@ steps: - label: Fusion and Compile Unit Tests (2xMI325 GPUs) timeout_in_minutes: 20 working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction, tj] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs source_file_dependencies: - csrc/quantization/fp4/ @@ -1712,7 +1712,7 @@ steps: - label: Fusion E2E Quick (MI325) timeout_in_minutes: 15 working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction, tj] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 num_devices: 1 source_file_dependencies: @@ -1733,7 +1733,7 @@ steps: - label: Fusion E2E Config Sweep (MI325) timeout_in_minutes: 30 working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction, tj] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 num_devices: 1 source_file_dependencies: From 6891c605398ac698320fe18cd4a350c2a5ebb447 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 13:45:15 +0000 Subject: [PATCH 20/45] fix syntax Signed-off-by: tjtanaa --- tests/compile/fullgraph/test_full_graph.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index cf16e7fffd40..ed4c92d90ff7 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -194,15 +194,13 @@ def test_custom_compile_config( ) @pytest.mark.parametrize( "model, backend", - ( - [ - ("Qwen/Qwen2-0.5B", None), # Standard attention model - ( - "deepseek-ai/DeepSeek-V2-Lite", - AttentionBackendEnum.FLASHINFER_MLA, - ), # MLA (Multi-head Latent Attention) model - ], - ), + [ + ("Qwen/Qwen2-0.5B", None), # Standard attention model + ( + "deepseek-ai/DeepSeek-V2-Lite", + AttentionBackendEnum.FLASHINFER_MLA, + ), # MLA (Multi-head Latent Attention) model + ], ) def test_fp8_kv_scale_compile( compilation_mode: int, From 1e8fe872380933966839600cd06ecad894304948 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 13:48:27 +0000 Subject: [PATCH 21/45] add skip marker Signed-off-by: tjtanaa --- tests/compile/fusions_e2e/test_tp2_ar_rms.py | 3 +++ tests/compile/fusions_e2e/test_tp2_async_tp.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/tests/compile/fusions_e2e/test_tp2_ar_rms.py b/tests/compile/fusions_e2e/test_tp2_ar_rms.py index 18b19565c1fc..ab4aefcaf79a 100644 --- a/tests/compile/fusions_e2e/test_tp2_ar_rms.py +++ b/tests/compile/fusions_e2e/test_tp2_ar_rms.py @@ -5,6 +5,7 @@ import pytest from vllm.config import PassConfig +from vllm.platforms import current_platform from ...utils import multi_gpu_test from .common import ( @@ -26,6 +27,8 @@ qwen3_a3b_fp8, ) +pytestmark = pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA") + @multi_gpu_test(num_gpus=2) @pytest.mark.parametrize( diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py index fb743c1ba7d3..88c3dc8192a5 100644 --- a/tests/compile/fusions_e2e/test_tp2_async_tp.py +++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py @@ -26,6 +26,8 @@ qwen3_a3b, ) +pytestmark = pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA") + @multi_gpu_test(num_gpus=2) @pytest.mark.parametrize( From b81b0f92a0d060a3bbae05a5854906e08cfacdfe Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 13:52:07 +0000 Subject: [PATCH 22/45] revert test Signed-off-by: tjtanaa --- tests/compile/fusions_e2e/test_tp1_quant.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py index 25a607051a89..294a91e98a8c 100644 --- a/tests/compile/fusions_e2e/test_tp1_quant.py +++ b/tests/compile/fusions_e2e/test_tp1_quant.py @@ -22,6 +22,7 @@ llama3_8b_fp4, llama3_8b_fp8, llama4_scout_fp4, + llama4_scout_fp8, qwen3_a3b_fp8, ) @@ -30,9 +31,9 @@ "model_name, matches_fn, model_kwargs, hf_overrides, use_deepgemm", [ (*llama3_8b_fp8, False), - # (*llama4_scout_fp8, False), + (*llama4_scout_fp8, False), (*qwen3_a3b_fp8, False), - # (*qwen3_a3b_fp8, True), + (*qwen3_a3b_fp8, True), ], ) @pytest.mark.parametrize( From 0326f765d1ae189883c46a0f3836d356ef02e018 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 14:22:05 +0000 Subject: [PATCH 23/45] fix the test case, amd cannot run nvidia model Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 2 +- tests/compile/fusions_e2e/test_tp1_quant.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 1aad50efb39a..7ec9ccbf74cf 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1724,7 +1724,7 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" + - "oh" # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported # TODO: Qwen uses group quantization which the pattern matcher on ROCm is not supported yet. # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py index 294a91e98a8c..5b167d6dc7cd 100644 --- a/tests/compile/fusions_e2e/test_tp1_quant.py +++ b/tests/compile/fusions_e2e/test_tp1_quant.py @@ -31,10 +31,16 @@ "model_name, matches_fn, model_kwargs, hf_overrides, use_deepgemm", [ (*llama3_8b_fp8, False), - (*llama4_scout_fp8, False), (*qwen3_a3b_fp8, False), (*qwen3_a3b_fp8, True), - ], + ] + + ( + [ + (*llama4_scout_fp8, False), + ] + if current_platform.is_cuda() + else [] + ), ) @pytest.mark.parametrize( "attn_backend", From 9001be5b46001070f8f78fd89e774de773980409 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 14:26:07 +0000 Subject: [PATCH 24/45] remove sequence parallel test Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 7ec9ccbf74cf..d1417501cfbd 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1663,23 +1663,6 @@ steps: # which affects fusion passes on ROCm. So we have to # enable them as as soon as possible. -# corresponds to .buildkite/test_areas/compile.yaml -- label: Sequence Parallel Correctness Tests (2xMI325 GPUs) - timeout_in_minutes: 50 - working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_2 - num_devices: 2 - source_file_dependencies: - - vllm/model_executor/layers/ - - vllm/compilation/ - - vllm/v1/worker/ - - vllm/v1/cudagraph_dispatcher.py - - tests/compile/correctness_e2e/test_sequence_parallel.py - commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - "pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py" - # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion and Compile Unit Tests (2xMI325 GPUs) timeout_in_minutes: 20 @@ -1751,7 +1734,12 @@ steps: - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'llama-3'" - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'" +## There are no ops on ROCm for these tests. +## The test still passes but the logs are not useful. +## fused ops just call torch.ops.symm_mem which +## exists in ROCm even though they don't work # - label: AsyncTP Correctness Tests (2xMI325 GPUs) # - label: Fusion E2E TP2 Quick (MI325) # - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) # - label: Fusion E2E TP2 (MI325) +# - label: Sequence Parallel Correctness Tests (2xMI325 GPUs) From ca222af52c0e052d33c0a61f73e44ebbd6f145f1 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 14:27:50 +0000 Subject: [PATCH 25/45] skip sequence parallel on non-cuda Signed-off-by: tjtanaa --- tests/compile/passes/distributed/test_sequence_parallelism.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/compile/passes/distributed/test_sequence_parallelism.py b/tests/compile/passes/distributed/test_sequence_parallelism.py index 46363a9a4a44..a793d68522b4 100644 --- a/tests/compile/passes/distributed/test_sequence_parallelism.py +++ b/tests/compile/passes/distributed/test_sequence_parallelism.py @@ -36,6 +36,8 @@ from vllm.utils.system_utils import update_environment_variables from vllm.utils.torch_utils import set_random_seed +pytestmark = pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA") + FP8_DTYPE = current_platform.fp8_dtype() prompts = [ "Hello, my name is", From 06b0aca4f2f4499a6bc470c5b6eb393381922a17 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 15:39:05 +0000 Subject: [PATCH 26/45] fix the test_config_generation.py Signed-off-by: tjtanaa --- tests/config/test_config_generation.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/config/test_config_generation.py b/tests/config/test_config_generation.py index 225ac0f2226d..a235e1a20a88 100644 --- a/tests/config/test_config_generation.py +++ b/tests/config/test_config_generation.py @@ -80,12 +80,13 @@ def create_config(): ray.shutdown() -def test_unrecognized_env(): - import os +def test_unrecognized_env(monkeypatch): + # Remove any existing VLLM env vars that might interfere + monkeypatch.delenv("VLLM_TEST_GROUP_NAME", raising=False) # Test that if fail_on_environ_validation is True, then an error # is raised when an unrecognized vLLM environment variable is set - os.environ["VLLM_UNRECOGNIZED_ENV_VAR"] = "some_value" + monkeypatch.setenv("VLLM_UNRECOGNIZED_ENV_VAR", "some_value") engine_args = EngineArgs( fail_on_environ_validation=True, ) @@ -97,7 +98,7 @@ def test_unrecognized_env(): engine_args.create_engine_config() # Test that when the unrecognized env var is removed, no error is raised - os.environ.pop("VLLM_UNRECOGNIZED_ENV_VAR", None) + monkeypatch.delenv("VLLM_UNRECOGNIZED_ENV_VAR") engine_args = EngineArgs( fail_on_environ_validation=True, ) From 676184e2d2a5365742ba0029d267f41dee642ada Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 12 Feb 2026 03:13:03 +0000 Subject: [PATCH 27/45] fix test_configuration Signed-off-by: tjtanaa --- tests/config/test_config_generation.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/config/test_config_generation.py b/tests/config/test_config_generation.py index a235e1a20a88..c7edf2b97174 100644 --- a/tests/config/test_config_generation.py +++ b/tests/config/test_config_generation.py @@ -81,8 +81,14 @@ def create_config(): def test_unrecognized_env(monkeypatch): - # Remove any existing VLLM env vars that might interfere - monkeypatch.delenv("VLLM_TEST_GROUP_NAME", raising=False) + import os + + from vllm.envs import environment_variables + + # Remove any existing unrecognized VLLM env vars that might interfere + for env in list(os.environ): + if env.startswith("VLLM_") and env not in environment_variables: + monkeypatch.delenv(env, raising=False) # Test that if fail_on_environ_validation is True, then an error # is raised when an unrecognized vLLM environment variable is set From b566461790559b6f1bcba96b3377a2483044980c Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 12 Feb 2026 03:17:06 +0000 Subject: [PATCH 28/45] fix the qwen3 e2e fusion pass on ROCm + AITER Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 7 +++---- tests/compile/fusions_e2e/common.py | 4 ++++ tests/compile/fusions_e2e/conftest.py | 5 +++++ tests/compile/fusions_e2e/models.py | 13 ++++++++++--- tests/compile/fusions_e2e/test_tp1_quant.py | 8 +++++++- vllm/compilation/passes/fusion/rocm_aiter_fusion.py | 4 +++- 6 files changed, 32 insertions(+), 9 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index d1417501cfbd..53cf6ef530a9 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1707,10 +1707,9 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - "oh" - # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported - # TODO: Qwen uses group quantization which the pattern matcher on ROCm is not supported yet. - # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" + - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" + # Different from CUDA, Qwen requires +rms_norm and +quant_fp8 as rms+quant fusion is only supported on AITER + - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and +rms_norm and +quant_fp8 and qwen3'" # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion E2E Config Sweep (MI325) diff --git a/tests/compile/fusions_e2e/common.py b/tests/compile/fusions_e2e/common.py index 284a9d66b957..2c6dc2b3ebbc 100644 --- a/tests/compile/fusions_e2e/common.py +++ b/tests/compile/fusions_e2e/common.py @@ -13,6 +13,7 @@ class Matches(NamedTuple): # simple pointwise + aiter_rms_quant_fusion: int = 0 rms_quant_fusion: int = 0 act_quant_fusion: int = 0 norm_rope_fusion: int = 0 @@ -82,6 +83,9 @@ def has_cuda_graph_wrapper_metadata() -> bool: ] FUSION_LOG_PATTERNS: dict[str, re.Pattern] = { + "aiter_rms_quant_fusion": re.compile( + r"RocmAiterRMSNormQuantFusionPass Replaced (\d+) patterns" + ), "rms_quant_fusion": re.compile(r"rms_quant_fusion.py:\d+] Replaced (\d+) patterns"), "act_quant_fusion": re.compile(r"act_quant_fusion.py:\d+] Replaced (\d+) patterns"), "norm_rope_fusion": re.compile( diff --git a/tests/compile/fusions_e2e/conftest.py b/tests/compile/fusions_e2e/conftest.py index 1d9f6cda9fd6..3dce32f9b404 100644 --- a/tests/compile/fusions_e2e/conftest.py +++ b/tests/compile/fusions_e2e/conftest.py @@ -63,9 +63,14 @@ def run( compilation_config: dict, matches_check: list[str], use_deepgemm: bool = False, + use_aiter: bool = False, tp_size: int = 1, ): monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1" if use_deepgemm else "0") + monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1" if use_aiter else "0") + from vllm._aiter_ops import rocm_aiter_ops + + rocm_aiter_ops.refresh_env_variables() # Disable, compile cache to make sure custom passes run. # Otherwise, we can't verify fusion happened through the logs. diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py index ffca5bc6c0a5..c459791f40f6 100644 --- a/tests/compile/fusions_e2e/models.py +++ b/tests/compile/fusions_e2e/models.py @@ -133,7 +133,7 @@ attn_quant_fusion=n_layers, ar_rms_fusion=0, sequence_parallel=n_layers * 2, - async_tp=0 + async_tp=0, ) ), ) @@ -200,8 +200,15 @@ # ROCm matches else ( lambda n_layers: Matches( - rms_quant_fusion=n_layers, - norm_rope_fusion=n_layers, + aiter_rms_quant_fusion=n_layers, + rms_quant_fusion=0, + # TODO: Allow use to set back n_layers, + # On ROCm norm_rope_fusion is only supported without + # enabling AITER. + # when we are running the tests in + # tests/compile/fusions_e2e/test_tp1_quant.py + # we are enabling AITER, so no fusion happens. + norm_rope_fusion=0, ar_rms_fusion=0, sequence_parallel=n_layers * 2 + 1, async_tp=n_layers * 2, diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py index 5b167d6dc7cd..b8d79102e82c 100644 --- a/tests/compile/fusions_e2e/test_tp1_quant.py +++ b/tests/compile/fusions_e2e/test_tp1_quant.py @@ -32,11 +32,11 @@ [ (*llama3_8b_fp8, False), (*qwen3_a3b_fp8, False), - (*qwen3_a3b_fp8, True), ] + ( [ (*llama4_scout_fp8, False), + (*qwen3_a3b_fp8, True), # only supported on CUDA ] if current_platform.is_cuda() else [] @@ -92,6 +92,8 @@ def test_tp1_fp8_fusions( ), ) + use_aiter = current_platform.is_rocm() and ("qwen" in model_name.lower()) + matches_check = [ "rms_quant_fusion", "act_quant_fusion", @@ -99,6 +101,9 @@ def test_tp1_fp8_fusions( "attn_quant_fusion", ] + if use_aiter: + matches_check.append("aiter_rms_quant_fusion") + run_e2e_fusion_test( model_name, matches, @@ -107,6 +112,7 @@ def test_tp1_fp8_fusions( compilation_config, matches_check, use_deepgemm=use_deepgemm, + use_aiter=use_aiter, ) diff --git a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py index 99278365c5db..7d67f1bb8c01 100644 --- a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py +++ b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py @@ -312,7 +312,9 @@ def __init__(self, config: VllmConfig) -> None: @VllmInductorPass.time_and_log def __call__(self, graph: fx.Graph) -> None: self.matched_count = self.patterns.apply(graph) - logger.debug("Replaced %s patterns", self.matched_count) + logger.debug( + "%s Replaced %s patterns", self.__class__.__name__, self.matched_count + ) def uuid(self) -> str: fusion_patterns = [ From 24a142d31f8f9229d5b0b0e2f3e5f886330eb20f Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 12 Feb 2026 03:56:24 +0000 Subject: [PATCH 29/45] fix pytest command Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 584a89a4ea78..37a65c9b2707 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -569,9 +569,12 @@ steps: --ignore=lora/test_gptoss_tp.py \ --ignore=lora/test_qwen3moe_tp.py parallelism: 4 + +##### .buildkite/test_areas/pytorch.yaml ##### +# corresponds to .buildkite/test_areas/pytorch.yaml - label: PyTorch Compilation Unit Tests # 15min timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, tj] agent_pool: mi325_1 # grade: Blocking torch_nightly: true @@ -585,6 +588,10 @@ steps: # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + # TODO: clean up this comment if not needed. It is used to + # keep track of the tests changes during vLLM IR Ops refactoring. + # Use `find` to launch multiple instances of pytest. + - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -1667,7 +1674,7 @@ steps: - label: Fusion and Compile Unit Tests (2xMI325 GPUs) timeout_in_minutes: 20 working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, tj] agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs source_file_dependencies: - csrc/quantization/fp4/ @@ -1695,7 +1702,7 @@ steps: - label: Fusion E2E Quick (MI325) timeout_in_minutes: 15 working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, tj] agent_pool: mi325_1 num_devices: 1 source_file_dependencies: @@ -1715,7 +1722,7 @@ steps: - label: Fusion E2E Config Sweep (MI325) timeout_in_minutes: 30 working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, tj] agent_pool: mi325_1 num_devices: 1 source_file_dependencies: @@ -1730,8 +1737,7 @@ steps: commands: - rocm-smi # Run just llama3 (fp8) for all config combinations - - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'llama-3'" - - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'" + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3" ## There are no ops on ROCm for these tests. ## The test still passes but the logs are not useful. From 7e2cca4f1e19dbf60b60e6eb196e563340e1b5ef Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 12 Feb 2026 04:02:53 +0000 Subject: [PATCH 30/45] fix pre-commit Signed-off-by: tjtanaa --- tests/compile/passes/test_silu_mul_quant_fusion.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py index f6d5e112dd12..f18de1ac13d6 100644 --- a/tests/compile/passes/test_silu_mul_quant_fusion.py +++ b/tests/compile/passes/test_silu_mul_quant_fusion.py @@ -182,7 +182,6 @@ def ops_in_model_after(self): TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS - @pytest.mark.parametrize("num_tokens", [32, 64]) @pytest.mark.parametrize("hidden_size", [128, 256]) @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) From 089969c172825ad0c142d1cb72a4edb2dab3e8a7 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 12 Feb 2026 04:15:34 +0000 Subject: [PATCH 31/45] fix the model configi Signed-off-by: tjtanaa --- tests/compile/fusions_e2e/models.py | 40 ++++++++++++++--------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py index c459791f40f6..e600daaf739d 100644 --- a/tests/compile/fusions_e2e/models.py +++ b/tests/compile/fusions_e2e/models.py @@ -49,9 +49,9 @@ # ROCm matches else ( lambda n_layers: Matches( - ar_rms_fusion=0, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 4, + ar_rms_fusion=0, # Not supported + sequence_parallel=0, # Not supported + async_tp=0, # Not supported ) ), ) @@ -75,9 +75,9 @@ rms_quant_fusion=n_layers * 2, act_quant_fusion=n_layers, attn_quant_fusion=n_layers, - ar_rms_fusion=0, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 4, + ar_rms_fusion=0, # Not supported + sequence_parallel=0, # Not supported + async_tp=0, # Not supported ) ), ) @@ -101,9 +101,9 @@ rms_quant_fusion=0, act_quant_fusion=n_layers, attn_quant_fusion=n_layers, - ar_rms_fusion=0, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 4, + ar_rms_fusion=0, # Not supported + sequence_parallel=0, # Not supported + async_tp=0, # Not supported ) ), ) @@ -131,9 +131,9 @@ lambda n_layers: Matches( rms_quant_fusion=n_layers, attn_quant_fusion=n_layers, - ar_rms_fusion=0, - sequence_parallel=n_layers * 2, - async_tp=0, + ar_rms_fusion=0, # Not supported + sequence_parallel=0, # Not supported + async_tp=0, # Not supported ) ), ) @@ -156,8 +156,8 @@ lambda n_layers: Matches( rms_quant_fusion=0, attn_quant_fusion=n_layers, - sequence_parallel=n_layers * 2, - async_tp=n_layers * 2 - 1, + sequence_parallel=0, # Not supported + async_tp=0, # Not supported ) ), ) @@ -177,9 +177,9 @@ else ( lambda n_layers: Matches( norm_rope_fusion=n_layers, - ar_rms_fusion=0, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 2, + ar_rms_fusion=0, # Not supported + sequence_parallel=0, # Not supported + async_tp=0, # Not supported ) ), ) @@ -209,9 +209,9 @@ # tests/compile/fusions_e2e/test_tp1_quant.py # we are enabling AITER, so no fusion happens. norm_rope_fusion=0, - ar_rms_fusion=0, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 2, + ar_rms_fusion=0, # Not supported + sequence_parallel=0, # Not supported + async_tp=0, # Not supported ) ), ) From 5d0539803e028a7d2f09b7c910f85de0a27e5372 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Fri, 13 Feb 2026 07:44:03 +0000 Subject: [PATCH 32/45] remove experimental flag Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 8 ++++---- tests/compile/fusions_e2e/test_tp2_async_tp.py | 13 +++---------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 37a65c9b2707..83bd308631ed 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -574,7 +574,7 @@ steps: # corresponds to .buildkite/test_areas/pytorch.yaml - label: PyTorch Compilation Unit Tests # 15min timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction, tj] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking torch_nightly: true @@ -1674,7 +1674,7 @@ steps: - label: Fusion and Compile Unit Tests (2xMI325 GPUs) timeout_in_minutes: 20 working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction, tj] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs source_file_dependencies: - csrc/quantization/fp4/ @@ -1702,7 +1702,7 @@ steps: - label: Fusion E2E Quick (MI325) timeout_in_minutes: 15 working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction, tj] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 num_devices: 1 source_file_dependencies: @@ -1722,7 +1722,7 @@ steps: - label: Fusion E2E Config Sweep (MI325) timeout_in_minutes: 30 working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction, tj] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 num_devices: 1 source_file_dependencies: diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py index 88c3dc8192a5..35277ebe8350 100644 --- a/tests/compile/fusions_e2e/test_tp2_async_tp.py +++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py @@ -17,8 +17,6 @@ ) from .models import ( FLASHINFER_ATTN, - ROCM_AITER_UNIFIED_ATTN, - ROCM_ATTN, TRITON_ATTN, llama3_8b, llama3_8b_fp8, @@ -34,14 +32,9 @@ "model_name, matches_fn, model_kwargs, hf_overrides", [llama3_8b_fp8, llama4_scout_fp8], ) -@pytest.mark.parametrize( - "attn_backend", - [TRITON_ATTN, FLASHINFER_ATTN] - if current_platform.is_cuda() - else [TRITON_ATTN, ROCM_ATTN, ROCM_AITER_UNIFIED_ATTN], -) +@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN]) @pytest.mark.parametrize("n_layers", [4]) -@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("quant_fp8", "rms_norm"))) +@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm")) @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) def test_tp2_async_tp_fp8_fusions( model_name: str, @@ -106,7 +99,7 @@ def test_tp2_async_tp_fp8_fusions( ) @pytest.mark.parametrize("attn_backend", [TRITON_ATTN]) @pytest.mark.parametrize("n_layers", [4]) -@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("rms_norm"))) +@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm")) @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) def test_tp2_async_tp_fusions( model_name: str, From 4922c7a0cc68af579368c998d118ac1cd8403dd2 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Fri, 13 Feb 2026 16:53:42 +0000 Subject: [PATCH 33/45] test suggestion Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 18 +++++++++++++----- .../layers/attention/mla_attention.py | 2 +- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 83bd308631ed..93f00f62c334 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -588,10 +588,18 @@ steps: # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - # TODO: clean up this comment if not needed. It is used to - # keep track of the tests changes during vLLM IR Ops refactoring. - # Use `find` to launch multiple instances of pytest. - - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + +# corresponds to .buildkite/test_areas/pytorch.yaml +- label: PyTorch Compilation Passes Unit Tests + timeout_in_minutes: 20 + mirror_hardwares: [amdexperimental, amdproduction, tj] + agent_pool: mi325_1 + source_file_dependencies: + - vllm/ + - tests/compile/passes + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - pytest -s -v compile/passes --ignore compile/passes/distributed - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -1674,7 +1682,7 @@ steps: - label: Fusion and Compile Unit Tests (2xMI325 GPUs) timeout_in_minutes: 20 working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, tj] agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs source_file_dependencies: - csrc/quantization/fp4/ diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index ed066784d652..d6e7c6447b14 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -403,7 +403,7 @@ def __init__( self.is_aiter_triton_fp4_bmm_enabled = ( rocm_aiter_ops.is_fp4bmm_enabled() and self.kv_b_proj.weight.dtype == torch.bfloat16 - and current_platform.has_device_capability(95) # gfx950 and above + # and current_platform.has_device_capability(95) # gfx950 and above ) # Attributes for forward_impl method From 1aace958ab163efe87b7effed827c23a0ccd7a5e Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Sat, 14 Feb 2026 03:10:19 +0000 Subject: [PATCH 34/45] revert pytorch test Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 93f00f62c334..87d5f46cb641 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -592,14 +592,16 @@ steps: # corresponds to .buildkite/test_areas/pytorch.yaml - label: PyTorch Compilation Passes Unit Tests timeout_in_minutes: 20 - mirror_hardwares: [amdexperimental, amdproduction, tj] + mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 source_file_dependencies: - vllm/ - tests/compile/passes commands: - - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - pytest -s -v compile/passes --ignore compile/passes/distributed + # TODO: clean up this comment if not needed. It is used to + # keep track of the tests changes during vLLM IR Ops refactoring. + # Use `find` to launch multiple instances of pytest. + - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 From 3f0e188902fee0292434b3bba209312abeee2c57 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Sat, 14 Feb 2026 15:41:09 +0000 Subject: [PATCH 35/45] remove mla related bugfix Signed-off-by: tjtanaa --- vllm/model_executor/layers/attention/mla_attention.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index 9cb0882059b6..98ff02e9d4ae 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -404,7 +404,6 @@ def __init__( self.is_aiter_triton_fp4_bmm_enabled = ( rocm_aiter_ops.is_fp4bmm_enabled() and self.kv_b_proj.weight.dtype == torch.bfloat16 - # and current_platform.has_device_capability(95) # gfx950 and above ) # Attributes for forward_impl method From 64d3b6324cb9f8a5ded94bf1bdeefe77d2e0ded4 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Sat, 14 Feb 2026 15:47:50 +0000 Subject: [PATCH 36/45] convert condition to pytest.param Signed-off-by: tjtanaa --- tests/compile/fusions_e2e/test_tp1_quant.py | 52 ++++++++++++++----- .../passes/test_silu_mul_quant_fusion.py | 13 ++++- 2 files changed, 51 insertions(+), 14 deletions(-) diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py index b8d79102e82c..54e0e40ffc07 100644 --- a/tests/compile/fusions_e2e/test_tp1_quant.py +++ b/tests/compile/fusions_e2e/test_tp1_quant.py @@ -32,21 +32,47 @@ [ (*llama3_8b_fp8, False), (*qwen3_a3b_fp8, False), - ] - + ( - [ - (*llama4_scout_fp8, False), - (*qwen3_a3b_fp8, True), # only supported on CUDA - ] - if current_platform.is_cuda() - else [] - ), + pytest.param( + *llama4_scout_fp8, + False, + marks=pytest.mark.skipif( + not current_platform.is_cuda(), + reason="Llama4 Scout FP8 only supported on CUDA", + ), + ), + pytest.param( + *qwen3_a3b_fp8, + True, + marks=pytest.mark.skipif( + not current_platform.is_cuda(), reason="DeepGemm only supported on CUDA" + ), + ), + ], ) @pytest.mark.parametrize( "attn_backend", - [TRITON_ATTN, FLASHINFER_ATTN] - if current_platform.is_cuda() - else [TRITON_ATTN, ROCM_ATTN, ROCM_AITER_UNIFIED_ATTN], + [ + TRITON_ATTN, + pytest.param( + FLASHINFER_ATTN, + marks=pytest.mark.skipif( + not current_platform.is_cuda(), + reason="FlashInfer only supported on CUDA", + ), + ), + pytest.param( + ROCM_ATTN, + marks=pytest.mark.skipif( + current_platform.is_cuda(), reason="ROCm attention only for AMD" + ), + ), + pytest.param( + ROCM_AITER_UNIFIED_ATTN, + marks=pytest.mark.skipif( + current_platform.is_cuda(), reason="ROCm AIter only for AMD" + ), + ), + ], ) @pytest.mark.parametrize("n_layers", [6]) @pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm")) @@ -102,7 +128,7 @@ def test_tp1_fp8_fusions( ] if use_aiter: - matches_check.append("aiter_rms_quant_fusion") + matches_check[0] = "aiter_rms_quant_fusion" run_e2e_fusion_test( model_name, diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py index f18de1ac13d6..1d807e07586b 100644 --- a/tests/compile/passes/test_silu_mul_quant_fusion.py +++ b/tests/compile/passes/test_silu_mul_quant_fusion.py @@ -191,9 +191,20 @@ def ops_in_model_after(self): list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS)) + [ (TestSiluMulNvfp4QuantModel, False, None), + pytest.param( + TestSiluMulGroupFp8QuantModel, + False, + None, + skipif=not current_platform.is_cuda(), + ), # GroupFP8Quant fusion only works with AITER on ROCm. # and the enable_quant_fp8_custom_op must be True. - (TestSiluMulGroupFp8QuantModel, True, None), + pytest.param( + TestSiluMulGroupFp8QuantModel, + True, + None, + skipif=not current_platform.is_rocm(), + ), ], ) @pytest.mark.skipif( From c186a19b06d5ced2c70947df44926e24259d72ad Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Sat, 14 Feb 2026 15:51:31 +0000 Subject: [PATCH 37/45] apply suggestion Signed-off-by: tjtanaa --- tests/compile/fusions_e2e/models.py | 8 +------- tests/compile/fusions_e2e/test_tp1_quant.py | 9 +++++++++ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py index e600daaf739d..c76023bbbec9 100644 --- a/tests/compile/fusions_e2e/models.py +++ b/tests/compile/fusions_e2e/models.py @@ -202,13 +202,7 @@ lambda n_layers: Matches( aiter_rms_quant_fusion=n_layers, rms_quant_fusion=0, - # TODO: Allow use to set back n_layers, - # On ROCm norm_rope_fusion is only supported without - # enabling AITER. - # when we are running the tests in - # tests/compile/fusions_e2e/test_tp1_quant.py - # we are enabling AITER, so no fusion happens. - norm_rope_fusion=0, + norm_rope_fusion=n_layers, ar_rms_fusion=0, # Not supported sequence_parallel=0, # Not supported async_tp=0, # Not supported diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py index 54e0e40ffc07..e72f5b9a2d73 100644 --- a/tests/compile/fusions_e2e/test_tp1_quant.py +++ b/tests/compile/fusions_e2e/test_tp1_quant.py @@ -130,6 +130,15 @@ def test_tp1_fp8_fusions( if use_aiter: matches_check[0] = "aiter_rms_quant_fusion" + # TODO: enable the `norm_rope_fusion` test, + # On ROCm norm_rope_fusion is only supported without + # enabling AITER. + # when we are running the tests in + # tests/compile/fusions_e2e/test_tp1_quant.py + # we are enabling AITER, so no fusion happens. + if "qwen" in model_name.lower(): + matches_check.remove("norm_rope_fusion") + run_e2e_fusion_test( model_name, matches, From 050544d173c58daf8f21a63dcbd9fe680c8e6f86 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Sat, 14 Feb 2026 16:57:25 +0000 Subject: [PATCH 38/45] fix error from pytest.param Signed-off-by: tjtanaa --- tests/compile/fusions_e2e/models.py | 13 +++++++++-- tests/compile/fusions_e2e/test_tp1_quant.py | 22 +++---------------- .../passes/test_silu_mul_quant_fusion.py | 8 +++++-- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py index c76023bbbec9..b7f6ac3f89c9 100644 --- a/tests/compile/fusions_e2e/models.py +++ b/tests/compile/fusions_e2e/models.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest +from vllm._aiter_ops import is_aiter_found_and_supported from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer from vllm.v1.attention.backends.registry import AttentionBackendEnum @@ -25,14 +26,22 @@ AttentionBackendCase(backend=AttentionBackendEnum.TRITON_ATTN), id="TRITON_ATTN" ) -# ROCm backends ROCM_ATTN = pytest.param( - AttentionBackendCase(backend=AttentionBackendEnum.ROCM_ATTN), id="ROCM_ATTN" + AttentionBackendCase(backend=AttentionBackendEnum.ROCM_ATTN), + id="ROCM_ATTN", + marks=pytest.mark.skipif( + not current_platform.is_rocm(), + reason="ROCm attention only for AMD", + ), ) ROCM_AITER_UNIFIED_ATTN = pytest.param( AttentionBackendCase(backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN), id="ROCM_AITER_UNIFIED_ATTN", + marks=pytest.mark.skipif( + not is_aiter_found_and_supported(), + reason="ROCM_AITER_UNIFIED_ATTN only for AMD when AITER is installed", + ), ) # Models diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py index e72f5b9a2d73..8037d55cdb56 100644 --- a/tests/compile/fusions_e2e/test_tp1_quant.py +++ b/tests/compile/fusions_e2e/test_tp1_quant.py @@ -53,25 +53,9 @@ "attn_backend", [ TRITON_ATTN, - pytest.param( - FLASHINFER_ATTN, - marks=pytest.mark.skipif( - not current_platform.is_cuda(), - reason="FlashInfer only supported on CUDA", - ), - ), - pytest.param( - ROCM_ATTN, - marks=pytest.mark.skipif( - current_platform.is_cuda(), reason="ROCm attention only for AMD" - ), - ), - pytest.param( - ROCM_AITER_UNIFIED_ATTN, - marks=pytest.mark.skipif( - current_platform.is_cuda(), reason="ROCm AIter only for AMD" - ), - ), + FLASHINFER_ATTN, + ROCM_ATTN, + ROCM_AITER_UNIFIED_ATTN, ], ) @pytest.mark.parametrize("n_layers", [6]) diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py index 1d807e07586b..12bb16e1409f 100644 --- a/tests/compile/passes/test_silu_mul_quant_fusion.py +++ b/tests/compile/passes/test_silu_mul_quant_fusion.py @@ -195,7 +195,9 @@ def ops_in_model_after(self): TestSiluMulGroupFp8QuantModel, False, None, - skipif=not current_platform.is_cuda(), + marks=pytest.mark.skipif( + not current_platform.is_cuda(), reason="CUDA only" + ), ), # GroupFP8Quant fusion only works with AITER on ROCm. # and the enable_quant_fp8_custom_op must be True. @@ -203,7 +205,9 @@ def ops_in_model_after(self): TestSiluMulGroupFp8QuantModel, True, None, - skipif=not current_platform.is_rocm(), + marks=pytest.mark.skipif( + not current_platform.is_rocm(), reason="ROCm only" + ), ), ], ) From 97102c3eed9748fe5c32501eebcf48099eaf6475 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 24 Feb 2026 10:40:55 +0000 Subject: [PATCH 39/45] remove rocm branching in model defination Signed-off-by: vllmellm --- tests/compile/fusions_e2e/models.py | 172 +++++--------------- tests/compile/fusions_e2e/test_tp1_quant.py | 7 + 2 files changed, 47 insertions(+), 132 deletions(-) diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py index b7f6ac3f89c9..e18bc1ee5652 100644 --- a/tests/compile/fusions_e2e/models.py +++ b/tests/compile/fusions_e2e/models.py @@ -47,73 +47,33 @@ # Models llama3_8b = ModelFusionInfo( model_name="meta-llama/Llama-3.1-8B-Instruct", - matches=( - lambda n_layers: Matches( - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 4, - ) - ) - if current_platform.is_cuda() - # ROCm matches - else ( - lambda n_layers: Matches( - ar_rms_fusion=0, # Not supported - sequence_parallel=0, # Not supported - async_tp=0, # Not supported - ) + matches=lambda n_layers: Matches( + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, ), ) llama3_8b_fp8 = ModelFusionInfo( model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", - matches=( - lambda n_layers: Matches( - rms_quant_fusion=n_layers * 2, - act_quant_fusion=n_layers, - attn_quant_fusion=n_layers, - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 4, - ) - ) - if current_platform.is_cuda() - # ROCm matches - else ( - lambda n_layers: Matches( - rms_quant_fusion=n_layers * 2, - act_quant_fusion=n_layers, - attn_quant_fusion=n_layers, - ar_rms_fusion=0, # Not supported - sequence_parallel=0, # Not supported - async_tp=0, # Not supported - ) + matches=lambda n_layers: Matches( + rms_quant_fusion=n_layers * 2, + act_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, ), ) llama3_8b_fp4 = ModelFusionInfo( model_name="nvidia/Llama-3.1-8B-Instruct-FP4", - matches=( - lambda n_layers: Matches( - rms_quant_fusion=0, - act_quant_fusion=n_layers, - attn_quant_fusion=n_layers, - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 4, - ) - ) - if current_platform.is_cuda() - # ROCm matches - else ( - lambda n_layers: Matches( - rms_quant_fusion=0, - act_quant_fusion=n_layers, - attn_quant_fusion=n_layers, - ar_rms_fusion=0, # Not supported - sequence_parallel=0, # Not supported - async_tp=0, # Not supported - ) + matches=lambda n_layers: Matches( + act_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, ), ) @@ -125,96 +85,44 @@ llama4_scout_fp8 = ModelFusionInfo( model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}}, - matches=( - lambda n_layers: Matches( - rms_quant_fusion=n_layers, - attn_quant_fusion=n_layers, - ar_rms_fusion=n_layers * 2, - sequence_parallel=n_layers * 2, - async_tp=n_layers * 2 - 1, - ) - ) - if current_platform.is_cuda() - # ROCm matches - else ( - lambda n_layers: Matches( - rms_quant_fusion=n_layers, - attn_quant_fusion=n_layers, - ar_rms_fusion=0, # Not supported - sequence_parallel=0, # Not supported - async_tp=0, # Not supported - ) + matches=lambda n_layers: Matches( + rms_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2, + sequence_parallel=n_layers * 2, + async_tp=n_layers * 2 - 1, ), ) llama4_scout_fp4 = ModelFusionInfo( model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4", hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}}, - matches=( - lambda n_layers: Matches( - rms_quant_fusion=0, - attn_quant_fusion=n_layers, - ar_rms_fusion=n_layers * 2, - sequence_parallel=n_layers * 2, - async_tp=n_layers * 2 - 1, - ) - ) - if current_platform.is_cuda() - # ROCm matches - else ( - lambda n_layers: Matches( - rms_quant_fusion=0, - attn_quant_fusion=n_layers, - sequence_parallel=0, # Not supported - async_tp=0, # Not supported - ) + matches=lambda n_layers: Matches( + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2, + sequence_parallel=n_layers * 2, + async_tp=n_layers * 2 - 1, ), ) qwen3_a3b = ModelFusionInfo( model_name="Qwen/Qwen3-30B-A3B", - matches=( - lambda n_layers: Matches( - norm_rope_fusion=n_layers, - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 2, - ) - ) - if current_platform.is_cuda() - # ROCm matches - else ( - lambda n_layers: Matches( - norm_rope_fusion=n_layers, - ar_rms_fusion=0, # Not supported - sequence_parallel=0, # Not supported - async_tp=0, # Not supported - ) + matches=lambda n_layers: Matches( + norm_rope_fusion=n_layers, + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 2, ), ) qwen3_a3b_fp8 = ModelFusionInfo( model_name="Qwen/Qwen3-30B-A3B-FP8", - matches=( - lambda n_layers: Matches( - rms_quant_fusion=n_layers, - norm_rope_fusion=n_layers, - attn_quant_fusion=0, # attn + group quant not supported - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 2, - ) - ) - if current_platform.is_cuda() - # ROCm matches - else ( - lambda n_layers: Matches( - aiter_rms_quant_fusion=n_layers, - rms_quant_fusion=0, - norm_rope_fusion=n_layers, - ar_rms_fusion=0, # Not supported - sequence_parallel=0, # Not supported - async_tp=0, # Not supported - ) + matches=lambda n_layers: Matches( + rms_quant_fusion=n_layers, + norm_rope_fusion=n_layers, + attn_quant_fusion=0, # attn + group quant not supported + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 2, ), ) diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py index 8037d55cdb56..ed252b6c033b 100644 --- a/tests/compile/fusions_e2e/test_tp1_quant.py +++ b/tests/compile/fusions_e2e/test_tp1_quant.py @@ -104,6 +104,13 @@ def test_tp1_fp8_fusions( use_aiter = current_platform.is_rocm() and ("qwen" in model_name.lower()) + if not current_platform.is_cuda(): + matches = matches._replace(ar_rms_fusion=0, sequence_parallel=0, async_tp=0) + if "qwen" in model_name.lower(): + matches = matches._replace( + rms_quant_fusion=0, aiter_rms_quant_fusion=n_layers + ) + matches_check = [ "rms_quant_fusion", "act_quant_fusion", From 0a42a79dde4c967e9de108342deeea3cdbc7a3a8 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 26 Feb 2026 00:59:54 +0000 Subject: [PATCH 40/45] remove todo Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 37e5a84a07e6..73117cd82d28 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1532,7 +1532,7 @@ steps: # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - pytest -v -s tests/distributed/test_context_parallel.py - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization - # TODO: this test is not supported on ROCm + # this test is not supported on ROCm # - pytest -v -s tests/v1/distributed/test_dbo.py ##### B200 test ##### From 3204c5cede6673932290ac0b6859e30faef458c0 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 26 Feb 2026 01:32:53 +0000 Subject: [PATCH 41/45] remove unnecessary test_tp1_quant.py Signed-off-by: tjtanaa --- tests/compile/fusions_e2e/test_tp1_quant.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py index 9cce547f89f6..159de38979fa 100644 --- a/tests/compile/fusions_e2e/test_tp1_quant.py +++ b/tests/compile/fusions_e2e/test_tp1_quant.py @@ -107,13 +107,6 @@ def test_tp1_fp8_fusions( use_aiter = current_platform.is_rocm() and ("qwen" in model_name.lower()) - if not current_platform.is_cuda(): - matches = matches._replace(ar_rms_fusion=0, sequence_parallel=0, async_tp=0) - if "qwen" in model_name.lower(): - matches = matches._replace( - rms_quant_fusion=0, aiter_rms_quant_fusion=n_layers - ) - matches_check = [ "rms_quant_fusion", "act_quant_fusion", @@ -123,14 +116,12 @@ def test_tp1_fp8_fusions( if use_aiter: matches_check[0] = "aiter_rms_quant_fusion" - - # TODO: enable the `norm_rope_fusion` test, - # On ROCm norm_rope_fusion is only supported without - # enabling AITER. - # when we are running the tests in - # tests/compile/fusions_e2e/test_tp1_quant.py - # we are enabling AITER, so no fusion happens. - if "qwen" in model_name.lower(): + # TODO: enable the `norm_rope_fusion` test, + # On ROCm norm_rope_fusion is only supported without + # enabling AITER. + # when we are running the tests in + # tests/compile/fusions_e2e/test_tp1_quant.py + # we are enabling AITER, so no fusion happens. matches_check.remove("norm_rope_fusion") run_e2e_fusion_test( From de42cfbd07af805f7d30847b13d6b121ec87610e Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Thu, 26 Feb 2026 11:20:47 +0000 Subject: [PATCH 42/45] apply reviewer feedback Signed-off-by: tjtanaa --- tests/compile/fusions_e2e/test_tp1_quant.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py index 159de38979fa..22823de9870a 100644 --- a/tests/compile/fusions_e2e/test_tp1_quant.py +++ b/tests/compile/fusions_e2e/test_tp1_quant.py @@ -116,6 +116,8 @@ def test_tp1_fp8_fusions( if use_aiter: matches_check[0] = "aiter_rms_quant_fusion" + + matches = matches._replace(aiter_rms_quant_fusion=matches.rms_quant_fusion) # TODO: enable the `norm_rope_fusion` test, # On ROCm norm_rope_fusion is only supported without # enabling AITER. From a168f7bd3684baee01062a6a048779403fc93ae1 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Mon, 2 Mar 2026 06:56:36 +0000 Subject: [PATCH 43/45] remove comment Signed-off-by: tjtanaa --- tests/compile/fusions_e2e/test_tp1_quant.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py index 22823de9870a..917116515f89 100644 --- a/tests/compile/fusions_e2e/test_tp1_quant.py +++ b/tests/compile/fusions_e2e/test_tp1_quant.py @@ -121,9 +121,6 @@ def test_tp1_fp8_fusions( # TODO: enable the `norm_rope_fusion` test, # On ROCm norm_rope_fusion is only supported without # enabling AITER. - # when we are running the tests in - # tests/compile/fusions_e2e/test_tp1_quant.py - # we are enabling AITER, so no fusion happens. matches_check.remove("norm_rope_fusion") run_e2e_fusion_test( From 53d253daebc243f7ea480a73e91d0cb6c65c1a12 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 3 Mar 2026 05:27:39 +0000 Subject: [PATCH 44/45] fix SiluMulGroupQaunt Signed-off-by: vllmellm --- tests/compile/passes/test_silu_mul_quant_fusion.py | 3 +-- vllm/compilation/passes/fusion/rocm_aiter_fusion.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py index d31cca1ef129..a77b4e6de7bd 100644 --- a/tests/compile/passes/test_silu_mul_quant_fusion.py +++ b/tests/compile/passes/test_silu_mul_quant_fusion.py @@ -182,9 +182,8 @@ def ops_in_model_after(self): "model_class, enable_quant_fp8_custom_op, force_kernel", list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS)) + [ - (TestSiluMulNvfp4QuantModel, False, None), pytest.param( - TestSiluMulGroupFp8QuantModel, + TestSiluMulNvfp4QuantModel, False, None, marks=pytest.mark.skipif( diff --git a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py index 7d67f1bb8c01..59c94db5e812 100644 --- a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py +++ b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py @@ -14,7 +14,7 @@ GroupShape, QuantKey, ScaleDesc, - kFp8Static128BlockSym, + kFp8Dynamic128Sym, ) from vllm.platforms import current_platform @@ -337,7 +337,7 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern): def __init__(self) -> None: self.silu_and_mul_matcher = MatcherSiluAndMul() self.quant_matcher = MatcherQuantFP8( - quant_key=kFp8Static128BlockSym, match_rocm_aiter=True + quant_key=kFp8Dynamic128Sym, match_rocm_aiter=True ) def get_inputs(self) -> list[torch.Tensor]: @@ -350,7 +350,7 @@ def pattern( input: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: at1 = self.silu_and_mul_matcher(input) - at2 = self.quant_matcher(at1, 128) + at2 = self.quant_matcher(at1) return at2[0], at2[1] def replacement( From 8374509a5a442c118f79c088022029ec3d546545 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 3 Mar 2026 07:51:56 +0000 Subject: [PATCH 45/45] comment out redundant tests Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 56 +++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 73117cd82d28..d4dbe7232342 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1650,33 +1650,35 @@ steps: # which affects fusion passes on ROCm. So we have to # enable them as as soon as possible. -# corresponds to .buildkite/test_areas/compile.yaml -- label: Fusion and Compile Unit Tests (2xMI325 GPUs) - timeout_in_minutes: 20 - working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction, tj] - agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs - source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/ - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/attention/attention.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes - - tests/compile/test_fusion_attn.py - - tests/compile/test_silu_mul_quant_fusion.py - - tests/compile/distributed/test_fusion_all_reduce.py - - tests/compile/fullgraph/test_full_graph.py - commands: - - rocm-smi - # we run all backend tests on ROCm - - "pytest -v -s tests/compile/passes/test_fusion_attn.py" - - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py" - # TODO: this test is not supported on ROCm, there are aiter kernels for this. - # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py - # TODO: find out more details - # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile +## TODO: Enable the test in this group +# # corresponds to .buildkite/test_areas/compile.yaml +# - label: Fusion and Compile Unit Tests (2xMI325 GPUs) +# timeout_in_minutes: 20 +# working_dir: "/vllm-workspace/" +# mirror_hardwares: [amdexperimental, amdproduction, tj] +# agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs +# source_file_dependencies: +# - csrc/quantization/fp4/ +# - vllm/model_executor/layers/quantization/ +# - vllm/model_executor/layers/layernorm.py +# - vllm/model_executor/layers/activation.py +# - vllm/model_executor/layers/attention/attention.py +# - vllm/v1/attention/backends/flashinfer.py +# - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes +# - tests/compile/test_fusion_attn.py +# - tests/compile/test_silu_mul_quant_fusion.py +# - tests/compile/distributed/test_fusion_all_reduce.py +# - tests/compile/fullgraph/test_full_graph.py +# commands: +# - rocm-smi +# # we run all backend tests on ROCm +# # These two tests are covered in "PyTorch Compilation Passes Unit Tests" +# # - "pytest -v -s tests/compile/passes/test_fusion_attn.py" +# # - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py" +# # TODO: this test is not supported on ROCm, there are aiter kernels for this. +# # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py +# # TODO: find out more details +# # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion E2E Quick (MI325)