From 230246d9349db65878aee63f7c4c8b4a920fa821 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Mon, 9 Feb 2026 08:28:20 +0000 Subject: [PATCH 01/16] try to enable new fusion pass test for ROCm Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 174 +++++++++++++----- tests/compile/fullgraph/test_full_graph.py | 34 +++- tests/compile/fusions_e2e/models.py | 83 +++++++-- tests/compile/fusions_e2e/test_tp1_quant.py | 15 +- tests/compile/passes/test_fusion_attn.py | 2 +- .../passes/test_silu_mul_quant_fusion.py | 15 +- .../layers/quantization/input_quant_fp8.py | 1 + 7 files changed, 245 insertions(+), 79 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 503b3a76f941..986708e37641 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -570,9 +570,11 @@ steps: --ignore=lora/test_qwen3moe_tp.py parallelism: 4 +##### .buildkite/test_areas/pytorch.yaml ##### +# corresponds to .buildkite/test_areas/pytorch.yaml - label: PyTorch Compilation Unit Tests # 15min timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] + mirror_hardwares: [amdexperimental, amdproduction, tj] agent_pool: mi325_1 # grade: Blocking torch_nightly: true @@ -586,10 +588,14 @@ steps: # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + # TODO: clean up this comment if not needed. It is used to + # keep track of the tests changes during vLLM IR Ops refactoring. + - pytest -s -v compile/passes --ignore compile/passes/distributed +# corresponds to .buildkite/test_areas/pytorch.yaml - label: PyTorch Fullgraph Smoke Test # 15min - timeout_in_minutes: 30 - mirror_hardwares: [amdexperimental, amdproduction] + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental, amdproduction, tj] agent_pool: mi325_1 # grade: Blocking torch_nightly: true @@ -603,9 +609,10 @@ steps: # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" -- label: PyTorch Fullgraph Test # 27min - timeout_in_minutes: 40 - mirror_hardwares: [amdexperimental, amdproduction] +# corresponds to .buildkite/test_areas/pytorch.yaml +- label: PyTorch Fullgraph # 27min + timeout_in_minutes: 30 + mirror_hardwares: [amdexperimental, amdproduction, tj] agent_pool: mi325_1 # grade: Blocking torch_nightly: true @@ -1176,41 +1183,6 @@ steps: - pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py -- label: Blackwell Fusion and Compile Tests # 30 min - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/" - gpu: b200 - source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/v1/worker/ - - vllm/v1/cudagraph_dispatcher.py - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/passes/test_fusion_attn.py - - tests/compile/passes/test_silu_mul_quant_fusion.py - - tests/compile/passes/distributed/test_fusion_all_reduce.py - - tests/compile/fullgraph/test_full_graph.py - commands: - - nvidia-smi - - pytest -v -s tests/compile/passes/test_fusion_attn.py - - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py - # this runner has 2 GPUs available even though num_gpus=2 is not set - - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py - - # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time - # # Wrap with quotes to escape yaml - # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" - # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 - # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - - # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile - - label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 working_dir: "/vllm-workspace/" @@ -1334,7 +1306,6 @@ steps: - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - pytest -v -s distributed/test_sequence_parallel.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py @@ -1558,17 +1529,20 @@ steps: num_gpus: 2 commands: - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py + # ================= 24 passed, 11 warnings in 192.85s (0:03:12) ================== - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py + # ================== 48 passed, 8 warnings in 386.41s (0:06:26) ================== - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py + # ======================== 8 skipped, 9 warnings in 2.08s ======================== #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'" # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated. - - - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py - pytest -v -s tests/distributed/test_context_parallel.py + # ======================== 4 passed, 3 warnings in 30.45s ======================== - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization - pytest -v -s tests/v1/distributed/test_dbo.py + # ======================== 2 skipped, 3 warnings in 1.97s ======================== ##### B200 test ##### - label: Distributed Tests (B200) # optional @@ -1692,3 +1666,115 @@ steps: working_dir: "/vllm-workspace" commands: - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 + +##### .buildkite/test_areas/compile.yaml ##### +# Slowly setting up the tests so that it is also easier for the +# CI team to review and upstream to the pipelinev2. +# The following tests are important for vLLM IR Ops refactoring, +# which affects fusion passes on ROCm. So we have to +# enable them as as soon as possible. + +# corresponds to .buildkite/test_areas/compile.yaml +- label: Sequence Parallel Correctness Tests (2xMI325 GPUs) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction, tj] + agent_pool: mi325_2 + num_devices: 2 + source_file_dependencies: + - vllm/model_executor/layers/ + - vllm/compilation/ + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py + - tests/compile/correctness_e2e/test_sequence_parallel.py + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py + +# corresponds to .buildkite/test_areas/compile.yaml +- label: Fusion and Compile Unit Tests (2xMI325 GPUs) + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction, tj] + agent_pool: mi325_2 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/ + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/attention/attention.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes + - tests/compile/test_fusion_attn.py + - tests/compile/test_silu_mul_quant_fusion.py + - tests/compile/distributed/test_fusion_all_reduce.py + - tests/compile/fullgraph/test_full_graph.py + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/ + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/attention/attention.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes + - tests/compile/passes/test_fusion_attn.py + - tests/compile/passes/test_silu_mul_quant_fusion.py + - tests/compile/passes/distributed/test_fusion_all_reduce.py + - tests/compile/fullgraph/test_full_graph.py + commands: + - rocm-smi + # we run all backend tests on ROCm + - pytest -v -s tests/compile/passes/test_fusion_attn.py + - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py + # TODO: this test is not supported on ROCm, there are aiter kernels for this. + # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + # TODO: this test is not supported on ROCm + # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + +# corresponds to .buildkite/test_areas/compile.yaml +- label: Fusion E2E Quick (MI325) + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction, tj] + agent_pool: mi325_1 + num_devices: 1 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/compilation/ + - tests/compile/fusions_e2e/ + commands: + - rocm-smi + # Run all models and attn backends but only Inductor partition and native custom ops + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" + +# corresponds to .buildkite/test_areas/compile.yaml +- label: Fusion E2E Config Sweep (MI325) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction, tj] + agent_pool: mi325_1 + num_devices: 1 + source_file_dependencies: + - csrc/quantization/ + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/attention/attention.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/fusions_e2e/ + commands: + - rocm-smi + # Run just llama3 (fp8) for all config combinations + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3" + +# corresponds to .buildkite/test_areas/kernels.yaml +# Skip the following tests as they are not supported on ROCm +# - label: Fusion E2E TP2 AR-RMS Config Sweep (H100) +# - label: Fusion E2E TP2 AsyncTP Config Sweep (H100) +# - label: Fusion E2E TP2 (B200) diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index ed4c92d90ff7..733ec22c98d6 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -194,13 +194,31 @@ def test_custom_compile_config( ) @pytest.mark.parametrize( "model, backend", - [ - ("Qwen/Qwen2-0.5B", None), # Standard attention model - ( - "deepseek-ai/DeepSeek-V2-Lite", - AttentionBackendEnum.FLASHINFER_MLA, - ), # MLA (Multi-head Latent Attention) model - ], + ( + [ + ("Qwen/Qwen2-0.5B", None), # Standard attention model + ( + "deepseek-ai/DeepSeek-V2-Lite", + AttentionBackendEnum.FLASHINFER_MLA, + ), # MLA (Multi-head Latent Attention) model + ] + if current_platform.is_cuda() + else [ + # ("Qwen/Qwen2-0.5B", None), # Standard attention model + # ( + # "deepseek-ai/DeepSeek-V2-Lite", + # AttentionBackendEnum.TRITON_MLA, + # ), # MLA (Multi-head Latent Attention) model + ( + "deepseek-ai/DeepSeek-V2-Lite", + AttentionBackendEnum.ROCM_AITER_MLA, + ), # MLA (Multi-head Latent Attention) model + ( + "deepseek-ai/DeepSeek-V2-Lite", + AttentionBackendEnum.ROCM_AITER_TRITON_MLA, + ), # MLA (Multi-head Latent Attention) model + ] + ), ) def test_fp8_kv_scale_compile( compilation_mode: int, @@ -209,7 +227,7 @@ def test_fp8_kv_scale_compile( ): model_kwargs = { "quantization": "fp8", - "kv_cache_dtype": "fp8_e4m3", + "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8", "calculate_kv_scales": True, "max_model_len": 512, } diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py index f54f617c64d4..525ed1b515bc 100644 --- a/tests/compile/fusions_e2e/models.py +++ b/tests/compile/fusions_e2e/models.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest +from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer from vllm.v1.attention.backends.registry import AttentionBackendEnum @@ -24,37 +25,83 @@ AttentionBackendCase(backend=AttentionBackendEnum.TRITON_ATTN), id="TRITON_ATTN" ) +# ROCm backends +ROCM_ATTN = pytest.param( + AttentionBackendCase(backend=AttentionBackendEnum.ROCM_ATTN), id="ROCM_ATTN" +) + +ROCM_AITER_UNIFIED_ATTN = pytest.param( + AttentionBackendCase(backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN), + id="ROCM_AITER_UNIFIED_ATTN", +) + # Models llama3_8b = ModelFusionInfo( model_name="meta-llama/Llama-3.1-8B-Instruct", - matches=lambda n_layers: Matches( - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 4, + matches=( + lambda n_layers: Matches( + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ), + ) + if current_platform.is_cuda() + else ( # ROCm matches + lambda n_layers: Matches( + ar_rms_fusion=0, + sequence_parallel=0, + async_tp=0, + ), ), ) llama3_8b_fp8 = ModelFusionInfo( model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", - matches=lambda n_layers: Matches( - rms_quant_fusion=n_layers * 2, - act_quant_fusion=n_layers, - attn_quant_fusion=n_layers, - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 4, + matches=( + lambda n_layers: Matches( + rms_quant_fusion=n_layers * 2, + act_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ) + ) + if current_platform.is_cuda() + else ( # ROCm matches + lambda n_layers: Matches( + rms_quant_fusion=n_layers * 2, + act_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=0, + sequence_parallel=0, + async_tp=0, + ), ), ) llama3_8b_fp4 = ModelFusionInfo( model_name="nvidia/Llama-3.1-8B-Instruct-FP4", - matches=lambda n_layers: Matches( - rms_quant_fusion=0, - act_quant_fusion=n_layers, - attn_quant_fusion=n_layers, - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 4, + matches=( + lambda n_layers: Matches( + rms_quant_fusion=0, + act_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ) + ) + if current_platform.is_cuda() + else ( # ROCm matches + lambda n_layers: Matches( + rms_quant_fusion=0, + act_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=0, + sequence_parallel=0, + async_tp=0, + ), ), ) diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py index 03f102794f85..25a607051a89 100644 --- a/tests/compile/fusions_e2e/test_tp1_quant.py +++ b/tests/compile/fusions_e2e/test_tp1_quant.py @@ -5,6 +5,7 @@ import pytest from vllm.config import PassConfig +from vllm.platforms import current_platform from .common import ( INDUCTOR_GRAPH_PARTITION, @@ -15,11 +16,12 @@ ) from .models import ( FLASHINFER_ATTN, + ROCM_AITER_UNIFIED_ATTN, + ROCM_ATTN, TRITON_ATTN, llama3_8b_fp4, llama3_8b_fp8, llama4_scout_fp4, - llama4_scout_fp8, qwen3_a3b_fp8, ) @@ -28,12 +30,17 @@ "model_name, matches_fn, model_kwargs, hf_overrides, use_deepgemm", [ (*llama3_8b_fp8, False), - (*llama4_scout_fp8, False), + # (*llama4_scout_fp8, False), (*qwen3_a3b_fp8, False), - (*qwen3_a3b_fp8, True), + # (*qwen3_a3b_fp8, True), ], ) -@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN]) +@pytest.mark.parametrize( + "attn_backend", + [TRITON_ATTN, FLASHINFER_ATTN] + if current_platform.is_cuda() + else [TRITON_ATTN, ROCM_ATTN, ROCM_AITER_UNIFIED_ATTN], +) @pytest.mark.parametrize("n_layers", [6]) @pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm")) @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) diff --git a/tests/compile/passes/test_fusion_attn.py b/tests/compile/passes/test_fusion_attn.py index 75d5c42f0731..a35db7bb21ff 100644 --- a/tests/compile/passes/test_fusion_attn.py +++ b/tests/compile/passes/test_fusion_attn.py @@ -267,7 +267,7 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor): PATTERN_TEST_MODELS_FP8 = [ ("amd/Llama-3.1-8B-Instruct-FP8-KV", TestAttentionFp8StaticQuantPatternModel) ] - BACKENDS = [ + BACKENDS_FP8 = [ AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN, AttentionBackendEnum.ROCM_ATTN, AttentionBackendEnum.TRITON_ATTN, diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py index c5ef015015ce..64aad53525a5 100644 --- a/tests/compile/passes/test_silu_mul_quant_fusion.py +++ b/tests/compile/passes/test_silu_mul_quant_fusion.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools +from typing import Any import pytest import torch @@ -148,6 +149,9 @@ def __init__(self, hidden_size: int, **kwargs): weight_group_shape=GroupShape(128, 128), act_quant_group_shape=GroupShape(1, 128), cutlass_block_fp8_supported=False, + # this parameter cannot always be True, + # it depends on the VLLM_ROCM_USE_AITER + # and VLLM_ROCM_USE_AITER_LINEAR environment variables use_aiter_and_is_supported=True, ) self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() @@ -181,6 +185,12 @@ def ops_in_model_after(self): ] TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS +EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [ + (TestSiluMulGroupFp8QuantModel, False, None), +] +if current_platform.is_cuda(): + EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None)) + @pytest.mark.parametrize("num_tokens", [32, 64]) @pytest.mark.parametrize("hidden_size", [128, 256]) @@ -189,10 +199,7 @@ def ops_in_model_after(self): @pytest.mark.parametrize( "model_class, enable_quant_fp8_custom_op, force_kernel", list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS)) - + [ - (TestSiluMulNvfp4QuantModel, False, None), - (TestSiluMulGroupFp8QuantModel, False, None), - ], + + EXTENDED_TESTCASES, ) @pytest.mark.skipif( envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm" diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py index 5bc78afa43b0..ed3b981cf183 100644 --- a/vllm/model_executor/layers/quantization/input_quant_fp8.py +++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py @@ -171,6 +171,7 @@ def forward_native( x: torch.Tensor, scale: torch.Tensor | None = None, scale_ub: torch.Tensor | None = None, + **kwargs, ): if self.is_group_quant and not self.static: assert scale is None, "Dynamic group quantization does not use scale" From 1c9552affe7f37454cde6b496a64b26e0d859ce0 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Mon, 9 Feb 2026 13:21:08 +0000 Subject: [PATCH 02/16] fix silu-mul-groupquant fuion test Signed-off-by: vllmellm --- .../passes/test_silu_mul_quant_fusion.py | 10 +++++++--- .../passes/fusion/rocm_aiter_fusion.py | 18 +++++++----------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py index 64aad53525a5..687bb9aa6bfe 100644 --- a/tests/compile/passes/test_silu_mul_quant_fusion.py +++ b/tests/compile/passes/test_silu_mul_quant_fusion.py @@ -10,7 +10,7 @@ from tests.compile.backend import TestBackend from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor from tests.utils import TestFP8Layer -from vllm._aiter_ops import IS_AITER_FOUND +from vllm._aiter_ops import IS_AITER_FOUND, rocm_aiter_ops from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.compilation.passes.fusion.act_quant_fusion import ( FUSED_OPS, @@ -186,7 +186,7 @@ def ops_in_model_after(self): TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [ - (TestSiluMulGroupFp8QuantModel, False, None), + (TestSiluMulGroupFp8QuantModel, True, None), ] if current_platform.is_cuda(): EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None)) @@ -216,6 +216,7 @@ def test_fusion_silu_and_mul_quant( enable_silu_mul_custom_op: bool, enable_quant_fp8_custom_op: bool, force_kernel: FP8ScaledMMLinearKernel | None, + monkeypatch: pytest.MonkeyPatch, ): if model_class is TestSiluMulNvfp4QuantModel and not is_nvfp4_supported(): pytest.skip("NVFP4 is not supported on this GPU.") @@ -242,13 +243,16 @@ def test_fusion_silu_and_mul_quant( ), ) - with set_current_vllm_config(config): + with set_current_vllm_config(config), monkeypatch.context() as m: fusion_passes = [ActivationQuantFusionPass(config)] if IS_AITER_FOUND: from vllm.compilation.passes.fusion.rocm_aiter_fusion import ( RocmAiterSiluMulFp8GroupQuantFusionPass, ) + m.setenv("VLLM_ROCM_USE_AITER", "1") + rocm_aiter_ops.refresh_env_variables() + fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)] passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)] diff --git a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py index d8131ce952d2..99278365c5db 100644 --- a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py +++ b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py @@ -5,7 +5,6 @@ import torch._inductor.pattern_matcher as pm from torch import fx from torch._inductor.pattern_matcher import PatternMatcherPass -from torch._ops import OpOverload import vllm.model_executor.layers.quantization.utils.fp8_utils # noqa: F401 from vllm._aiter_ops import rocm_aiter_ops @@ -15,6 +14,7 @@ GroupShape, QuantKey, ScaleDesc, + kFp8Static128BlockSym, ) from vllm.platforms import current_platform @@ -332,9 +332,11 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern): FUSED_SILU_MUL_QUANT_OP = rocm_aiter_ops.get_act_mul_fused_fp8_group_quant_op() - def __init__(self, quant_op: OpOverload) -> None: + def __init__(self) -> None: self.silu_and_mul_matcher = MatcherSiluAndMul() - self.quant_op = quant_op + self.quant_matcher = MatcherQuantFP8( + quant_key=kFp8Static128BlockSym, match_rocm_aiter=True + ) def get_inputs(self) -> list[torch.Tensor]: return [ @@ -346,7 +348,7 @@ def pattern( input: torch.Tensor, ) -> tuple[torch.Tensor, torch.Tensor]: at1 = self.silu_and_mul_matcher(input) - at2 = self.quant_op(at1, 128) + at2 = self.quant_matcher(at1, 128) return at2[0], at2[1] def replacement( @@ -370,11 +372,6 @@ class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass): https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980 """ - AITER_GROUP_FP8_QUANT_OP = rocm_aiter_ops.get_group_quant_op() - TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default - - QUANT_OPS = [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP] - @enable_fake_mode def __init__(self, config: VllmConfig) -> None: super().__init__(config) @@ -383,8 +380,7 @@ def __init__(self, config: VllmConfig) -> None: pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass" ) - for quant_op in self.QUANT_OPS: - AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns) + AiterSiluMulFp8GroupQuantPattern().register(self.patterns) self.dump_patterns(config, self.patterns) From bffe1814354d8ed1055c6a02ecdde905f3f61549 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 10 Feb 2026 07:42:59 +0000 Subject: [PATCH 03/16] fix full graph test Signed-off-by: vllmellm --- tests/compile/fullgraph/test_full_graph.py | 27 ++++++++++++------- .../layers/attention/mla_attention.py | 1 + 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index 733ec22c98d6..921f57cea0a6 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -10,6 +10,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams +from vllm._aiter_ops import rocm_aiter_ops from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig from vllm.platforms import current_platform from vllm.utils.torch_utils import is_torch_equal_or_newer @@ -224,17 +225,25 @@ def test_fp8_kv_scale_compile( compilation_mode: int, model: str, backend: AttentionBackendEnum | None, + monkeypatch: pytest.MonkeyPatch, ): - model_kwargs = { - "quantization": "fp8", - "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8", - "calculate_kv_scales": True, - "max_model_len": 512, - } - if backend: - model_kwargs["attention_config"] = {"backend": backend.name} + with monkeypatch.context() as m: + model_kwargs = { + "quantization": "fp8", + "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8", + "calculate_kv_scales": True, + "max_model_len": 512, + } + if backend: + model_kwargs["attention_config"] = {"backend": backend.name} + if current_platform.is_rocm(): + m.setenv("VLLM_ROCM_USE_AITER", "1") + # Disable Aiter MOE as some shapes are not supported + m.setenv("VLLM_ROCM_USE_AITER_MOE", "0") + + rocm_aiter_ops.refresh_env_variables() - run_model(compilation_mode, model, **model_kwargs) + run_model(compilation_mode, model, **model_kwargs) def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs): diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py index c31aa7b41d0d..ed31a2d176e3 100644 --- a/vllm/model_executor/layers/attention/mla_attention.py +++ b/vllm/model_executor/layers/attention/mla_attention.py @@ -403,6 +403,7 @@ def __init__( self.is_aiter_triton_fp4_bmm_enabled = ( rocm_aiter_ops.is_fp4bmm_enabled() and self.kv_b_proj.weight.dtype == torch.bfloat16 + and current_platform.has_device_capability(95) # gfx950 and above ) # Attributes for forward_impl method From 28ed03f12de4842576366a34efbf31d44cb2a34a Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 10 Feb 2026 07:52:31 +0000 Subject: [PATCH 04/16] clearer test case for silu mul and group quant test Signed-off-by: vllmellm --- tests/compile/passes/test_silu_mul_quant_fusion.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py index 687bb9aa6bfe..c6794a156240 100644 --- a/tests/compile/passes/test_silu_mul_quant_fusion.py +++ b/tests/compile/passes/test_silu_mul_quant_fusion.py @@ -184,10 +184,11 @@ def ops_in_model_after(self): PerTensorTorchFP8ScaledMMLinearKernel, ] TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS +EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [] +# SiluMulGroupFp8Quant is only supported on ROCm +if current_platform.is_rocm(): + EXTENDED_TESTCASES.append((TestSiluMulGroupFp8QuantModel, True, None)) -EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [ - (TestSiluMulGroupFp8QuantModel, True, None), -] if current_platform.is_cuda(): EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None)) @@ -245,7 +246,7 @@ def test_fusion_silu_and_mul_quant( with set_current_vllm_config(config), monkeypatch.context() as m: fusion_passes = [ActivationQuantFusionPass(config)] - if IS_AITER_FOUND: + if current_platform.is_rocm() and IS_AITER_FOUND: from vllm.compilation.passes.fusion.rocm_aiter_fusion import ( RocmAiterSiluMulFp8GroupQuantFusionPass, ) From 5628eb9b272da81a81da4477cb5774fdf7143632 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 10 Feb 2026 08:40:13 +0000 Subject: [PATCH 05/16] fix e2e fusion tests Signed-off-by: vllmellm --- .buildkite/test-amd.yaml | 71 +++++++++- tests/compile/fusions_e2e/models.py | 125 ++++++++++++------ .../compile/fusions_e2e/test_tp2_async_tp.py | 14 +- 3 files changed, 164 insertions(+), 46 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 986708e37641..64a878baa774 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1691,6 +1691,17 @@ steps: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py +# corresponds to .buildkite/test_areas/compile.yaml +- label: AsyncTP Correctness Tests (2xMI325 GPUs) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/" + device: mi325_2 + optional: true + num_devices: 2 + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY=1 + - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py + # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion and Compile Unit Tests (2xMI325 GPUs) timeout_in_minutes: 20 @@ -1750,7 +1761,8 @@ steps: # Run all models and attn backends but only Inductor partition and native custom ops - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported - - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" + # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet. + # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion E2E Config Sweep (MI325) @@ -1771,10 +1783,57 @@ steps: commands: - rocm-smi # Run just llama3 (fp8) for all config combinations + - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3" - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3" -# corresponds to .buildkite/test_areas/kernels.yaml -# Skip the following tests as they are not supported on ROCm -# - label: Fusion E2E TP2 AR-RMS Config Sweep (H100) -# - label: Fusion E2E TP2 AsyncTP Config Sweep (H100) -# - label: Fusion E2E TP2 (B200) +- label: Fusion E2E TP2 Quick (MI325) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + device: mi325_1 + num_devices: 2 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/compilation/ + - tests/compile/fusions_e2e/ + commands: + - rocm-smi + # Run all models and attn backends but only Inductor partition and native custom ops + - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + +# corresponds to .buildkite/test_areas/compile.yaml +- label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + device: mi325_2 + num_devices: 2 + source_file_dependencies: + - csrc/quantization/ + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/attention/attention.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/fusions_e2e/ + commands: + - rocm-smi + # Run just llama3 (fp8 & bf16) for all config combinations + - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3" + +- label: Fusion E2E TP2 (MI325) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + device: mi325_2 + num_devices: 2 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/ + - vllm/v1/attention/ + - vllm/compilation/ + - tests/compile/fusions_e2e/ + commands: + - rocm-smi + # Run all models and attn backends but only Inductor partition and native custom ops + - pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py index 525ed1b515bc..77cb1b4d3ad9 100644 --- a/tests/compile/fusions_e2e/models.py +++ b/tests/compile/fusions_e2e/models.py @@ -43,15 +43,16 @@ ar_rms_fusion=n_layers * 2 + 1, sequence_parallel=n_layers * 2 + 1, async_tp=n_layers * 4, - ), + ) ) if current_platform.is_cuda() - else ( # ROCm matches + # ROCm matches + else ( lambda n_layers: Matches( ar_rms_fusion=0, - sequence_parallel=0, - async_tp=0, - ), + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ) ), ) @@ -68,15 +69,16 @@ ) ) if current_platform.is_cuda() - else ( # ROCm matches + # ROCm matches + else ( lambda n_layers: Matches( rms_quant_fusion=n_layers * 2, act_quant_fusion=n_layers, attn_quant_fusion=n_layers, ar_rms_fusion=0, - sequence_parallel=0, - async_tp=0, - ), + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ) ), ) @@ -93,15 +95,16 @@ ) ) if current_platform.is_cuda() - else ( # ROCm matches + # ROCm matches + else ( lambda n_layers: Matches( rms_quant_fusion=0, act_quant_fusion=n_layers, attn_quant_fusion=n_layers, ar_rms_fusion=0, - sequence_parallel=0, - async_tp=0, - ), + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 4, + ) ), ) @@ -113,45 +116,93 @@ llama4_scout_fp8 = ModelFusionInfo( model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8", hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}}, - matches=lambda n_layers: Matches( - rms_quant_fusion=n_layers, - attn_quant_fusion=n_layers, - ar_rms_fusion=n_layers * 2, - sequence_parallel=n_layers * 2, - async_tp=n_layers * 2 - 1, + matches=( + lambda n_layers: Matches( + rms_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2, + sequence_parallel=n_layers * 2, + async_tp=n_layers * 2 - 1, + ) + ) + if current_platform.is_cuda() + # ROCm matches + else ( + lambda n_layers: Matches( + rms_quant_fusion=n_layers, + attn_quant_fusion=n_layers, + sequence_parallel=n_layers * 2, + ) ), ) llama4_scout_fp4 = ModelFusionInfo( model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4", hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}}, - matches=lambda n_layers: Matches( - rms_quant_fusion=0, - attn_quant_fusion=n_layers, - ar_rms_fusion=n_layers * 2, - sequence_parallel=n_layers * 2, - async_tp=n_layers * 2 - 1, + matches=( + lambda n_layers: Matches( + rms_quant_fusion=0, + attn_quant_fusion=n_layers, + ar_rms_fusion=n_layers * 2, + sequence_parallel=n_layers * 2, + async_tp=n_layers * 2 - 1, + ) + ) + if current_platform.is_cuda() + # ROCm matches + else ( + lambda n_layers: Matches( + rms_quant_fusion=0, + attn_quant_fusion=n_layers, + sequence_parallel=n_layers * 2, + async_tp=n_layers * 2 - 1, + ) ), ) qwen3_a3b = ModelFusionInfo( model_name="Qwen/Qwen3-30B-A3B", - matches=lambda n_layers: Matches( - norm_rope_fusion=n_layers, - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 2, + matches=( + lambda n_layers: Matches( + norm_rope_fusion=n_layers, + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 2, + ) + ) + if current_platform.is_cuda() + # ROCm matches + else ( + lambda n_layers: Matches( + norm_rope_fusion=n_layers, + ar_rms_fusion=0, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 2, + ) ), ) qwen3_a3b_fp8 = ModelFusionInfo( model_name="Qwen/Qwen3-30B-A3B-FP8", - matches=lambda n_layers: Matches( - rms_quant_fusion=n_layers, - norm_rope_fusion=n_layers, - attn_quant_fusion=0, # attn + group quant not supported - ar_rms_fusion=n_layers * 2 + 1, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 2, + matches=( + lambda n_layers: Matches( + rms_quant_fusion=n_layers, + norm_rope_fusion=n_layers, + attn_quant_fusion=0, # attn + group quant not supported + ar_rms_fusion=n_layers * 2 + 1, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 2, + ) + ) + if current_platform.is_cuda() + # ROCm matches + else ( + lambda n_layers: Matches( + rms_quant_fusion=n_layers, + norm_rope_fusion=n_layers, + ar_rms_fusion=0, + sequence_parallel=n_layers * 2 + 1, + async_tp=n_layers * 2, + ) ), ) diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py index 4769ca1e0b63..fb743c1ba7d3 100644 --- a/tests/compile/fusions_e2e/test_tp2_async_tp.py +++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py @@ -5,6 +5,7 @@ import pytest from vllm.config import PassConfig +from vllm.platforms import current_platform from ...utils import multi_gpu_test from .common import ( @@ -16,6 +17,8 @@ ) from .models import ( FLASHINFER_ATTN, + ROCM_AITER_UNIFIED_ATTN, + ROCM_ATTN, TRITON_ATTN, llama3_8b, llama3_8b_fp8, @@ -29,9 +32,14 @@ "model_name, matches_fn, model_kwargs, hf_overrides", [llama3_8b_fp8, llama4_scout_fp8], ) -@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN]) +@pytest.mark.parametrize( + "attn_backend", + [TRITON_ATTN, FLASHINFER_ATTN] + if current_platform.is_cuda() + else [TRITON_ATTN, ROCM_ATTN, ROCM_AITER_UNIFIED_ATTN], +) @pytest.mark.parametrize("n_layers", [4]) -@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm")) +@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("quant_fp8", "rms_norm"))) @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) def test_tp2_async_tp_fp8_fusions( model_name: str, @@ -96,7 +104,7 @@ def test_tp2_async_tp_fp8_fusions( ) @pytest.mark.parametrize("attn_backend", [TRITON_ATTN]) @pytest.mark.parametrize("n_layers", [4]) -@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm")) +@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("rms_norm"))) @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) def test_tp2_async_tp_fusions( model_name: str, From 119b4b01b345e17264c84d37f2acd060e6a5448a Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 10 Feb 2026 10:18:50 +0000 Subject: [PATCH 06/16] fix tests in fusion silu_mul and tidy up kite Signed-off-by: vllmellm --- .buildkite/test-amd.yaml | 4 ++++ .../passes/test_silu_mul_quant_fusion.py | 19 +++++++------------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 64a878baa774..33ba6689faa5 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1695,6 +1695,7 @@ steps: - label: AsyncTP Correctness Tests (2xMI325 GPUs) timeout_in_minutes: 50 working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction, tj] device: mi325_2 optional: true num_devices: 2 @@ -1789,6 +1790,7 @@ steps: - label: Fusion E2E TP2 Quick (MI325) timeout_in_minutes: 40 working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction, tj] device: mi325_1 num_devices: 2 source_file_dependencies: @@ -1806,6 +1808,7 @@ steps: - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) timeout_in_minutes: 40 working_dir: "/vllm-workspace/" + mirror_hardwares: [amdexperimental, amdproduction, tj] device: mi325_2 num_devices: 2 source_file_dependencies: @@ -1823,6 +1826,7 @@ steps: - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3" - label: Fusion E2E TP2 (MI325) + mirror_hardwares: [amdexperimental, amdproduction, tj] timeout_in_minutes: 40 working_dir: "/vllm-workspace/" device: mi325_2 diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py index c6794a156240..abd32c38ca04 100644 --- a/tests/compile/passes/test_silu_mul_quant_fusion.py +++ b/tests/compile/passes/test_silu_mul_quant_fusion.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools -from typing import Any import pytest import torch @@ -10,7 +9,7 @@ from tests.compile.backend import TestBackend from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor from tests.utils import TestFP8Layer -from vllm._aiter_ops import IS_AITER_FOUND, rocm_aiter_ops +from vllm._aiter_ops import IS_AITER_FOUND from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.compilation.passes.fusion.act_quant_fusion import ( FUSED_OPS, @@ -184,13 +183,6 @@ def ops_in_model_after(self): PerTensorTorchFP8ScaledMMLinearKernel, ] TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS -EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [] -# SiluMulGroupFp8Quant is only supported on ROCm -if current_platform.is_rocm(): - EXTENDED_TESTCASES.append((TestSiluMulGroupFp8QuantModel, True, None)) - -if current_platform.is_cuda(): - EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None)) @pytest.mark.parametrize("num_tokens", [32, 64]) @@ -200,7 +192,10 @@ def ops_in_model_after(self): @pytest.mark.parametrize( "model_class, enable_quant_fp8_custom_op, force_kernel", list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS)) - + EXTENDED_TESTCASES, + + [ + (TestSiluMulNvfp4QuantModel, False, None), + (TestSiluMulGroupFp8QuantModel, True, None), + ], ) @pytest.mark.skipif( envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm" @@ -246,14 +241,14 @@ def test_fusion_silu_and_mul_quant( with set_current_vllm_config(config), monkeypatch.context() as m: fusion_passes = [ActivationQuantFusionPass(config)] - if current_platform.is_rocm() and IS_AITER_FOUND: + if IS_AITER_FOUND and model_class is TestSiluMulGroupFp8QuantModel: + from vllm._aiter_ops import rocm_aiter_ops from vllm.compilation.passes.fusion.rocm_aiter_fusion import ( RocmAiterSiluMulFp8GroupQuantFusionPass, ) m.setenv("VLLM_ROCM_USE_AITER", "1") rocm_aiter_ops.refresh_env_variables() - fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)] passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)] From 218fcfb221df513d2c000e3e133e27f9e6f1d010 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Tue, 10 Feb 2026 10:25:11 +0000 Subject: [PATCH 07/16] remove unnecessary change Signed-off-by: vllmellm --- vllm/model_executor/layers/quantization/input_quant_fp8.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py index ed3b981cf183..5bc78afa43b0 100644 --- a/vllm/model_executor/layers/quantization/input_quant_fp8.py +++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py @@ -171,7 +171,6 @@ def forward_native( x: torch.Tensor, scale: torch.Tensor | None = None, scale_ub: torch.Tensor | None = None, - **kwargs, ): if self.is_group_quant and not self.static: assert scale is None, "Dynamic group quantization does not use scale" From befaba1bc95b823ac36b0091632b206b8c6faa76 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 10 Feb 2026 13:46:06 +0000 Subject: [PATCH 08/16] remove duplicate Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 33ba6689faa5..149767bdd06b 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1721,18 +1721,6 @@ steps: - tests/compile/test_silu_mul_quant_fusion.py - tests/compile/distributed/test_fusion_all_reduce.py - tests/compile/fullgraph/test_full_graph.py - source_file_dependencies: - - csrc/quantization/fp4/ - - vllm/model_executor/layers/quantization/ - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/attention/attention.py - - vllm/v1/attention/backends/flashinfer.py - - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes - - tests/compile/passes/test_fusion_attn.py - - tests/compile/passes/test_silu_mul_quant_fusion.py - - tests/compile/passes/distributed/test_fusion_all_reduce.py - - tests/compile/fullgraph/test_full_graph.py commands: - rocm-smi # we run all backend tests on ROCm From ca801a13e3d9dc9c37bb1e429a7134cc4e28e58a Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 10 Feb 2026 15:16:30 +0000 Subject: [PATCH 09/16] need to add quote Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 149767bdd06b..8d9c6eb62db0 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -620,7 +620,7 @@ steps: - vllm/ - tests/compile commands: - - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + - "pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'" # # Limit to no custom ops to reduce running time # # Wrap with quotes to escape yaml and avoid starting -k string with a - # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" @@ -1689,7 +1689,7 @@ steps: - tests/compile/correctness_e2e/test_sequence_parallel.py commands: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py + - "pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py" # corresponds to .buildkite/test_areas/compile.yaml - label: AsyncTP Correctness Tests (2xMI325 GPUs) @@ -1701,7 +1701,7 @@ steps: num_devices: 2 commands: - export VLLM_TEST_CLEAN_GPU_MEMORY=1 - - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py + - "pytest -v -s tests/compile/correctness_e2e/test_async_tp.py" # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion and Compile Unit Tests (2xMI325 GPUs) @@ -1724,8 +1724,8 @@ steps: commands: - rocm-smi # we run all backend tests on ROCm - - pytest -v -s tests/compile/passes/test_fusion_attn.py - - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py + - "pytest -v -s tests/compile/passes/test_fusion_attn.py" + - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py" # TODO: this test is not supported on ROCm, there are aiter kernels for this. # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) @@ -1748,7 +1748,7 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"' # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet. # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" @@ -1772,8 +1772,8 @@ steps: commands: - rocm-smi # Run just llama3 (fp8) for all config combinations - - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3" - - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3" + - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"' + - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"' - label: Fusion E2E TP2 Quick (MI325) timeout_in_minutes: 40 @@ -1790,7 +1790,7 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"' # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) @@ -1811,7 +1811,7 @@ steps: commands: - rocm-smi # Run just llama3 (fp8 & bf16) for all config combinations - - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3" + - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"' - label: Fusion E2E TP2 (MI325) mirror_hardwares: [amdexperimental, amdproduction, tj] @@ -1828,4 +1828,4 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8" + - 'pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"' From 0b65174b5ad3074344ae519852d28561bd155f46 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 10 Feb 2026 15:47:57 +0000 Subject: [PATCH 10/16] fix syntax Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 8d9c6eb62db0..439075c6a843 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1748,7 +1748,7 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"' + - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet. # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" @@ -1772,8 +1772,8 @@ steps: commands: - rocm-smi # Run just llama3 (fp8) for all config combinations - - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"' - - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"' + - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'llama-3'" + - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'" - label: Fusion E2E TP2 Quick (MI325) timeout_in_minutes: 40 @@ -1790,7 +1790,7 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"' + - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) @@ -1811,7 +1811,7 @@ steps: commands: - rocm-smi # Run just llama3 (fp8 & bf16) for all config combinations - - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"' + - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'" - label: Fusion E2E TP2 (MI325) mirror_hardwares: [amdexperimental, amdproduction, tj] @@ -1828,4 +1828,4 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - 'pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"' + - "pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" From be40a224ccc64d072f2664d15b58eee9fe46c4b2 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 10 Feb 2026 16:49:52 +0000 Subject: [PATCH 11/16] fix Fusion E2E TP2 (MI325) path Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 439075c6a843..407a3d671803 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1814,7 +1814,7 @@ steps: - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'" - label: Fusion E2E TP2 (MI325) - mirror_hardwares: [amdexperimental, amdproduction, tj] + mirror_hardwares: [amdexperimental, amdproduction, tj, tj2] timeout_in_minutes: 40 working_dir: "/vllm-workspace/" device: mi325_2 @@ -1828,4 +1828,4 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - "pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" + - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" From d8d071254ffbafcabb69fd9221e08a755298e7e5 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Wed, 11 Feb 2026 02:48:27 +0000 Subject: [PATCH 12/16] fix test-amd syntax Signed-off-by: tjtanaa --- .buildkite/test-amd.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 407a3d671803..8d469a39b042 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1696,7 +1696,7 @@ steps: timeout_in_minutes: 50 working_dir: "/vllm-workspace/" mirror_hardwares: [amdexperimental, amdproduction, tj] - device: mi325_2 + agent_pool: mi325_2 optional: true num_devices: 2 commands: @@ -1750,7 +1750,7 @@ steps: # Run all models and attn backends but only Inductor partition and native custom ops - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported - # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet. + # TODO: Qwen uses group quantization which the pattern matcher on ROCm is not supported yet. # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" # corresponds to .buildkite/test_areas/compile.yaml @@ -1779,7 +1779,7 @@ steps: timeout_in_minutes: 40 working_dir: "/vllm-workspace/" mirror_hardwares: [amdexperimental, amdproduction, tj] - device: mi325_1 + agent_pool: mi325_1 num_devices: 2 source_file_dependencies: - csrc/quantization/ @@ -1790,14 +1790,14 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" + - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) timeout_in_minutes: 40 working_dir: "/vllm-workspace/" mirror_hardwares: [amdexperimental, amdproduction, tj] - device: mi325_2 + agent_pool: mi325_2 num_devices: 2 source_file_dependencies: - csrc/quantization/ @@ -1811,13 +1811,13 @@ steps: commands: - rocm-smi # Run just llama3 (fp8 & bf16) for all config combinations - - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'" + - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'" - label: Fusion E2E TP2 (MI325) mirror_hardwares: [amdexperimental, amdproduction, tj, tj2] timeout_in_minutes: 40 working_dir: "/vllm-workspace/" - device: mi325_2 + agent_pool: mi325_2 num_devices: 2 source_file_dependencies: - csrc/quantization/ @@ -1828,4 +1828,4 @@ steps: commands: - rocm-smi # Run all models and attn backends but only Inductor partition and native custom ops - - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" + - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" From 5c879d18a64b06dfdfbe21021c0ae348d052bf0f Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 11 Feb 2026 08:02:54 +0000 Subject: [PATCH 13/16] remove unsupported tests cases Signed-off-by: vllmellm --- tests/compile/fullgraph/test_full_graph.py | 8 +++----- tests/compile/passes/test_fusion.py | 15 ++++++++++----- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py index 921f57cea0a6..37aec789d62e 100644 --- a/tests/compile/fullgraph/test_full_graph.py +++ b/tests/compile/fullgraph/test_full_graph.py @@ -205,11 +205,9 @@ def test_custom_compile_config( ] if current_platform.is_cuda() else [ - # ("Qwen/Qwen2-0.5B", None), # Standard attention model - # ( - # "deepseek-ai/DeepSeek-V2-Lite", - # AttentionBackendEnum.TRITON_MLA, - # ), # MLA (Multi-head Latent Attention) model + # TRITON_MLA does not support FP8 KV cache + # So we can skip the standard attention model + # test. ( "deepseek-ai/DeepSeek-V2-Lite", AttentionBackendEnum.ROCM_AITER_MLA, diff --git a/tests/compile/passes/test_fusion.py b/tests/compile/passes/test_fusion.py index a2128150f701..aa733a744db3 100644 --- a/tests/compile/passes/test_fusion.py +++ b/tests/compile/passes/test_fusion.py @@ -87,9 +87,16 @@ (RowWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN), # ChannelWiseTorchFP8ScaledMMLinearKernel only supports per-token (ChannelWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN), + # # Blockwise group shapes (no kernel abstraction) - (None, GroupShape(1, 128)), - (None, GroupShape(1, 64)), + # (None, GroupShape(1, 128)), + # (None, GroupShape(1, 64)), + # + # ^ This is not supported yet: See + # PR https://github.com/vllm-project/vllm/pull/30845 + # TODO: enable Blockwise group shapes + # + ] KERNEL_GROUPSHAPE_COMBINATIONS = ( @@ -99,9 +106,7 @@ ) # For Aiter tests we toggle use_aiter_quant_op -AITER_KERNEL_GROUPSHAPE_COMBINATIONS = [ - # Per-token with ROCmFP8ScaledMMLinearKernel - (ROCmFP8ScaledMMLinearKernel, GroupShape.PER_TENSOR, False), +AITER_KERNEL_GROUPSHAPE_COMBINATIONS = [ # Per-token with RowWiseTorchFP8ScaledMMLinearKernel (RowWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN, True), (RowWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN, False), From 727ff29127228e3fba417caec031e6c85c32e1e7 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 11 Feb 2026 08:06:12 +0000 Subject: [PATCH 14/16] add fp8 kv cache test to amd ci Signed-off-by: vllmellm --- .buildkite/test-amd.yaml | 5 ++--- tests/compile/passes/test_fusion.py | 11 +++++------ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 0507c617745a..5e0cd7eabd8c 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1726,11 +1726,10 @@ steps: # we run all backend tests on ROCm - "pytest -v -s tests/compile/passes/test_fusion_attn.py" - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py" + - "pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile" # TODO: this test is not supported on ROCm, there are aiter kernels for this. # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py - # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) - # TODO: this test is not supported on ROCm - # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + # corresponds to .buildkite/test_areas/compile.yaml - label: Fusion E2E Quick (MI325) diff --git a/tests/compile/passes/test_fusion.py b/tests/compile/passes/test_fusion.py index aa733a744db3..05f766b1e9e8 100644 --- a/tests/compile/passes/test_fusion.py +++ b/tests/compile/passes/test_fusion.py @@ -87,16 +87,15 @@ (RowWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN), # ChannelWiseTorchFP8ScaledMMLinearKernel only supports per-token (ChannelWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN), - # + # # Blockwise group shapes (no kernel abstraction) # (None, GroupShape(1, 128)), # (None, GroupShape(1, 64)), - # - # ^ This is not supported yet: See + # + # ^ This is not supported yet: See # PR https://github.com/vllm-project/vllm/pull/30845 # TODO: enable Blockwise group shapes - # - + # ] KERNEL_GROUPSHAPE_COMBINATIONS = ( @@ -106,7 +105,7 @@ ) # For Aiter tests we toggle use_aiter_quant_op -AITER_KERNEL_GROUPSHAPE_COMBINATIONS = [ +AITER_KERNEL_GROUPSHAPE_COMBINATIONS = [ # Per-token with RowWiseTorchFP8ScaledMMLinearKernel (RowWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN, True), (RowWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN, False), From 0aa05b1e5f3f2cbdfc0747b75acd4de072261813 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 11 Feb 2026 08:17:25 +0000 Subject: [PATCH 15/16] run PyTorch Compilation Unit Tests in multiple instances Signed-off-by: vllmellm --- .buildkite/test-amd.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 5e0cd7eabd8c..2a41df731e05 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -590,7 +590,9 @@ steps: - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" # TODO: clean up this comment if not needed. It is used to # keep track of the tests changes during vLLM IR Ops refactoring. - - pytest -s -v compile/passes --ignore compile/passes/distributed + # Use `find` to launch multiple instances of pytest. + - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + # corresponds to .buildkite/test_areas/pytorch.yaml - label: PyTorch Fullgraph Smoke Test # 15min From 5f5f4423ece39ea3ed940fdeef17063abccb8945 Mon Sep 17 00:00:00 2001 From: vllmellm Date: Wed, 11 Feb 2026 08:33:48 +0000 Subject: [PATCH 16/16] remove async tp tests for AMD ci Signed-off-by: vllmellm --- .buildkite/test-amd.yaml | 54 ------------------- tests/compile/fusions_e2e/models.py | 30 +++++------ .../compile/fusions_e2e/test_tp2_async_tp.py | 14 ++--- 3 files changed, 18 insertions(+), 80 deletions(-) diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 2a41df731e05..5d986207d354 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -1776,57 +1776,3 @@ steps: - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'llama-3'" - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'" -- label: Fusion E2E TP2 Quick (MI325) - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction, tj] - agent_pool: mi325_1 - num_devices: 2 - source_file_dependencies: - - csrc/quantization/ - - vllm/model_executor/ - - vllm/v1/attention/ - - vllm/compilation/ - - tests/compile/fusions_e2e/ - commands: - - rocm-smi - # Run all models and attn backends but only Inductor partition and native custom ops - - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" - -# corresponds to .buildkite/test_areas/compile.yaml -- label: Fusion E2E TP2 AsyncTP Config Sweep (MI325) - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/" - mirror_hardwares: [amdexperimental, amdproduction, tj] - agent_pool: mi325_2 - num_devices: 2 - source_file_dependencies: - - csrc/quantization/ - - vllm/compilation/ - # can affect pattern matching - - vllm/model_executor/layers/layernorm.py - - vllm/model_executor/layers/activation.py - - vllm/model_executor/layers/attention/attention.py - - vllm/model_executor/layers/quantization/input_quant_fp8.py - - tests/compile/fusions_e2e/ - commands: - - rocm-smi - # Run just llama3 (fp8 & bf16) for all config combinations - - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'" - -- label: Fusion E2E TP2 (MI325) - mirror_hardwares: [amdexperimental, amdproduction, tj, tj2] - timeout_in_minutes: 40 - working_dir: "/vllm-workspace/" - agent_pool: mi325_2 - num_devices: 2 - source_file_dependencies: - - csrc/quantization/ - - vllm/model_executor/ - - vllm/v1/attention/ - - vllm/compilation/ - - tests/compile/fusions_e2e/ - commands: - - rocm-smi - # Run all models and attn backends but only Inductor partition and native custom ops - - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'" diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py index 77cb1b4d3ad9..c31cfe97426c 100644 --- a/tests/compile/fusions_e2e/models.py +++ b/tests/compile/fusions_e2e/models.py @@ -75,9 +75,9 @@ rms_quant_fusion=n_layers * 2, act_quant_fusion=n_layers, attn_quant_fusion=n_layers, - ar_rms_fusion=0, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 4, + ar_rms_fusion=0, # Not supported + sequence_parallel=0, # Not supported + async_tp=0, # Not supported ) ), ) @@ -101,9 +101,9 @@ rms_quant_fusion=0, act_quant_fusion=n_layers, attn_quant_fusion=n_layers, - ar_rms_fusion=0, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 4, + ar_rms_fusion=0, # Not supported + sequence_parallel=0, # Not supported + async_tp=0, # Not supported ) ), ) @@ -131,7 +131,7 @@ lambda n_layers: Matches( rms_quant_fusion=n_layers, attn_quant_fusion=n_layers, - sequence_parallel=n_layers * 2, + sequence_parallel=0, # Not supported ) ), ) @@ -154,8 +154,8 @@ lambda n_layers: Matches( rms_quant_fusion=0, attn_quant_fusion=n_layers, - sequence_parallel=n_layers * 2, - async_tp=n_layers * 2 - 1, + sequence_parallel=0, # Not supported + async_tp=0, # Not supported ) ), ) @@ -175,9 +175,9 @@ else ( lambda n_layers: Matches( norm_rope_fusion=n_layers, - ar_rms_fusion=0, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 2, + ar_rms_fusion=0, # Not supported + sequence_parallel=0, # Not supported + async_tp=0, # Not supported ) ), ) @@ -200,9 +200,9 @@ lambda n_layers: Matches( rms_quant_fusion=n_layers, norm_rope_fusion=n_layers, - ar_rms_fusion=0, - sequence_parallel=n_layers * 2 + 1, - async_tp=n_layers * 2, + ar_rms_fusion=0, # Not supported + sequence_parallel=0, # Not supported + async_tp=0, # Not supported ) ), ) diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py index fb743c1ba7d3..4769ca1e0b63 100644 --- a/tests/compile/fusions_e2e/test_tp2_async_tp.py +++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py @@ -5,7 +5,6 @@ import pytest from vllm.config import PassConfig -from vllm.platforms import current_platform from ...utils import multi_gpu_test from .common import ( @@ -17,8 +16,6 @@ ) from .models import ( FLASHINFER_ATTN, - ROCM_AITER_UNIFIED_ATTN, - ROCM_ATTN, TRITON_ATTN, llama3_8b, llama3_8b_fp8, @@ -32,14 +29,9 @@ "model_name, matches_fn, model_kwargs, hf_overrides", [llama3_8b_fp8, llama4_scout_fp8], ) -@pytest.mark.parametrize( - "attn_backend", - [TRITON_ATTN, FLASHINFER_ATTN] - if current_platform.is_cuda() - else [TRITON_ATTN, ROCM_ATTN, ROCM_AITER_UNIFIED_ATTN], -) +@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN]) @pytest.mark.parametrize("n_layers", [4]) -@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("quant_fp8", "rms_norm"))) +@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm")) @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) def test_tp2_async_tp_fp8_fusions( model_name: str, @@ -104,7 +96,7 @@ def test_tp2_async_tp_fp8_fusions( ) @pytest.mark.parametrize("attn_backend", [TRITON_ATTN]) @pytest.mark.parametrize("n_layers", [4]) -@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("rms_norm"))) +@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm")) @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION) def test_tp2_async_tp_fusions( model_name: str,