From 230246d9349db65878aee63f7c4c8b4a920fa821 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Mon, 9 Feb 2026 08:28:20 +0000
Subject: [PATCH 01/45] try to enable new fusion pass test for ROCm

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml                      | 174 +++++++++++++-----
 tests/compile/fullgraph/test_full_graph.py    |  34 +++-
 tests/compile/fusions_e2e/models.py           |  83 +++++++--
 tests/compile/fusions_e2e/test_tp1_quant.py   |  15 +-
 tests/compile/passes/test_fusion_attn.py      |   2 +-
 .../passes/test_silu_mul_quant_fusion.py      |  15 +-
 .../layers/quantization/input_quant_fp8.py    |   1 +
 7 files changed, 245 insertions(+), 79 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 503b3a76f941..986708e37641 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -570,9 +570,11 @@ steps:
       --ignore=lora/test_qwen3moe_tp.py
   parallelism: 4
 
+##### .buildkite/test_areas/pytorch.yaml #####
+# corresponds to .buildkite/test_areas/pytorch.yaml
 - label: PyTorch Compilation Unit Tests # 15min
   timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -586,10 +588,14 @@ steps:
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
   - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+  # TODO: clean up this comment if not needed. It is used to 
+  # keep track of the tests changes during vLLM IR Ops refactoring.
+  - pytest -s -v compile/passes --ignore compile/passes/distributed
 
+# corresponds to .buildkite/test_areas/pytorch.yaml
 - label: PyTorch Fullgraph Smoke Test # 15min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -603,9 +609,10 @@ steps:
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
   - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
 
-- label: PyTorch Fullgraph Test # 27min
-  timeout_in_minutes: 40
-  mirror_hardwares: [amdexperimental, amdproduction]
+# corresponds to .buildkite/test_areas/pytorch.yaml
+- label: PyTorch Fullgraph # 27min
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -1176,41 +1183,6 @@ steps:
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
     - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
 
-- label: Blackwell Fusion and Compile Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/v1/worker/
-  - vllm/v1/cudagraph_dispatcher.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/passes/test_fusion_attn.py
-  - tests/compile/passes/test_silu_mul_quant_fusion.py
-  - tests/compile/passes/distributed/test_fusion_all_reduce.py
-  - tests/compile/fullgraph/test_full_graph.py
-  commands:
-    - nvidia-smi
-    - pytest -v -s tests/compile/passes/test_fusion_attn.py
-    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
-
-    # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # # Wrap with quotes to escape yaml
-    # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
-    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
-    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
-
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
-
 - label: Blackwell GPT-OSS Eval
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
@@ -1334,7 +1306,6 @@ steps:
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - pytest -v -s distributed/test_sequence_parallel.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
   - pytest -v -s v1/worker/test_worker_memory_snapshot.py
 
@@ -1558,17 +1529,20 @@ steps:
   num_gpus: 2
   commands:
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
+    # ================= 24 passed, 11 warnings in 192.85s (0:03:12) ==================
     - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
+    # ================== 48 passed, 8 warnings in 386.41s (0:06:26) ==================
     - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+    # ======================== 8 skipped, 9 warnings in 2.08s ========================
     #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
     # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
     # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
-
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
+    # ======================== 4 passed, 3 warnings in 30.45s ========================
     - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
     - pytest -v -s tests/v1/distributed/test_dbo.py
+    # ======================== 2 skipped, 3 warnings in 1.97s ========================
 
 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
@@ -1692,3 +1666,115 @@ steps:
   working_dir: "/vllm-workspace"
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
+
+##### .buildkite/test_areas/compile.yaml #####
+# Slowly setting up the tests so that it is also easier for the 
+# CI team to review and upstream to the pipelinev2.
+# The following tests are important for vLLM IR Ops refactoring,
+# which affects fusion passes on ROCm. So we have to 
+# enable them as as soon as possible.
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: Sequence Parallel Correctness Tests (2xMI325 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  agent_pool: mi325_2
+  num_devices: 2
+  source_file_dependencies:
+  - vllm/model_executor/layers/
+  - vllm/compilation/
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - tests/compile/correctness_e2e/test_sequence_parallel.py
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: Fusion and Compile Unit Tests (2xMI325 GPUs)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  agent_pool: mi325_2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/attention/attention.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/fullgraph/test_full_graph.py
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/attention/attention.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
+  - tests/compile/passes/test_fusion_attn.py
+  - tests/compile/passes/test_silu_mul_quant_fusion.py
+  - tests/compile/passes/distributed/test_fusion_all_reduce.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    - rocm-smi
+    # we run all backend tests on ROCm
+    - pytest -v -s tests/compile/passes/test_fusion_attn.py
+    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
+    # TODO: this test is not supported on ROCm, there are aiter kernels for this.
+    # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    # TODO: this test is not supported on ROCm
+    # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: Fusion E2E Quick (MI325)
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  agent_pool: mi325_1
+  num_devices: 1
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - rocm-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: Fusion E2E Config Sweep (MI325)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  agent_pool: mi325_1
+  num_devices: 1
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - rocm-smi
+    # Run just llama3 (fp8) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
+
+# corresponds to .buildkite/test_areas/kernels.yaml
+# Skip the following tests as they are not supported on ROCm
+# - label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
+# - label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
+# - label: Fusion E2E TP2 (B200)
diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
index ed4c92d90ff7..733ec22c98d6 100644
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -194,13 +194,31 @@ def test_custom_compile_config(
 )
 @pytest.mark.parametrize(
     "model, backend",
-    [
-        ("Qwen/Qwen2-0.5B", None),  # Standard attention model
-        (
-            "deepseek-ai/DeepSeek-V2-Lite",
-            AttentionBackendEnum.FLASHINFER_MLA,
-        ),  # MLA (Multi-head Latent Attention) model
-    ],
+    (
+        [
+            ("Qwen/Qwen2-0.5B", None),  # Standard attention model
+            (
+                "deepseek-ai/DeepSeek-V2-Lite",
+                AttentionBackendEnum.FLASHINFER_MLA,
+            ),  # MLA (Multi-head Latent Attention) model
+        ]
+        if current_platform.is_cuda()
+        else [
+            # ("Qwen/Qwen2-0.5B", None),  # Standard attention model
+            # (
+            #     "deepseek-ai/DeepSeek-V2-Lite",
+            #     AttentionBackendEnum.TRITON_MLA,
+            # ),  # MLA (Multi-head Latent Attention) model
+            (
+                "deepseek-ai/DeepSeek-V2-Lite",
+                AttentionBackendEnum.ROCM_AITER_MLA,
+            ),  # MLA (Multi-head Latent Attention) model
+            (
+                "deepseek-ai/DeepSeek-V2-Lite",
+                AttentionBackendEnum.ROCM_AITER_TRITON_MLA,
+            ),  # MLA (Multi-head Latent Attention) model
+        ]
+    ),
 )
 def test_fp8_kv_scale_compile(
     compilation_mode: int,
@@ -209,7 +227,7 @@ def test_fp8_kv_scale_compile(
 ):
     model_kwargs = {
         "quantization": "fp8",
-        "kv_cache_dtype": "fp8_e4m3",
+        "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8",
         "calculate_kv_scales": True,
         "max_model_len": 512,
     }
diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py
index f54f617c64d4..525ed1b515bc 100644
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
+from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
@@ -24,37 +25,83 @@
     AttentionBackendCase(backend=AttentionBackendEnum.TRITON_ATTN), id="TRITON_ATTN"
 )
 
+# ROCm backends
+ROCM_ATTN = pytest.param(
+    AttentionBackendCase(backend=AttentionBackendEnum.ROCM_ATTN), id="ROCM_ATTN"
+)
+
+ROCM_AITER_UNIFIED_ATTN = pytest.param(
+    AttentionBackendCase(backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN),
+    id="ROCM_AITER_UNIFIED_ATTN",
+)
+
 # Models
 llama3_8b = ModelFusionInfo(
     model_name="meta-llama/Llama-3.1-8B-Instruct",
-    matches=lambda n_layers: Matches(
-        ar_rms_fusion=n_layers * 2 + 1,
-        sequence_parallel=n_layers * 2 + 1,
-        async_tp=n_layers * 4,
+    matches=(
+        lambda n_layers: Matches(
+            ar_rms_fusion=n_layers * 2 + 1,
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 4,
+        ),
+    )
+    if current_platform.is_cuda()
+    else (  # ROCm matches
+        lambda n_layers: Matches(
+            ar_rms_fusion=0,
+            sequence_parallel=0,
+            async_tp=0,
+        ),
     ),
 )
 
 llama3_8b_fp8 = ModelFusionInfo(
     model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
-    matches=lambda n_layers: Matches(
-        rms_quant_fusion=n_layers * 2,
-        act_quant_fusion=n_layers,
-        attn_quant_fusion=n_layers,
-        ar_rms_fusion=n_layers * 2 + 1,
-        sequence_parallel=n_layers * 2 + 1,
-        async_tp=n_layers * 4,
+    matches=(
+        lambda n_layers: Matches(
+            rms_quant_fusion=n_layers * 2,
+            act_quant_fusion=n_layers,
+            attn_quant_fusion=n_layers,
+            ar_rms_fusion=n_layers * 2 + 1,
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 4,
+        )
+    )
+    if current_platform.is_cuda()
+    else (  # ROCm matches
+        lambda n_layers: Matches(
+            rms_quant_fusion=n_layers * 2,
+            act_quant_fusion=n_layers,
+            attn_quant_fusion=n_layers,
+            ar_rms_fusion=0,
+            sequence_parallel=0,
+            async_tp=0,
+        ),
     ),
 )
 
 llama3_8b_fp4 = ModelFusionInfo(
     model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
-    matches=lambda n_layers: Matches(
-        rms_quant_fusion=0,
-        act_quant_fusion=n_layers,
-        attn_quant_fusion=n_layers,
-        ar_rms_fusion=n_layers * 2 + 1,
-        sequence_parallel=n_layers * 2 + 1,
-        async_tp=n_layers * 4,
+    matches=(
+        lambda n_layers: Matches(
+            rms_quant_fusion=0,
+            act_quant_fusion=n_layers,
+            attn_quant_fusion=n_layers,
+            ar_rms_fusion=n_layers * 2 + 1,
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 4,
+        )
+    )
+    if current_platform.is_cuda()
+    else (  # ROCm matches
+        lambda n_layers: Matches(
+            rms_quant_fusion=0,
+            act_quant_fusion=n_layers,
+            attn_quant_fusion=n_layers,
+            ar_rms_fusion=0,
+            sequence_parallel=0,
+            async_tp=0,
+        ),
     ),
 )
 
diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py
index 03f102794f85..25a607051a89 100644
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -5,6 +5,7 @@
 import pytest
 
 from vllm.config import PassConfig
+from vllm.platforms import current_platform
 
 from .common import (
     INDUCTOR_GRAPH_PARTITION,
@@ -15,11 +16,12 @@
 )
 from .models import (
     FLASHINFER_ATTN,
+    ROCM_AITER_UNIFIED_ATTN,
+    ROCM_ATTN,
     TRITON_ATTN,
     llama3_8b_fp4,
     llama3_8b_fp8,
     llama4_scout_fp4,
-    llama4_scout_fp8,
     qwen3_a3b_fp8,
 )
 
@@ -28,12 +30,17 @@
     "model_name, matches_fn, model_kwargs, hf_overrides, use_deepgemm",
     [
         (*llama3_8b_fp8, False),
-        (*llama4_scout_fp8, False),
+        # (*llama4_scout_fp8, False),
         (*qwen3_a3b_fp8, False),
-        (*qwen3_a3b_fp8, True),
+        # (*qwen3_a3b_fp8, True),
     ],
 )
-@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
+@pytest.mark.parametrize(
+    "attn_backend",
+    [TRITON_ATTN, FLASHINFER_ATTN]
+    if current_platform.is_cuda()
+    else [TRITON_ATTN, ROCM_ATTN, ROCM_AITER_UNIFIED_ATTN],
+)
 @pytest.mark.parametrize("n_layers", [6])
 @pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
 @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
diff --git a/tests/compile/passes/test_fusion_attn.py b/tests/compile/passes/test_fusion_attn.py
index 75d5c42f0731..a35db7bb21ff 100644
--- a/tests/compile/passes/test_fusion_attn.py
+++ b/tests/compile/passes/test_fusion_attn.py
@@ -267,7 +267,7 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
     PATTERN_TEST_MODELS_FP8 = [
         ("amd/Llama-3.1-8B-Instruct-FP8-KV", TestAttentionFp8StaticQuantPatternModel)
     ]
-    BACKENDS = [
+    BACKENDS_FP8 = [
         AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
         AttentionBackendEnum.ROCM_ATTN,
         AttentionBackendEnum.TRITON_ATTN,
diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py
index c5ef015015ce..64aad53525a5 100644
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
+from typing import Any
 
 import pytest
 import torch
@@ -148,6 +149,9 @@ def __init__(self, hidden_size: int, **kwargs):
             weight_group_shape=GroupShape(128, 128),
             act_quant_group_shape=GroupShape(1, 128),
             cutlass_block_fp8_supported=False,
+            # this parameter cannot always be True,
+            # it depends on the VLLM_ROCM_USE_AITER
+            # and VLLM_ROCM_USE_AITER_LINEAR environment variables
             use_aiter_and_is_supported=True,
         )
         self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
@@ -181,6 +185,12 @@ def ops_in_model_after(self):
 ]
 TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
 
+EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [
+    (TestSiluMulGroupFp8QuantModel, False, None),
+]
+if current_platform.is_cuda():
+    EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None))
+
 
 @pytest.mark.parametrize("num_tokens", [32, 64])
 @pytest.mark.parametrize("hidden_size", [128, 256])
@@ -189,10 +199,7 @@ def ops_in_model_after(self):
 @pytest.mark.parametrize(
     "model_class, enable_quant_fp8_custom_op, force_kernel",
     list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS))
-    + [
-        (TestSiluMulNvfp4QuantModel, False, None),
-        (TestSiluMulGroupFp8QuantModel, False, None),
-    ],
+    + EXTENDED_TESTCASES,
 )
 @pytest.mark.skipif(
     envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm"
diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py
index 5bc78afa43b0..ed3b981cf183 100644
--- a/vllm/model_executor/layers/quantization/input_quant_fp8.py
+++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -171,6 +171,7 @@ def forward_native(
         x: torch.Tensor,
         scale: torch.Tensor | None = None,
         scale_ub: torch.Tensor | None = None,
+        **kwargs,
     ):
         if self.is_group_quant and not self.static:
             assert scale is None, "Dynamic group quantization does not use scale"

From 1c9552affe7f37454cde6b496a64b26e0d859ce0 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Mon, 9 Feb 2026 13:21:08 +0000
Subject: [PATCH 02/45] fix silu-mul-groupquant fuion test

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../passes/test_silu_mul_quant_fusion.py       | 10 +++++++---
 .../passes/fusion/rocm_aiter_fusion.py         | 18 +++++++-----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py
index 64aad53525a5..687bb9aa6bfe 100644
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -10,7 +10,7 @@
 from tests.compile.backend import TestBackend
 from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
 from tests.utils import TestFP8Layer
-from vllm._aiter_ops import IS_AITER_FOUND
+from vllm._aiter_ops import IS_AITER_FOUND, rocm_aiter_ops
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.compilation.passes.fusion.act_quant_fusion import (
     FUSED_OPS,
@@ -186,7 +186,7 @@ def ops_in_model_after(self):
 TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
 
 EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [
-    (TestSiluMulGroupFp8QuantModel, False, None),
+    (TestSiluMulGroupFp8QuantModel, True, None),
 ]
 if current_platform.is_cuda():
     EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None))
@@ -216,6 +216,7 @@ def test_fusion_silu_and_mul_quant(
     enable_silu_mul_custom_op: bool,
     enable_quant_fp8_custom_op: bool,
     force_kernel: FP8ScaledMMLinearKernel | None,
+    monkeypatch: pytest.MonkeyPatch,
 ):
     if model_class is TestSiluMulNvfp4QuantModel and not is_nvfp4_supported():
         pytest.skip("NVFP4 is not supported on this GPU.")
@@ -242,13 +243,16 @@ def test_fusion_silu_and_mul_quant(
         ),
     )
 
-    with set_current_vllm_config(config):
+    with set_current_vllm_config(config), monkeypatch.context() as m:
         fusion_passes = [ActivationQuantFusionPass(config)]
         if IS_AITER_FOUND:
             from vllm.compilation.passes.fusion.rocm_aiter_fusion import (
                 RocmAiterSiluMulFp8GroupQuantFusionPass,
             )
 
+            m.setenv("VLLM_ROCM_USE_AITER", "1")
+            rocm_aiter_ops.refresh_env_variables()
+
             fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)]
 
         passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)]
diff --git a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
index d8131ce952d2..99278365c5db 100644
--- a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
+++ b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
@@ -5,7 +5,6 @@
 import torch._inductor.pattern_matcher as pm
 from torch import fx
 from torch._inductor.pattern_matcher import PatternMatcherPass
-from torch._ops import OpOverload
 
 import vllm.model_executor.layers.quantization.utils.fp8_utils  # noqa: F401
 from vllm._aiter_ops import rocm_aiter_ops
@@ -15,6 +14,7 @@
     GroupShape,
     QuantKey,
     ScaleDesc,
+    kFp8Static128BlockSym,
 )
 from vllm.platforms import current_platform
 
@@ -332,9 +332,11 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern):
 
     FUSED_SILU_MUL_QUANT_OP = rocm_aiter_ops.get_act_mul_fused_fp8_group_quant_op()
 
-    def __init__(self, quant_op: OpOverload) -> None:
+    def __init__(self) -> None:
         self.silu_and_mul_matcher = MatcherSiluAndMul()
-        self.quant_op = quant_op
+        self.quant_matcher = MatcherQuantFP8(
+            quant_key=kFp8Static128BlockSym, match_rocm_aiter=True
+        )
 
     def get_inputs(self) -> list[torch.Tensor]:
         return [
@@ -346,7 +348,7 @@ def pattern(
             input: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             at1 = self.silu_and_mul_matcher(input)
-            at2 = self.quant_op(at1, 128)
+            at2 = self.quant_matcher(at1, 128)
             return at2[0], at2[1]
 
         def replacement(
@@ -370,11 +372,6 @@ class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass):
     https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
     """
 
-    AITER_GROUP_FP8_QUANT_OP = rocm_aiter_ops.get_group_quant_op()
-    TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default
-
-    QUANT_OPS = [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]
-
     @enable_fake_mode
     def __init__(self, config: VllmConfig) -> None:
         super().__init__(config)
@@ -383,8 +380,7 @@ def __init__(self, config: VllmConfig) -> None:
             pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass"
         )
 
-        for quant_op in self.QUANT_OPS:
-            AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns)
+        AiterSiluMulFp8GroupQuantPattern().register(self.patterns)
 
         self.dump_patterns(config, self.patterns)
 

From bffe1814354d8ed1055c6a02ecdde905f3f61549 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 10 Feb 2026 07:42:59 +0000
Subject: [PATCH 03/45] fix full graph test

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 tests/compile/fullgraph/test_full_graph.py    | 27 ++++++++++++-------
 .../layers/attention/mla_attention.py         |  1 +
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
index 733ec22c98d6..921f57cea0a6 100644
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -10,6 +10,7 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
@@ -224,17 +225,25 @@ def test_fp8_kv_scale_compile(
     compilation_mode: int,
     model: str,
     backend: AttentionBackendEnum | None,
+    monkeypatch: pytest.MonkeyPatch,
 ):
-    model_kwargs = {
-        "quantization": "fp8",
-        "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8",
-        "calculate_kv_scales": True,
-        "max_model_len": 512,
-    }
-    if backend:
-        model_kwargs["attention_config"] = {"backend": backend.name}
+    with monkeypatch.context() as m:
+        model_kwargs = {
+            "quantization": "fp8",
+            "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8",
+            "calculate_kv_scales": True,
+            "max_model_len": 512,
+        }
+        if backend:
+            model_kwargs["attention_config"] = {"backend": backend.name}
+            if current_platform.is_rocm():
+                m.setenv("VLLM_ROCM_USE_AITER", "1")
+                # Disable Aiter MOE as some shapes are not supported
+                m.setenv("VLLM_ROCM_USE_AITER_MOE", "0")
+
+                rocm_aiter_ops.refresh_env_variables()
 
-    run_model(compilation_mode, model, **model_kwargs)
+        run_model(compilation_mode, model, **model_kwargs)
 
 
 def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py
index c31aa7b41d0d..ed31a2d176e3 100644
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -403,6 +403,7 @@ def __init__(
         self.is_aiter_triton_fp4_bmm_enabled = (
             rocm_aiter_ops.is_fp4bmm_enabled()
             and self.kv_b_proj.weight.dtype == torch.bfloat16
+            and current_platform.has_device_capability(95)  # gfx950 and above
         )
 
         # Attributes for forward_impl method

From 28ed03f12de4842576366a34efbf31d44cb2a34a Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 10 Feb 2026 07:52:31 +0000
Subject: [PATCH 04/45] clearer test case for silu mul and group quant test

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 tests/compile/passes/test_silu_mul_quant_fusion.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py
index 687bb9aa6bfe..c6794a156240 100644
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -184,10 +184,11 @@ def ops_in_model_after(self):
     PerTensorTorchFP8ScaledMMLinearKernel,
 ]
 TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
+EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = []
+# SiluMulGroupFp8Quant is only supported on ROCm
+if current_platform.is_rocm():
+    EXTENDED_TESTCASES.append((TestSiluMulGroupFp8QuantModel, True, None))
 
-EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [
-    (TestSiluMulGroupFp8QuantModel, True, None),
-]
 if current_platform.is_cuda():
     EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None))
 
@@ -245,7 +246,7 @@ def test_fusion_silu_and_mul_quant(
 
     with set_current_vllm_config(config), monkeypatch.context() as m:
         fusion_passes = [ActivationQuantFusionPass(config)]
-        if IS_AITER_FOUND:
+        if current_platform.is_rocm() and IS_AITER_FOUND:
             from vllm.compilation.passes.fusion.rocm_aiter_fusion import (
                 RocmAiterSiluMulFp8GroupQuantFusionPass,
             )

From 5628eb9b272da81a81da4477cb5774fdf7143632 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 10 Feb 2026 08:40:13 +0000
Subject: [PATCH 05/45] fix e2e fusion tests

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .buildkite/test-amd.yaml                      |  71 +++++++++-
 tests/compile/fusions_e2e/models.py           | 125 ++++++++++++------
 .../compile/fusions_e2e/test_tp2_async_tp.py  |  14 +-
 3 files changed, 164 insertions(+), 46 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 986708e37641..64a878baa774 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1691,6 +1691,17 @@ steps:
   - export VLLM_TEST_CLEAN_GPU_MEMORY=1
   - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
 
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: AsyncTP Correctness Tests  (2xMI325 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  device: mi325_2
+  optional: true
+  num_devices: 2
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
+
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion and Compile Unit Tests (2xMI325 GPUs)
   timeout_in_minutes: 20
@@ -1750,7 +1761,8 @@ steps:
     # Run all models and attn backends but only Inductor partition and native custom ops
     - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
     # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet.
+    # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion E2E Config Sweep (MI325)
@@ -1771,10 +1783,57 @@ steps:
   commands:
     - rocm-smi
     # Run just llama3 (fp8) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
     - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
 
-# corresponds to .buildkite/test_areas/kernels.yaml
-# Skip the following tests as they are not supported on ROCm
-# - label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
-# - label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
-# - label: Fusion E2E TP2 (B200)
+- label: Fusion E2E TP2 Quick (MI325)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: mi325_1
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - rocm-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: mi325_2
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - rocm-smi
+    # Run just llama3 (fp8 & bf16) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
+
+- label: Fusion E2E TP2 (MI325)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: mi325_2
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - rocm-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py
index 525ed1b515bc..77cb1b4d3ad9 100644
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -43,15 +43,16 @@
             ar_rms_fusion=n_layers * 2 + 1,
             sequence_parallel=n_layers * 2 + 1,
             async_tp=n_layers * 4,
-        ),
+        )
     )
     if current_platform.is_cuda()
-    else (  # ROCm matches
+    # ROCm matches
+    else (
         lambda n_layers: Matches(
             ar_rms_fusion=0,
-            sequence_parallel=0,
-            async_tp=0,
-        ),
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 4,
+        )
     ),
 )
 
@@ -68,15 +69,16 @@
         )
     )
     if current_platform.is_cuda()
-    else (  # ROCm matches
+    # ROCm matches
+    else (
         lambda n_layers: Matches(
             rms_quant_fusion=n_layers * 2,
             act_quant_fusion=n_layers,
             attn_quant_fusion=n_layers,
             ar_rms_fusion=0,
-            sequence_parallel=0,
-            async_tp=0,
-        ),
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 4,
+        )
     ),
 )
 
@@ -93,15 +95,16 @@
         )
     )
     if current_platform.is_cuda()
-    else (  # ROCm matches
+    # ROCm matches
+    else (
         lambda n_layers: Matches(
             rms_quant_fusion=0,
             act_quant_fusion=n_layers,
             attn_quant_fusion=n_layers,
             ar_rms_fusion=0,
-            sequence_parallel=0,
-            async_tp=0,
-        ),
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 4,
+        )
     ),
 )
 
@@ -113,45 +116,93 @@
 llama4_scout_fp8 = ModelFusionInfo(
     model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
     hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
-    matches=lambda n_layers: Matches(
-        rms_quant_fusion=n_layers,
-        attn_quant_fusion=n_layers,
-        ar_rms_fusion=n_layers * 2,
-        sequence_parallel=n_layers * 2,
-        async_tp=n_layers * 2 - 1,
+    matches=(
+        lambda n_layers: Matches(
+            rms_quant_fusion=n_layers,
+            attn_quant_fusion=n_layers,
+            ar_rms_fusion=n_layers * 2,
+            sequence_parallel=n_layers * 2,
+            async_tp=n_layers * 2 - 1,
+        )
+    )
+    if current_platform.is_cuda()
+    # ROCm matches
+    else (
+        lambda n_layers: Matches(
+            rms_quant_fusion=n_layers,
+            attn_quant_fusion=n_layers,
+            sequence_parallel=n_layers * 2,
+        )
     ),
 )
 
 llama4_scout_fp4 = ModelFusionInfo(
     model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4",
     hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
-    matches=lambda n_layers: Matches(
-        rms_quant_fusion=0,
-        attn_quant_fusion=n_layers,
-        ar_rms_fusion=n_layers * 2,
-        sequence_parallel=n_layers * 2,
-        async_tp=n_layers * 2 - 1,
+    matches=(
+        lambda n_layers: Matches(
+            rms_quant_fusion=0,
+            attn_quant_fusion=n_layers,
+            ar_rms_fusion=n_layers * 2,
+            sequence_parallel=n_layers * 2,
+            async_tp=n_layers * 2 - 1,
+        )
+    )
+    if current_platform.is_cuda()
+    # ROCm matches
+    else (
+        lambda n_layers: Matches(
+            rms_quant_fusion=0,
+            attn_quant_fusion=n_layers,
+            sequence_parallel=n_layers * 2,
+            async_tp=n_layers * 2 - 1,
+        )
     ),
 )
 
 qwen3_a3b = ModelFusionInfo(
     model_name="Qwen/Qwen3-30B-A3B",
-    matches=lambda n_layers: Matches(
-        norm_rope_fusion=n_layers,
-        ar_rms_fusion=n_layers * 2 + 1,
-        sequence_parallel=n_layers * 2 + 1,
-        async_tp=n_layers * 2,
+    matches=(
+        lambda n_layers: Matches(
+            norm_rope_fusion=n_layers,
+            ar_rms_fusion=n_layers * 2 + 1,
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 2,
+        )
+    )
+    if current_platform.is_cuda()
+    # ROCm matches
+    else (
+        lambda n_layers: Matches(
+            norm_rope_fusion=n_layers,
+            ar_rms_fusion=0,
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 2,
+        )
     ),
 )
 
 qwen3_a3b_fp8 = ModelFusionInfo(
     model_name="Qwen/Qwen3-30B-A3B-FP8",
-    matches=lambda n_layers: Matches(
-        rms_quant_fusion=n_layers,
-        norm_rope_fusion=n_layers,
-        attn_quant_fusion=0,  # attn + group quant not supported
-        ar_rms_fusion=n_layers * 2 + 1,
-        sequence_parallel=n_layers * 2 + 1,
-        async_tp=n_layers * 2,
+    matches=(
+        lambda n_layers: Matches(
+            rms_quant_fusion=n_layers,
+            norm_rope_fusion=n_layers,
+            attn_quant_fusion=0,  # attn + group quant not supported
+            ar_rms_fusion=n_layers * 2 + 1,
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 2,
+        )
+    )
+    if current_platform.is_cuda()
+    # ROCm matches
+    else (
+        lambda n_layers: Matches(
+            rms_quant_fusion=n_layers,
+            norm_rope_fusion=n_layers,
+            ar_rms_fusion=0,
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 2,
+        )
     ),
 )
diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py
index 4769ca1e0b63..fb743c1ba7d3 100644
--- a/tests/compile/fusions_e2e/test_tp2_async_tp.py
+++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py
@@ -5,6 +5,7 @@
 import pytest
 
 from vllm.config import PassConfig
+from vllm.platforms import current_platform
 
 from ...utils import multi_gpu_test
 from .common import (
@@ -16,6 +17,8 @@
 )
 from .models import (
     FLASHINFER_ATTN,
+    ROCM_AITER_UNIFIED_ATTN,
+    ROCM_ATTN,
     TRITON_ATTN,
     llama3_8b,
     llama3_8b_fp8,
@@ -29,9 +32,14 @@
     "model_name, matches_fn, model_kwargs, hf_overrides",
     [llama3_8b_fp8, llama4_scout_fp8],
 )
-@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
+@pytest.mark.parametrize(
+    "attn_backend",
+    [TRITON_ATTN, FLASHINFER_ATTN]
+    if current_platform.is_cuda()
+    else [TRITON_ATTN, ROCM_ATTN, ROCM_AITER_UNIFIED_ATTN],
+)
 @pytest.mark.parametrize("n_layers", [4])
-@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
+@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("quant_fp8", "rms_norm")))
 @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
 def test_tp2_async_tp_fp8_fusions(
     model_name: str,
@@ -96,7 +104,7 @@ def test_tp2_async_tp_fp8_fusions(
 )
 @pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
 @pytest.mark.parametrize("n_layers", [4])
-@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
+@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("rms_norm")))
 @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
 def test_tp2_async_tp_fusions(
     model_name: str,

From 119b4b01b345e17264c84d37f2acd060e6a5448a Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 10 Feb 2026 10:18:50 +0000
Subject: [PATCH 06/45] fix tests in fusion silu_mul and tidy up kite

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .buildkite/test-amd.yaml                      |  4 ++++
 .../passes/test_silu_mul_quant_fusion.py      | 19 +++++++------------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 64a878baa774..33ba6689faa5 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1695,6 +1695,7 @@ steps:
 - label: AsyncTP Correctness Tests  (2xMI325 GPUs)
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   device: mi325_2
   optional: true
   num_devices: 2
@@ -1789,6 +1790,7 @@ steps:
 - label: Fusion E2E TP2 Quick (MI325)
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   device: mi325_1
   num_devices: 2
   source_file_dependencies:
@@ -1806,6 +1808,7 @@ steps:
 - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   device: mi325_2
   num_devices: 2
   source_file_dependencies:
@@ -1823,6 +1826,7 @@ steps:
     - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
 
 - label: Fusion E2E TP2 (MI325)
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   device: mi325_2
diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py
index c6794a156240..abd32c38ca04 100644
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
-from typing import Any
 
 import pytest
 import torch
@@ -10,7 +9,7 @@
 from tests.compile.backend import TestBackend
 from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
 from tests.utils import TestFP8Layer
-from vllm._aiter_ops import IS_AITER_FOUND, rocm_aiter_ops
+from vllm._aiter_ops import IS_AITER_FOUND
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.compilation.passes.fusion.act_quant_fusion import (
     FUSED_OPS,
@@ -184,13 +183,6 @@ def ops_in_model_after(self):
     PerTensorTorchFP8ScaledMMLinearKernel,
 ]
 TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
-EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = []
-# SiluMulGroupFp8Quant is only supported on ROCm
-if current_platform.is_rocm():
-    EXTENDED_TESTCASES.append((TestSiluMulGroupFp8QuantModel, True, None))
-
-if current_platform.is_cuda():
-    EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None))
 
 
 @pytest.mark.parametrize("num_tokens", [32, 64])
@@ -200,7 +192,10 @@ def ops_in_model_after(self):
 @pytest.mark.parametrize(
     "model_class, enable_quant_fp8_custom_op, force_kernel",
     list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS))
-    + EXTENDED_TESTCASES,
+    + [
+        (TestSiluMulNvfp4QuantModel, False, None),
+        (TestSiluMulGroupFp8QuantModel, True, None),
+    ],
 )
 @pytest.mark.skipif(
     envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm"
@@ -246,14 +241,14 @@ def test_fusion_silu_and_mul_quant(
 
     with set_current_vllm_config(config), monkeypatch.context() as m:
         fusion_passes = [ActivationQuantFusionPass(config)]
-        if current_platform.is_rocm() and IS_AITER_FOUND:
+        if IS_AITER_FOUND and model_class is TestSiluMulGroupFp8QuantModel:
+            from vllm._aiter_ops import rocm_aiter_ops
             from vllm.compilation.passes.fusion.rocm_aiter_fusion import (
                 RocmAiterSiluMulFp8GroupQuantFusionPass,
             )
 
             m.setenv("VLLM_ROCM_USE_AITER", "1")
             rocm_aiter_ops.refresh_env_variables()
-
             fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)]
 
         passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)]

From 218fcfb221df513d2c000e3e133e27f9e6f1d010 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 10 Feb 2026 10:25:11 +0000
Subject: [PATCH 07/45] remove unnecessary change

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/model_executor/layers/quantization/input_quant_fp8.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py
index ed3b981cf183..5bc78afa43b0 100644
--- a/vllm/model_executor/layers/quantization/input_quant_fp8.py
+++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -171,7 +171,6 @@ def forward_native(
         x: torch.Tensor,
         scale: torch.Tensor | None = None,
         scale_ub: torch.Tensor | None = None,
-        **kwargs,
     ):
         if self.is_group_quant and not self.static:
             assert scale is None, "Dynamic group quantization does not use scale"

From befaba1bc95b823ac36b0091632b206b8c6faa76 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 10 Feb 2026 13:46:06 +0000
Subject: [PATCH 08/45] remove duplicate

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 33ba6689faa5..149767bdd06b 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1721,18 +1721,6 @@ steps:
   - tests/compile/test_silu_mul_quant_fusion.py
   - tests/compile/distributed/test_fusion_all_reduce.py
   - tests/compile/fullgraph/test_full_graph.py
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/attention/attention.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
-  - tests/compile/passes/test_fusion_attn.py
-  - tests/compile/passes/test_silu_mul_quant_fusion.py
-  - tests/compile/passes/distributed/test_fusion_all_reduce.py
-  - tests/compile/fullgraph/test_full_graph.py
   commands:
     - rocm-smi
     # we run all backend tests on ROCm

From ca801a13e3d9dc9c37bb1e429a7134cc4e28e58a Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 10 Feb 2026 15:16:30 +0000
Subject: [PATCH 09/45] need to add quote

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 149767bdd06b..8d9c6eb62db0 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -620,7 +620,7 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+  - "pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'"
     # # Limit to no custom ops to reduce running time
     # # Wrap with quotes to escape yaml and avoid starting -k string with a -
     # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
@@ -1689,7 +1689,7 @@ steps:
   - tests/compile/correctness_e2e/test_sequence_parallel.py
   commands:
   - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
+  - "pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py"
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: AsyncTP Correctness Tests  (2xMI325 GPUs)
@@ -1701,7 +1701,7 @@ steps:
   num_devices: 2
   commands:
   - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
+  - "pytest -v -s tests/compile/correctness_e2e/test_async_tp.py"
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion and Compile Unit Tests (2xMI325 GPUs)
@@ -1724,8 +1724,8 @@ steps:
   commands:
     - rocm-smi
     # we run all backend tests on ROCm
-    - pytest -v -s tests/compile/passes/test_fusion_attn.py
-    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
+    - "pytest -v -s tests/compile/passes/test_fusion_attn.py"
+    - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py"
     # TODO: this test is not supported on ROCm, there are aiter kernels for this.
     # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
     # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
@@ -1748,7 +1748,7 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"'
     # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
     # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet.
     # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
@@ -1772,8 +1772,8 @@ steps:
   commands:
     - rocm-smi
     # Run just llama3 (fp8) for all config combinations
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
+    - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"'
+    - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"'
 
 - label: Fusion E2E TP2 Quick (MI325)
   timeout_in_minutes: 40
@@ -1790,7 +1790,7 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"'
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
@@ -1811,7 +1811,7 @@ steps:
   commands:
     - rocm-smi
     # Run just llama3 (fp8 & bf16) for all config combinations
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
+    - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"'
 
 - label: Fusion E2E TP2 (MI325)
   mirror_hardwares: [amdexperimental, amdproduction, tj]
@@ -1828,4 +1828,4 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - 'pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"'

From 0b65174b5ad3074344ae519852d28561bd155f46 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 10 Feb 2026 15:47:57 +0000
Subject: [PATCH 10/45] fix syntax

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 8d9c6eb62db0..439075c6a843 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1748,7 +1748,7 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"'
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
     # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
     # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet.
     # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
@@ -1772,8 +1772,8 @@ steps:
   commands:
     - rocm-smi
     # Run just llama3 (fp8) for all config combinations
-    - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"'
-    - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"'
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'llama-3'"
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'"
 
 - label: Fusion E2E TP2 Quick (MI325)
   timeout_in_minutes: 40
@@ -1790,7 +1790,7 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"'
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
@@ -1811,7 +1811,7 @@ steps:
   commands:
     - rocm-smi
     # Run just llama3 (fp8 & bf16) for all config combinations
-    - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"'
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'"
 
 - label: Fusion E2E TP2 (MI325)
   mirror_hardwares: [amdexperimental, amdproduction, tj]
@@ -1828,4 +1828,4 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - 'pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"'
+    - "pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"

From be40a224ccc64d072f2664d15b58eee9fe46c4b2 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 10 Feb 2026 16:49:52 +0000
Subject: [PATCH 11/45] fix Fusion E2E TP2 (MI325) path

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 439075c6a843..407a3d671803 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1814,7 +1814,7 @@ steps:
     - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'"
 
 - label: Fusion E2E TP2 (MI325)
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  mirror_hardwares: [amdexperimental, amdproduction, tj, tj2]
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   device: mi325_2
@@ -1828,4 +1828,4 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - "pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"

From d8d071254ffbafcabb69fd9221e08a755298e7e5 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 02:48:27 +0000
Subject: [PATCH 12/45] fix test-amd syntax

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 407a3d671803..8d469a39b042 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1696,7 +1696,7 @@ steps:
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/"
   mirror_hardwares: [amdexperimental, amdproduction, tj]
-  device: mi325_2
+  agent_pool: mi325_2
   optional: true
   num_devices: 2
   commands:
@@ -1750,7 +1750,7 @@ steps:
     # Run all models and attn backends but only Inductor partition and native custom ops
     - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
     # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet.
+    # TODO: Qwen uses group quantization which the pattern matcher on ROCm is not supported yet.
     # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
 
 # corresponds to .buildkite/test_areas/compile.yaml
@@ -1779,7 +1779,7 @@ steps:
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   mirror_hardwares: [amdexperimental, amdproduction, tj]
-  device: mi325_1
+  agent_pool: mi325_1
   num_devices: 2
   source_file_dependencies:
     - csrc/quantization/
@@ -1790,14 +1790,14 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
+    - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   mirror_hardwares: [amdexperimental, amdproduction, tj]
-  device: mi325_2
+  agent_pool: mi325_2
   num_devices: 2
   source_file_dependencies:
     - csrc/quantization/
@@ -1811,13 +1811,13 @@ steps:
   commands:
     - rocm-smi
     # Run just llama3 (fp8 & bf16) for all config combinations
-    - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'"
+    - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'"
 
 - label: Fusion E2E TP2 (MI325)
   mirror_hardwares: [amdexperimental, amdproduction, tj, tj2]
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
-  device: mi325_2
+  agent_pool: mi325_2
   num_devices: 2
   source_file_dependencies:
     - csrc/quantization/
@@ -1828,4 +1828,4 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
+    - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"

From f58033a22e7088f79830ea5f99a146c60b89b8be Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 04:15:47 +0000
Subject: [PATCH 13/45] revert pytorch tests

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 0507c617745a..1329af10973c 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -569,12 +569,9 @@ steps:
       --ignore=lora/test_gptoss_tp.py \
       --ignore=lora/test_qwen3moe_tp.py
   parallelism: 4
-
-##### .buildkite/test_areas/pytorch.yaml #####
-# corresponds to .buildkite/test_areas/pytorch.yaml
 - label: PyTorch Compilation Unit Tests # 15min
   timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -588,14 +585,10 @@ steps:
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
   - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
-  # TODO: clean up this comment if not needed. It is used to 
-  # keep track of the tests changes during vLLM IR Ops refactoring.
-  - pytest -s -v compile/passes --ignore compile/passes/distributed
 
-# corresponds to .buildkite/test_areas/pytorch.yaml
 - label: PyTorch Fullgraph Smoke Test # 15min
-  timeout_in_minutes: 35
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -609,10 +602,9 @@ steps:
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
   - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
 
-# corresponds to .buildkite/test_areas/pytorch.yaml
-- label: PyTorch Fullgraph # 27min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
+- label: PyTorch Fullgraph Test # 27min
+  timeout_in_minutes: 40
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -620,7 +612,7 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - "pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'"
+  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
     # # Limit to no custom ops to reduce running time
     # # Wrap with quotes to escape yaml and avoid starting -k string with a -
     # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
@@ -1529,20 +1521,17 @@ steps:
   num_gpus: 2
   commands:
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
-    # ================= 24 passed, 11 warnings in 192.85s (0:03:12) ==================
     - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
-    # ================== 48 passed, 8 warnings in 386.41s (0:06:26) ==================
-    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
-    # ======================== 8 skipped, 9 warnings in 2.08s ========================
+    # TODO: this test is not supported on ROCm, there are aiter kernels for this.
+    # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
     #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
     # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
     # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
     - pytest -v -s tests/distributed/test_context_parallel.py
-    # ======================== 4 passed, 3 warnings in 30.45s ========================
     - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
-    - pytest -v -s tests/v1/distributed/test_dbo.py
-    # ======================== 2 skipped, 3 warnings in 1.97s ========================
+    # TODO: this test is not supported on ROCm
+    # - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####
 - label: Distributed Tests (B200) # optional

From eabee32b32f103afd0ec64a0fb08045b8357e288 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 05:01:29 +0000
Subject: [PATCH 14/45] fix agent pool

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml                           | 2 +-
 tests/compile/passes/test_silu_mul_quant_fusion.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 1329af10973c..02f91563ed00 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1768,7 +1768,7 @@ steps:
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   mirror_hardwares: [amdexperimental, amdproduction, tj]
-  agent_pool: mi325_1
+  agent_pool: mi325_2
   num_devices: 2
   source_file_dependencies:
     - csrc/quantization/
diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py
index abd32c38ca04..f6d5e112dd12 100644
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -148,9 +148,6 @@ def __init__(self, hidden_size: int, **kwargs):
             weight_group_shape=GroupShape(128, 128),
             act_quant_group_shape=GroupShape(1, 128),
             cutlass_block_fp8_supported=False,
-            # this parameter cannot always be True,
-            # it depends on the VLLM_ROCM_USE_AITER
-            # and VLLM_ROCM_USE_AITER_LINEAR environment variables
             use_aiter_and_is_supported=True,
         )
         self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
@@ -185,6 +182,7 @@ def ops_in_model_after(self):
 TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
 
 
+
 @pytest.mark.parametrize("num_tokens", [32, 64])
 @pytest.mark.parametrize("hidden_size", [128, 256])
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
@@ -194,6 +192,8 @@ def ops_in_model_after(self):
     list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS))
     + [
         (TestSiluMulNvfp4QuantModel, False, None),
+        # GroupFP8Quant fusion only works with AITER on ROCm.
+        # and the enable_quant_fp8_custom_op must be True.
         (TestSiluMulGroupFp8QuantModel, True, None),
     ],
 )

From 56ac061c19626b03916a6b200f346092d1d4ba32 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 07:56:41 +0000
Subject: [PATCH 15/45] add fix test_full_graph

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml                   | 53 +---------------------
 tests/compile/fullgraph/test_full_graph.py |  8 ++--
 tests/compile/fusions_e2e/models.py        |  2 +
 3 files changed, 6 insertions(+), 57 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 02f91563ed00..b3b18b0b1b53 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1719,7 +1719,7 @@ steps:
     # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
     # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
     # TODO: this test is not supported on ROCm
-    # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion E2E Quick (MI325)
@@ -1765,56 +1765,5 @@ steps:
     - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'"
 
 - label: Fusion E2E TP2 Quick (MI325)
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
-  agent_pool: mi325_2
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/model_executor/
-    - vllm/v1/attention/
-    - vllm/compilation/
-    - tests/compile/fusions_e2e/
-  commands:
-    - rocm-smi
-    # Run all models and attn backends but only Inductor partition and native custom ops
-    - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
-
-# corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
-  agent_pool: mi325_2
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/compilation/
-    # can affect pattern matching
-    - vllm/model_executor/layers/layernorm.py
-    - vllm/model_executor/layers/activation.py
-    - vllm/model_executor/layers/attention/attention.py
-    - vllm/model_executor/layers/quantization/input_quant_fp8.py
-    - tests/compile/fusions_e2e/
-  commands:
-    - rocm-smi
-    # Run just llama3 (fp8 & bf16) for all config combinations
-    - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'"
-
 - label: Fusion E2E TP2 (MI325)
-  mirror_hardwares: [amdexperimental, amdproduction, tj, tj2]
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  agent_pool: mi325_2
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/model_executor/
-    - vllm/v1/attention/
-    - vllm/compilation/
-    - tests/compile/fusions_e2e/
-  commands:
-    - rocm-smi
-    # Run all models and attn backends but only Inductor partition and native custom ops
-    - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
index 921f57cea0a6..447391903314 100644
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -205,11 +205,9 @@ def test_custom_compile_config(
         ]
         if current_platform.is_cuda()
         else [
-            # ("Qwen/Qwen2-0.5B", None),  # Standard attention model
-            # (
-            #     "deepseek-ai/DeepSeek-V2-Lite",
-            #     AttentionBackendEnum.TRITON_MLA,
-            # ),  # MLA (Multi-head Latent Attention) model
+            ("Qwen/Qwen2-0.5B", None),  # Standard attention model
+            # AttentionBackendEnum.TRITON_MLA does not support
+            # fp8 kv scale compile.
             (
                 "deepseek-ai/DeepSeek-V2-Lite",
                 AttentionBackendEnum.ROCM_AITER_MLA,
diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py
index 77cb1b4d3ad9..ffca5bc6c0a5 100644
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -131,7 +131,9 @@
         lambda n_layers: Matches(
             rms_quant_fusion=n_layers,
             attn_quant_fusion=n_layers,
+            ar_rms_fusion=0,
             sequence_parallel=n_layers * 2,
+            async_tp=0
         )
     ),
 )

From b8c0bcdff3aa3baf4b6b2c99feab6d1db7f066d7 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 07:58:25 +0000
Subject: [PATCH 16/45] remove unrelated comment

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index b3b18b0b1b53..766fc76b09b3 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1717,8 +1717,6 @@ steps:
     - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py"
     # TODO: this test is not supported on ROCm, there are aiter kernels for this.
     # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    # TODO: this test is not supported on ROCm
     - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 
 # corresponds to .buildkite/test_areas/compile.yaml

From 9ef71e4de064fc1fc459f668dc99a223fd34a93d Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 08:04:37 +0000
Subject: [PATCH 17/45] reduce test and compute resource

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 766fc76b09b3..263555e219cf 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1680,24 +1680,12 @@ steps:
   - export VLLM_TEST_CLEAN_GPU_MEMORY=1
   - "pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py"
 
-# corresponds to .buildkite/test_areas/compile.yaml
-- label: AsyncTP Correctness Tests  (2xMI325 GPUs)
-  timeout_in_minutes: 50
-  working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
-  agent_pool: mi325_2
-  optional: true
-  num_devices: 2
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - "pytest -v -s tests/compile/correctness_e2e/test_async_tp.py"
-
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion and Compile Unit Tests (2xMI325 GPUs)
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/"
   mirror_hardwares: [amdexperimental, amdproduction, tj]
-  agent_pool: mi325_2
+  agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs
   source_file_dependencies:
   - csrc/quantization/fp4/
   - vllm/model_executor/layers/quantization/
@@ -1762,6 +1750,7 @@ steps:
     - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'llama-3'"
     - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'"
 
-- label: Fusion E2E TP2 Quick (MI325)
-- label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
-- label: Fusion E2E TP2 (MI325)
+# - label: AsyncTP Correctness Tests  (2xMI325 GPUs)
+# - label: Fusion E2E TP2 Quick (MI325)
+# - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
+# - label: Fusion E2E TP2 (MI325)

From 158ea2fc6d1bdf3f88eacf52a06d8597a9be8cdf Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 10:58:55 +0000
Subject: [PATCH 18/45] skip kvcache tests and reverted the changes in
 test_full_graph

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml                   |  3 +-
 tests/compile/fullgraph/test_full_graph.py | 43 +++++-----------------
 2 files changed, 12 insertions(+), 34 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 263555e219cf..cb5f10295409 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1705,7 +1705,8 @@ steps:
     - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py"
     # TODO: this test is not supported on ROCm, there are aiter kernels for this.
     # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
-    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+    # TODO: find out more details
+    # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion E2E Quick (MI325)
diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
index 447391903314..cf16e7fffd40 100644
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -10,7 +10,6 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
@@ -202,46 +201,24 @@ def test_custom_compile_config(
                 "deepseek-ai/DeepSeek-V2-Lite",
                 AttentionBackendEnum.FLASHINFER_MLA,
             ),  # MLA (Multi-head Latent Attention) model
-        ]
-        if current_platform.is_cuda()
-        else [
-            ("Qwen/Qwen2-0.5B", None),  # Standard attention model
-            # AttentionBackendEnum.TRITON_MLA does not support
-            # fp8 kv scale compile.
-            (
-                "deepseek-ai/DeepSeek-V2-Lite",
-                AttentionBackendEnum.ROCM_AITER_MLA,
-            ),  # MLA (Multi-head Latent Attention) model
-            (
-                "deepseek-ai/DeepSeek-V2-Lite",
-                AttentionBackendEnum.ROCM_AITER_TRITON_MLA,
-            ),  # MLA (Multi-head Latent Attention) model
-        ]
+        ],
     ),
 )
 def test_fp8_kv_scale_compile(
     compilation_mode: int,
     model: str,
     backend: AttentionBackendEnum | None,
-    monkeypatch: pytest.MonkeyPatch,
 ):
-    with monkeypatch.context() as m:
-        model_kwargs = {
-            "quantization": "fp8",
-            "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8",
-            "calculate_kv_scales": True,
-            "max_model_len": 512,
-        }
-        if backend:
-            model_kwargs["attention_config"] = {"backend": backend.name}
-            if current_platform.is_rocm():
-                m.setenv("VLLM_ROCM_USE_AITER", "1")
-                # Disable Aiter MOE as some shapes are not supported
-                m.setenv("VLLM_ROCM_USE_AITER_MOE", "0")
+    model_kwargs = {
+        "quantization": "fp8",
+        "kv_cache_dtype": "fp8_e4m3",
+        "calculate_kv_scales": True,
+        "max_model_len": 512,
+    }
+    if backend:
+        model_kwargs["attention_config"] = {"backend": backend.name}
 
-                rocm_aiter_ops.refresh_env_variables()
-
-        run_model(compilation_mode, model, **model_kwargs)
+    run_model(compilation_mode, model, **model_kwargs)
 
 
 def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):

From 099766164aede247b5e52344466af1f95b60f433 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 13:25:14 +0000
Subject: [PATCH 19/45] remove tj marker

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index cb5f10295409..1aad50efb39a 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1667,7 +1667,7 @@ steps:
 - label: Sequence Parallel Correctness Tests (2xMI325 GPUs)
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_2
   num_devices: 2
   source_file_dependencies:
@@ -1684,7 +1684,7 @@ steps:
 - label: Fusion and Compile Unit Tests (2xMI325 GPUs)
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs
   source_file_dependencies:
   - csrc/quantization/fp4/
@@ -1712,7 +1712,7 @@ steps:
 - label: Fusion E2E Quick (MI325)
   timeout_in_minutes: 15
   working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   num_devices: 1
   source_file_dependencies:
@@ -1733,7 +1733,7 @@ steps:
 - label: Fusion E2E Config Sweep (MI325)
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   num_devices: 1
   source_file_dependencies:

From 6891c605398ac698320fe18cd4a350c2a5ebb447 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 13:45:15 +0000
Subject: [PATCH 20/45] fix syntax

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/compile/fullgraph/test_full_graph.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
index cf16e7fffd40..ed4c92d90ff7 100644
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -194,15 +194,13 @@ def test_custom_compile_config(
 )
 @pytest.mark.parametrize(
     "model, backend",
-    (
-        [
-            ("Qwen/Qwen2-0.5B", None),  # Standard attention model
-            (
-                "deepseek-ai/DeepSeek-V2-Lite",
-                AttentionBackendEnum.FLASHINFER_MLA,
-            ),  # MLA (Multi-head Latent Attention) model
-        ],
-    ),
+    [
+        ("Qwen/Qwen2-0.5B", None),  # Standard attention model
+        (
+            "deepseek-ai/DeepSeek-V2-Lite",
+            AttentionBackendEnum.FLASHINFER_MLA,
+        ),  # MLA (Multi-head Latent Attention) model
+    ],
 )
 def test_fp8_kv_scale_compile(
     compilation_mode: int,

From 1e8fe872380933966839600cd06ecad894304948 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 13:48:27 +0000
Subject: [PATCH 21/45] add skip marker

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/compile/fusions_e2e/test_tp2_ar_rms.py   | 3 +++
 tests/compile/fusions_e2e/test_tp2_async_tp.py | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/tests/compile/fusions_e2e/test_tp2_ar_rms.py b/tests/compile/fusions_e2e/test_tp2_ar_rms.py
index 18b19565c1fc..ab4aefcaf79a 100644
--- a/tests/compile/fusions_e2e/test_tp2_ar_rms.py
+++ b/tests/compile/fusions_e2e/test_tp2_ar_rms.py
@@ -5,6 +5,7 @@
 import pytest
 
 from vllm.config import PassConfig
+from vllm.platforms import current_platform
 
 from ...utils import multi_gpu_test
 from .common import (
@@ -26,6 +27,8 @@
     qwen3_a3b_fp8,
 )
 
+pytestmark = pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
+
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py
index fb743c1ba7d3..88c3dc8192a5 100644
--- a/tests/compile/fusions_e2e/test_tp2_async_tp.py
+++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py
@@ -26,6 +26,8 @@
     qwen3_a3b,
 )
 
+pytestmark = pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
+
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(

From b81b0f92a0d060a3bbae05a5854906e08cfacdfe Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 13:52:07 +0000
Subject: [PATCH 22/45] revert test

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/compile/fusions_e2e/test_tp1_quant.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py
index 25a607051a89..294a91e98a8c 100644
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -22,6 +22,7 @@
     llama3_8b_fp4,
     llama3_8b_fp8,
     llama4_scout_fp4,
+    llama4_scout_fp8,
     qwen3_a3b_fp8,
 )
 
@@ -30,9 +31,9 @@
     "model_name, matches_fn, model_kwargs, hf_overrides, use_deepgemm",
     [
         (*llama3_8b_fp8, False),
-        # (*llama4_scout_fp8, False),
+        (*llama4_scout_fp8, False),
         (*qwen3_a3b_fp8, False),
-        # (*qwen3_a3b_fp8, True),
+        (*qwen3_a3b_fp8, True),
     ],
 )
 @pytest.mark.parametrize(

From 0326f765d1ae189883c46a0f3836d356ef02e018 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 14:22:05 +0000
Subject: [PATCH 23/45] fix the test case, amd cannot run nvidia model

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml                    |  2 +-
 tests/compile/fusions_e2e/test_tp1_quant.py | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 1aad50efb39a..7ec9ccbf74cf 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1724,7 +1724,7 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
+    - "oh"
     # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
     # TODO: Qwen uses group quantization which the pattern matcher on ROCm is not supported yet.
     # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py
index 294a91e98a8c..5b167d6dc7cd 100644
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -31,10 +31,16 @@
     "model_name, matches_fn, model_kwargs, hf_overrides, use_deepgemm",
     [
         (*llama3_8b_fp8, False),
-        (*llama4_scout_fp8, False),
         (*qwen3_a3b_fp8, False),
         (*qwen3_a3b_fp8, True),
-    ],
+    ]
+    + (
+        [
+            (*llama4_scout_fp8, False),
+        ]
+        if current_platform.is_cuda()
+        else []
+    ),
 )
 @pytest.mark.parametrize(
     "attn_backend",

From 9001be5b46001070f8f78fd89e774de773980409 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 14:26:07 +0000
Subject: [PATCH 24/45] remove sequence parallel test

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 7ec9ccbf74cf..d1417501cfbd 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1663,23 +1663,6 @@ steps:
 # which affects fusion passes on ROCm. So we have to 
 # enable them as as soon as possible.
 
-# corresponds to .buildkite/test_areas/compile.yaml
-- label: Sequence Parallel Correctness Tests (2xMI325 GPUs)
-  timeout_in_minutes: 50
-  working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction]
-  agent_pool: mi325_2
-  num_devices: 2
-  source_file_dependencies:
-  - vllm/model_executor/layers/
-  - vllm/compilation/
-  - vllm/v1/worker/
-  - vllm/v1/cudagraph_dispatcher.py
-  - tests/compile/correctness_e2e/test_sequence_parallel.py
-  commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - "pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py"
-
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion and Compile Unit Tests (2xMI325 GPUs)
   timeout_in_minutes: 20
@@ -1751,7 +1734,12 @@ steps:
     - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'llama-3'"
     - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'"
 
+## There are no ops on ROCm for these tests.
+## The test still passes but the logs are not useful.
+## fused ops just call torch.ops.symm_mem which 
+## exists in ROCm even though they don't work
 # - label: AsyncTP Correctness Tests  (2xMI325 GPUs)
 # - label: Fusion E2E TP2 Quick (MI325)
 # - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
 # - label: Fusion E2E TP2 (MI325)
+# - label: Sequence Parallel Correctness Tests (2xMI325 GPUs)

From ca222af52c0e052d33c0a61f73e44ebbd6f145f1 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 14:27:50 +0000
Subject: [PATCH 25/45] skip sequence parallel on non-cuda

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/compile/passes/distributed/test_sequence_parallelism.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/compile/passes/distributed/test_sequence_parallelism.py b/tests/compile/passes/distributed/test_sequence_parallelism.py
index 46363a9a4a44..a793d68522b4 100644
--- a/tests/compile/passes/distributed/test_sequence_parallelism.py
+++ b/tests/compile/passes/distributed/test_sequence_parallelism.py
@@ -36,6 +36,8 @@
 from vllm.utils.system_utils import update_environment_variables
 from vllm.utils.torch_utils import set_random_seed
 
+pytestmark = pytest.mark.skipif(not current_platform.is_cuda(), reason="Only test CUDA")
+
 FP8_DTYPE = current_platform.fp8_dtype()
 prompts = [
     "Hello, my name is",

From 06b0aca4f2f4499a6bc470c5b6eb393381922a17 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 15:39:05 +0000
Subject: [PATCH 26/45] fix the test_config_generation.py

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/config/test_config_generation.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/config/test_config_generation.py b/tests/config/test_config_generation.py
index 225ac0f2226d..a235e1a20a88 100644
--- a/tests/config/test_config_generation.py
+++ b/tests/config/test_config_generation.py
@@ -80,12 +80,13 @@ def create_config():
     ray.shutdown()
 
 
-def test_unrecognized_env():
-    import os
+def test_unrecognized_env(monkeypatch):
+    # Remove any existing VLLM env vars that might interfere
+    monkeypatch.delenv("VLLM_TEST_GROUP_NAME", raising=False)
 
     # Test that if fail_on_environ_validation is True, then an error
     # is raised when an unrecognized vLLM environment variable is set
-    os.environ["VLLM_UNRECOGNIZED_ENV_VAR"] = "some_value"
+    monkeypatch.setenv("VLLM_UNRECOGNIZED_ENV_VAR", "some_value")
     engine_args = EngineArgs(
         fail_on_environ_validation=True,
     )
@@ -97,7 +98,7 @@ def test_unrecognized_env():
     engine_args.create_engine_config()
 
     # Test that when the unrecognized env var is removed, no error is raised
-    os.environ.pop("VLLM_UNRECOGNIZED_ENV_VAR", None)
+    monkeypatch.delenv("VLLM_UNRECOGNIZED_ENV_VAR")
     engine_args = EngineArgs(
         fail_on_environ_validation=True,
     )

From 676184e2d2a5365742ba0029d267f41dee642ada Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 12 Feb 2026 03:13:03 +0000
Subject: [PATCH 27/45] fix test_configuration

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/config/test_config_generation.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/config/test_config_generation.py b/tests/config/test_config_generation.py
index a235e1a20a88..c7edf2b97174 100644
--- a/tests/config/test_config_generation.py
+++ b/tests/config/test_config_generation.py
@@ -81,8 +81,14 @@ def create_config():
 
 
 def test_unrecognized_env(monkeypatch):
-    # Remove any existing VLLM env vars that might interfere
-    monkeypatch.delenv("VLLM_TEST_GROUP_NAME", raising=False)
+    import os
+
+    from vllm.envs import environment_variables
+
+    # Remove any existing unrecognized VLLM env vars that might interfere
+    for env in list(os.environ):
+        if env.startswith("VLLM_") and env not in environment_variables:
+            monkeypatch.delenv(env, raising=False)
 
     # Test that if fail_on_environ_validation is True, then an error
     # is raised when an unrecognized vLLM environment variable is set

From b566461790559b6f1bcba96b3377a2483044980c Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 12 Feb 2026 03:17:06 +0000
Subject: [PATCH 28/45] fix the qwen3 e2e fusion pass on ROCm + AITER

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml                            |  7 +++----
 tests/compile/fusions_e2e/common.py                 |  4 ++++
 tests/compile/fusions_e2e/conftest.py               |  5 +++++
 tests/compile/fusions_e2e/models.py                 | 13 ++++++++++---
 tests/compile/fusions_e2e/test_tp1_quant.py         |  8 +++++++-
 vllm/compilation/passes/fusion/rocm_aiter_fusion.py |  4 +++-
 6 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index d1417501cfbd..53cf6ef530a9 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1707,10 +1707,9 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - "oh"
-    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    # TODO: Qwen uses group quantization which the pattern matcher on ROCm is not supported yet.
-    # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
+    # Different from CUDA, Qwen requires +rms_norm and +quant_fp8 as rms+quant fusion is only supported on AITER
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and +rms_norm and +quant_fp8 and qwen3'"
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion E2E Config Sweep (MI325)
diff --git a/tests/compile/fusions_e2e/common.py b/tests/compile/fusions_e2e/common.py
index 284a9d66b957..2c6dc2b3ebbc 100644
--- a/tests/compile/fusions_e2e/common.py
+++ b/tests/compile/fusions_e2e/common.py
@@ -13,6 +13,7 @@
 
 class Matches(NamedTuple):
     # simple pointwise
+    aiter_rms_quant_fusion: int = 0
     rms_quant_fusion: int = 0
     act_quant_fusion: int = 0
     norm_rope_fusion: int = 0
@@ -82,6 +83,9 @@ def has_cuda_graph_wrapper_metadata() -> bool:
 ]
 
 FUSION_LOG_PATTERNS: dict[str, re.Pattern] = {
+    "aiter_rms_quant_fusion": re.compile(
+        r"RocmAiterRMSNormQuantFusionPass Replaced (\d+) patterns"
+    ),
     "rms_quant_fusion": re.compile(r"rms_quant_fusion.py:\d+] Replaced (\d+) patterns"),
     "act_quant_fusion": re.compile(r"act_quant_fusion.py:\d+] Replaced (\d+) patterns"),
     "norm_rope_fusion": re.compile(
diff --git a/tests/compile/fusions_e2e/conftest.py b/tests/compile/fusions_e2e/conftest.py
index 1d9f6cda9fd6..3dce32f9b404 100644
--- a/tests/compile/fusions_e2e/conftest.py
+++ b/tests/compile/fusions_e2e/conftest.py
@@ -63,9 +63,14 @@ def run(
         compilation_config: dict,
         matches_check: list[str],
         use_deepgemm: bool = False,
+        use_aiter: bool = False,
         tp_size: int = 1,
     ):
         monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1" if use_deepgemm else "0")
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1" if use_aiter else "0")
+        from vllm._aiter_ops import rocm_aiter_ops
+
+        rocm_aiter_ops.refresh_env_variables()
 
         # Disable, compile cache to make sure custom passes run.
         # Otherwise, we can't verify fusion happened through the logs.
diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py
index ffca5bc6c0a5..c459791f40f6 100644
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -133,7 +133,7 @@
             attn_quant_fusion=n_layers,
             ar_rms_fusion=0,
             sequence_parallel=n_layers * 2,
-            async_tp=0
+            async_tp=0,
         )
     ),
 )
@@ -200,8 +200,15 @@
     # ROCm matches
     else (
         lambda n_layers: Matches(
-            rms_quant_fusion=n_layers,
-            norm_rope_fusion=n_layers,
+            aiter_rms_quant_fusion=n_layers,
+            rms_quant_fusion=0,
+            # TODO: Allow use to set back n_layers,
+            # On ROCm norm_rope_fusion is only supported without
+            # enabling AITER.
+            # when we are running the tests in
+            # tests/compile/fusions_e2e/test_tp1_quant.py
+            # we are enabling AITER, so no fusion happens.
+            norm_rope_fusion=0,
             ar_rms_fusion=0,
             sequence_parallel=n_layers * 2 + 1,
             async_tp=n_layers * 2,
diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py
index 5b167d6dc7cd..b8d79102e82c 100644
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -32,11 +32,11 @@
     [
         (*llama3_8b_fp8, False),
         (*qwen3_a3b_fp8, False),
-        (*qwen3_a3b_fp8, True),
     ]
     + (
         [
             (*llama4_scout_fp8, False),
+            (*qwen3_a3b_fp8, True),  # only supported on CUDA
         ]
         if current_platform.is_cuda()
         else []
@@ -92,6 +92,8 @@ def test_tp1_fp8_fusions(
         ),
     )
 
+    use_aiter = current_platform.is_rocm() and ("qwen" in model_name.lower())
+
     matches_check = [
         "rms_quant_fusion",
         "act_quant_fusion",
@@ -99,6 +101,9 @@ def test_tp1_fp8_fusions(
         "attn_quant_fusion",
     ]
 
+    if use_aiter:
+        matches_check.append("aiter_rms_quant_fusion")
+
     run_e2e_fusion_test(
         model_name,
         matches,
@@ -107,6 +112,7 @@ def test_tp1_fp8_fusions(
         compilation_config,
         matches_check,
         use_deepgemm=use_deepgemm,
+        use_aiter=use_aiter,
     )
 
 
diff --git a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
index 99278365c5db..7d67f1bb8c01 100644
--- a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
+++ b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
@@ -312,7 +312,9 @@ def __init__(self, config: VllmConfig) -> None:
     @VllmInductorPass.time_and_log
     def __call__(self, graph: fx.Graph) -> None:
         self.matched_count = self.patterns.apply(graph)
-        logger.debug("Replaced %s patterns", self.matched_count)
+        logger.debug(
+            "%s Replaced %s patterns", self.__class__.__name__, self.matched_count
+        )
 
     def uuid(self) -> str:
         fusion_patterns = [

From 24a142d31f8f9229d5b0b0e2f3e5f886330eb20f Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 12 Feb 2026 03:56:24 +0000
Subject: [PATCH 29/45] fix pytest command

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 584a89a4ea78..37a65c9b2707 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -569,9 +569,12 @@ steps:
       --ignore=lora/test_gptoss_tp.py \
       --ignore=lora/test_qwen3moe_tp.py
   parallelism: 4
+
+##### .buildkite/test_areas/pytorch.yaml #####
+# corresponds to .buildkite/test_areas/pytorch.yaml
 - label: PyTorch Compilation Unit Tests # 15min
   timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -585,6 +588,10 @@ steps:
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
   - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+  # TODO: clean up this comment if not needed. It is used to 
+  # keep track of the tests changes during vLLM IR Ops refactoring.
+  # Use `find` to launch multiple instances of pytest.
+  - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -1667,7 +1674,7 @@ steps:
 - label: Fusion and Compile Unit Tests (2xMI325 GPUs)
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs
   source_file_dependencies:
   - csrc/quantization/fp4/
@@ -1695,7 +1702,7 @@ steps:
 - label: Fusion E2E Quick (MI325)
   timeout_in_minutes: 15
   working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   agent_pool: mi325_1
   num_devices: 1
   source_file_dependencies:
@@ -1715,7 +1722,7 @@ steps:
 - label: Fusion E2E Config Sweep (MI325)
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   agent_pool: mi325_1
   num_devices: 1
   source_file_dependencies:
@@ -1730,8 +1737,7 @@ steps:
   commands:
     - rocm-smi
     # Run just llama3 (fp8) for all config combinations
-    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'llama-3'"
-    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
 
 ## There are no ops on ROCm for these tests.
 ## The test still passes but the logs are not useful.

From 7e2cca4f1e19dbf60b60e6eb196e563340e1b5ef Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 12 Feb 2026 04:02:53 +0000
Subject: [PATCH 30/45] fix pre-commit

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/compile/passes/test_silu_mul_quant_fusion.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py
index f6d5e112dd12..f18de1ac13d6 100644
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -182,7 +182,6 @@ def ops_in_model_after(self):
 TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
 
 
-
 @pytest.mark.parametrize("num_tokens", [32, 64])
 @pytest.mark.parametrize("hidden_size", [128, 256])
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])

From 089969c172825ad0c142d1cb72a4edb2dab3e8a7 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 12 Feb 2026 04:15:34 +0000
Subject: [PATCH 31/45] fix the model configi

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/compile/fusions_e2e/models.py | 40 ++++++++++++++---------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py
index c459791f40f6..e600daaf739d 100644
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -49,9 +49,9 @@
     # ROCm matches
     else (
         lambda n_layers: Matches(
-            ar_rms_fusion=0,
-            sequence_parallel=n_layers * 2 + 1,
-            async_tp=n_layers * 4,
+            ar_rms_fusion=0,  # Not supported
+            sequence_parallel=0,  # Not supported
+            async_tp=0,  # Not supported
         )
     ),
 )
@@ -75,9 +75,9 @@
             rms_quant_fusion=n_layers * 2,
             act_quant_fusion=n_layers,
             attn_quant_fusion=n_layers,
-            ar_rms_fusion=0,
-            sequence_parallel=n_layers * 2 + 1,
-            async_tp=n_layers * 4,
+            ar_rms_fusion=0,  # Not supported
+            sequence_parallel=0,  # Not supported
+            async_tp=0,  # Not supported
         )
     ),
 )
@@ -101,9 +101,9 @@
             rms_quant_fusion=0,
             act_quant_fusion=n_layers,
             attn_quant_fusion=n_layers,
-            ar_rms_fusion=0,
-            sequence_parallel=n_layers * 2 + 1,
-            async_tp=n_layers * 4,
+            ar_rms_fusion=0,  # Not supported
+            sequence_parallel=0,  # Not supported
+            async_tp=0,  # Not supported
         )
     ),
 )
@@ -131,9 +131,9 @@
         lambda n_layers: Matches(
             rms_quant_fusion=n_layers,
             attn_quant_fusion=n_layers,
-            ar_rms_fusion=0,
-            sequence_parallel=n_layers * 2,
-            async_tp=0,
+            ar_rms_fusion=0,  # Not supported
+            sequence_parallel=0,  # Not supported
+            async_tp=0,  # Not supported
         )
     ),
 )
@@ -156,8 +156,8 @@
         lambda n_layers: Matches(
             rms_quant_fusion=0,
             attn_quant_fusion=n_layers,
-            sequence_parallel=n_layers * 2,
-            async_tp=n_layers * 2 - 1,
+            sequence_parallel=0,  # Not supported
+            async_tp=0,  # Not supported
         )
     ),
 )
@@ -177,9 +177,9 @@
     else (
         lambda n_layers: Matches(
             norm_rope_fusion=n_layers,
-            ar_rms_fusion=0,
-            sequence_parallel=n_layers * 2 + 1,
-            async_tp=n_layers * 2,
+            ar_rms_fusion=0,  # Not supported
+            sequence_parallel=0,  # Not supported
+            async_tp=0,  # Not supported
         )
     ),
 )
@@ -209,9 +209,9 @@
             # tests/compile/fusions_e2e/test_tp1_quant.py
             # we are enabling AITER, so no fusion happens.
             norm_rope_fusion=0,
-            ar_rms_fusion=0,
-            sequence_parallel=n_layers * 2 + 1,
-            async_tp=n_layers * 2,
+            ar_rms_fusion=0,  # Not supported
+            sequence_parallel=0,  # Not supported
+            async_tp=0,  # Not supported
         )
     ),
 )

From 5d0539803e028a7d2f09b7c910f85de0a27e5372 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Fri, 13 Feb 2026 07:44:03 +0000
Subject: [PATCH 32/45] remove experimental flag

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml                       |  8 ++++----
 tests/compile/fusions_e2e/test_tp2_async_tp.py | 13 +++----------
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 37a65c9b2707..83bd308631ed 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -574,7 +574,7 @@ steps:
 # corresponds to .buildkite/test_areas/pytorch.yaml
 - label: PyTorch Compilation Unit Tests # 15min
   timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -1674,7 +1674,7 @@ steps:
 - label: Fusion and Compile Unit Tests (2xMI325 GPUs)
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs
   source_file_dependencies:
   - csrc/quantization/fp4/
@@ -1702,7 +1702,7 @@ steps:
 - label: Fusion E2E Quick (MI325)
   timeout_in_minutes: 15
   working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   num_devices: 1
   source_file_dependencies:
@@ -1722,7 +1722,7 @@ steps:
 - label: Fusion E2E Config Sweep (MI325)
   timeout_in_minutes: 30
   working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   num_devices: 1
   source_file_dependencies:
diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py
index 88c3dc8192a5..35277ebe8350 100644
--- a/tests/compile/fusions_e2e/test_tp2_async_tp.py
+++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py
@@ -17,8 +17,6 @@
 )
 from .models import (
     FLASHINFER_ATTN,
-    ROCM_AITER_UNIFIED_ATTN,
-    ROCM_ATTN,
     TRITON_ATTN,
     llama3_8b,
     llama3_8b_fp8,
@@ -34,14 +32,9 @@
     "model_name, matches_fn, model_kwargs, hf_overrides",
     [llama3_8b_fp8, llama4_scout_fp8],
 )
-@pytest.mark.parametrize(
-    "attn_backend",
-    [TRITON_ATTN, FLASHINFER_ATTN]
-    if current_platform.is_cuda()
-    else [TRITON_ATTN, ROCM_ATTN, ROCM_AITER_UNIFIED_ATTN],
-)
+@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
 @pytest.mark.parametrize("n_layers", [4])
-@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("quant_fp8", "rms_norm")))
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
 @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
 def test_tp2_async_tp_fp8_fusions(
     model_name: str,
@@ -106,7 +99,7 @@ def test_tp2_async_tp_fp8_fusions(
 )
 @pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
 @pytest.mark.parametrize("n_layers", [4])
-@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("rms_norm")))
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
 @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
 def test_tp2_async_tp_fusions(
     model_name: str,

From 4922c7a0cc68af579368c998d118ac1cd8403dd2 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Fri, 13 Feb 2026 16:53:42 +0000
Subject: [PATCH 33/45] test suggestion

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml                       | 18 +++++++++++++-----
 .../layers/attention/mla_attention.py          |  2 +-
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 83bd308631ed..93f00f62c334 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -588,10 +588,18 @@ steps:
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
   - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
-  # TODO: clean up this comment if not needed. It is used to 
-  # keep track of the tests changes during vLLM IR Ops refactoring.
-  # Use `find` to launch multiple instances of pytest.
-  - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+
+# corresponds to .buildkite/test_areas/pytorch.yaml
+- label: PyTorch Compilation Passes Unit Tests
+  timeout_in_minutes: 20
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  agent_pool: mi325_1
+  source_file_dependencies:
+    - vllm/
+    - tests/compile/passes
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -s -v compile/passes --ignore compile/passes/distributed
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30
@@ -1674,7 +1682,7 @@ steps:
 - label: Fusion and Compile Unit Tests (2xMI325 GPUs)
   timeout_in_minutes: 20
   working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs
   source_file_dependencies:
   - csrc/quantization/fp4/
diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py
index ed066784d652..d6e7c6447b14 100644
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -403,7 +403,7 @@ def __init__(
         self.is_aiter_triton_fp4_bmm_enabled = (
             rocm_aiter_ops.is_fp4bmm_enabled()
             and self.kv_b_proj.weight.dtype == torch.bfloat16
-            and current_platform.has_device_capability(95)  # gfx950 and above
+            # and current_platform.has_device_capability(95)  # gfx950 and above
         )
 
         # Attributes for forward_impl method

From 1aace958ab163efe87b7effed827c23a0ccd7a5e Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Sat, 14 Feb 2026 03:10:19 +0000
Subject: [PATCH 34/45] revert pytorch test

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 93f00f62c334..87d5f46cb641 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -592,14 +592,16 @@ steps:
 # corresponds to .buildkite/test_areas/pytorch.yaml
 - label: PyTorch Compilation Passes Unit Tests
   timeout_in_minutes: 20
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   source_file_dependencies:
     - vllm/
     - tests/compile/passes
   commands:
-  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -s -v compile/passes --ignore compile/passes/distributed
+  # TODO: clean up this comment if not needed. It is used to 
+  # keep track of the tests changes during vLLM IR Ops refactoring.
+  # Use `find` to launch multiple instances of pytest.
+  - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
 
 - label: PyTorch Fullgraph Smoke Test # 15min
   timeout_in_minutes: 30

From 3f0e188902fee0292434b3bba209312abeee2c57 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Sat, 14 Feb 2026 15:41:09 +0000
Subject: [PATCH 35/45] remove mla related bugfix

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 vllm/model_executor/layers/attention/mla_attention.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py
index 9cb0882059b6..98ff02e9d4ae 100644
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -404,7 +404,6 @@ def __init__(
         self.is_aiter_triton_fp4_bmm_enabled = (
             rocm_aiter_ops.is_fp4bmm_enabled()
             and self.kv_b_proj.weight.dtype == torch.bfloat16
-            # and current_platform.has_device_capability(95)  # gfx950 and above
         )
 
         # Attributes for forward_impl method

From 64d3b6324cb9f8a5ded94bf1bdeefe77d2e0ded4 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Sat, 14 Feb 2026 15:47:50 +0000
Subject: [PATCH 36/45] convert condition to pytest.param

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/compile/fusions_e2e/test_tp1_quant.py   | 52 ++++++++++++++-----
 .../passes/test_silu_mul_quant_fusion.py      | 13 ++++-
 2 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py
index b8d79102e82c..54e0e40ffc07 100644
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -32,21 +32,47 @@
     [
         (*llama3_8b_fp8, False),
         (*qwen3_a3b_fp8, False),
-    ]
-    + (
-        [
-            (*llama4_scout_fp8, False),
-            (*qwen3_a3b_fp8, True),  # only supported on CUDA
-        ]
-        if current_platform.is_cuda()
-        else []
-    ),
+        pytest.param(
+            *llama4_scout_fp8,
+            False,
+            marks=pytest.mark.skipif(
+                not current_platform.is_cuda(),
+                reason="Llama4 Scout FP8 only supported on CUDA",
+            ),
+        ),
+        pytest.param(
+            *qwen3_a3b_fp8,
+            True,
+            marks=pytest.mark.skipif(
+                not current_platform.is_cuda(), reason="DeepGemm only supported on CUDA"
+            ),
+        ),
+    ],
 )
 @pytest.mark.parametrize(
     "attn_backend",
-    [TRITON_ATTN, FLASHINFER_ATTN]
-    if current_platform.is_cuda()
-    else [TRITON_ATTN, ROCM_ATTN, ROCM_AITER_UNIFIED_ATTN],
+    [
+        TRITON_ATTN,
+        pytest.param(
+            FLASHINFER_ATTN,
+            marks=pytest.mark.skipif(
+                not current_platform.is_cuda(),
+                reason="FlashInfer only supported on CUDA",
+            ),
+        ),
+        pytest.param(
+            ROCM_ATTN,
+            marks=pytest.mark.skipif(
+                current_platform.is_cuda(), reason="ROCm attention only for AMD"
+            ),
+        ),
+        pytest.param(
+            ROCM_AITER_UNIFIED_ATTN,
+            marks=pytest.mark.skipif(
+                current_platform.is_cuda(), reason="ROCm AIter only for AMD"
+            ),
+        ),
+    ],
 )
 @pytest.mark.parametrize("n_layers", [6])
 @pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
@@ -102,7 +128,7 @@ def test_tp1_fp8_fusions(
     ]
 
     if use_aiter:
-        matches_check.append("aiter_rms_quant_fusion")
+        matches_check[0] = "aiter_rms_quant_fusion"
 
     run_e2e_fusion_test(
         model_name,
diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py
index f18de1ac13d6..1d807e07586b 100644
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -191,9 +191,20 @@ def ops_in_model_after(self):
     list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS))
     + [
         (TestSiluMulNvfp4QuantModel, False, None),
+        pytest.param(
+            TestSiluMulGroupFp8QuantModel,
+            False,
+            None,
+            skipif=not current_platform.is_cuda(),
+        ),
         # GroupFP8Quant fusion only works with AITER on ROCm.
         # and the enable_quant_fp8_custom_op must be True.
-        (TestSiluMulGroupFp8QuantModel, True, None),
+        pytest.param(
+            TestSiluMulGroupFp8QuantModel,
+            True,
+            None,
+            skipif=not current_platform.is_rocm(),
+        ),
     ],
 )
 @pytest.mark.skipif(

From c186a19b06d5ced2c70947df44926e24259d72ad Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Sat, 14 Feb 2026 15:51:31 +0000
Subject: [PATCH 37/45] apply suggestion

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/compile/fusions_e2e/models.py         | 8 +-------
 tests/compile/fusions_e2e/test_tp1_quant.py | 9 +++++++++
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py
index e600daaf739d..c76023bbbec9 100644
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -202,13 +202,7 @@
         lambda n_layers: Matches(
             aiter_rms_quant_fusion=n_layers,
             rms_quant_fusion=0,
-            # TODO: Allow use to set back n_layers,
-            # On ROCm norm_rope_fusion is only supported without
-            # enabling AITER.
-            # when we are running the tests in
-            # tests/compile/fusions_e2e/test_tp1_quant.py
-            # we are enabling AITER, so no fusion happens.
-            norm_rope_fusion=0,
+            norm_rope_fusion=n_layers,
             ar_rms_fusion=0,  # Not supported
             sequence_parallel=0,  # Not supported
             async_tp=0,  # Not supported
diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py
index 54e0e40ffc07..e72f5b9a2d73 100644
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -130,6 +130,15 @@ def test_tp1_fp8_fusions(
     if use_aiter:
         matches_check[0] = "aiter_rms_quant_fusion"
 
+    # TODO: enable the `norm_rope_fusion` test,
+    # On ROCm norm_rope_fusion is only supported without
+    # enabling AITER.
+    # when we are running the tests in
+    # tests/compile/fusions_e2e/test_tp1_quant.py
+    # we are enabling AITER, so no fusion happens.
+    if "qwen" in model_name.lower():
+        matches_check.remove("norm_rope_fusion")
+
     run_e2e_fusion_test(
         model_name,
         matches,

From 050544d173c58daf8f21a63dcbd9fe680c8e6f86 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Sat, 14 Feb 2026 16:57:25 +0000
Subject: [PATCH 38/45] fix error from pytest.param

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/compile/fusions_e2e/models.py           | 13 +++++++++--
 tests/compile/fusions_e2e/test_tp1_quant.py   | 22 +++----------------
 .../passes/test_silu_mul_quant_fusion.py      |  8 +++++--
 3 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py
index c76023bbbec9..b7f6ac3f89c9 100644
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
+from vllm._aiter_ops import is_aiter_found_and_supported
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
@@ -25,14 +26,22 @@
     AttentionBackendCase(backend=AttentionBackendEnum.TRITON_ATTN), id="TRITON_ATTN"
 )
 
-# ROCm backends
 ROCM_ATTN = pytest.param(
-    AttentionBackendCase(backend=AttentionBackendEnum.ROCM_ATTN), id="ROCM_ATTN"
+    AttentionBackendCase(backend=AttentionBackendEnum.ROCM_ATTN),
+    id="ROCM_ATTN",
+    marks=pytest.mark.skipif(
+        not current_platform.is_rocm(),
+        reason="ROCm attention only for AMD",
+    ),
 )
 
 ROCM_AITER_UNIFIED_ATTN = pytest.param(
     AttentionBackendCase(backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN),
     id="ROCM_AITER_UNIFIED_ATTN",
+    marks=pytest.mark.skipif(
+        not is_aiter_found_and_supported(),
+        reason="ROCM_AITER_UNIFIED_ATTN only for AMD when AITER is installed",
+    ),
 )
 
 # Models
diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py
index e72f5b9a2d73..8037d55cdb56 100644
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -53,25 +53,9 @@
     "attn_backend",
     [
         TRITON_ATTN,
-        pytest.param(
-            FLASHINFER_ATTN,
-            marks=pytest.mark.skipif(
-                not current_platform.is_cuda(),
-                reason="FlashInfer only supported on CUDA",
-            ),
-        ),
-        pytest.param(
-            ROCM_ATTN,
-            marks=pytest.mark.skipif(
-                current_platform.is_cuda(), reason="ROCm attention only for AMD"
-            ),
-        ),
-        pytest.param(
-            ROCM_AITER_UNIFIED_ATTN,
-            marks=pytest.mark.skipif(
-                current_platform.is_cuda(), reason="ROCm AIter only for AMD"
-            ),
-        ),
+        FLASHINFER_ATTN,
+        ROCM_ATTN,
+        ROCM_AITER_UNIFIED_ATTN,
     ],
 )
 @pytest.mark.parametrize("n_layers", [6])
diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py
index 1d807e07586b..12bb16e1409f 100644
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -195,7 +195,9 @@ def ops_in_model_after(self):
             TestSiluMulGroupFp8QuantModel,
             False,
             None,
-            skipif=not current_platform.is_cuda(),
+            marks=pytest.mark.skipif(
+                not current_platform.is_cuda(), reason="CUDA only"
+            ),
         ),
         # GroupFP8Quant fusion only works with AITER on ROCm.
         # and the enable_quant_fp8_custom_op must be True.
@@ -203,7 +205,9 @@ def ops_in_model_after(self):
             TestSiluMulGroupFp8QuantModel,
             True,
             None,
-            skipif=not current_platform.is_rocm(),
+            marks=pytest.mark.skipif(
+                not current_platform.is_rocm(), reason="ROCm only"
+            ),
         ),
     ],
 )

From 97102c3eed9748fe5c32501eebcf48099eaf6475 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 24 Feb 2026 10:40:55 +0000
Subject: [PATCH 39/45] remove rocm branching in model defination

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 tests/compile/fusions_e2e/models.py         | 172 +++++---------------
 tests/compile/fusions_e2e/test_tp1_quant.py |   7 +
 2 files changed, 47 insertions(+), 132 deletions(-)

diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py
index b7f6ac3f89c9..e18bc1ee5652 100644
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -47,73 +47,33 @@
 # Models
 llama3_8b = ModelFusionInfo(
     model_name="meta-llama/Llama-3.1-8B-Instruct",
-    matches=(
-        lambda n_layers: Matches(
-            ar_rms_fusion=n_layers * 2 + 1,
-            sequence_parallel=n_layers * 2 + 1,
-            async_tp=n_layers * 4,
-        )
-    )
-    if current_platform.is_cuda()
-    # ROCm matches
-    else (
-        lambda n_layers: Matches(
-            ar_rms_fusion=0,  # Not supported
-            sequence_parallel=0,  # Not supported
-            async_tp=0,  # Not supported
-        )
+    matches=lambda n_layers: Matches(
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 4,
     ),
 )
 
 llama3_8b_fp8 = ModelFusionInfo(
     model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
-    matches=(
-        lambda n_layers: Matches(
-            rms_quant_fusion=n_layers * 2,
-            act_quant_fusion=n_layers,
-            attn_quant_fusion=n_layers,
-            ar_rms_fusion=n_layers * 2 + 1,
-            sequence_parallel=n_layers * 2 + 1,
-            async_tp=n_layers * 4,
-        )
-    )
-    if current_platform.is_cuda()
-    # ROCm matches
-    else (
-        lambda n_layers: Matches(
-            rms_quant_fusion=n_layers * 2,
-            act_quant_fusion=n_layers,
-            attn_quant_fusion=n_layers,
-            ar_rms_fusion=0,  # Not supported
-            sequence_parallel=0,  # Not supported
-            async_tp=0,  # Not supported
-        )
+    matches=lambda n_layers: Matches(
+        rms_quant_fusion=n_layers * 2,
+        act_quant_fusion=n_layers,
+        attn_quant_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 4,
     ),
 )
 
 llama3_8b_fp4 = ModelFusionInfo(
     model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
-    matches=(
-        lambda n_layers: Matches(
-            rms_quant_fusion=0,
-            act_quant_fusion=n_layers,
-            attn_quant_fusion=n_layers,
-            ar_rms_fusion=n_layers * 2 + 1,
-            sequence_parallel=n_layers * 2 + 1,
-            async_tp=n_layers * 4,
-        )
-    )
-    if current_platform.is_cuda()
-    # ROCm matches
-    else (
-        lambda n_layers: Matches(
-            rms_quant_fusion=0,
-            act_quant_fusion=n_layers,
-            attn_quant_fusion=n_layers,
-            ar_rms_fusion=0,  # Not supported
-            sequence_parallel=0,  # Not supported
-            async_tp=0,  # Not supported
-        )
+    matches=lambda n_layers: Matches(
+        act_quant_fusion=n_layers,
+        attn_quant_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 4,
     ),
 )
 
@@ -125,96 +85,44 @@
 llama4_scout_fp8 = ModelFusionInfo(
     model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
     hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
-    matches=(
-        lambda n_layers: Matches(
-            rms_quant_fusion=n_layers,
-            attn_quant_fusion=n_layers,
-            ar_rms_fusion=n_layers * 2,
-            sequence_parallel=n_layers * 2,
-            async_tp=n_layers * 2 - 1,
-        )
-    )
-    if current_platform.is_cuda()
-    # ROCm matches
-    else (
-        lambda n_layers: Matches(
-            rms_quant_fusion=n_layers,
-            attn_quant_fusion=n_layers,
-            ar_rms_fusion=0,  # Not supported
-            sequence_parallel=0,  # Not supported
-            async_tp=0,  # Not supported
-        )
+    matches=lambda n_layers: Matches(
+        rms_quant_fusion=n_layers,
+        attn_quant_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2,
+        sequence_parallel=n_layers * 2,
+        async_tp=n_layers * 2 - 1,
     ),
 )
 
 llama4_scout_fp4 = ModelFusionInfo(
     model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4",
     hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
-    matches=(
-        lambda n_layers: Matches(
-            rms_quant_fusion=0,
-            attn_quant_fusion=n_layers,
-            ar_rms_fusion=n_layers * 2,
-            sequence_parallel=n_layers * 2,
-            async_tp=n_layers * 2 - 1,
-        )
-    )
-    if current_platform.is_cuda()
-    # ROCm matches
-    else (
-        lambda n_layers: Matches(
-            rms_quant_fusion=0,
-            attn_quant_fusion=n_layers,
-            sequence_parallel=0,  # Not supported
-            async_tp=0,  # Not supported
-        )
+    matches=lambda n_layers: Matches(
+        attn_quant_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2,
+        sequence_parallel=n_layers * 2,
+        async_tp=n_layers * 2 - 1,
     ),
 )
 
 qwen3_a3b = ModelFusionInfo(
     model_name="Qwen/Qwen3-30B-A3B",
-    matches=(
-        lambda n_layers: Matches(
-            norm_rope_fusion=n_layers,
-            ar_rms_fusion=n_layers * 2 + 1,
-            sequence_parallel=n_layers * 2 + 1,
-            async_tp=n_layers * 2,
-        )
-    )
-    if current_platform.is_cuda()
-    # ROCm matches
-    else (
-        lambda n_layers: Matches(
-            norm_rope_fusion=n_layers,
-            ar_rms_fusion=0,  # Not supported
-            sequence_parallel=0,  # Not supported
-            async_tp=0,  # Not supported
-        )
+    matches=lambda n_layers: Matches(
+        norm_rope_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 2,
     ),
 )
 
 qwen3_a3b_fp8 = ModelFusionInfo(
     model_name="Qwen/Qwen3-30B-A3B-FP8",
-    matches=(
-        lambda n_layers: Matches(
-            rms_quant_fusion=n_layers,
-            norm_rope_fusion=n_layers,
-            attn_quant_fusion=0,  # attn + group quant not supported
-            ar_rms_fusion=n_layers * 2 + 1,
-            sequence_parallel=n_layers * 2 + 1,
-            async_tp=n_layers * 2,
-        )
-    )
-    if current_platform.is_cuda()
-    # ROCm matches
-    else (
-        lambda n_layers: Matches(
-            aiter_rms_quant_fusion=n_layers,
-            rms_quant_fusion=0,
-            norm_rope_fusion=n_layers,
-            ar_rms_fusion=0,  # Not supported
-            sequence_parallel=0,  # Not supported
-            async_tp=0,  # Not supported
-        )
+    matches=lambda n_layers: Matches(
+        rms_quant_fusion=n_layers,
+        norm_rope_fusion=n_layers,
+        attn_quant_fusion=0,  # attn + group quant not supported
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 2,
     ),
 )
diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py
index 8037d55cdb56..ed252b6c033b 100644
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -104,6 +104,13 @@ def test_tp1_fp8_fusions(
 
     use_aiter = current_platform.is_rocm() and ("qwen" in model_name.lower())
 
+    if not current_platform.is_cuda():
+        matches = matches._replace(ar_rms_fusion=0, sequence_parallel=0, async_tp=0)
+        if "qwen" in model_name.lower():
+            matches = matches._replace(
+                rms_quant_fusion=0, aiter_rms_quant_fusion=n_layers
+            )
+
     matches_check = [
         "rms_quant_fusion",
         "act_quant_fusion",

From 0a42a79dde4c967e9de108342deeea3cdbc7a3a8 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 26 Feb 2026 00:59:54 +0000
Subject: [PATCH 40/45] remove todo

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 37e5a84a07e6..73117cd82d28 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1532,7 +1532,7 @@ steps:
     # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
     - pytest -v -s tests/distributed/test_context_parallel.py
     - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
-    # TODO: this test is not supported on ROCm
+    # this test is not supported on ROCm
     # - pytest -v -s tests/v1/distributed/test_dbo.py
 
 ##### B200 test #####

From 3204c5cede6673932290ac0b6859e30faef458c0 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 26 Feb 2026 01:32:53 +0000
Subject: [PATCH 41/45] remove unnecessary test_tp1_quant.py

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/compile/fusions_e2e/test_tp1_quant.py | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py
index 9cce547f89f6..159de38979fa 100644
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -107,13 +107,6 @@ def test_tp1_fp8_fusions(
 
     use_aiter = current_platform.is_rocm() and ("qwen" in model_name.lower())
 
-    if not current_platform.is_cuda():
-        matches = matches._replace(ar_rms_fusion=0, sequence_parallel=0, async_tp=0)
-        if "qwen" in model_name.lower():
-            matches = matches._replace(
-                rms_quant_fusion=0, aiter_rms_quant_fusion=n_layers
-            )
-
     matches_check = [
         "rms_quant_fusion",
         "act_quant_fusion",
@@ -123,14 +116,12 @@ def test_tp1_fp8_fusions(
 
     if use_aiter:
         matches_check[0] = "aiter_rms_quant_fusion"
-
-    # TODO: enable the `norm_rope_fusion` test,
-    # On ROCm norm_rope_fusion is only supported without
-    # enabling AITER.
-    # when we are running the tests in
-    # tests/compile/fusions_e2e/test_tp1_quant.py
-    # we are enabling AITER, so no fusion happens.
-    if "qwen" in model_name.lower():
+        # TODO: enable the `norm_rope_fusion` test,
+        # On ROCm norm_rope_fusion is only supported without
+        # enabling AITER.
+        # when we are running the tests in
+        # tests/compile/fusions_e2e/test_tp1_quant.py
+        # we are enabling AITER, so no fusion happens.
         matches_check.remove("norm_rope_fusion")
 
     run_e2e_fusion_test(

From de42cfbd07af805f7d30847b13d6b121ec87610e Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 26 Feb 2026 11:20:47 +0000
Subject: [PATCH 42/45] apply reviewer feedback

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/compile/fusions_e2e/test_tp1_quant.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py
index 159de38979fa..22823de9870a 100644
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -116,6 +116,8 @@ def test_tp1_fp8_fusions(
 
     if use_aiter:
         matches_check[0] = "aiter_rms_quant_fusion"
+
+        matches = matches._replace(aiter_rms_quant_fusion=matches.rms_quant_fusion)
         # TODO: enable the `norm_rope_fusion` test,
         # On ROCm norm_rope_fusion is only supported without
         # enabling AITER.

From a168f7bd3684baee01062a6a048779403fc93ae1 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Mon, 2 Mar 2026 06:56:36 +0000
Subject: [PATCH 43/45] remove comment

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/compile/fusions_e2e/test_tp1_quant.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py
index 22823de9870a..917116515f89 100644
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -121,9 +121,6 @@ def test_tp1_fp8_fusions(
         # TODO: enable the `norm_rope_fusion` test,
         # On ROCm norm_rope_fusion is only supported without
         # enabling AITER.
-        # when we are running the tests in
-        # tests/compile/fusions_e2e/test_tp1_quant.py
-        # we are enabling AITER, so no fusion happens.
         matches_check.remove("norm_rope_fusion")
 
     run_e2e_fusion_test(

From 53d253daebc243f7ea480a73e91d0cb6c65c1a12 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 3 Mar 2026 05:27:39 +0000
Subject: [PATCH 44/45] fix SiluMulGroupQaunt

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 tests/compile/passes/test_silu_mul_quant_fusion.py  | 3 +--
 vllm/compilation/passes/fusion/rocm_aiter_fusion.py | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py
index d31cca1ef129..a77b4e6de7bd 100644
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -182,9 +182,8 @@ def ops_in_model_after(self):
     "model_class, enable_quant_fp8_custom_op, force_kernel",
     list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS))
     + [
-        (TestSiluMulNvfp4QuantModel, False, None),
         pytest.param(
-            TestSiluMulGroupFp8QuantModel,
+            TestSiluMulNvfp4QuantModel,
             False,
             None,
             marks=pytest.mark.skipif(
diff --git a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
index 7d67f1bb8c01..59c94db5e812 100644
--- a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
+++ b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
@@ -14,7 +14,7 @@
     GroupShape,
     QuantKey,
     ScaleDesc,
-    kFp8Static128BlockSym,
+    kFp8Dynamic128Sym,
 )
 from vllm.platforms import current_platform
 
@@ -337,7 +337,7 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern):
     def __init__(self) -> None:
         self.silu_and_mul_matcher = MatcherSiluAndMul()
         self.quant_matcher = MatcherQuantFP8(
-            quant_key=kFp8Static128BlockSym, match_rocm_aiter=True
+            quant_key=kFp8Dynamic128Sym, match_rocm_aiter=True
         )
 
     def get_inputs(self) -> list[torch.Tensor]:
@@ -350,7 +350,7 @@ def pattern(
             input: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             at1 = self.silu_and_mul_matcher(input)
-            at2 = self.quant_matcher(at1, 128)
+            at2 = self.quant_matcher(at1)
             return at2[0], at2[1]
 
         def replacement(

From 8374509a5a442c118f79c088022029ec3d546545 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 3 Mar 2026 07:51:56 +0000
Subject: [PATCH 45/45] comment out redundant tests

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 56 +++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 73117cd82d28..d4dbe7232342 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1650,33 +1650,35 @@ steps:
 # which affects fusion passes on ROCm. So we have to 
 # enable them as as soon as possible.
 
-# corresponds to .buildkite/test_areas/compile.yaml
-- label: Fusion and Compile Unit Tests (2xMI325 GPUs)
-  timeout_in_minutes: 20
-  working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
-  agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/attention/attention.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
-  - tests/compile/test_fusion_attn.py
-  - tests/compile/test_silu_mul_quant_fusion.py
-  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/fullgraph/test_full_graph.py
-  commands:
-    - rocm-smi
-    # we run all backend tests on ROCm
-    - "pytest -v -s tests/compile/passes/test_fusion_attn.py"
-    - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py"
-    # TODO: this test is not supported on ROCm, there are aiter kernels for this.
-    # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
-    # TODO: find out more details
-    # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+## TODO: Enable the test in this group
+# # corresponds to .buildkite/test_areas/compile.yaml
+# - label: Fusion and Compile Unit Tests (2xMI325 GPUs)
+#   timeout_in_minutes: 20
+#   working_dir: "/vllm-workspace/"
+#   mirror_hardwares: [amdexperimental, amdproduction, tj]
+#   agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs
+#   source_file_dependencies:
+#   - csrc/quantization/fp4/
+#   - vllm/model_executor/layers/quantization/
+#   - vllm/model_executor/layers/layernorm.py
+#   - vllm/model_executor/layers/activation.py
+#   - vllm/model_executor/layers/attention/attention.py
+#   - vllm/v1/attention/backends/flashinfer.py
+#   - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
+#   - tests/compile/test_fusion_attn.py
+#   - tests/compile/test_silu_mul_quant_fusion.py
+#   - tests/compile/distributed/test_fusion_all_reduce.py
+#   - tests/compile/fullgraph/test_full_graph.py
+#   commands:
+#     - rocm-smi
+#     # we run all backend tests on ROCm
+#     # These two tests are covered in "PyTorch Compilation Passes Unit Tests"
+#     # - "pytest -v -s tests/compile/passes/test_fusion_attn.py"
+#     # - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py"
+#     # TODO: this test is not supported on ROCm, there are aiter kernels for this.
+#     # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+#     # TODO: find out more details
+#     # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion E2E Quick (MI325)