From 230246d9349db65878aee63f7c4c8b4a920fa821 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Mon, 9 Feb 2026 08:28:20 +0000
Subject: [PATCH 01/16] try to enable new fusion pass test for ROCm

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml                      | 174 +++++++++++++-----
 tests/compile/fullgraph/test_full_graph.py    |  34 +++-
 tests/compile/fusions_e2e/models.py           |  83 +++++++--
 tests/compile/fusions_e2e/test_tp1_quant.py   |  15 +-
 tests/compile/passes/test_fusion_attn.py      |   2 +-
 .../passes/test_silu_mul_quant_fusion.py      |  15 +-
 .../layers/quantization/input_quant_fp8.py    |   1 +
 7 files changed, 245 insertions(+), 79 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 503b3a76f941..986708e37641 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -570,9 +570,11 @@ steps:
       --ignore=lora/test_qwen3moe_tp.py
   parallelism: 4
 
+##### .buildkite/test_areas/pytorch.yaml #####
+# corresponds to .buildkite/test_areas/pytorch.yaml
 - label: PyTorch Compilation Unit Tests # 15min
   timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -586,10 +588,14 @@ steps:
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
   - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+  # TODO: clean up this comment if not needed. It is used to 
+  # keep track of the tests changes during vLLM IR Ops refactoring.
+  - pytest -s -v compile/passes --ignore compile/passes/distributed
 
+# corresponds to .buildkite/test_areas/pytorch.yaml
 - label: PyTorch Fullgraph Smoke Test # 15min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -603,9 +609,10 @@ steps:
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
   - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
 
-- label: PyTorch Fullgraph Test # 27min
-  timeout_in_minutes: 40
-  mirror_hardwares: [amdexperimental, amdproduction]
+# corresponds to .buildkite/test_areas/pytorch.yaml
+- label: PyTorch Fullgraph # 27min
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -1176,41 +1183,6 @@ steps:
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
     - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
 
-- label: Blackwell Fusion and Compile Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/v1/worker/
-  - vllm/v1/cudagraph_dispatcher.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/passes/test_fusion_attn.py
-  - tests/compile/passes/test_silu_mul_quant_fusion.py
-  - tests/compile/passes/distributed/test_fusion_all_reduce.py
-  - tests/compile/fullgraph/test_full_graph.py
-  commands:
-    - nvidia-smi
-    - pytest -v -s tests/compile/passes/test_fusion_attn.py
-    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
-
-    # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # # Wrap with quotes to escape yaml
-    # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
-    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
-    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
-
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
-
 - label: Blackwell GPT-OSS Eval
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
@@ -1334,7 +1306,6 @@ steps:
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - pytest -v -s distributed/test_sequence_parallel.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
   - pytest -v -s v1/worker/test_worker_memory_snapshot.py
 
@@ -1558,17 +1529,20 @@ steps:
   num_gpus: 2
   commands:
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
+    # ================= 24 passed, 11 warnings in 192.85s (0:03:12) ==================
     - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
+    # ================== 48 passed, 8 warnings in 386.41s (0:06:26) ==================
     - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+    # ======================== 8 skipped, 9 warnings in 2.08s ========================
     #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
     # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
     # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
-
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
+    # ======================== 4 passed, 3 warnings in 30.45s ========================
     - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
     - pytest -v -s tests/v1/distributed/test_dbo.py
+    # ======================== 2 skipped, 3 warnings in 1.97s ========================
 
 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
@@ -1692,3 +1666,115 @@ steps:
   working_dir: "/vllm-workspace"
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
+
+##### .buildkite/test_areas/compile.yaml #####
+# Slowly setting up the tests so that it is also easier for the 
+# CI team to review and upstream to the pipelinev2.
+# The following tests are important for vLLM IR Ops refactoring,
+# which affects fusion passes on ROCm. So we have to 
+# enable them as as soon as possible.
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: Sequence Parallel Correctness Tests (2xMI325 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  agent_pool: mi325_2
+  num_devices: 2
+  source_file_dependencies:
+  - vllm/model_executor/layers/
+  - vllm/compilation/
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - tests/compile/correctness_e2e/test_sequence_parallel.py
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: Fusion and Compile Unit Tests (2xMI325 GPUs)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  agent_pool: mi325_2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/attention/attention.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/fullgraph/test_full_graph.py
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/attention/attention.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
+  - tests/compile/passes/test_fusion_attn.py
+  - tests/compile/passes/test_silu_mul_quant_fusion.py
+  - tests/compile/passes/distributed/test_fusion_all_reduce.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    - rocm-smi
+    # we run all backend tests on ROCm
+    - pytest -v -s tests/compile/passes/test_fusion_attn.py
+    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
+    # TODO: this test is not supported on ROCm, there are aiter kernels for this.
+    # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    # TODO: this test is not supported on ROCm
+    # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: Fusion E2E Quick (MI325)
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  agent_pool: mi325_1
+  num_devices: 1
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - rocm-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: Fusion E2E Config Sweep (MI325)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  agent_pool: mi325_1
+  num_devices: 1
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - rocm-smi
+    # Run just llama3 (fp8) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
+
+# corresponds to .buildkite/test_areas/kernels.yaml
+# Skip the following tests as they are not supported on ROCm
+# - label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
+# - label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
+# - label: Fusion E2E TP2 (B200)
diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
index ed4c92d90ff7..733ec22c98d6 100644
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -194,13 +194,31 @@ def test_custom_compile_config(
 )
 @pytest.mark.parametrize(
     "model, backend",
-    [
-        ("Qwen/Qwen2-0.5B", None),  # Standard attention model
-        (
-            "deepseek-ai/DeepSeek-V2-Lite",
-            AttentionBackendEnum.FLASHINFER_MLA,
-        ),  # MLA (Multi-head Latent Attention) model
-    ],
+    (
+        [
+            ("Qwen/Qwen2-0.5B", None),  # Standard attention model
+            (
+                "deepseek-ai/DeepSeek-V2-Lite",
+                AttentionBackendEnum.FLASHINFER_MLA,
+            ),  # MLA (Multi-head Latent Attention) model
+        ]
+        if current_platform.is_cuda()
+        else [
+            # ("Qwen/Qwen2-0.5B", None),  # Standard attention model
+            # (
+            #     "deepseek-ai/DeepSeek-V2-Lite",
+            #     AttentionBackendEnum.TRITON_MLA,
+            # ),  # MLA (Multi-head Latent Attention) model
+            (
+                "deepseek-ai/DeepSeek-V2-Lite",
+                AttentionBackendEnum.ROCM_AITER_MLA,
+            ),  # MLA (Multi-head Latent Attention) model
+            (
+                "deepseek-ai/DeepSeek-V2-Lite",
+                AttentionBackendEnum.ROCM_AITER_TRITON_MLA,
+            ),  # MLA (Multi-head Latent Attention) model
+        ]
+    ),
 )
 def test_fp8_kv_scale_compile(
     compilation_mode: int,
@@ -209,7 +227,7 @@ def test_fp8_kv_scale_compile(
 ):
     model_kwargs = {
         "quantization": "fp8",
-        "kv_cache_dtype": "fp8_e4m3",
+        "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8",
         "calculate_kv_scales": True,
         "max_model_len": 512,
     }
diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py
index f54f617c64d4..525ed1b515bc 100644
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
+from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
 
@@ -24,37 +25,83 @@
     AttentionBackendCase(backend=AttentionBackendEnum.TRITON_ATTN), id="TRITON_ATTN"
 )
 
+# ROCm backends
+ROCM_ATTN = pytest.param(
+    AttentionBackendCase(backend=AttentionBackendEnum.ROCM_ATTN), id="ROCM_ATTN"
+)
+
+ROCM_AITER_UNIFIED_ATTN = pytest.param(
+    AttentionBackendCase(backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN),
+    id="ROCM_AITER_UNIFIED_ATTN",
+)
+
 # Models
 llama3_8b = ModelFusionInfo(
     model_name="meta-llama/Llama-3.1-8B-Instruct",
-    matches=lambda n_layers: Matches(
-        ar_rms_fusion=n_layers * 2 + 1,
-        sequence_parallel=n_layers * 2 + 1,
-        async_tp=n_layers * 4,
+    matches=(
+        lambda n_layers: Matches(
+            ar_rms_fusion=n_layers * 2 + 1,
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 4,
+        ),
+    )
+    if current_platform.is_cuda()
+    else (  # ROCm matches
+        lambda n_layers: Matches(
+            ar_rms_fusion=0,
+            sequence_parallel=0,
+            async_tp=0,
+        ),
     ),
 )
 
 llama3_8b_fp8 = ModelFusionInfo(
     model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
-    matches=lambda n_layers: Matches(
-        rms_quant_fusion=n_layers * 2,
-        act_quant_fusion=n_layers,
-        attn_quant_fusion=n_layers,
-        ar_rms_fusion=n_layers * 2 + 1,
-        sequence_parallel=n_layers * 2 + 1,
-        async_tp=n_layers * 4,
+    matches=(
+        lambda n_layers: Matches(
+            rms_quant_fusion=n_layers * 2,
+            act_quant_fusion=n_layers,
+            attn_quant_fusion=n_layers,
+            ar_rms_fusion=n_layers * 2 + 1,
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 4,
+        )
+    )
+    if current_platform.is_cuda()
+    else (  # ROCm matches
+        lambda n_layers: Matches(
+            rms_quant_fusion=n_layers * 2,
+            act_quant_fusion=n_layers,
+            attn_quant_fusion=n_layers,
+            ar_rms_fusion=0,
+            sequence_parallel=0,
+            async_tp=0,
+        ),
     ),
 )
 
 llama3_8b_fp4 = ModelFusionInfo(
     model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
-    matches=lambda n_layers: Matches(
-        rms_quant_fusion=0,
-        act_quant_fusion=n_layers,
-        attn_quant_fusion=n_layers,
-        ar_rms_fusion=n_layers * 2 + 1,
-        sequence_parallel=n_layers * 2 + 1,
-        async_tp=n_layers * 4,
+    matches=(
+        lambda n_layers: Matches(
+            rms_quant_fusion=0,
+            act_quant_fusion=n_layers,
+            attn_quant_fusion=n_layers,
+            ar_rms_fusion=n_layers * 2 + 1,
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 4,
+        )
+    )
+    if current_platform.is_cuda()
+    else (  # ROCm matches
+        lambda n_layers: Matches(
+            rms_quant_fusion=0,
+            act_quant_fusion=n_layers,
+            attn_quant_fusion=n_layers,
+            ar_rms_fusion=0,
+            sequence_parallel=0,
+            async_tp=0,
+        ),
     ),
 )
 
diff --git a/tests/compile/fusions_e2e/test_tp1_quant.py b/tests/compile/fusions_e2e/test_tp1_quant.py
index 03f102794f85..25a607051a89 100644
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -5,6 +5,7 @@
 import pytest
 
 from vllm.config import PassConfig
+from vllm.platforms import current_platform
 
 from .common import (
     INDUCTOR_GRAPH_PARTITION,
@@ -15,11 +16,12 @@
 )
 from .models import (
     FLASHINFER_ATTN,
+    ROCM_AITER_UNIFIED_ATTN,
+    ROCM_ATTN,
     TRITON_ATTN,
     llama3_8b_fp4,
     llama3_8b_fp8,
     llama4_scout_fp4,
-    llama4_scout_fp8,
     qwen3_a3b_fp8,
 )
 
@@ -28,12 +30,17 @@
     "model_name, matches_fn, model_kwargs, hf_overrides, use_deepgemm",
     [
         (*llama3_8b_fp8, False),
-        (*llama4_scout_fp8, False),
+        # (*llama4_scout_fp8, False),
         (*qwen3_a3b_fp8, False),
-        (*qwen3_a3b_fp8, True),
+        # (*qwen3_a3b_fp8, True),
     ],
 )
-@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
+@pytest.mark.parametrize(
+    "attn_backend",
+    [TRITON_ATTN, FLASHINFER_ATTN]
+    if current_platform.is_cuda()
+    else [TRITON_ATTN, ROCM_ATTN, ROCM_AITER_UNIFIED_ATTN],
+)
 @pytest.mark.parametrize("n_layers", [6])
 @pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
 @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
diff --git a/tests/compile/passes/test_fusion_attn.py b/tests/compile/passes/test_fusion_attn.py
index 75d5c42f0731..a35db7bb21ff 100644
--- a/tests/compile/passes/test_fusion_attn.py
+++ b/tests/compile/passes/test_fusion_attn.py
@@ -267,7 +267,7 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
     PATTERN_TEST_MODELS_FP8 = [
         ("amd/Llama-3.1-8B-Instruct-FP8-KV", TestAttentionFp8StaticQuantPatternModel)
     ]
-    BACKENDS = [
+    BACKENDS_FP8 = [
         AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
         AttentionBackendEnum.ROCM_ATTN,
         AttentionBackendEnum.TRITON_ATTN,
diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py
index c5ef015015ce..64aad53525a5 100644
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
+from typing import Any
 
 import pytest
 import torch
@@ -148,6 +149,9 @@ def __init__(self, hidden_size: int, **kwargs):
             weight_group_shape=GroupShape(128, 128),
             act_quant_group_shape=GroupShape(1, 128),
             cutlass_block_fp8_supported=False,
+            # this parameter cannot always be True,
+            # it depends on the VLLM_ROCM_USE_AITER
+            # and VLLM_ROCM_USE_AITER_LINEAR environment variables
             use_aiter_and_is_supported=True,
         )
         self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
@@ -181,6 +185,12 @@ def ops_in_model_after(self):
 ]
 TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
 
+EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [
+    (TestSiluMulGroupFp8QuantModel, False, None),
+]
+if current_platform.is_cuda():
+    EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None))
+
 
 @pytest.mark.parametrize("num_tokens", [32, 64])
 @pytest.mark.parametrize("hidden_size", [128, 256])
@@ -189,10 +199,7 @@ def ops_in_model_after(self):
 @pytest.mark.parametrize(
     "model_class, enable_quant_fp8_custom_op, force_kernel",
     list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS))
-    + [
-        (TestSiluMulNvfp4QuantModel, False, None),
-        (TestSiluMulGroupFp8QuantModel, False, None),
-    ],
+    + EXTENDED_TESTCASES,
 )
 @pytest.mark.skipif(
     envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm"
diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py
index 5bc78afa43b0..ed3b981cf183 100644
--- a/vllm/model_executor/layers/quantization/input_quant_fp8.py
+++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -171,6 +171,7 @@ def forward_native(
         x: torch.Tensor,
         scale: torch.Tensor | None = None,
         scale_ub: torch.Tensor | None = None,
+        **kwargs,
     ):
         if self.is_group_quant and not self.static:
             assert scale is None, "Dynamic group quantization does not use scale"

From 1c9552affe7f37454cde6b496a64b26e0d859ce0 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Mon, 9 Feb 2026 13:21:08 +0000
Subject: [PATCH 02/16] fix silu-mul-groupquant fuion test

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .../passes/test_silu_mul_quant_fusion.py       | 10 +++++++---
 .../passes/fusion/rocm_aiter_fusion.py         | 18 +++++++-----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py
index 64aad53525a5..687bb9aa6bfe 100644
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -10,7 +10,7 @@
 from tests.compile.backend import TestBackend
 from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
 from tests.utils import TestFP8Layer
-from vllm._aiter_ops import IS_AITER_FOUND
+from vllm._aiter_ops import IS_AITER_FOUND, rocm_aiter_ops
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.compilation.passes.fusion.act_quant_fusion import (
     FUSED_OPS,
@@ -186,7 +186,7 @@ def ops_in_model_after(self):
 TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
 
 EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [
-    (TestSiluMulGroupFp8QuantModel, False, None),
+    (TestSiluMulGroupFp8QuantModel, True, None),
 ]
 if current_platform.is_cuda():
     EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None))
@@ -216,6 +216,7 @@ def test_fusion_silu_and_mul_quant(
     enable_silu_mul_custom_op: bool,
     enable_quant_fp8_custom_op: bool,
     force_kernel: FP8ScaledMMLinearKernel | None,
+    monkeypatch: pytest.MonkeyPatch,
 ):
     if model_class is TestSiluMulNvfp4QuantModel and not is_nvfp4_supported():
         pytest.skip("NVFP4 is not supported on this GPU.")
@@ -242,13 +243,16 @@ def test_fusion_silu_and_mul_quant(
         ),
     )
 
-    with set_current_vllm_config(config):
+    with set_current_vllm_config(config), monkeypatch.context() as m:
         fusion_passes = [ActivationQuantFusionPass(config)]
         if IS_AITER_FOUND:
             from vllm.compilation.passes.fusion.rocm_aiter_fusion import (
                 RocmAiterSiluMulFp8GroupQuantFusionPass,
             )
 
+            m.setenv("VLLM_ROCM_USE_AITER", "1")
+            rocm_aiter_ops.refresh_env_variables()
+
             fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)]
 
         passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)]
diff --git a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
index d8131ce952d2..99278365c5db 100644
--- a/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
+++ b/vllm/compilation/passes/fusion/rocm_aiter_fusion.py
@@ -5,7 +5,6 @@
 import torch._inductor.pattern_matcher as pm
 from torch import fx
 from torch._inductor.pattern_matcher import PatternMatcherPass
-from torch._ops import OpOverload
 
 import vllm.model_executor.layers.quantization.utils.fp8_utils  # noqa: F401
 from vllm._aiter_ops import rocm_aiter_ops
@@ -15,6 +14,7 @@
     GroupShape,
     QuantKey,
     ScaleDesc,
+    kFp8Static128BlockSym,
 )
 from vllm.platforms import current_platform
 
@@ -332,9 +332,11 @@ class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern):
 
     FUSED_SILU_MUL_QUANT_OP = rocm_aiter_ops.get_act_mul_fused_fp8_group_quant_op()
 
-    def __init__(self, quant_op: OpOverload) -> None:
+    def __init__(self) -> None:
         self.silu_and_mul_matcher = MatcherSiluAndMul()
-        self.quant_op = quant_op
+        self.quant_matcher = MatcherQuantFP8(
+            quant_key=kFp8Static128BlockSym, match_rocm_aiter=True
+        )
 
     def get_inputs(self) -> list[torch.Tensor]:
         return [
@@ -346,7 +348,7 @@ def pattern(
             input: torch.Tensor,
         ) -> tuple[torch.Tensor, torch.Tensor]:
             at1 = self.silu_and_mul_matcher(input)
-            at2 = self.quant_op(at1, 128)
+            at2 = self.quant_matcher(at1, 128)
             return at2[0], at2[1]
 
         def replacement(
@@ -370,11 +372,6 @@ class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass):
     https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
     """
 
-    AITER_GROUP_FP8_QUANT_OP = rocm_aiter_ops.get_group_quant_op()
-    TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default
-
-    QUANT_OPS = [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]
-
     @enable_fake_mode
     def __init__(self, config: VllmConfig) -> None:
         super().__init__(config)
@@ -383,8 +380,7 @@ def __init__(self, config: VllmConfig) -> None:
             pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass"
         )
 
-        for quant_op in self.QUANT_OPS:
-            AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns)
+        AiterSiluMulFp8GroupQuantPattern().register(self.patterns)
 
         self.dump_patterns(config, self.patterns)
 

From bffe1814354d8ed1055c6a02ecdde905f3f61549 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 10 Feb 2026 07:42:59 +0000
Subject: [PATCH 03/16] fix full graph test

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 tests/compile/fullgraph/test_full_graph.py    | 27 ++++++++++++-------
 .../layers/attention/mla_attention.py         |  1 +
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
index 733ec22c98d6..921f57cea0a6 100644
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -10,6 +10,7 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
@@ -224,17 +225,25 @@ def test_fp8_kv_scale_compile(
     compilation_mode: int,
     model: str,
     backend: AttentionBackendEnum | None,
+    monkeypatch: pytest.MonkeyPatch,
 ):
-    model_kwargs = {
-        "quantization": "fp8",
-        "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8",
-        "calculate_kv_scales": True,
-        "max_model_len": 512,
-    }
-    if backend:
-        model_kwargs["attention_config"] = {"backend": backend.name}
+    with monkeypatch.context() as m:
+        model_kwargs = {
+            "quantization": "fp8",
+            "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8",
+            "calculate_kv_scales": True,
+            "max_model_len": 512,
+        }
+        if backend:
+            model_kwargs["attention_config"] = {"backend": backend.name}
+            if current_platform.is_rocm():
+                m.setenv("VLLM_ROCM_USE_AITER", "1")
+                # Disable Aiter MOE as some shapes are not supported
+                m.setenv("VLLM_ROCM_USE_AITER_MOE", "0")
+
+                rocm_aiter_ops.refresh_env_variables()
 
-    run_model(compilation_mode, model, **model_kwargs)
+        run_model(compilation_mode, model, **model_kwargs)
 
 
 def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
diff --git a/vllm/model_executor/layers/attention/mla_attention.py b/vllm/model_executor/layers/attention/mla_attention.py
index c31aa7b41d0d..ed31a2d176e3 100644
--- a/vllm/model_executor/layers/attention/mla_attention.py
+++ b/vllm/model_executor/layers/attention/mla_attention.py
@@ -403,6 +403,7 @@ def __init__(
         self.is_aiter_triton_fp4_bmm_enabled = (
             rocm_aiter_ops.is_fp4bmm_enabled()
             and self.kv_b_proj.weight.dtype == torch.bfloat16
+            and current_platform.has_device_capability(95)  # gfx950 and above
         )
 
         # Attributes for forward_impl method

From 28ed03f12de4842576366a34efbf31d44cb2a34a Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 10 Feb 2026 07:52:31 +0000
Subject: [PATCH 04/16] clearer test case for silu mul and group quant test

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 tests/compile/passes/test_silu_mul_quant_fusion.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py
index 687bb9aa6bfe..c6794a156240 100644
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -184,10 +184,11 @@ def ops_in_model_after(self):
     PerTensorTorchFP8ScaledMMLinearKernel,
 ]
 TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
+EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = []
+# SiluMulGroupFp8Quant is only supported on ROCm
+if current_platform.is_rocm():
+    EXTENDED_TESTCASES.append((TestSiluMulGroupFp8QuantModel, True, None))
 
-EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = [
-    (TestSiluMulGroupFp8QuantModel, True, None),
-]
 if current_platform.is_cuda():
     EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None))
 
@@ -245,7 +246,7 @@ def test_fusion_silu_and_mul_quant(
 
     with set_current_vllm_config(config), monkeypatch.context() as m:
         fusion_passes = [ActivationQuantFusionPass(config)]
-        if IS_AITER_FOUND:
+        if current_platform.is_rocm() and IS_AITER_FOUND:
             from vllm.compilation.passes.fusion.rocm_aiter_fusion import (
                 RocmAiterSiluMulFp8GroupQuantFusionPass,
             )

From 5628eb9b272da81a81da4477cb5774fdf7143632 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 10 Feb 2026 08:40:13 +0000
Subject: [PATCH 05/16] fix e2e fusion tests

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .buildkite/test-amd.yaml                      |  71 +++++++++-
 tests/compile/fusions_e2e/models.py           | 125 ++++++++++++------
 .../compile/fusions_e2e/test_tp2_async_tp.py  |  14 +-
 3 files changed, 164 insertions(+), 46 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 986708e37641..64a878baa774 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1691,6 +1691,17 @@ steps:
   - export VLLM_TEST_CLEAN_GPU_MEMORY=1
   - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
 
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: AsyncTP Correctness Tests  (2xMI325 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  device: mi325_2
+  optional: true
+  num_devices: 2
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
+
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion and Compile Unit Tests (2xMI325 GPUs)
   timeout_in_minutes: 20
@@ -1750,7 +1761,8 @@ steps:
     # Run all models and attn backends but only Inductor partition and native custom ops
     - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
     # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet.
+    # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion E2E Config Sweep (MI325)
@@ -1771,10 +1783,57 @@ steps:
   commands:
     - rocm-smi
     # Run just llama3 (fp8) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
     - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
 
-# corresponds to .buildkite/test_areas/kernels.yaml
-# Skip the following tests as they are not supported on ROCm
-# - label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
-# - label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
-# - label: Fusion E2E TP2 (B200)
+- label: Fusion E2E TP2 Quick (MI325)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: mi325_1
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - rocm-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: mi325_2
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - rocm-smi
+    # Run just llama3 (fp8 & bf16) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
+
+- label: Fusion E2E TP2 (MI325)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: mi325_2
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - rocm-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py
index 525ed1b515bc..77cb1b4d3ad9 100644
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -43,15 +43,16 @@
             ar_rms_fusion=n_layers * 2 + 1,
             sequence_parallel=n_layers * 2 + 1,
             async_tp=n_layers * 4,
-        ),
+        )
     )
     if current_platform.is_cuda()
-    else (  # ROCm matches
+    # ROCm matches
+    else (
         lambda n_layers: Matches(
             ar_rms_fusion=0,
-            sequence_parallel=0,
-            async_tp=0,
-        ),
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 4,
+        )
     ),
 )
 
@@ -68,15 +69,16 @@
         )
     )
     if current_platform.is_cuda()
-    else (  # ROCm matches
+    # ROCm matches
+    else (
         lambda n_layers: Matches(
             rms_quant_fusion=n_layers * 2,
             act_quant_fusion=n_layers,
             attn_quant_fusion=n_layers,
             ar_rms_fusion=0,
-            sequence_parallel=0,
-            async_tp=0,
-        ),
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 4,
+        )
     ),
 )
 
@@ -93,15 +95,16 @@
         )
     )
     if current_platform.is_cuda()
-    else (  # ROCm matches
+    # ROCm matches
+    else (
         lambda n_layers: Matches(
             rms_quant_fusion=0,
             act_quant_fusion=n_layers,
             attn_quant_fusion=n_layers,
             ar_rms_fusion=0,
-            sequence_parallel=0,
-            async_tp=0,
-        ),
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 4,
+        )
     ),
 )
 
@@ -113,45 +116,93 @@
 llama4_scout_fp8 = ModelFusionInfo(
     model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
     hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
-    matches=lambda n_layers: Matches(
-        rms_quant_fusion=n_layers,
-        attn_quant_fusion=n_layers,
-        ar_rms_fusion=n_layers * 2,
-        sequence_parallel=n_layers * 2,
-        async_tp=n_layers * 2 - 1,
+    matches=(
+        lambda n_layers: Matches(
+            rms_quant_fusion=n_layers,
+            attn_quant_fusion=n_layers,
+            ar_rms_fusion=n_layers * 2,
+            sequence_parallel=n_layers * 2,
+            async_tp=n_layers * 2 - 1,
+        )
+    )
+    if current_platform.is_cuda()
+    # ROCm matches
+    else (
+        lambda n_layers: Matches(
+            rms_quant_fusion=n_layers,
+            attn_quant_fusion=n_layers,
+            sequence_parallel=n_layers * 2,
+        )
     ),
 )
 
 llama4_scout_fp4 = ModelFusionInfo(
     model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4",
     hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
-    matches=lambda n_layers: Matches(
-        rms_quant_fusion=0,
-        attn_quant_fusion=n_layers,
-        ar_rms_fusion=n_layers * 2,
-        sequence_parallel=n_layers * 2,
-        async_tp=n_layers * 2 - 1,
+    matches=(
+        lambda n_layers: Matches(
+            rms_quant_fusion=0,
+            attn_quant_fusion=n_layers,
+            ar_rms_fusion=n_layers * 2,
+            sequence_parallel=n_layers * 2,
+            async_tp=n_layers * 2 - 1,
+        )
+    )
+    if current_platform.is_cuda()
+    # ROCm matches
+    else (
+        lambda n_layers: Matches(
+            rms_quant_fusion=0,
+            attn_quant_fusion=n_layers,
+            sequence_parallel=n_layers * 2,
+            async_tp=n_layers * 2 - 1,
+        )
     ),
 )
 
 qwen3_a3b = ModelFusionInfo(
     model_name="Qwen/Qwen3-30B-A3B",
-    matches=lambda n_layers: Matches(
-        norm_rope_fusion=n_layers,
-        ar_rms_fusion=n_layers * 2 + 1,
-        sequence_parallel=n_layers * 2 + 1,
-        async_tp=n_layers * 2,
+    matches=(
+        lambda n_layers: Matches(
+            norm_rope_fusion=n_layers,
+            ar_rms_fusion=n_layers * 2 + 1,
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 2,
+        )
+    )
+    if current_platform.is_cuda()
+    # ROCm matches
+    else (
+        lambda n_layers: Matches(
+            norm_rope_fusion=n_layers,
+            ar_rms_fusion=0,
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 2,
+        )
     ),
 )
 
 qwen3_a3b_fp8 = ModelFusionInfo(
     model_name="Qwen/Qwen3-30B-A3B-FP8",
-    matches=lambda n_layers: Matches(
-        rms_quant_fusion=n_layers,
-        norm_rope_fusion=n_layers,
-        attn_quant_fusion=0,  # attn + group quant not supported
-        ar_rms_fusion=n_layers * 2 + 1,
-        sequence_parallel=n_layers * 2 + 1,
-        async_tp=n_layers * 2,
+    matches=(
+        lambda n_layers: Matches(
+            rms_quant_fusion=n_layers,
+            norm_rope_fusion=n_layers,
+            attn_quant_fusion=0,  # attn + group quant not supported
+            ar_rms_fusion=n_layers * 2 + 1,
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 2,
+        )
+    )
+    if current_platform.is_cuda()
+    # ROCm matches
+    else (
+        lambda n_layers: Matches(
+            rms_quant_fusion=n_layers,
+            norm_rope_fusion=n_layers,
+            ar_rms_fusion=0,
+            sequence_parallel=n_layers * 2 + 1,
+            async_tp=n_layers * 2,
+        )
     ),
 )
diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py
index 4769ca1e0b63..fb743c1ba7d3 100644
--- a/tests/compile/fusions_e2e/test_tp2_async_tp.py
+++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py
@@ -5,6 +5,7 @@
 import pytest
 
 from vllm.config import PassConfig
+from vllm.platforms import current_platform
 
 from ...utils import multi_gpu_test
 from .common import (
@@ -16,6 +17,8 @@
 )
 from .models import (
     FLASHINFER_ATTN,
+    ROCM_AITER_UNIFIED_ATTN,
+    ROCM_ATTN,
     TRITON_ATTN,
     llama3_8b,
     llama3_8b_fp8,
@@ -29,9 +32,14 @@
     "model_name, matches_fn, model_kwargs, hf_overrides",
     [llama3_8b_fp8, llama4_scout_fp8],
 )
-@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
+@pytest.mark.parametrize(
+    "attn_backend",
+    [TRITON_ATTN, FLASHINFER_ATTN]
+    if current_platform.is_cuda()
+    else [TRITON_ATTN, ROCM_ATTN, ROCM_AITER_UNIFIED_ATTN],
+)
 @pytest.mark.parametrize("n_layers", [4])
-@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
+@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("quant_fp8", "rms_norm")))
 @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
 def test_tp2_async_tp_fp8_fusions(
     model_name: str,
@@ -96,7 +104,7 @@ def test_tp2_async_tp_fp8_fusions(
 )
 @pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
 @pytest.mark.parametrize("n_layers", [4])
-@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
+@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("rms_norm")))
 @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
 def test_tp2_async_tp_fusions(
     model_name: str,

From 119b4b01b345e17264c84d37f2acd060e6a5448a Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 10 Feb 2026 10:18:50 +0000
Subject: [PATCH 06/16] fix tests in fusion silu_mul and tidy up kite

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .buildkite/test-amd.yaml                      |  4 ++++
 .../passes/test_silu_mul_quant_fusion.py      | 19 +++++++------------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 64a878baa774..33ba6689faa5 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1695,6 +1695,7 @@ steps:
 - label: AsyncTP Correctness Tests  (2xMI325 GPUs)
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   device: mi325_2
   optional: true
   num_devices: 2
@@ -1789,6 +1790,7 @@ steps:
 - label: Fusion E2E TP2 Quick (MI325)
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   device: mi325_1
   num_devices: 2
   source_file_dependencies:
@@ -1806,6 +1808,7 @@ steps:
 - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   device: mi325_2
   num_devices: 2
   source_file_dependencies:
@@ -1823,6 +1826,7 @@ steps:
     - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
 
 - label: Fusion E2E TP2 (MI325)
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   device: mi325_2
diff --git a/tests/compile/passes/test_silu_mul_quant_fusion.py b/tests/compile/passes/test_silu_mul_quant_fusion.py
index c6794a156240..abd32c38ca04 100644
--- a/tests/compile/passes/test_silu_mul_quant_fusion.py
+++ b/tests/compile/passes/test_silu_mul_quant_fusion.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import itertools
-from typing import Any
 
 import pytest
 import torch
@@ -10,7 +9,7 @@
 from tests.compile.backend import TestBackend
 from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor
 from tests.utils import TestFP8Layer
-from vllm._aiter_ops import IS_AITER_FOUND, rocm_aiter_ops
+from vllm._aiter_ops import IS_AITER_FOUND
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from vllm.compilation.passes.fusion.act_quant_fusion import (
     FUSED_OPS,
@@ -184,13 +183,6 @@ def ops_in_model_after(self):
     PerTensorTorchFP8ScaledMMLinearKernel,
 ]
 TEST_KERNELS = ROCM_KERNELS if current_platform.is_rocm() else CUDA_KERNELS
-EXTENDED_TESTCASES: list[tuple[type[Any], bool, None]] = []
-# SiluMulGroupFp8Quant is only supported on ROCm
-if current_platform.is_rocm():
-    EXTENDED_TESTCASES.append((TestSiluMulGroupFp8QuantModel, True, None))
-
-if current_platform.is_cuda():
-    EXTENDED_TESTCASES.append((TestSiluMulNvfp4QuantModel, False, None))
 
 
 @pytest.mark.parametrize("num_tokens", [32, 64])
@@ -200,7 +192,10 @@ def ops_in_model_after(self):
 @pytest.mark.parametrize(
     "model_class, enable_quant_fp8_custom_op, force_kernel",
     list(itertools.product([TestSiluMulFp8QuantModel], [True, False], TEST_KERNELS))
-    + EXTENDED_TESTCASES,
+    + [
+        (TestSiluMulNvfp4QuantModel, False, None),
+        (TestSiluMulGroupFp8QuantModel, True, None),
+    ],
 )
 @pytest.mark.skipif(
     envs.VLLM_TARGET_DEVICE not in ["cuda", "rocm"], reason="Only test on CUDA and ROCm"
@@ -246,14 +241,14 @@ def test_fusion_silu_and_mul_quant(
 
     with set_current_vllm_config(config), monkeypatch.context() as m:
         fusion_passes = [ActivationQuantFusionPass(config)]
-        if current_platform.is_rocm() and IS_AITER_FOUND:
+        if IS_AITER_FOUND and model_class is TestSiluMulGroupFp8QuantModel:
+            from vllm._aiter_ops import rocm_aiter_ops
             from vllm.compilation.passes.fusion.rocm_aiter_fusion import (
                 RocmAiterSiluMulFp8GroupQuantFusionPass,
             )
 
             m.setenv("VLLM_ROCM_USE_AITER", "1")
             rocm_aiter_ops.refresh_env_variables()
-
             fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)]
 
         passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)]

From 218fcfb221df513d2c000e3e133e27f9e6f1d010 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Tue, 10 Feb 2026 10:25:11 +0000
Subject: [PATCH 07/16] remove unnecessary change

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 vllm/model_executor/layers/quantization/input_quant_fp8.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py
index ed3b981cf183..5bc78afa43b0 100644
--- a/vllm/model_executor/layers/quantization/input_quant_fp8.py
+++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py
@@ -171,7 +171,6 @@ def forward_native(
         x: torch.Tensor,
         scale: torch.Tensor | None = None,
         scale_ub: torch.Tensor | None = None,
-        **kwargs,
     ):
         if self.is_group_quant and not self.static:
             assert scale is None, "Dynamic group quantization does not use scale"

From befaba1bc95b823ac36b0091632b206b8c6faa76 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 10 Feb 2026 13:46:06 +0000
Subject: [PATCH 08/16] remove duplicate

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 33ba6689faa5..149767bdd06b 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1721,18 +1721,6 @@ steps:
   - tests/compile/test_silu_mul_quant_fusion.py
   - tests/compile/distributed/test_fusion_all_reduce.py
   - tests/compile/fullgraph/test_full_graph.py
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/attention/attention.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
-  - tests/compile/passes/test_fusion_attn.py
-  - tests/compile/passes/test_silu_mul_quant_fusion.py
-  - tests/compile/passes/distributed/test_fusion_all_reduce.py
-  - tests/compile/fullgraph/test_full_graph.py
   commands:
     - rocm-smi
     # we run all backend tests on ROCm

From ca801a13e3d9dc9c37bb1e429a7134cc4e28e58a Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 10 Feb 2026 15:16:30 +0000
Subject: [PATCH 09/16] need to add quote

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 149767bdd06b..8d9c6eb62db0 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -620,7 +620,7 @@ steps:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+  - "pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'"
     # # Limit to no custom ops to reduce running time
     # # Wrap with quotes to escape yaml and avoid starting -k string with a -
     # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
@@ -1689,7 +1689,7 @@ steps:
   - tests/compile/correctness_e2e/test_sequence_parallel.py
   commands:
   - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
+  - "pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py"
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: AsyncTP Correctness Tests  (2xMI325 GPUs)
@@ -1701,7 +1701,7 @@ steps:
   num_devices: 2
   commands:
   - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
+  - "pytest -v -s tests/compile/correctness_e2e/test_async_tp.py"
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion and Compile Unit Tests (2xMI325 GPUs)
@@ -1724,8 +1724,8 @@ steps:
   commands:
     - rocm-smi
     # we run all backend tests on ROCm
-    - pytest -v -s tests/compile/passes/test_fusion_attn.py
-    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
+    - "pytest -v -s tests/compile/passes/test_fusion_attn.py"
+    - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py"
     # TODO: this test is not supported on ROCm, there are aiter kernels for this.
     # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
     # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
@@ -1748,7 +1748,7 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"'
     # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
     # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet.
     # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
@@ -1772,8 +1772,8 @@ steps:
   commands:
     - rocm-smi
     # Run just llama3 (fp8) for all config combinations
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
-    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
+    - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"'
+    - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"'
 
 - label: Fusion E2E TP2 Quick (MI325)
   timeout_in_minutes: 40
@@ -1790,7 +1790,7 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"'
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
@@ -1811,7 +1811,7 @@ steps:
   commands:
     - rocm-smi
     # Run just llama3 (fp8 & bf16) for all config combinations
-    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
+    - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"'
 
 - label: Fusion E2E TP2 (MI325)
   mirror_hardwares: [amdexperimental, amdproduction, tj]
@@ -1828,4 +1828,4 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - 'pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"'

From 0b65174b5ad3074344ae519852d28561bd155f46 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 10 Feb 2026 15:47:57 +0000
Subject: [PATCH 10/16] fix syntax

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 8d9c6eb62db0..439075c6a843 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1748,7 +1748,7 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"'
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
     # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
     # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet.
     # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
@@ -1772,8 +1772,8 @@ steps:
   commands:
     - rocm-smi
     # Run just llama3 (fp8) for all config combinations
-    - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"'
-    - 'pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"'
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'llama-3'"
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'"
 
 - label: Fusion E2E TP2 Quick (MI325)
   timeout_in_minutes: 40
@@ -1790,7 +1790,7 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"'
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
@@ -1811,7 +1811,7 @@ steps:
   commands:
     - rocm-smi
     # Run just llama3 (fp8 & bf16) for all config combinations
-    - 'pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"'
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'"
 
 - label: Fusion E2E TP2 (MI325)
   mirror_hardwares: [amdexperimental, amdproduction, tj]
@@ -1828,4 +1828,4 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - 'pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"'
+    - "pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"

From be40a224ccc64d072f2664d15b58eee9fe46c4b2 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 10 Feb 2026 16:49:52 +0000
Subject: [PATCH 11/16] fix Fusion E2E TP2 (MI325) path

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 439075c6a843..407a3d671803 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1814,7 +1814,7 @@ steps:
     - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'"
 
 - label: Fusion E2E TP2 (MI325)
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  mirror_hardwares: [amdexperimental, amdproduction, tj, tj2]
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   device: mi325_2
@@ -1828,4 +1828,4 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - "pytest -v -s vllm/tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"

From d8d071254ffbafcabb69fd9221e08a755298e7e5 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 11 Feb 2026 02:48:27 +0000
Subject: [PATCH 12/16] fix test-amd syntax

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 407a3d671803..8d469a39b042 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1696,7 +1696,7 @@ steps:
   timeout_in_minutes: 50
   working_dir: "/vllm-workspace/"
   mirror_hardwares: [amdexperimental, amdproduction, tj]
-  device: mi325_2
+  agent_pool: mi325_2
   optional: true
   num_devices: 2
   commands:
@@ -1750,7 +1750,7 @@ steps:
     # Run all models and attn backends but only Inductor partition and native custom ops
     - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
     # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
-    # TODO: Qwen uses group quantizatio which the pattern matcher on ROCm is not supported yet.
+    # TODO: Qwen uses group quantization which the pattern matcher on ROCm is not supported yet.
     # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
 
 # corresponds to .buildkite/test_areas/compile.yaml
@@ -1779,7 +1779,7 @@ steps:
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   mirror_hardwares: [amdexperimental, amdproduction, tj]
-  device: mi325_1
+  agent_pool: mi325_1
   num_devices: 2
   source_file_dependencies:
     - csrc/quantization/
@@ -1790,14 +1790,14 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
+    - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
   mirror_hardwares: [amdexperimental, amdproduction, tj]
-  device: mi325_2
+  agent_pool: mi325_2
   num_devices: 2
   source_file_dependencies:
     - csrc/quantization/
@@ -1811,13 +1811,13 @@ steps:
   commands:
     - rocm-smi
     # Run just llama3 (fp8 & bf16) for all config combinations
-    - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'"
+    - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'"
 
 - label: Fusion E2E TP2 (MI325)
   mirror_hardwares: [amdexperimental, amdproduction, tj, tj2]
   timeout_in_minutes: 40
   working_dir: "/vllm-workspace/"
-  device: mi325_2
+  agent_pool: mi325_2
   num_devices: 2
   source_file_dependencies:
     - csrc/quantization/
@@ -1828,4 +1828,4 @@ steps:
   commands:
     - rocm-smi
     # Run all models and attn backends but only Inductor partition and native custom ops
-    - "pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
+    - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"

From 5c879d18a64b06dfdfbe21021c0ae348d052bf0f Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 11 Feb 2026 08:02:54 +0000
Subject: [PATCH 13/16]  remove unsupported tests cases

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 tests/compile/fullgraph/test_full_graph.py |  8 +++-----
 tests/compile/passes/test_fusion.py        | 15 ++++++++++-----
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
index 921f57cea0a6..37aec789d62e 100644
--- a/tests/compile/fullgraph/test_full_graph.py
+++ b/tests/compile/fullgraph/test_full_graph.py
@@ -205,11 +205,9 @@ def test_custom_compile_config(
         ]
         if current_platform.is_cuda()
         else [
-            # ("Qwen/Qwen2-0.5B", None),  # Standard attention model
-            # (
-            #     "deepseek-ai/DeepSeek-V2-Lite",
-            #     AttentionBackendEnum.TRITON_MLA,
-            # ),  # MLA (Multi-head Latent Attention) model
+            # TRITON_MLA does not support FP8 KV cache
+            # So we can skip the standard attention model
+            # test.
             (
                 "deepseek-ai/DeepSeek-V2-Lite",
                 AttentionBackendEnum.ROCM_AITER_MLA,
diff --git a/tests/compile/passes/test_fusion.py b/tests/compile/passes/test_fusion.py
index a2128150f701..aa733a744db3 100644
--- a/tests/compile/passes/test_fusion.py
+++ b/tests/compile/passes/test_fusion.py
@@ -87,9 +87,16 @@
     (RowWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN),
     # ChannelWiseTorchFP8ScaledMMLinearKernel only supports per-token
     (ChannelWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN),
+    # 
     # Blockwise group shapes (no kernel abstraction)
-    (None, GroupShape(1, 128)),
-    (None, GroupShape(1, 64)),
+    # (None, GroupShape(1, 128)),
+    # (None, GroupShape(1, 64)),
+    #     
+    # ^ This is not supported yet: See 
+    # PR https://github.com/vllm-project/vllm/pull/30845
+    # TODO: enable Blockwise group shapes
+    # 
+
 ]
 
 KERNEL_GROUPSHAPE_COMBINATIONS = (
@@ -99,9 +106,7 @@
 )
 
 # For Aiter tests we toggle use_aiter_quant_op
-AITER_KERNEL_GROUPSHAPE_COMBINATIONS = [
-    # Per-token with ROCmFP8ScaledMMLinearKernel
-    (ROCmFP8ScaledMMLinearKernel, GroupShape.PER_TENSOR, False),
+AITER_KERNEL_GROUPSHAPE_COMBINATIONS = [ 
     # Per-token with RowWiseTorchFP8ScaledMMLinearKernel
     (RowWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN, True),
     (RowWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN, False),

From 727ff29127228e3fba417caec031e6c85c32e1e7 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 11 Feb 2026 08:06:12 +0000
Subject: [PATCH 14/16] add fp8 kv cache test to amd ci

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .buildkite/test-amd.yaml            |  5 ++---
 tests/compile/passes/test_fusion.py | 11 +++++------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 0507c617745a..5e0cd7eabd8c 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1726,11 +1726,10 @@ steps:
     # we run all backend tests on ROCm
     - "pytest -v -s tests/compile/passes/test_fusion_attn.py"
     - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py"
+    - "pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile"
     # TODO: this test is not supported on ROCm, there are aiter kernels for this.
     # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    # TODO: this test is not supported on ROCm
-    # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
+    
 
 # corresponds to .buildkite/test_areas/compile.yaml
 - label: Fusion E2E Quick (MI325)
diff --git a/tests/compile/passes/test_fusion.py b/tests/compile/passes/test_fusion.py
index aa733a744db3..05f766b1e9e8 100644
--- a/tests/compile/passes/test_fusion.py
+++ b/tests/compile/passes/test_fusion.py
@@ -87,16 +87,15 @@
     (RowWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN),
     # ChannelWiseTorchFP8ScaledMMLinearKernel only supports per-token
     (ChannelWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN),
-    # 
+    #
     # Blockwise group shapes (no kernel abstraction)
     # (None, GroupShape(1, 128)),
     # (None, GroupShape(1, 64)),
-    #     
-    # ^ This is not supported yet: See 
+    #
+    # ^ This is not supported yet: See
     # PR https://github.com/vllm-project/vllm/pull/30845
     # TODO: enable Blockwise group shapes
-    # 
-
+    #
 ]
 
 KERNEL_GROUPSHAPE_COMBINATIONS = (
@@ -106,7 +105,7 @@
 )
 
 # For Aiter tests we toggle use_aiter_quant_op
-AITER_KERNEL_GROUPSHAPE_COMBINATIONS = [ 
+AITER_KERNEL_GROUPSHAPE_COMBINATIONS = [
     # Per-token with RowWiseTorchFP8ScaledMMLinearKernel
     (RowWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN, True),
     (RowWiseTorchFP8ScaledMMLinearKernel, GroupShape.PER_TOKEN, False),

From 0aa05b1e5f3f2cbdfc0747b75acd4de072261813 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 11 Feb 2026 08:17:25 +0000
Subject: [PATCH 15/16] run PyTorch Compilation Unit Tests in multiple
 instances

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 5e0cd7eabd8c..2a41df731e05 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -590,7 +590,9 @@ steps:
   - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
   # TODO: clean up this comment if not needed. It is used to 
   # keep track of the tests changes during vLLM IR Ops refactoring.
-  - pytest -s -v compile/passes --ignore compile/passes/distributed
+  # Use `find` to launch multiple instances of pytest.
+  - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+
 
 # corresponds to .buildkite/test_areas/pytorch.yaml
 - label: PyTorch Fullgraph Smoke Test # 15min

From 5f5f4423ece39ea3ed940fdeef17063abccb8945 Mon Sep 17 00:00:00 2001
From: vllmellm <vllm.ellm@embeddedllm.com>
Date: Wed, 11 Feb 2026 08:33:48 +0000
Subject: [PATCH 16/16] remove async tp tests for AMD ci

Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
---
 .buildkite/test-amd.yaml                      | 54 -------------------
 tests/compile/fusions_e2e/models.py           | 30 +++++------
 .../compile/fusions_e2e/test_tp2_async_tp.py  | 14 ++---
 3 files changed, 18 insertions(+), 80 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 2a41df731e05..5d986207d354 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1776,57 +1776,3 @@ steps:
     - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'llama-3'"
     - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'"
 
-- label: Fusion E2E TP2 Quick (MI325)
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
-  agent_pool: mi325_1
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/model_executor/
-    - vllm/v1/attention/
-    - vllm/compilation/
-    - tests/compile/fusions_e2e/
-  commands:
-    - rocm-smi
-    # Run all models and attn backends but only Inductor partition and native custom ops
-    - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
-
-# corresponds to .buildkite/test_areas/compile.yaml
-- label: Fusion E2E TP2 AsyncTP Config Sweep (MI325)
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  mirror_hardwares: [amdexperimental, amdproduction, tj]
-  agent_pool: mi325_2
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/compilation/
-    # can affect pattern matching
-    - vllm/model_executor/layers/layernorm.py
-    - vllm/model_executor/layers/activation.py
-    - vllm/model_executor/layers/attention/attention.py
-    - vllm/model_executor/layers/quantization/input_quant_fp8.py
-    - tests/compile/fusions_e2e/
-  commands:
-    - rocm-smi
-    # Run just llama3 (fp8 & bf16) for all config combinations
-    - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'llama-3'"
-
-- label: Fusion E2E TP2 (MI325)
-  mirror_hardwares: [amdexperimental, amdproduction, tj, tj2]
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  agent_pool: mi325_2
-  num_devices: 2
-  source_file_dependencies:
-    - csrc/quantization/
-    - vllm/model_executor/
-    - vllm/v1/attention/
-    - vllm/compilation/
-    - tests/compile/fusions_e2e/
-  commands:
-    - rocm-smi
-    # Run all models and attn backends but only Inductor partition and native custom ops
-    - "pytest -v -rsx tests/compile/fusions_e2e/test_tp2_async_tp.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
diff --git a/tests/compile/fusions_e2e/models.py b/tests/compile/fusions_e2e/models.py
index 77cb1b4d3ad9..c31cfe97426c 100644
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -75,9 +75,9 @@
             rms_quant_fusion=n_layers * 2,
             act_quant_fusion=n_layers,
             attn_quant_fusion=n_layers,
-            ar_rms_fusion=0,
-            sequence_parallel=n_layers * 2 + 1,
-            async_tp=n_layers * 4,
+            ar_rms_fusion=0,  # Not supported
+            sequence_parallel=0,  # Not supported
+            async_tp=0,  # Not supported
         )
     ),
 )
@@ -101,9 +101,9 @@
             rms_quant_fusion=0,
             act_quant_fusion=n_layers,
             attn_quant_fusion=n_layers,
-            ar_rms_fusion=0,
-            sequence_parallel=n_layers * 2 + 1,
-            async_tp=n_layers * 4,
+            ar_rms_fusion=0,  # Not supported
+            sequence_parallel=0,  # Not supported
+            async_tp=0,  # Not supported
         )
     ),
 )
@@ -131,7 +131,7 @@
         lambda n_layers: Matches(
             rms_quant_fusion=n_layers,
             attn_quant_fusion=n_layers,
-            sequence_parallel=n_layers * 2,
+            sequence_parallel=0,  # Not supported
         )
     ),
 )
@@ -154,8 +154,8 @@
         lambda n_layers: Matches(
             rms_quant_fusion=0,
             attn_quant_fusion=n_layers,
-            sequence_parallel=n_layers * 2,
-            async_tp=n_layers * 2 - 1,
+            sequence_parallel=0,  # Not supported
+            async_tp=0,  # Not supported
         )
     ),
 )
@@ -175,9 +175,9 @@
     else (
         lambda n_layers: Matches(
             norm_rope_fusion=n_layers,
-            ar_rms_fusion=0,
-            sequence_parallel=n_layers * 2 + 1,
-            async_tp=n_layers * 2,
+            ar_rms_fusion=0,  # Not supported
+            sequence_parallel=0,  # Not supported
+            async_tp=0,  # Not supported
         )
     ),
 )
@@ -200,9 +200,9 @@
         lambda n_layers: Matches(
             rms_quant_fusion=n_layers,
             norm_rope_fusion=n_layers,
-            ar_rms_fusion=0,
-            sequence_parallel=n_layers * 2 + 1,
-            async_tp=n_layers * 2,
+            ar_rms_fusion=0,  # Not supported
+            sequence_parallel=0,  # Not supported
+            async_tp=0,  # Not supported
         )
     ),
 )
diff --git a/tests/compile/fusions_e2e/test_tp2_async_tp.py b/tests/compile/fusions_e2e/test_tp2_async_tp.py
index fb743c1ba7d3..4769ca1e0b63 100644
--- a/tests/compile/fusions_e2e/test_tp2_async_tp.py
+++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py
@@ -5,7 +5,6 @@
 import pytest
 
 from vllm.config import PassConfig
-from vllm.platforms import current_platform
 
 from ...utils import multi_gpu_test
 from .common import (
@@ -17,8 +16,6 @@
 )
 from .models import (
     FLASHINFER_ATTN,
-    ROCM_AITER_UNIFIED_ATTN,
-    ROCM_ATTN,
     TRITON_ATTN,
     llama3_8b,
     llama3_8b_fp8,
@@ -32,14 +29,9 @@
     "model_name, matches_fn, model_kwargs, hf_overrides",
     [llama3_8b_fp8, llama4_scout_fp8],
 )
-@pytest.mark.parametrize(
-    "attn_backend",
-    [TRITON_ATTN, FLASHINFER_ATTN]
-    if current_platform.is_cuda()
-    else [TRITON_ATTN, ROCM_ATTN, ROCM_AITER_UNIFIED_ATTN],
-)
+@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
 @pytest.mark.parametrize("n_layers", [4])
-@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("quant_fp8", "rms_norm")))
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
 @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
 def test_tp2_async_tp_fp8_fusions(
     model_name: str,
@@ -104,7 +96,7 @@ def test_tp2_async_tp_fp8_fusions(
 )
 @pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
 @pytest.mark.parametrize("n_layers", [4])
-@pytest.mark.parametrize("custom_ops", list(custom_ops_combos("rms_norm")))
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
 @pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
 def test_tp2_async_tp_fusions(
     model_name: str,