vllm-project · tjtanaa · Feb 9, 2026 · Feb 9, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
@@ -570,9 +570,11 @@ steps:
       --ignore=lora/test_qwen3moe_tp.py
   parallelism: 4
 
+##### .buildkite/test_areas/pytorch.yaml #####
+# corresponds to .buildkite/test_areas/pytorch.yaml
 - label: PyTorch Compilation Unit Tests # 15min
   timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -586,10 +588,16 @@ steps:
   # Use `find` to launch multiple instances of pytest so that
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
   - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+  # TODO: clean up this comment if not needed. It is used to 
+  # keep track of the tests changes during vLLM IR Ops refactoring.
+  # Use `find` to launch multiple instances of pytest.
+  - "find compile/passes -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;"
+
 
+# corresponds to .buildkite/test_areas/pytorch.yaml
 - label: PyTorch Fullgraph Smoke Test # 15min
-  timeout_in_minutes: 30
-  mirror_hardwares: [amdexperimental, amdproduction]
+  timeout_in_minutes: 35
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
@@ -603,17 +611,18 @@ steps:
   # they do not suffer from https://github.com/vllm-project/vllm/issues/28965
   - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"
 
-- label: PyTorch Fullgraph Test # 27min
-  timeout_in_minutes: 40
-  mirror_hardwares: [amdexperimental, amdproduction]
+# corresponds to .buildkite/test_areas/pytorch.yaml
+- label: PyTorch Fullgraph # 27min
+  timeout_in_minutes: 30
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
   agent_pool: mi325_1
   # grade: Blocking
   torch_nightly: true
   source_file_dependencies:
   - vllm/
   - tests/compile
   commands:
-  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
+  - "pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'"
     # # Limit to no custom ops to reduce running time
     # # Wrap with quotes to escape yaml and avoid starting -k string with a -
     # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
@@ -1176,41 +1185,6 @@ steps:
     - pytest -v -s tests/kernels/moe/test_flashinfer.py
     - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py
 
-- label: Blackwell Fusion and Compile Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/v1/worker/
-  - vllm/v1/cudagraph_dispatcher.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/passes/test_fusion_attn.py
-  - tests/compile/passes/test_silu_mul_quant_fusion.py
-  - tests/compile/passes/distributed/test_fusion_all_reduce.py
-  - tests/compile/fullgraph/test_full_graph.py
-  commands:
-    - nvidia-smi
-    - pytest -v -s tests/compile/passes/test_fusion_attn.py
-    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
-    # this runner has 2 GPUs available even though num_gpus=2 is not set
-    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
-
-    # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # # Wrap with quotes to escape yaml
-    # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
-    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
-    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
-
-    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
-    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
-
 - label: Blackwell GPT-OSS Eval
   timeout_in_minutes: 60
   working_dir: "/vllm-workspace/"
@@ -1334,7 +1308,6 @@ steps:
   - pytest -v -s ./compile/test_wrapper.py
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - pytest -v -s distributed/test_sequence_parallel.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
   - pytest -v -s v1/worker/test_worker_memory_snapshot.py
 
@@ -1558,17 +1531,20 @@ steps:
   num_gpus: 2
   commands:
     - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
+    # ================= 24 passed, 11 warnings in 192.85s (0:03:12) ==================
     - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
+    # ================== 48 passed, 8 warnings in 386.41s (0:06:26) ==================
     - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+    # ======================== 8 skipped, 9 warnings in 2.08s ========================
     #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
     # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
     # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
     # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
-
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
     - pytest -v -s tests/distributed/test_context_parallel.py
+    # ======================== 4 passed, 3 warnings in 30.45s ========================
     - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
     - pytest -v -s tests/v1/distributed/test_dbo.py
+    # ======================== 2 skipped, 3 warnings in 1.97s ========================
 
 ##### B200 test #####
 - label: Distributed Tests (B200) # optional
@@ -1692,3 +1668,111 @@ steps:
   working_dir: "/vllm-workspace"
   commands:
   - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
+
+##### .buildkite/test_areas/compile.yaml #####
+# Slowly setting up the tests so that it is also easier for the 
+# CI team to review and upstream to the pipelinev2.
+# The following tests are important for vLLM IR Ops refactoring,
+# which affects fusion passes on ROCm. So we have to 
+# enable them as as soon as possible.
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: Sequence Parallel Correctness Tests (2xMI325 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  agent_pool: mi325_2
+  num_devices: 2
+  source_file_dependencies:
+  - vllm/model_executor/layers/
+  - vllm/compilation/
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - tests/compile/correctness_e2e/test_sequence_parallel.py
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - "pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py"
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: AsyncTP Correctness Tests  (2xMI325 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  agent_pool: mi325_2
+  optional: true
+  num_devices: 2
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - "pytest -v -s tests/compile/correctness_e2e/test_async_tp.py"
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: Fusion and Compile Unit Tests (2xMI325 GPUs)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  agent_pool: mi325_2
+  source_file_dependencies:
+  - csrc/quantization/fp4/
+  - vllm/model_executor/layers/quantization/
+  - vllm/model_executor/layers/layernorm.py
+  - vllm/model_executor/layers/activation.py
+  - vllm/model_executor/layers/attention/attention.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
+  - tests/compile/test_fusion_attn.py
+  - tests/compile/test_silu_mul_quant_fusion.py
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/fullgraph/test_full_graph.py
+  commands:
+    - rocm-smi
+    # we run all backend tests on ROCm
+    - "pytest -v -s tests/compile/passes/test_fusion_attn.py"
+    - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py"
+    - "pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile"
+    # TODO: this test is not supported on ROCm, there are aiter kernels for this.
+    # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
+
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: Fusion E2E Quick (MI325)
+  timeout_in_minutes: 15
+  working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  agent_pool: mi325_1
+  num_devices: 1
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - rocm-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # TODO: Qwen uses group quantization which the pattern matcher on ROCm is not supported yet.
+    # - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+
+# corresponds to .buildkite/test_areas/compile.yaml
+- label: Fusion E2E Config Sweep (MI325)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  mirror_hardwares: [amdexperimental, amdproduction, tj]
+  agent_pool: mi325_1
+  num_devices: 1
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - rocm-smi
+    # Run just llama3 (fp8) for all config combinations
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'llama-3'"
+    - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8' -k 'inductor_partition and not +rms_norm and +quant_fp8 and qwen3' -k 'llama-3'"
+
diff --git a/tests/compile/fullgraph/test_full_graph.py b/tests/compile/fullgraph/test_full_graph.py
@@ -10,6 +10,7 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
+from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
@@ -194,29 +195,53 @@ def test_custom_compile_config(
 )
 @pytest.mark.parametrize(
     "model, backend",
-    [
-        ("Qwen/Qwen2-0.5B", None),  # Standard attention model
-        (
-            "deepseek-ai/DeepSeek-V2-Lite",
-            AttentionBackendEnum.FLASHINFER_MLA,
-        ),  # MLA (Multi-head Latent Attention) model
-    ],
+    (
+        [
+            ("Qwen/Qwen2-0.5B", None),  # Standard attention model
+            (
+                "deepseek-ai/DeepSeek-V2-Lite",
+                AttentionBackendEnum.FLASHINFER_MLA,
+            ),  # MLA (Multi-head Latent Attention) model
+        ]
+        if current_platform.is_cuda()
+        else [
+            # TRITON_MLA does not support FP8 KV cache
+            # So we can skip the standard attention model
+            # test.
+            (
+                "deepseek-ai/DeepSeek-V2-Lite",
+                AttentionBackendEnum.ROCM_AITER_MLA,
+            ),  # MLA (Multi-head Latent Attention) model
+            (
+                "deepseek-ai/DeepSeek-V2-Lite",
+                AttentionBackendEnum.ROCM_AITER_TRITON_MLA,
+            ),  # MLA (Multi-head Latent Attention) model
+        ]
+    ),
 )
 def test_fp8_kv_scale_compile(
     compilation_mode: int,
     model: str,
     backend: AttentionBackendEnum | None,
+    monkeypatch: pytest.MonkeyPatch,
 ):
-    model_kwargs = {
-        "quantization": "fp8",
-        "kv_cache_dtype": "fp8_e4m3",
-        "calculate_kv_scales": True,
-        "max_model_len": 512,
-    }
-    if backend:
-        model_kwargs["attention_config"] = {"backend": backend.name}
-
-    run_model(compilation_mode, model, **model_kwargs)
+    with monkeypatch.context() as m:
+        model_kwargs = {
+            "quantization": "fp8",
+            "kv_cache_dtype": "fp8_e4m3" if current_platform.is_cuda() else "fp8",
+            "calculate_kv_scales": True,
+            "max_model_len": 512,
+        }
+        if backend:
+            model_kwargs["attention_config"] = {"backend": backend.name}
+            if current_platform.is_rocm():
+                m.setenv("VLLM_ROCM_USE_AITER", "1")
+                # Disable Aiter MOE as some shapes are not supported
+                m.setenv("VLLM_ROCM_USE_AITER_MOE", "0")
+
+                rocm_aiter_ops.refresh_env_variables()
+
+        run_model(compilation_mode, model, **model_kwargs)
 
 
 def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):