From 7aded98e8e0ef6f9a1f3681c034d79189832f4a5 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Tue, 3 Mar 2026 13:02:28 -0600
Subject: [PATCH 1/5] Attempting to fix quotation

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/scripts/hardware_ci/run-amd-test.sh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 8895771f0a40..70eef322bd19 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -205,6 +205,13 @@ re_quote_pytest_markers() {
       esac
 
       if $is_boundary; then
+        # Strip surrounding double quotes if present (from upstream
+        # single-to-double conversion); without this, wrapping below
+        # would produce '"expr"' with literal double-quote characters.
+        if [[ "$marker_buf" == '"'*'"' ]]; then
+          marker_buf="${marker_buf#\"}"
+          marker_buf="${marker_buf%\"}"
+        fi
         # Flush the collected marker expression
         if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
           output+="'${marker_buf}' "
@@ -242,6 +249,11 @@ re_quote_pytest_markers() {
 
   # Flush any trailing marker expression (marker at end of command)
   if $collecting && [[ -n "$marker_buf" ]]; then
+    # Strip surrounding double quotes (see mid-stream flush comment)
+    if [[ "$marker_buf" == '"'*'"' ]]; then
+      marker_buf="${marker_buf#\"}"
+      marker_buf="${marker_buf%\"}"
+    fi
     if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
       output+="'${marker_buf}'"
     else

From 09d0124c76d2a159457f1d9b07df4e8cf6b59a8b Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Tue, 3 Mar 2026 18:51:05 -0600
Subject: [PATCH 2/5] Adding more mirrors and testing new orchestrator

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .../scripts/hardware_ci/run-amd-test.sh       |  2 ++
 .buildkite/test_areas/expert_parallelism.yaml | 10 ++++++++++
 .buildkite/test_areas/misc.yaml               | 10 ++++++++++
 .buildkite/test_areas/models_language.yaml    | 10 ++++++++++
 .buildkite/test_areas/models_multimodal.yaml  | 20 +++++++++++++++++++
 5 files changed, 52 insertions(+)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 70eef322bd19..1c43c404d247 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -504,6 +504,8 @@ else
     -e HF_TOKEN \
     -e AWS_ACCESS_KEY_ID \
     -e AWS_SECRET_ACCESS_KEY \
+    -e BUILDKITE_PARALLEL_JOB \
+    -e BUILDKITE_PARALLEL_JOB_COUNT \
     -v "${HF_CACHE}:${HF_MOUNT}" \
     -e "HF_HOME=${HF_MOUNT}" \
     -e "PYTHONPATH=${MYPYTHONPATH}" \
diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml
index 1443d847eaf5..5c9f2f5a23e6 100644
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -10,6 +10,11 @@ steps:
   - tests/distributed/test_eplb_algo.py
   commands:
   - pytest -v -s distributed/test_eplb_algo.py
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: EPLB Execution
   timeout_in_minutes: 20
@@ -21,6 +26,11 @@ steps:
   commands:
   - pytest -v -s distributed/test_eplb_execute.py
   - pytest -v -s distributed/test_eplb_spec_decode.py
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: Elastic EP Scaling Test
   timeout_in_minutes: 20
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index d8957c217755..21b9522ae6b5 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -57,6 +57,11 @@ steps:
   - pip install modelscope
   - pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: Examples
   timeout_in_minutes: 45
@@ -122,6 +127,11 @@ steps:
   - pytest -v -s detokenizer
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
   depends_on: 
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
index a3bd21ccff3c..bf1742f037c0 100644
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -12,6 +12,11 @@ steps:
     # Test standard language models, excluding a subset of slow tests
     - pip freeze | grep -E 'torch'
     - pytest -v -s models/language -m 'core_model and (not slow_test)'
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: Language Models Tests (Extra Standard) %N
   timeout_in_minutes: 45
@@ -42,6 +47,11 @@ steps:
     # Shard hybrid language model tests
     - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
   parallelism: 2
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: Language Models Test (Extended Generation) # 80min
   timeout_in_minutes: 110
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
index a1194c229866..cd58d47a1649 100644
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -45,6 +45,11 @@ steps:
   - vllm/v1/core/
   commands:
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: Multi-Modal Models (Extended) 1
   optional: true
@@ -54,6 +59,11 @@ steps:
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: Multi-Modal Models (Extended) 2
   optional: true
@@ -63,6 +73,11 @@ steps:
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: Multi-Modal Models (Extended) 3
   optional: true
@@ -72,3 +87,8 @@ steps:
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd

From 1e8105b4f859fa19b59f559639da7d2390ac4153 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Tue, 3 Mar 2026 23:23:53 -0600
Subject: [PATCH 3/5] Removed lang mod std mirror until fixed

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/test_areas/misc.yaml              |  5 -----
 .buildkite/test_areas/models_language.yaml   | 11 ++++++-----
 .buildkite/test_areas/models_multimodal.yaml |  5 +++++
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index 21b9522ae6b5..c1596adbef23 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -127,11 +127,6 @@ steps:
   - pytest -v -s detokenizer
   - pytest -v -s -m 'not cpu_test' multimodal
   - pytest -v -s utils_
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: Async Engine, Inputs, Utils, Worker, Config (CPU)
   depends_on: 
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
index bf1742f037c0..d0c235800c35 100644
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -12,11 +12,6 @@ steps:
     # Test standard language models, excluding a subset of slow tests
     - pip freeze | grep -E 'torch'
     - pytest -v -s models/language -m 'core_model and (not slow_test)'
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: Language Models Tests (Extra Standard) %N
   timeout_in_minutes: 45
@@ -52,6 +47,12 @@ steps:
       device: mi325_1
       depends_on:
       - image-build-amd
+      commands:
+      - pytest -v -s v1/e2e
+      - pytest -v -s v1/engine
+      - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
+      - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
+      - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
 
 - label: Language Models Test (Extended Generation) # 80min
   timeout_in_minutes: 110
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
index cd58d47a1649..538aa69a3270 100644
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -35,6 +35,11 @@ steps:
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: Multi-Modal Accuracy Eval (Small Models) # 50min
   timeout_in_minutes: 70

From d5ea9f6c199b170f7d2097a29c05e085729c5342 Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Thu, 5 Mar 2026 16:53:37 -0600
Subject: [PATCH 4/5] [ROCm][CI] Gating and mirroring more TGs - stage D

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .../scripts/hardware_ci/run-amd-test.sh       | 14 ----
 .buildkite/test_areas/distributed.yaml        | 67 ++++++++++++++++++-
 .buildkite/test_areas/expert_parallelism.yaml | 10 ---
 .buildkite/test_areas/lora.yaml               |  6 +-
 .buildkite/test_areas/misc.yaml               |  5 --
 .buildkite/test_areas/models_language.yaml    | 18 ++++-
 .buildkite/test_areas/models_multimodal.yaml  | 10 ---
 .buildkite/test_areas/quantization.yaml       |  5 ++
 8 files changed, 92 insertions(+), 43 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 1c43c404d247..8895771f0a40 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -205,13 +205,6 @@ re_quote_pytest_markers() {
       esac
 
       if $is_boundary; then
-        # Strip surrounding double quotes if present (from upstream
-        # single-to-double conversion); without this, wrapping below
-        # would produce '"expr"' with literal double-quote characters.
-        if [[ "$marker_buf" == '"'*'"' ]]; then
-          marker_buf="${marker_buf#\"}"
-          marker_buf="${marker_buf%\"}"
-        fi
         # Flush the collected marker expression
         if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
           output+="'${marker_buf}' "
@@ -249,11 +242,6 @@ re_quote_pytest_markers() {
 
   # Flush any trailing marker expression (marker at end of command)
   if $collecting && [[ -n "$marker_buf" ]]; then
-    # Strip surrounding double quotes (see mid-stream flush comment)
-    if [[ "$marker_buf" == '"'*'"' ]]; then
-      marker_buf="${marker_buf#\"}"
-      marker_buf="${marker_buf%\"}"
-    fi
     if [[ "$marker_buf" == *" "* || "$marker_buf" == *"("* ]]; then
       output+="'${marker_buf}'"
     else
@@ -504,8 +492,6 @@ else
     -e HF_TOKEN \
     -e AWS_ACCESS_KEY_ID \
     -e AWS_SECRET_ACCESS_KEY \
-    -e BUILDKITE_PARALLEL_JOB \
-    -e BUILDKITE_PARALLEL_JOB_COUNT \
     -v "${HF_CACHE}:${HF_MOUNT}" \
     -e "HF_HOME=${HF_MOUNT}" \
     -e "PYTHONPATH=${MYPYTHONPATH}" \
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 64911983f5a8..8a62f15f9f8b 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -49,6 +49,27 @@ steps:
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
   - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+  mirror:
+  amd:
+    device: mi325_2
+    depends_on:
+    - image-build-amd
+    commands:
+      # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
+      # TODO: Remove when the bug is fixed in a future ROCm release
+      - export TORCH_NCCL_BLOCKING_WAIT=1
+      # NOTE: The rest is in complete parity with CUDA tests
+      - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+      - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+      - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+      - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+      - pytest -v -s entrypoints/llm/test_collective_rpc.py
+      - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+      - pytest -v -s ./compile/test_wrapper.py
+      - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+      - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+      - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+      - pytest -v -s v1/worker/test_worker_memory_snapshot.py
 
 - label: Distributed Tests (4 GPUs)
   timeout_in_minutes: 50
@@ -105,6 +126,40 @@ steps:
   - cd new_weight_syncing
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
+  mirror:
+  amd:
+    device: mi325_4
+    depends_on:
+    - image-build-amd
+    commands:
+    # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
+    # TODO: Remove when the bug is fixed in a future ROCm release
+    - export TORCH_NCCL_BLOCKING_WAIT=1
+    # NOTE: The rest is in complete parity with CUDA tests
+    - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+    - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+    - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+    - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+    - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+    - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+    - pytest -v -s distributed/test_utils.py
+    - pytest -v -s compile/fullgraph/test_basic_correctness.py
+    - pytest -v -s distributed/test_pynccl.py
+    - pytest -v -s distributed/test_events.py
+    - pytest -v -s distributed/test_symm_mem_allreduce.py
+    - cd ../examples/offline_inference
+    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+    - cd new_weight_syncing
+    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
+    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
 
 - label: Distributed Tests (8 GPUs)(H100)
   timeout_in_minutes: 10
@@ -138,7 +193,7 @@ steps:
   - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s -x lora/test_mixtral.py
 
-- label: Distributed Tests (2 GPUs)(H100)
+- label: Distributed Tests (2 GPUs)(H100-MI325)
   timeout_in_minutes: 15
   device: h100
   optional: true
@@ -149,6 +204,16 @@ steps:
     # - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py --- failing, need to re-enable
     - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
+  mirror:
+    amd:
+      device: mi325_2
+      depends_on:
+      - image-build-amd
+      commands:
+      - pytest -v -s tests/distributed/test_context_parallel.py
+      - python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
+      - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
+      - pytest -v -s tests/v1/distributed/test_dbo.py
 
 - label: Distributed Tests (2 GPUs)(B200)
   device: b200
diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml
index 5c9f2f5a23e6..1443d847eaf5 100644
--- a/.buildkite/test_areas/expert_parallelism.yaml
+++ b/.buildkite/test_areas/expert_parallelism.yaml
@@ -10,11 +10,6 @@ steps:
   - tests/distributed/test_eplb_algo.py
   commands:
   - pytest -v -s distributed/test_eplb_algo.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: EPLB Execution
   timeout_in_minutes: 20
@@ -26,11 +21,6 @@ steps:
   commands:
   - pytest -v -s distributed/test_eplb_execute.py
   - pytest -v -s distributed/test_eplb_spec_decode.py
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: Elastic EP Scaling Test
   timeout_in_minutes: 20
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
index f034175cc1b8..713e13e87263 100644
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -10,7 +10,11 @@ steps:
   commands:
     - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
   parallelism: 4
-
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: LoRA TP (Distributed)
   timeout_in_minutes: 30
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index e7047d449516..dd14a1eac5a4 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -57,11 +57,6 @@ steps:
   - pip install modelscope
   - pytest -v -s test_regression.py
   working_dir: "/vllm-workspace/tests" # optional
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: Examples
   timeout_in_minutes: 45
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
index d0c235800c35..9ba4b52871b5 100644
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -12,6 +12,11 @@ steps:
     # Test standard language models, excluding a subset of slow tests
     - pip freeze | grep -E 'torch'
     - pytest -v -s models/language -m 'core_model and (not slow_test)'
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: Language Models Tests (Extra Standard) %N
   timeout_in_minutes: 45
@@ -27,6 +32,16 @@ steps:
     - pip freeze | grep -E 'torch'
     - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
   parallelism: 2
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
+      commands:
+      - export TORCH_NCCL_BLOCKING_WAIT=1
+      # NOTE: The rest is in complete parity with CUDA tests
+      - pip freeze | grep -E 'torch'
+      - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
 
 - label: Language Models Tests (Hybrid) %N
   timeout_in_minutes: 75
@@ -48,9 +63,8 @@ steps:
       depends_on:
       - image-build-amd
       commands:
-      - pytest -v -s v1/e2e
-      - pytest -v -s v1/engine
       - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
+      # NOTE: The rest is in complete parity with CUDA tests
       - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
       - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
 
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
index 4a91234890d8..a18b6b39112d 100644
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -55,11 +55,6 @@ steps:
   - vllm/v1/core/
   commands:
   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
 
 - label: Multi-Modal Models (Extended) 1
   optional: true
@@ -97,8 +92,3 @@ steps:
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
index 5ee2e5186966..9e25df9c3c57 100644
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -44,3 +44,8 @@ steps:
   - tests/models/quantization
   commands:
     - pytest -v -s models/quantization
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd

From 9f46b90cfc5f531752fff2bdef1b97376482897e Mon Sep 17 00:00:00 2001
From: Andreas Karatzas <akaratza@amd.com>
Date: Sat, 14 Mar 2026 21:29:06 -0500
Subject: [PATCH 5/5] Redistributing tests to different architectures

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
---
 .buildkite/test_areas/basic_correctness.yaml |  2 +-
 .buildkite/test_areas/distributed.yaml       |  6 +++---
 .buildkite/test_areas/engine.yaml            |  2 +-
 .buildkite/test_areas/entrypoints.yaml       |  4 ++--
 .buildkite/test_areas/lora.yaml              |  2 +-
 .buildkite/test_areas/misc.yaml              |  2 +-
 .buildkite/test_areas/models_basic.yaml      |  2 +-
 .buildkite/test_areas/models_language.yaml   | 14 ++------------
 .buildkite/test_areas/models_multimodal.yaml |  2 +-
 .buildkite/test_areas/plugins.yaml           |  2 +-
 .buildkite/test_areas/quantization.yaml      |  2 +-
 .buildkite/test_areas/samplers.yaml          |  2 +-
 12 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml
index 5259a66a3c9e..4498bebaf47e 100644
--- a/.buildkite/test_areas/basic_correctness.yaml
+++ b/.buildkite/test_areas/basic_correctness.yaml
@@ -16,6 +16,6 @@ steps:
   - pytest -v -s basic_correctness/test_cpu_offload.py
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 8a62f15f9f8b..daa3d42fbc62 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -51,7 +51,7 @@ steps:
   - pytest -v -s v1/worker/test_worker_memory_snapshot.py
   mirror:
   amd:
-    device: mi325_2
+    device: mi250_2
     depends_on:
     - image-build-amd
     commands:
@@ -128,7 +128,7 @@ steps:
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
   mirror:
   amd:
-    device: mi325_4
+    device: mi250_4
     depends_on:
     - image-build-amd
     commands:
@@ -206,7 +206,7 @@ steps:
     - pytest -v -s tests/v1/distributed/test_dbo.py
   mirror:
     amd:
-      device: mi325_2
+      device: mi250_2
       depends_on:
       - image-build-amd
       commands:
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
index b5b3eeb6d728..afd4822a84d1 100644
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -49,7 +49,7 @@ steps:
     - pytest -v -s v1/e2e/test_spec_decode.py -k "tensor_parallelism"
   mirror:
     amd:
-      device: mi325_2
+      device: mi250_2
       depends_on:
       - image-build-amd
 
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
index 5796036f3361..bdb7c2adc1f4 100644
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -62,7 +62,7 @@ steps:
   - pytest -v -s tool_use
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
 
@@ -99,7 +99,7 @@ steps:
     - pytest -v -s v1/entrypoints
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
 
diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
index 713e13e87263..85af12faacaf 100644
--- a/.buildkite/test_areas/lora.yaml
+++ b/.buildkite/test_areas/lora.yaml
@@ -12,7 +12,7 @@ steps:
   parallelism: 4
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
 
diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
index dd14a1eac5a4..79b390715602 100644
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -89,7 +89,7 @@ steps:
     - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
 
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
index de0f3994dd10..f71fee7afbbb 100644
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -38,7 +38,7 @@ steps:
     - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
     
diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
index 9ba4b52871b5..488b5f7cf986 100644
--- a/.buildkite/test_areas/models_language.yaml
+++ b/.buildkite/test_areas/models_language.yaml
@@ -34,7 +34,7 @@ steps:
   parallelism: 2
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
       commands:
@@ -57,16 +57,6 @@ steps:
     # Shard hybrid language model tests
     - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
   parallelism: 2
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-      commands:
-      - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
-      # NOTE: The rest is in complete parity with CUDA tests
-      - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
-      - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
 
 - label: Language Models Test (Extended Generation) # 80min
   timeout_in_minutes: 110
@@ -109,7 +99,7 @@ steps:
     - pytest -v -s models/language/pooling -m 'not core_model'
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
 
diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
index a18b6b39112d..c52d1a2aaade 100644
--- a/.buildkite/test_areas/models_multimodal.yaml
+++ b/.buildkite/test_areas/models_multimodal.yaml
@@ -80,7 +80,7 @@ steps:
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
 
diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml
index 34747a2350db..3b703efd6c65 100644
--- a/.buildkite/test_areas/plugins.yaml
+++ b/.buildkite/test_areas/plugins.yaml
@@ -41,6 +41,6 @@ steps:
   - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
   mirror:
     amd:
-      device: mi325_2
+      device: mi250_2
       depends_on:
       - image-build-amd
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
index 9e25df9c3c57..5465c0bc0fe5 100644
--- a/.buildkite/test_areas/quantization.yaml
+++ b/.buildkite/test_areas/quantization.yaml
@@ -46,6 +46,6 @@ steps:
     - pytest -v -s models/quantization
   mirror:
     amd:
-      device: mi325_1
+      device: mi355_1
       depends_on:
       - image-build-amd
diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml
index 2052a379827a..b782f188e220 100644
--- a/.buildkite/test_areas/samplers.yaml
+++ b/.buildkite/test_areas/samplers.yaml
@@ -14,7 +14,7 @@ steps:
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
       commands: