vllm-project · AndreasKaratzas · Mar 3, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml
@@ -14,3 +14,8 @@ steps:
   - pytest -v -s basic_correctness/test_cumem.py
   - pytest -v -s basic_correctness/test_basic_correctness.py
   - pytest -v -s basic_correctness/test_cpu_offload.py
+  mirror:
+    amd:
+      device: mi250_1
+      depends_on:
+      - image-build-amd
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
@@ -49,6 +49,27 @@ steps:
   - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
   - pytest -v -s v1/worker/test_worker_memory_snapshot.py
+  mirror:
+  amd:
+    device: mi250_2
+    depends_on:
+    - image-build-amd
+    commands:
+      # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
+      # TODO: Remove when the bug is fixed in a future ROCm release
+      - export TORCH_NCCL_BLOCKING_WAIT=1
+      # NOTE: The rest is in complete parity with CUDA tests
+      - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+      - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+      - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+      - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+      - pytest -v -s entrypoints/llm/test_collective_rpc.py
+      - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
+      - pytest -v -s ./compile/test_wrapper.py
+      - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+      - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+      - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
+      - pytest -v -s v1/worker/test_worker_memory_snapshot.py
 
 - label: Distributed Torchrun + Examples (4 GPUs)
   timeout_in_minutes: 30
@@ -87,6 +108,40 @@ steps:
   - cd new_weight_syncing
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
   - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
+  mirror:
+  amd:
+    device: mi250_4
+    depends_on:
+    - image-build-amd
+    commands:
+    # Work around HIP bug tracked here: https://github.com/ROCm/hip/issues/3876
+    # TODO: Remove when the bug is fixed in a future ROCm release
+    - export TORCH_NCCL_BLOCKING_WAIT=1
+    # NOTE: The rest is in complete parity with CUDA tests
+    - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+    - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
+    - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
+    - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
+    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
+    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
+    - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
+    - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
+    - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
+    - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
+    - pytest -v -s distributed/test_utils.py
+    - pytest -v -s compile/fullgraph/test_basic_correctness.py
+    - pytest -v -s distributed/test_pynccl.py
+    - pytest -v -s distributed/test_events.py
+    - pytest -v -s distributed/test_symm_mem_allreduce.py
+    - cd ../examples/offline_inference
+    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
+    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+    - cd new_weight_syncing
+    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_nccl.py
+    - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf_ipc.py
 
 - label: Distributed DP Tests (4 GPUs)
   timeout_in_minutes: 30
@@ -161,7 +216,7 @@ steps:
   - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   - pytest -v -s -x lora/test_mixtral.py
 
-- label: Distributed Tests (2 GPUs)(H100)
+- label: Distributed Tests (2 GPUs)(H100-MI325)
   timeout_in_minutes: 15
   device: h100
   optional: true
@@ -172,6 +227,16 @@ steps:
     - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
     - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
     - pytest -v -s tests/v1/distributed/test_dbo.py
+  mirror:
+    amd:
+      device: mi250_2
+      depends_on:
+      - image-build-amd
+      commands:
+      - pytest -v -s tests/distributed/test_context_parallel.py
+      - python3 examples/offline_inference/new_weight_syncing/rlhf_async_new_apis.py
+      - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
+      - pytest -v -s tests/v1/distributed/test_dbo.py
 
 - label: Distributed Tests (2 GPUs)(B200)
   device: b200

diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
@@ -22,6 +22,11 @@ steps:
   commands:
     - pytest -v -s v1/engine/test_preprocess_error_handling.py
     - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: e2e Scheduling (1 GPU)
   timeout_in_minutes: 30
@@ -30,6 +35,11 @@ steps:
     - tests/v1/e2e/general/
   commands:
     - pytest -v -s v1/e2e/general/test_async_scheduling.py
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: e2e Core (1 GPU)
   timeout_in_minutes: 30
@@ -38,6 +48,11 @@ steps:
     - tests/v1/e2e/general/
   commands:
     - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: V1 e2e (2 GPUs)
   timeout_in_minutes: 60 # TODO: Fix timeout after we have more confidence in the test stability
@@ -51,7 +66,7 @@ steps:
     - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
   mirror:
     amd:
-      device: mi325_2
+      device: mi250_2
       depends_on:
       - image-build-amd
 

diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
@@ -55,6 +55,11 @@ steps:
   - pytest -v -s entrypoints/instrumentator
   - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
   - pytest -v -s tool_use
+  mirror:
+    amd:
+      device: mi250_1
+      depends_on:
+      - image-build-amd
 
 - label: Entrypoints Integration (Pooling)
   timeout_in_minutes: 50
@@ -84,7 +89,7 @@ steps:
     - pytest -v -s v1/entrypoints
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
 

diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml
@@ -10,7 +10,11 @@ steps:
   commands:
     - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py
   parallelism: 4
-
+  mirror:
+    amd:
+      device: mi250_1
+      depends_on:
+      - image-build-amd
 
 - label: LoRA TP (Distributed)
   timeout_in_minutes: 30

diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml
@@ -88,6 +88,11 @@ steps:
     - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
     # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
     - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
+  mirror:
+    amd:
+      device: mi250_1
+      depends_on:
+      - image-build-amd
 
 - label: Metrics, Tracing (2 GPUs)
   timeout_in_minutes: 20

diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
@@ -38,7 +38,7 @@ steps:
     - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
 

diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml
@@ -12,6 +12,11 @@ steps:
     # Test standard language models, excluding a subset of slow tests
     - pip freeze | grep -E 'torch'
     - pytest -v -s models/language -m 'core_model and (not slow_test)'
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: Language Models Tests (Extra Standard) %N
   timeout_in_minutes: 45
@@ -27,6 +32,16 @@ steps:
     - pip freeze | grep -E 'torch'
     - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
   parallelism: 2
+  mirror:
+    amd:
+      device: mi250_1
+      depends_on:
+      - image-build-amd
+      commands:
+      - export TORCH_NCCL_BLOCKING_WAIT=1
+      # NOTE: The rest is in complete parity with CUDA tests
+      - pip freeze | grep -E 'torch'
+      - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB
 
 - label: Language Models Tests (Hybrid) %N
   timeout_in_minutes: 75
@@ -84,7 +99,7 @@ steps:
     - pytest -v -s models/language/pooling -m 'not core_model'
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
 

diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml
@@ -84,6 +84,11 @@ steps:
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/processing/test_tensor_schema.py
+  mirror:
+    amd:
+      device: mi325_1
+      depends_on:
+      - image-build-amd
 
 - label: Multi-Modal Accuracy Eval (Small Models) # 50min
   timeout_in_minutes: 70
@@ -117,6 +122,11 @@ steps:
   commands:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
+  mirror:
+    amd:
+      device: mi250_1
+      depends_on:
+      - image-build-amd
 
 - label: Multi-Modal Models (Extended) 3
   optional: true

diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml
@@ -39,3 +39,8 @@ steps:
   - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
   - pytest -v -s models/test_oot_registration.py # it needs a clean process
   - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
+  mirror:
+    amd:
+      device: mi250_2
+      depends_on:
+      - image-build-amd
diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml
@@ -44,3 +44,8 @@ steps:
   - tests/models/quantization
   commands:
     - pytest -v -s models/quantization
+  mirror:
+    amd:
+      device: mi355_1
+      depends_on:
+      - image-build-amd
diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml
@@ -14,7 +14,7 @@ steps:
     - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
   mirror:
     amd:
-      device: mi325_1
+      device: mi250_1
       depends_on:
       - image-build-amd
       commands: