vllm-project · hsliuustc0106 · Mar 19, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
@@ -90,10 +90,18 @@ upload_pipeline() {
     FAIL_FAST=$(fail_fast)
 
     cd .buildkite
+
+    # Select test definition file: merge suite for main, ready suite for PRs
+    if [[ $BUILDKITE_BRANCH == "main" ]]; then
+        TEST_YAML="test-amd-merge.yml"
+    else
+        TEST_YAML="test-amd-ready.yaml"
+    fi
+
     (
         set -x
         # Output pipeline.yaml with all blank lines removed
-        minijinja-cli test-template.j2 test-amd.yaml \
+        minijinja-cli test-template.j2 "$TEST_YAML" \
             -D branch="$BUILDKITE_BRANCH" \
             -D list_file_diff="$LIST_FILE_DIFF" \
             -D run_all="$RUN_ALL" \

@@ -87,7 +87,11 @@ HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 
-commands=$@
+if [[ -n "${TEST_COMMAND:-}" ]]; then
+    commands="$TEST_COMMAND"
+else
+    commands="$@"
+fi
 echo "Commands:$commands"
 
 PARALLEL_JOB_COUNT=8
@@ -102,6 +106,7 @@ if [[ -z "$render_gid" ]]; then
 fi
 
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
+# TODO: @tjtanaa reenable to run VLLM_ROCM_USE_AITER=1 when AITER is shipped with prebuilt kernels.
 if [[ $commands == *"--shard-id="* ]]; then
   # assign job count as the number of shards used
   commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
@@ -118,7 +123,7 @@ if [[ $commands == *"--shard-id="* ]]; then
         --rm \
         -e MIOPEN_DEBUG_CONV_DIRECT=0 \
         -e MIOPEN_DEBUG_CONV_GEMM=0 \
-        -e VLLM_ROCM_USE_AITER=1 \
+        -e VLLM_ROCM_USE_AITER=0 \
         -e HIP_VISIBLE_DEVICES="${GPU}" \
         -e HF_TOKEN \
         -e AWS_ACCESS_KEY_ID \
@@ -153,7 +158,7 @@ else
           --rm \
           -e MIOPEN_DEBUG_CONV_DIRECT=0 \
           -e MIOPEN_DEBUG_CONV_GEMM=0 \
-          -e VLLM_ROCM_USE_AITER=1 \
+          -e VLLM_ROCM_USE_AITER=0 \
           -e HF_TOKEN \
           -e AWS_ACCESS_KEY_ID \
           -e AWS_SECRET_ACCESS_KEY \

@@ -0,0 +1,197 @@
+steps:
+
+- label: "Simple Unit Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_ROCM_USE_AITER=0
+    - "timeout 20m pytest -v -s -m 'core_model and cpu' --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml"
+
+- label: "Diffusion Model Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - timeout 20m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "advanced_model and diffusion" --run-level "advanced_model"
+
+- label: "Diffusion Images API LoRA E2E"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - timeout 20m pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
+
+- label: "Diffusion Model CPU offloading Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - |
+      timeout 20m bash -c '
+        set +e
+        pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+        EXIT1=\$?
+        pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
+        EXIT2=\$?
+        exit \$((EXIT1 | EXIT2))
+      '
+
+## ISSUE depends on `diffusers` package: https://github.com/huggingface/diffusers/issues/13274
+# - label: "Audio Generation Model Test"
+#   agent_pool: mi325_1
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
+
+- label: "Diffusion Cache Backend Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 15m pytest -s -v -m "core_model and cache and diffusion and not distributed_cuda and L4"
+
+- label: "Diffusion Sequence Parallelism Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 20m pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
+
+# merge-only tests
+- label: "Diffusion Tensor Parallelism Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - pytest -s -v tests/e2e/offline_inference/test_zimage_parallelism.py
+
+- label: "Diffusion GPU Worker Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - timeout 20m pytest -s -v tests/diffusion/test_diffusion_worker.py
+
+- label: "Benchmark & Engine Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - |
+      timeout 20m bash -c '
+        set +e
+        pytest -s -v tests/benchmarks/test_serve_cli.py
+        EXIT1=\$?
+        pytest -s -v tests/engine/test_async_omni_engine_abort.py
+        EXIT2=\$?
+        exit \$((EXIT1 | EXIT2))
+      '
+
+- label: "Omni Model Test Qwen2-5-Omni"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 20m pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
+
+- label: "Omni Model Test Qwen3-Omni"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    - timeout 10m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
+    - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
+
+- label: "Qwen3-TTS E2E Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_tts.py
+
+- label: "Diffusion Image Edit Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 20m pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
+
+# split Bagel Model Test with H100 (Real Weights) into three tests
+- label: "Bagel Text2Img Model Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
+    - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory" -k "rocm"
+
+- label: "Bagel Img2Img Model Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
+    - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "advanced_model" --run-level "advanced_model" -k "rocm"
+
+- label: "Bagel Online Serving Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    - export VLLM_IMAGE_FETCH_TIMEOUT=60
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
+    - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model" -k "rocm"