diff --git a/.buildkite/test-amd-merge.yml b/.buildkite/test-amd-merge.yml
index 60ba0d9d41..b6f2037d18 100644
--- a/.buildkite/test-amd-merge.yml
+++ b/.buildkite/test-amd-merge.yml
@@ -32,7 +32,6 @@ steps:
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
-    - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - |
@@ -63,13 +62,12 @@ steps:
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
-    - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - timeout 15m pytest -s -v -m "core_model and cache and diffusion and not distributed_cuda and L4"
 
-- label: "Diffusion Sequence Parallelism Test"
-  agent_pool: mi325_2
+- label: "Diffusion Sequence Parallelism Test (Need 4 GPUs)"
+  agent_pool: mi325_4
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
@@ -77,6 +75,7 @@ steps:
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
+    - timeout 20m pytest -s -v tests/diffusion/distributed/test_ulysses_uaa_perf.py
 
 # merge-only tests
 - label: "Diffusion Tensor Parallelism Test"
@@ -95,22 +94,14 @@ steps:
   commands:
     - timeout 20m pytest -s -v tests/diffusion/test_diffusion_worker.py
 
-- label: "Benchmark & Engine Test"
-  agent_pool: mi325_2
+- label: "Engine Test"
+  agent_pool: mi325_1
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - |
-      timeout 20m bash -c '
-        set +e
-        pytest -s -v tests/benchmarks/test_serve_cli.py
-        EXIT1=\$?
-        pytest -s -v tests/engine/test_async_omni_engine_abort.py
-        EXIT2=\$?
-        exit \$((EXIT1 | EXIT2))
-      '
+    - timeout 20m pytest -s -v tests/engine/test_async_omni_engine_abort.py
 
 - label: "Omni Model Test Qwen2-5-Omni"
   agent_pool: mi325_2
@@ -121,6 +112,7 @@ steps:
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
+    - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen2_5_omni.py -m "advanced_model" --run-level "advanced_model"
 
 - label: "Omni Model Test Qwen3-Omni"
   agent_pool: mi325_2
@@ -131,11 +123,10 @@ steps:
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    - timeout 10m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
-    - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
+    - timeout 30m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py tests/e2e/online_serving/test_qwen3_omni.py tests/e2e/online_serving/test_mimo_audio.py -m "advanced_model" --run-level "advanced_model"
 
 - label: "Qwen3-TTS CustomVoice E2E Test"
-  agent_pool: mi325_2
+  agent_pool: mi325_1
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
@@ -145,21 +136,21 @@ steps:
         export VLLM_LOGGING_LEVEL=DEBUG
         export VLLM_WORKER_MULTIPROC_METHOD=spawn
         export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
-        pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline_inference/test_qwen3_tts_customvoice.py
+        pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py tests/e2e/offline_inference/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model"
       '
 
 - label: "Qwen3-TTS Base E2E Test"
-  agent_pool: mi325_2
+  agent_pool: mi325_1
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
     - |
-      timeout 20m bash -c '
+      timeout 30m bash -c '
         export VLLM_LOGGING_LEVEL=DEBUG
         export VLLM_WORKER_MULTIPROC_METHOD=spawn
         export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
-        pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline_inference/test_qwen3_tts_base.py
+        pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py tests/e2e/offline_inference/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model"
       '
 
 - label: "Diffusion Image Edit Test"
@@ -173,43 +164,58 @@ steps:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - timeout 20m pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
 
-# split Bagel Model Test with H100 (Real Weights) into three tests
-- label: "Bagel Text2Img Model Test"
-  agent_pool: mi325_1
-  depends_on: amd-build
-  mirror_hardwares: [amdproduction]
-  grade: Blocking
-  commands:
-    - export GPU_ARCHS=gfx942
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
-    - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory" -k "rocm"
+# TODO: Bagel test on ROCm is very unstable. @tjtanaa
+# Need to debug before reneable numerical changes across large PRs
+# # split Bagel Model Test with H100 (Real Weights) into three tests
+# - label: "Bagel Text2Img Model Test (1/3)"
+#   agent_pool: mi325_1
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - export VLLM_ROCM_USE_AITER_RMSNORM=0
+#     - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory" -k "rocm"
 
-- label: "Bagel Img2Img Model Test"
-  agent_pool: mi325_1
-  depends_on: amd-build
-  mirror_hardwares: [amdproduction]
-  grade: Blocking
-  commands:
-    - export GPU_ARCHS=gfx942
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
-    - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "advanced_model" --run-level "advanced_model" -k "rocm"
+# - label: "Bagel Img2Img Model Test (2/3)"
+#   agent_pool: mi325_1
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - export VLLM_ROCM_USE_AITER_RMSNORM=0
+#     - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "advanced_model" --run-level "advanced_model" -k "rocm"
+
+# - label: "Bagel Online Serving Test (3/3)"
+#   agent_pool: mi325_1
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+#     - export VLLM_IMAGE_FETCH_TIMEOUT=60
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - export VLLM_ROCM_USE_AITER_RMSNORM=0
+#     - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model" -k "rocm"
 
-- label: "Bagel Online Serving Test"
+- label: "Voxtral-TTS E2E Test"
   agent_pool: mi325_1
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
-    - export GPU_ARCHS=gfx942
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    - export VLLM_IMAGE_FETCH_TIMEOUT=60
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
-    - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model" -k "rocm"
+    - |
+      timeout 20m bash -c '
+        export VLLM_LOGGING_LEVEL=DEBUG
+        export VLLM_WORKER_MULTIPROC_METHOD=spawn
+        pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model"
+      '
diff --git a/.buildkite/test-amd-ready.yaml b/.buildkite/test-amd-ready.yaml
index 6e31163acc..ced91635c2 100644
--- a/.buildkite/test-amd-ready.yaml
+++ b/.buildkite/test-amd-ready.yaml
@@ -9,13 +9,37 @@ steps:
     - export VLLM_ROCM_USE_AITER=0
     - "timeout 20m pytest -v -s -m 'core_model and cpu' --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml"
 
+- label: "Voxtral TTS CUDA Unit Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - timeout 10m pytest -s -v tests/model_executor/models/voxtral_tts/test_cuda_graph_acoustic_transformer.py
+
 - label: "Diffusion Model Test"
-  agent_pool: mi325_2
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - timeout 30m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "core_model and diffusion" --run-level "core_model"
+
+- label: "Diffusion Batching Test"
+  agent_pool: mi325_1
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
-    - timeout 20m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "core_model and diffusion" --run-level "core_model"
+    - timeout 20m pytest -s -v tests/e2e/offline_inference/test_qwen_image_diffusion_batching.py -m "core_model and diffusion" --run-level "core_model"
+
+- label: "Custom Pipeline Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - timeout 20m pytest -s -v tests/e2e/offline_inference/custom_pipeline/ -m "core_model"
 
 - label: "Diffusion Model CPU offloading Test"
   agent_pool: mi325_1
@@ -23,7 +47,6 @@ steps:
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
-    - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - |
@@ -77,47 +100,58 @@ steps:
   commands:
     - timeout 20m pytest -s -v tests/diffusion/test_diffusion_worker.py
 
-- label: "Benchmark & Engine Test"
-  agent_pool: mi325_2
+- label: "Engine Test"
+  agent_pool: mi325_1
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - |
-      timeout 30m bash -c '
-        set +e
-        pytest -s -v tests/benchmarks/test_serve_cli.py
-        EXIT1=\$?
-        pytest -s -v tests/engine/test_async_omni_engine_abort.py
-        EXIT2=\$?
-        exit \$((EXIT1 | EXIT2))
+      timeout 15m bash -c '
+              pytest -s -v tests/engine/test_async_omni_engine_abort.py
       '
 
-- label: "Omni Model Test Qwen2-5-Omni"
-  agent_pool: mi325_2
-  depends_on: amd-build
-  mirror_hardwares: [amdproduction]
-  grade: Blocking
-  commands:
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - timeout 17m pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
 
-- label: "Omni Model Test Qwen3-Omni"
-  agent_pool: mi325_2
+# NOTE: This test is not running any thing. It is skipped and deselected.
+# Currently it is = 1 skipped, 1 deselected, 17 warnings in 0.03s ======
+# - label: "Omni Model Test Qwen2-5-Omni"
+#   agent_pool: mi325_2
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py -m "core_model" --run-level "core_model"
+
+# - label: "Omni Model Test Qwen3-Omni"
+#   agent_pool: mi325_2
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+#     - timeout 10m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
+#     - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model"
+
+- label: "MiMo-Audio E2E Test with H100"
+  agent_pool: mi325_1
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    - timeout 10m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
-    - timeout 10m pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model"
+    - |
+      timeout 30m bash -c '
+        export VLLM_LOGGING_LEVEL=DEBUG
+        export VLLM_WORKER_MULTIPROC_METHOD=spawn
+        pytest -s -v tests/e2e/online_serving/test_mimo_audio.py -m "core_model" --run-level "core_model"
+      '
 
 - label: "Qwen3-TTS E2E Test"
-  agent_pool: mi325_2
+  agent_pool: mi325_1
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
@@ -125,55 +159,82 @@ steps:
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
-    - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "core_model" --run-level "core_model"
+    - timeout 30m pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "core_model" --run-level "core_model"
 
-- label: "Diffusion Image Edit Test"
+- label: "Voxtral-TTS E2E Test"
   agent_pool: mi325_1
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
-    - export GPU_ARCHS=gfx942
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - timeout 20m pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
+    - |
+      timeout 20m bash -c '
+        export VLLM_LOGGING_LEVEL=DEBUG
+        export VLLM_WORKER_MULTIPROC_METHOD=spawn
+        pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model"
+        pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model"
+      '
 
-- label: "Bagel Text2Img Model Test"
+- label: "Diffusion Image Edit Test"
   agent_pool: mi325_1
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
     - export GPU_ARCHS=gfx942
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
-    - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "core_model" --run-level "core_model" -k "rocm"
+    - timeout 20m pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
 
-- label: "Bagel Img2Img Model Test"
-  agent_pool: mi325_1
-  depends_on: amd-build
-  mirror_hardwares: [amdproduction]
-  grade: Blocking
-  commands:
-    - export GPU_ARCHS=gfx942
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
-    - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "core_model" --run-level "core_model" -k "rocm"
+# TODO: Bagel test on ROCm is very unstable. @tjtanaa
+# Need to debug before reneable numerical changes across large PRs
+# - label: "Bagel Text2Img Model Test"
+#   agent_pool: mi325_1
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - export VLLM_ROCM_USE_AITER_RMSNORM=0
+#     - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "core_model" --run-level "core_model" -k "rocm"
+
+# - label: "Bagel Img2Img Model Test"
+#   agent_pool: mi325_1
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - export VLLM_ROCM_USE_AITER_RMSNORM=0
+#     - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "core_model" --run-level "core_model" -k "rocm"
 
-- label: "Bagel Online Serving Test"
+# - label: "Bagel Online Serving Test"
+#   agent_pool: mi325_1
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+#     - export VLLM_IMAGE_FETCH_TIMEOUT=60
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - export VLLM_ROCM_USE_AITER_RMSNORM=0
+#     - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "core_model" --run-level "core_model" -k "rocm"
+
+- label: "CosyVoice3-TTS E2E Test"
   agent_pool: mi325_1
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
-    - export GPU_ARCHS=gfx942
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    - export VLLM_IMAGE_FETCH_TIMEOUT=60
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
-    - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "core_model" --run-level "core_model" -k "rocm"
+    - |
+      timeout 20m bash -c '
+        pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "core_model" --run-level "core_model"
+      '
diff --git a/.buildkite/test-template-amd-omni.j2 b/.buildkite/test-template-amd-omni.j2
index 8dc91a1172..f4c386a5fe 100644
--- a/.buildkite/test-template-amd-omni.j2
+++ b/.buildkite/test-template-amd-omni.j2
@@ -48,6 +48,9 @@
           DOCKER_BUILDKIT: "1"
           TEST_COMMAND: |-
             (command rocm-smi || true) && cd {{ (step.working_dir or default_working_dir) | safe }}
+{% if "mi250" in step.agent_pool %}
+            python3 -m pip uninstall -y amd-aiter
+{% endif %}
             {{ indented_cmd | safe }}
         priority: 100
         {% if step.grade and step.grade == "Blocking" %}
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index bfbb060bcb..ce3f0aa3b5 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -39,6 +39,24 @@ RUN if [ "${USE_NIGHTLY_BUILD}" = "1" ]; then \
 # Step 3: Copy vllm-omni code and install without uv
 RUN mkdir -p ${COMMON_WORKDIR}/vllm-omni
 COPY . ${COMMON_WORKDIR}/vllm-omni
+
+# This is a workaround to ensure pytest exits with the correct status code in CI tests.
+RUN printf '%s\n' \
+    'import os' \
+    '' \
+    '_exit_code = 1' \
+    '' \
+    'def pytest_sessionfinish(session, exitstatus):' \
+    '    global _exit_code' \
+    '    _exit_code = int(exitstatus)' \
+    '' \
+    'def pytest_unconfigure(config):' \
+    '    import sys' \
+    '    sys.stdout.flush()' \
+    '    sys.stderr.flush()' \
+    '    os._exit(_exit_code)' \
+    > ${COMMON_WORKDIR}/vllm-omni/conftest.py
+
 RUN cd ${COMMON_WORKDIR}/vllm-omni && uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]" --no-build-isolation
 
 RUN ln -sf /usr/bin/python3 /usr/bin/python
diff --git a/tests/e2e/offline_inference/test_t2i_model.py b/tests/e2e/offline_inference/test_t2i_model.py
index 77b2b3aaf2..55a154f61b 100644
--- a/tests/e2e/offline_inference/test_t2i_model.py
+++ b/tests/e2e/offline_inference/test_t2i_model.py
@@ -26,17 +26,12 @@
 # TODO: When NPU support is ready, remove this branch.
 if current_omni_platform.is_npu():
     models = ["Tongyi-MAI/Z-Image-Turbo", "Qwen/Qwen-Image"]
-elif current_omni_platform.is_rocm():
-    # TODO: When ROCm support is ready, remove this branch.
-    # Current upstream vLLM has issues running riverclouds/qwen_image_random
-    # on ROCm
-    models = ["Tongyi-MAI/Z-Image-Turbo"]
 
 
 @pytest.mark.core_model
 @pytest.mark.advanced_model
 @pytest.mark.diffusion
-@hardware_test(res={"cuda": "L4", "rocm": "MI325", "xpu": "B60"}, num_cards={"cuda": 1, "rocm": 2, "xpu": 2})
+@hardware_test(res={"cuda": "L4", "rocm": "MI325", "xpu": "B60"}, num_cards={"cuda": 1, "rocm": 1, "xpu": 2})
 @pytest.mark.parametrize("model_name", models)
 def test_diffusion_model(model_name: str, run_level):
     if run_level == "core_model" and model_name != "riverclouds/qwen_image_random":
diff --git a/tests/e2e/offline_inference/test_zimage_parallelism.py b/tests/e2e/offline_inference/test_zimage_parallelism.py
index 9d9db16a40..b685704ae4 100644
--- a/tests/e2e/offline_inference/test_zimage_parallelism.py
+++ b/tests/e2e/offline_inference/test_zimage_parallelism.py
@@ -159,8 +159,8 @@ def _run_zimage_generate(
 @pytest.mark.parallel
 @hardware_test(res={"cuda": "L4", "rocm": "MI325"}, num_cards={"cuda": 4, "rocm": 2})
 def test_zimage_tensor_parallel_tp2(tmp_path: Path):
-    if current_omni_platform.is_npu() or current_omni_platform.is_rocm():
-        pytest.skip("Z-Image TP e2e test is only supported on CUDA for now.")
+    if current_omni_platform.is_npu():
+        pytest.skip("Z-Image TP e2e test is only supported on CUDA and ROCm for now.")
     if not current_omni_platform.is_available() or current_omni_platform.device_count() < 2:
         pytest.skip("Z-Image TP=2 requires >= 2 devices.")
 
@@ -211,7 +211,9 @@ def test_zimage_tensor_parallel_tp2(tmp_path: Path):
     )
 
     print(f"Z-Image TP perf (lower is better): tp1_time_s={tp1_time_s:.6f}, tp2_time_s={tp2_time_s:.6f}")
-    assert tp2_time_s < tp1_time_s, f"Expected TP=2 to be faster than TP=1 (tp1={tp1_time_s}, tp2={tp2_time_s})"
+    # ROCm is not optimized TP2 can be slower than TP1
+    if not current_omni_platform.is_rocm():
+        assert tp2_time_s < tp1_time_s, f"Expected TP=2 to be faster than TP=1 (tp1={tp1_time_s}, tp2={tp2_time_s})"
 
     print(f"Z-Image TP peak memory (MB): tp1_peak_mem={tp1_peak_mem:.2f}, tp2_peak_mem={tp2_peak_mem:.2f}")
     assert tp2_peak_mem < tp1_peak_mem, (
@@ -221,8 +223,8 @@ def test_zimage_tensor_parallel_tp2(tmp_path: Path):
 
 @pytest.mark.integration
 def test_zimage_vae_patch_parallel_tp2(tmp_path: Path):
-    if current_omni_platform.is_npu() or current_omni_platform.is_rocm():
-        pytest.skip("Z-Image VAE patch parallel e2e test is only supported on CUDA for now.")
+    if current_omni_platform.is_npu():
+        pytest.skip("Z-Image VAE patch parallel e2e test is only supported on CUDA and ROCm for now.")
     if not current_omni_platform.is_available() or current_omni_platform.device_count() < 2:
         pytest.skip("Z-Image VAE patch parallel TP=2 requires >= 2 devices.")
 
diff --git a/vllm_omni/engine/stage_init_utils.py b/vllm_omni/engine/stage_init_utils.py
index e6f603d2a9..277da09602 100644
--- a/vllm_omni/engine/stage_init_utils.py
+++ b/vllm_omni/engine/stage_init_utils.py
@@ -138,6 +138,20 @@ def extract_stage_metadata(stage_config: Any) -> StageMetadata:
     stage_id: int = stage_config.stage_id
     stage_type: Literal["llm", "diffusion"] = getattr(stage_config, "stage_type", "llm")
     engine_args = stage_config.engine_args
+
+    if current_omni_platform.is_rocm():
+        if engine_args.get("attention_backend") is None:
+            from vllm._aiter_ops import rocm_aiter_ops
+
+            if rocm_aiter_ops.is_enabled():
+                engine_args["attention_backend"] = "ROCM_AITER_FA"
+            # Before vLLM v0.19.0, the default attention backend is TRITON_ATTN for ROCm.
+            # Since vLLM v0.19.0, the default attention backend is ROCM_ATTN for ROCm.
+            # However, the compatibility of ROCM_ATTN with Omni is not guaranteed.
+            # Therefore, we still use TRITON_ATTN as the default attention backend,
+            # when the selected_backend is not specified.
+            engine_args["attention_backend"] = "TRITON_ATTN"
+
     runtime_cfg = getattr(stage_config, "runtime", {})
     engine_input_source: list[int] = getattr(stage_config, "engine_input_source", [])
     final_output: bool = getattr(stage_config, "final_output", False)
diff --git a/vllm_omni/platforms/rocm/platform.py b/vllm_omni/platforms/rocm/platform.py
index 4479e54f2a..7b0e09c128 100644
--- a/vllm_omni/platforms/rocm/platform.py
+++ b/vllm_omni/platforms/rocm/platform.py
@@ -16,6 +16,34 @@ class RocmOmniPlatform(OmniPlatform, RocmPlatform):
 
     Inherits all ROCm-specific implementations from vLLM's RocmPlatform,
     and adds Omni-specific interfaces from OmniPlatform.
+
+
+    NOTE: AR Attention Backend Overriding Logic:
+    ------------------------------------------
+    Since vLLM v0.19.0, the default attention backend is ROCM_ATTN for ROCm.
+    However, the compatibility of ROCM_ATTN with Omni is not guaranteed.
+    Therefore, we still use TRITON_ATTN as the default attention backend,
+    when the selected_backend is not specified.
+
+    So the behaviour of the attention backend overriding logic currently lives in
+    extract_stage_metadata in `vllm_omni/engine/stage_init_utils.py`
+
+    ```
+    if current_omni_platform.is_rocm():
+        print(f"engine_args: {str(engine_args)}")
+        if engine_args.get("attention_backend") is None:
+            from vllm._aiter_ops import rocm_aiter_ops
+
+            if rocm_aiter_ops.is_enabled():
+                engine_args["attention_backend"] = "ROCM_AITER_FA"
+            # Before vLLM v0.19.0, the default attention backend is TRITON_ATTN for ROCm.
+            # Since vLLM v0.19.0, the default attention backend is ROCM_ATTN for ROCm.
+            # However, the compatibility of ROCM_ATTN with Omni is not guaranteed.
+            # Therefore, we still use TRITON_ATTN as the default attention backend,
+            # when the selected_backend is not specified.
+            engine_args["attention_backend"] = "TRITON_ATTN"
+    ```
+
     """
 
     _omni_enum = OmniPlatformEnum.ROCM