diff --git a/.buildkite/bootstrap-amd-omni.sh b/.buildkite/bootstrap-amd-omni.sh
index a38b7622011..9e7021493c5 100755
--- a/.buildkite/bootstrap-amd-omni.sh
+++ b/.buildkite/bootstrap-amd-omni.sh
@@ -90,10 +90,18 @@ upload_pipeline() {
     FAIL_FAST=$(fail_fast)
 
     cd .buildkite
+
+    # Select test definition file: merge suite for main, ready suite for PRs
+    if [[ $BUILDKITE_BRANCH == "main" ]]; then
+        TEST_YAML="test-amd-merge.yml"
+    else
+        TEST_YAML="test-amd-ready.yaml"
+    fi
+
     (
         set -x
         # Output pipeline.yaml with all blank lines removed
-        minijinja-cli test-template.j2 test-amd.yaml \
+        minijinja-cli test-template.j2 "$TEST_YAML" \
             -D branch="$BUILDKITE_BRANCH" \
             -D list_file_diff="$LIST_FILE_DIFF" \
             -D run_all="$RUN_ALL" \
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index f86b4b5d958..a06cf96bff2 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -87,7 +87,11 @@ HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 
-commands=$@
+if [[ -n "${TEST_COMMAND:-}" ]]; then
+    commands="$TEST_COMMAND"
+else
+    commands="$@"
+fi
 echo "Commands:$commands"
 
 PARALLEL_JOB_COUNT=8
@@ -102,6 +106,7 @@ if [[ -z "$render_gid" ]]; then
 fi
 
 # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
+# TODO: @tjtanaa reenable to run VLLM_ROCM_USE_AITER=1 when AITER is shipped with prebuilt kernels.
 if [[ $commands == *"--shard-id="* ]]; then
   # assign job count as the number of shards used
   commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
@@ -118,7 +123,7 @@ if [[ $commands == *"--shard-id="* ]]; then
         --rm \
         -e MIOPEN_DEBUG_CONV_DIRECT=0 \
         -e MIOPEN_DEBUG_CONV_GEMM=0 \
-        -e VLLM_ROCM_USE_AITER=1 \
+        -e VLLM_ROCM_USE_AITER=0 \
         -e HIP_VISIBLE_DEVICES="${GPU}" \
         -e HF_TOKEN \
         -e AWS_ACCESS_KEY_ID \
@@ -153,7 +158,7 @@ else
           --rm \
           -e MIOPEN_DEBUG_CONV_DIRECT=0 \
           -e MIOPEN_DEBUG_CONV_GEMM=0 \
-          -e VLLM_ROCM_USE_AITER=1 \
+          -e VLLM_ROCM_USE_AITER=0 \
           -e HF_TOKEN \
           -e AWS_ACCESS_KEY_ID \
           -e AWS_SECRET_ACCESS_KEY \
diff --git a/.buildkite/test-amd-merge.yml b/.buildkite/test-amd-merge.yml
new file mode 100644
index 00000000000..78d910b2194
--- /dev/null
+++ b/.buildkite/test-amd-merge.yml
@@ -0,0 +1,197 @@
+steps:
+
+- label: "Simple Unit Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_ROCM_USE_AITER=0
+    - "timeout 20m pytest -v -s -m 'core_model and cpu' --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml"
+
+- label: "Diffusion Model Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - timeout 20m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "advanced_model and diffusion" --run-level "advanced_model"
+
+- label: "Diffusion Images API LoRA E2E"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - timeout 20m pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
+
+- label: "Diffusion Model CPU offloading Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - |
+      timeout 20m bash -c '
+        set +e
+        pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+        EXIT1=\$?
+        pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
+        EXIT2=\$?
+        exit \$((EXIT1 | EXIT2))
+      '
+
+## ISSUE depends on `diffusers` package: https://github.com/huggingface/diffusers/issues/13274
+# - label: "Audio Generation Model Test"
+#   agent_pool: mi325_1
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
+
+- label: "Diffusion Cache Backend Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 15m pytest -s -v -m "core_model and cache and diffusion and not distributed_cuda and L4"
+
+- label: "Diffusion Sequence Parallelism Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 20m pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
+
+# merge-only tests
+- label: "Diffusion Tensor Parallelism Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - pytest -s -v tests/e2e/offline_inference/test_zimage_parallelism.py
+
+- label: "Diffusion GPU Worker Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - timeout 20m pytest -s -v tests/diffusion/test_diffusion_worker.py
+
+- label: "Benchmark & Engine Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - |
+      timeout 20m bash -c '
+        set +e
+        pytest -s -v tests/benchmarks/test_serve_cli.py
+        EXIT1=\$?
+        pytest -s -v tests/engine/test_async_omni_engine_abort.py
+        EXIT2=\$?
+        exit \$((EXIT1 | EXIT2))
+      '
+
+- label: "Omni Model Test Qwen2-5-Omni"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 20m pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
+
+- label: "Omni Model Test Qwen3-Omni"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    - timeout 10m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
+    - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
+
+- label: "Qwen3-TTS E2E Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_tts.py
+
+- label: "Diffusion Image Edit Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 20m pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
+
+# split Bagel Model Test with H100 (Real Weights) into three tests
+- label: "Bagel Text2Img Model Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
+    - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory" -k "rocm"
+
+- label: "Bagel Img2Img Model Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
+    - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "advanced_model" --run-level "advanced_model" -k "rocm"
+
+- label: "Bagel Online Serving Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    - export VLLM_IMAGE_FETCH_TIMEOUT=60
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
+    - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model" -k "rocm"
diff --git a/.buildkite/test-amd-ready.yaml b/.buildkite/test-amd-ready.yaml
new file mode 100644
index 00000000000..b09fe2ebffa
--- /dev/null
+++ b/.buildkite/test-amd-ready.yaml
@@ -0,0 +1,178 @@
+steps:
+
+- label: "Simple Unit Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_ROCM_USE_AITER=0
+    - "timeout 20m pytest -v -s -m 'core_model and cpu' --cov=vllm_omni --cov-branch --cov-report=term-missing --cov-report=html --cov-report=xml"
+
+- label: "Diffusion Model Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - timeout 20m pytest -s -v tests/e2e/offline_inference/test_t2i_model.py -m "core_model and diffusion" --run-level "core_model"
+
+- label: "Diffusion Model CPU offloading Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - |
+      timeout 20m bash -c '
+        set +e
+        pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+        EXIT1=\$?
+        pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
+        EXIT2=\$?
+        exit \$((EXIT1 | EXIT2))
+      '
+
+## ISSUE depends on `diffusers` package: https://github.com/huggingface/diffusers/issues/13274
+# - label: "Audio Generation Model Test"
+#   agent_pool: mi325_1
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - timeout 20m pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
+
+- label: "Diffusion Cache Backend Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 15m pytest -s -v -m "core_model and cache and diffusion and not distributed_cuda and L4"
+
+- label: "Diffusion Sequence Parallelism Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 20m pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py -m core_model
+
+- label: "Diffusion GPU Worker Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - timeout 20m pytest -s -v tests/diffusion/test_diffusion_worker.py
+
+- label: "Benchmark & Engine Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - |
+      timeout 30m bash -c '
+        set +e
+        pytest -s -v tests/benchmarks/test_serve_cli.py
+        EXIT1=\$?
+        pytest -s -v tests/engine/test_async_omni_engine_abort.py
+        EXIT2=\$?
+        exit \$((EXIT1 | EXIT2))
+      '
+
+- label: "Omni Model Test Qwen2-5-Omni"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 17m pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
+
+- label: "Omni Model Test Qwen3-Omni"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    - timeout 10m pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
+    - timeout 10m pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model"
+
+- label: "Qwen3-TTS E2E Test"
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 20m pytest -s -v tests/e2e/online_serving/test_qwen3_tts.py
+
+- label: "Diffusion Image Edit Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - timeout 20m pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
+
+- label: "Bagel Text2Img Model Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
+    - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "core_model" --run-level "core_model" -k "rocm"
+
+- label: "Bagel Img2Img Model Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
+    - timeout 30m pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "core_model" --run-level "core_model" -k "rocm"
+
+- label: "Bagel Online Serving Test"
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+    - export VLLM_IMAGE_FETCH_TIMEOUT=60
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
+    - timeout 40m pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "core_model" --run-level "core_model" -k "rocm"
diff --git a/.buildkite/test-template-amd-omni.j2 b/.buildkite/test-template-amd-omni.j2
index 291ed0a9ade..4612f8ccd5b 100644
--- a/.buildkite/test-template-amd-omni.j2
+++ b/.buildkite/test-template-amd-omni.j2
@@ -40,9 +40,14 @@
           {% else %}
           queue: amd_mi325_1
           {% endif %}
-        command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" && ")) | safe }}"
+{% set cmd_body = (step.command or (step.commands | join("\n"))) | trim %}
+{% set indented_cmd = cmd_body | replace("\n", "\n            ") %}
+        command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh
         env:
           DOCKER_BUILDKIT: "1"
+          TEST_COMMAND: |-
+            (command rocm-smi || true) && cd {{ (step.working_dir or default_working_dir) | safe }}
+            {{ indented_cmd | safe }}
         priority: 100
         {% if step.grade and step.grade == "Blocking" %}
         soft_fail: false
diff --git a/tests/benchmarks/test_serve_cli.py b/tests/benchmarks/test_serve_cli.py
index 8e9a3bfce81..ee371330bc1 100644
--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -12,6 +12,8 @@
 
 if current_omni_platform.is_xpu():
     stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "xpu" / "qwen2_5_omni_ci.yaml")]
+elif current_omni_platform.is_rocm():
+    stage_configs = [str(Path(__file__).parent.parent / "e2e" / "stage_configs" / "rocm" / "qwen2_5_omni_ci.yaml")]
 
 # Create parameter combinations for model and stage config
 test_params = [
@@ -21,7 +23,7 @@
 
 @pytest.mark.core_model
 @pytest.mark.benchmark
-@hardware_test(res={"cuda": "L4", "xpu": "B60"}, num_cards=3)
+@hardware_test(res={"cuda": "L4", "xpu": "B60", "rocm": "MI325"}, num_cards=3)
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 def test_bench_serve_chat(omni_server):
     command = [
diff --git a/tests/e2e/offline_inference/test_bagel_img2img.py b/tests/e2e/offline_inference/test_bagel_img2img.py
index 8c734c6a250..c7df4f91bed 100644
--- a/tests/e2e/offline_inference/test_bagel_img2img.py
+++ b/tests/e2e/offline_inference/test_bagel_img2img.py
@@ -5,7 +5,7 @@
 End-to-end test for Bagel img2img generation.
 
 This test validates that the Bagel model generates images from an input image
-and text prompt that match expected reference pixel values within a ±5 tolerance.
+and text prompt that match expected reference pixel values within a ±10 tolerance.
 
 Equivalent to running:
     python3 examples/offline_inference/bagel/end2end.py \
@@ -25,6 +25,7 @@
 from tests.conftest import modify_stage_config
 from tests.utils import hardware_test
 from vllm_omni.entrypoints.omni import Omni
+from vllm_omni.platforms import current_omni_platform
 
 # Reference pixel data extracted from the known-good output image
 # Generated with seed=52, num_inference_steps=15,
@@ -43,7 +44,21 @@
     {"position": (256, 256), "rgb": (181, 202, 222)},
 ]
 
-PIXEL_TOLERANCE = 5
+if current_omni_platform.is_rocm():
+    REFERENCE_PIXELS = [
+        {"position": (100, 100), "rgb": (156, 172, 215)},
+        {"position": (400, 50), "rgb": (106, 144, 216)},
+        {"position": (700, 100), "rgb": (118, 158, 231)},
+        {"position": (150, 400), "rgb": (183, 23, 48)},
+        {"position": (512, 336), "rgb": (218, 215, 191)},
+        {"position": (700, 400), "rgb": (194, 14, 42)},
+        {"position": (100, 600), "rgb": (105, 10, 16)},
+        {"position": (400, 600), "rgb": (167, 33, 46)},
+        {"position": (700, 600), "rgb": (102, 86, 92)},
+        {"position": (256, 256), "rgb": (181, 201, 220)},
+    ]
+
+PIXEL_TOLERANCE = 10
 
 DEFAULT_PROMPT = "<|fim_middle|><|im_start|>Change the grass color to red<|im_end|>"
 
@@ -191,7 +206,7 @@ def _resolve_stage_config(config_path: str, run_level: str) -> str:
 @pytest.mark.core_model
 @pytest.mark.advanced_model
 @pytest.mark.diffusion
-@hardware_test(res={"cuda": "H100"})
+@hardware_test(res={"cuda": "H100", "rocm": "MI325"})
 def test_bagel_img2img_shared_memory_connector(run_level):
     """Test Bagel img2img with shared memory connector."""
     input_image = _load_input_image()
diff --git a/tests/e2e/offline_inference/test_bagel_text2img.py b/tests/e2e/offline_inference/test_bagel_text2img.py
index ed369aedd92..505e12438d0 100644
--- a/tests/e2e/offline_inference/test_bagel_text2img.py
+++ b/tests/e2e/offline_inference/test_bagel_text2img.py
@@ -31,6 +31,7 @@
 from tests.conftest import modify_stage_config
 from tests.utils import hardware_test
 from vllm_omni.entrypoints.omni import Omni
+from vllm_omni.platforms import current_omni_platform
 
 # Reference pixel data extracted from the known-good output image
 # Each entry contains (x, y) position and expected (R, G, B) values
@@ -49,6 +50,20 @@
     {"position": (256, 256), "rgb": (171, 160, 153)},
 ]
 
+if current_omni_platform.is_rocm():
+    REFERENCE_PIXELS = [
+        {"position": (100, 100), "rgb": (123, 119, 100)},
+        {"position": (400, 50), "rgb": (162, 161, 142)},
+        {"position": (700, 100), "rgb": (171, 156, 127)},
+        {"position": (150, 400), "rgb": (131, 128, 112)},
+        {"position": (512, 512), "rgb": (134, 61, 59)},
+        {"position": (700, 400), "rgb": (204, 107, 43)},
+        {"position": (100, 700), "rgb": (201, 180, 165)},
+        {"position": (400, 700), "rgb": (140, 108, 87)},
+        {"position": (700, 700), "rgb": (247, 205, 145)},
+        {"position": (256, 256), "rgb": (171, 160, 153)},
+    ]
+
 # Maximum allowed difference per color channel
 PIXEL_TOLERANCE = 5
 
@@ -181,7 +196,7 @@ def _resolve_stage_config(config_path: str, run_level: str) -> str:
 @pytest.mark.core_model
 @pytest.mark.advanced_model
 @pytest.mark.diffusion
-@hardware_test(res={"cuda": "H100"})
+@hardware_test(res={"cuda": "H100", "rocm": "MI325"})
 def test_bagel_text2img_shared_memory_connector(run_level):
     """Test Bagel text2img with shared memory connector."""
     config_path = str(Path(__file__).parent / "stage_configs" / "bagel_sharedmemory_ci.yaml")
diff --git a/tests/e2e/online_serving/test_bagel_online.py b/tests/e2e/online_serving/test_bagel_online.py
index a5e26db1ea1..62e79906c03 100644
--- a/tests/e2e/online_serving/test_bagel_online.py
+++ b/tests/e2e/online_serving/test_bagel_online.py
@@ -97,7 +97,7 @@ def test_bagel_text2img_online(omni_server, openai_client) -> None:
 @pytest.mark.core_model
 @pytest.mark.advanced_model
 @pytest.mark.diffusion
-@hardware_test(res={"cuda": "H100"})
+@hardware_test(res={"cuda": "H100", "rocm": "MI325"})
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 def test_bagel_img2img_online(omni_server, openai_client) -> None:
     """Test Bagel img2img via OpenAI-compatible chat completions API."""
diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py
index ef4d40198f0..fcda20ba388 100644
--- a/tests/e2e/online_serving/test_qwen3_omni.py
+++ b/tests/e2e/online_serving/test_qwen3_omni.py
@@ -44,13 +44,9 @@ def get_chunk_config():
     return path
 
 
-# CI stage config for 2xH100-80G GPUs or AMD GPU MI325
-if current_omni_platform.is_rocm():
-    # ROCm stage config optimized for MI325 GPU
-    stage_configs = [str(Path(__file__).parent.parent / "stage_configs" / "rocm" / "qwen3_omni_ci.yaml")]
-elif current_omni_platform.is_xpu():
+if current_omni_platform.is_xpu():
     stage_configs = [str(Path(__file__).parent.parent / "stage_configs" / "xpu" / "qwen3_omni_ci.yaml")]
-else:
+else:  # MI325 GPU should share the same config as H100
     stage_configs = [get_chunk_config()]
 
 # Create parameter combinations for model and stage config
diff --git a/tests/e2e/stage_configs/rocm/qwen2_5_omni_ci.yaml b/tests/e2e/stage_configs/rocm/qwen2_5_omni_ci.yaml
index 9fa5930c8ec..b95afd22394 100644
--- a/tests/e2e/stage_configs/rocm/qwen2_5_omni_ci.yaml
+++ b/tests/e2e/stage_configs/rocm/qwen2_5_omni_ci.yaml
@@ -13,8 +13,8 @@ stage_args:
       model_arch: Qwen2_5OmniForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      max_model_len: 2400
-      max_num_batched_tokens: 2400
+      max_model_len: 16384
+      max_num_batched_tokens: 16384
       max_num_seqs: 1
       gpu_memory_utilization: 0.8
       skip_mm_profiling: true
@@ -44,8 +44,8 @@ stage_args:
       model_arch: Qwen2_5OmniForConditionalGeneration
       worker_type: ar
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      max_model_len: 2400
-      max_num_batched_tokens: 2400
+      max_model_len: 16384
+      max_num_batched_tokens: 16384
       max_num_seqs: 1
       gpu_memory_utilization: 0.8
       skip_mm_profiling: true
@@ -59,7 +59,7 @@ stage_args:
       temperature: 0.9
       top_p: 0.8
       top_k: 40
-      max_tokens: 128
+      max_tokens: 4096
       seed: 42
       detokenize: True
       repetition_penalty: 1.05
@@ -79,6 +79,8 @@ stage_args:
       trust_remote_code: true
       enable_prefix_caching: false
       engine_output_type: audio
+      max_num_batched_tokens: 4096
+      max_model_len: 4096
     engine_input_source: [1]
     final_output: true
     final_output_type: audio
@@ -86,7 +88,7 @@ stage_args:
       temperature: 0.0
       top_p: 1.0
       top_k: -1
-      max_tokens: 128
+      max_tokens: 4096
       seed: 42
       detokenize: True
       repetition_penalty: 1.1