diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 5f7440a805c..f86b4b5d958 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -116,6 +116,9 @@ if [[ $commands == *"--shard-id="* ]]; then
         --shm-size=16gb \
         --group-add "$render_gid" \
         --rm \
+        -e MIOPEN_DEBUG_CONV_DIRECT=0 \
+        -e MIOPEN_DEBUG_CONV_GEMM=0 \
+        -e VLLM_ROCM_USE_AITER=1 \
         -e HIP_VISIBLE_DEVICES="${GPU}" \
         -e HF_TOKEN \
         -e AWS_ACCESS_KEY_ID \
@@ -148,6 +151,9 @@ else
           --shm-size=16gb \
           --group-add "$render_gid" \
           --rm \
+          -e MIOPEN_DEBUG_CONV_DIRECT=0 \
+          -e MIOPEN_DEBUG_CONV_GEMM=0 \
+          -e VLLM_ROCM_USE_AITER=1 \
           -e HF_TOKEN \
           -e AWS_ACCESS_KEY_ID \
           -e AWS_SECRET_ACCESS_KEY \
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 08a3f7a1c34..930ef7a409e 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -8,14 +8,32 @@ steps:
   grade: Blocking
   commands:
     - export GPU_ARCHS=gfx942
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - export VLLM_ROCM_USE_AITER_MHA=1
-    - export VLLM_ROCM_USE_AITER_LINEAR=0
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
 
+- label: "Diffusion Images API LoRA E2E"
+  timeout_in_minutes: 20
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
+
+- label: "Diffusion Model CPU offloading Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+
 - label: "Diffusion Cache Backend Test"
   timeout_in_minutes: 15
   agent_pool: mi325_1
@@ -26,25 +44,30 @@ steps:
     - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - export VLLM_ROCM_USE_AITER_MHA=1
-    - export VLLM_ROCM_USE_AITER_LINEAR=0
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
 
-- label: "Diffusion Parallelism Test"
-  timeout_in_minutes: 15
+- label: "Diffusion Sequence Parallelism Test"
+  timeout_in_minutes: 20
   agent_pool: mi325_2
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
 
+- label: "Diffusion Tensor Parallelism Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_2
+  depends_on: amd-build
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
+
 - label: "Diffusion GPU Worker Test"
   timeout_in_minutes: 20
   agent_pool: mi325_2
@@ -52,8 +75,6 @@ steps:
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
     - pytest -s -v tests/diffusion/test_diffusion_worker.py
 
 - label: "Omni Model Test Qwen2-5-Omni"
@@ -66,12 +87,6 @@ steps:
     - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - export VLLM_ROCM_USE_AITER_MHA=1
-    - export VLLM_ROCM_USE_AITER_LINEAR=0
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
 
 - label: "Omni Model Test Qwen3-Omni"
@@ -83,9 +98,10 @@ steps:
   commands:
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py tests/e2e/online_serving/test_qwen3_omni.py
+    - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
+    - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
+    - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
+    - pytest -s -v tests/e2e/online_serving/test_async_omni.py
 
 - label: "Diffusion Image Edit Test"
   timeout_in_minutes: 15
@@ -97,10 +113,4 @@ steps:
     - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - export VLLM_ROCM_USE_AITER_MHA=1
-    - export VLLM_ROCM_USE_AITER_LINEAR=0
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/online_serving/test_i2i_multi_image_input.py