From 664428c6de82689006d65cfaf0eb46e457680514 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 28 Jan 2026 10:48:34 +0000
Subject: [PATCH 1/4] enable test

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml                    | 77 +++++++++++++++++----
 tests/e2e/online_serving/test_qwen3_omni.py |  2 -
 2 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 86d65f15bcf..a570182466c 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -11,11 +11,47 @@ steps:
     - export MIOPEN_DEBUG_CONV_DIRECT=0
     - export MIOPEN_DEBUG_CONV_GEMM=0
     - export VLLM_ROCM_USE_AITER=1
-    - export VLLM_ROCM_USE_AITER_MHA=1
-    - export VLLM_ROCM_USE_AITER_LINEAR=0
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
 
+- label: "Diffusion Images API LoRA E2E"
+  timeout_in_minutes: 20
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
+    - export VLLM_ROCM_USE_AITER=1
+    - pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
+
+- label: "Diffusion Model CPU offloading Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
+    - export VLLM_ROCM_USE_AITER=1
+    - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+  
+- label: "Audio Generation Model Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
+
 - label: "Diffusion Cache Backend Test"
   timeout_in_minutes: 15
   agent_pool: mi325_1
@@ -29,22 +65,36 @@ steps:
     - export MIOPEN_DEBUG_CONV_DIRECT=0
     - export MIOPEN_DEBUG_CONV_GEMM=0
     - export VLLM_ROCM_USE_AITER=1
-    - export VLLM_ROCM_USE_AITER_MHA=1
-    - export VLLM_ROCM_USE_AITER_LINEAR=0
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
 
-- label: "Diffusion Parallelism Test"
-  timeout_in_minutes: 15
+- label: "Diffusion Sequence Parallelism Test"
+  timeout_in_minutes: 20
   agent_pool: mi325_2
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - export MIOPEN_DEBUG_CONV_DIRECT=0
     - export MIOPEN_DEBUG_CONV_GEMM=0
+    - export VLLM_ROCM_USE_AITER=1
     - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
 
+- label: "Diffusion Tensor Parallelism Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_2
+  depends_on: amd-build
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
+    - export VLLM_ROCM_USE_AITER=1
+    - pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
+
 - label: "Diffusion GPU Worker Test"
   timeout_in_minutes: 20
   agent_pool: mi325_2
@@ -69,9 +119,6 @@ steps:
     - export MIOPEN_DEBUG_CONV_DIRECT=0
     - export MIOPEN_DEBUG_CONV_GEMM=0
     - export VLLM_ROCM_USE_AITER=1
-    - export VLLM_ROCM_USE_AITER_MHA=1
-    - export VLLM_ROCM_USE_AITER_LINEAR=0
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
 
 - label: "Omni Model Test Qwen3-Omni"
@@ -85,7 +132,10 @@ steps:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - export MIOPEN_DEBUG_CONV_DIRECT=0
     - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py tests/e2e/online_serving/test_qwen3_omni.py
+    - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
+    - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
+    - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
+    - pytest -s -v tests/e2e/online_serving/test_async_omni.py
 
 - label: "Diffusion Image Edit Test"
   timeout_in_minutes: 15
@@ -100,7 +150,4 @@ steps:
     - export MIOPEN_DEBUG_CONV_DIRECT=0
     - export MIOPEN_DEBUG_CONV_GEMM=0
     - export VLLM_ROCM_USE_AITER=1
-    - export VLLM_ROCM_USE_AITER_MHA=1
-    - export VLLM_ROCM_USE_AITER_LINEAR=0
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/online_serving/test_i2i_multi_image_input.py
diff --git a/tests/e2e/online_serving/test_qwen3_omni.py b/tests/e2e/online_serving/test_qwen3_omni.py
index 8564104a03f..c591f9bb329 100644
--- a/tests/e2e/online_serving/test_qwen3_omni.py
+++ b/tests/e2e/online_serving/test_qwen3_omni.py
@@ -144,7 +144,6 @@ def get_max_batch_size(size_type="few"):
     return batch_sizes.get(size_type, 5)
 
 
-@pytest.mark.skipif(is_rocm(), reason="Test skipped on AMD environment due to known output issues")
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 def test_mix_to_text_audio_001(client: openai.OpenAI, omni_server, request) -> None:
     """
@@ -224,7 +223,6 @@ def test_mix_to_text_audio_001(client: openai.OpenAI, omni_server, request) -> N
     assert similarity > 0.9, "The audio content is not same as the text"
 
 
-@pytest.mark.skipif(is_rocm(), reason="Test skipped on AMD environment due to known output issues")
 @pytest.mark.parametrize("omni_server", test_params, indirect=True)
 def test_text_to_text_audio_001(client: openai.OpenAI, omni_server) -> None:
     """

From 8f40f68ae26269bcebba364d10078e077d60a163 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 28 Jan 2026 15:39:46 +0000
Subject: [PATCH 2/4] fix the rocm yaml

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .../online_serving/stage_configs/rocm/qwen3_omni_ci.yaml   | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml b/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml
index ef0bae0fadc..d8157881246 100644
--- a/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml
+++ b/tests/e2e/online_serving/stage_configs/rocm/qwen3_omni_ci.yaml
@@ -22,7 +22,6 @@ stage_args:
       enable_prefix_caching: false
       hf_config_name: thinker_config
       tensor_parallel_size: 2
-      load_format: dummy
     final_output: true
     final_output_type: text
     is_comprehension: true
@@ -52,7 +51,6 @@ stage_args:
        enable_prefix_caching: false
        distributed_executor_backend: "mp"
        hf_config_name: talker_config
-       load_format: dummy
     engine_input_source: [0]
     custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
     # final_output: true
@@ -60,7 +58,7 @@ stage_args:
     default_sampling_params:
       temperature: 0.9
       top_k: 50
-      max_tokens: 100
+      max_tokens: 1000
       seed: 42
       detokenize: False
       repetition_penalty: 1.05
@@ -83,7 +81,6 @@ stage_args:
       distributed_executor_backend: "mp"
       max_num_batched_tokens: 1000000
       hf_config_name: thinker_config
-      load_format: dummy
       async_scheduling: false
     engine_input_source: [1]
     custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
@@ -93,7 +90,7 @@ stage_args:
       temperature: 0.0
       top_p: 1.0
       top_k: -1
-      max_tokens: 200
+      max_tokens: 2000
       seed: 42
       detokenize: True
       repetition_penalty: 1.1

From aa887f4ca096afd1a2bed2fa28bdbae66b14d4d5 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 28 Jan 2026 15:41:21 +0000
Subject: [PATCH 3/4] only run the two tests for debugging

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 236 +++++++++++++++++++--------------------
 1 file changed, 118 insertions(+), 118 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index a570182466c..7462662166a 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1,47 +1,47 @@
 steps:
 
-- label: "Diffusion Model Test"
-  timeout_in_minutes: 20
-  agent_pool: mi325_2
-  depends_on: amd-build
-  mirror_hardwares: [amdproduction]
-  grade: Blocking
-  commands:
-    - export GPU_ARCHS=gfx942
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
+# - label: "Diffusion Model Test"
+#   timeout_in_minutes: 20
+#   agent_pool: mi325_2
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export MIOPEN_DEBUG_CONV_DIRECT=0
+#     - export MIOPEN_DEBUG_CONV_GEMM=0
+#     - export VLLM_ROCM_USE_AITER=1
+#     - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
 
-- label: "Diffusion Images API LoRA E2E"
-  timeout_in_minutes: 20
-  agent_pool: mi325_1
-  depends_on: amd-build
-  mirror_hardwares: [amdproduction]
-  grade: Blocking
-  commands:
-    - export GPU_ARCHS=gfx942
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
+# - label: "Diffusion Images API LoRA E2E"
+#   timeout_in_minutes: 20
+#   agent_pool: mi325_1
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - export MIOPEN_DEBUG_CONV_DIRECT=0
+#     - export MIOPEN_DEBUG_CONV_GEMM=0
+#     - export VLLM_ROCM_USE_AITER=1
+#     - pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
 
-- label: "Diffusion Model CPU offloading Test"
-  timeout_in_minutes: 20
-  agent_pool: mi325_1
-  depends_on: amd-build
-  mirror_hardwares: [amdproduction]
-  grade: Blocking
-  commands:
-    - export GPU_ARCHS=gfx942
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
+# - label: "Diffusion Model CPU offloading Test"
+#   timeout_in_minutes: 20
+#   agent_pool: mi325_1
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - export MIOPEN_DEBUG_CONV_DIRECT=0
+#     - export MIOPEN_DEBUG_CONV_GEMM=0
+#     - export VLLM_ROCM_USE_AITER=1
+#     - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
   
 - label: "Audio Generation Model Test"
   timeout_in_minutes: 20
@@ -52,74 +52,74 @@ steps:
   commands:
     - pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
 
-- label: "Diffusion Cache Backend Test"
-  timeout_in_minutes: 15
-  agent_pool: mi325_1
-  depends_on: amd-build
-  mirror_hardwares: [amdproduction]
-  grade: Blocking
-  commands:
-    - export GPU_ARCHS=gfx942
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
+# - label: "Diffusion Cache Backend Test"
+#   timeout_in_minutes: 15
+#   agent_pool: mi325_1
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - export MIOPEN_DEBUG_CONV_DIRECT=0
+#     - export MIOPEN_DEBUG_CONV_GEMM=0
+#     - export VLLM_ROCM_USE_AITER=1
+#     - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
 
-- label: "Diffusion Sequence Parallelism Test"
-  timeout_in_minutes: 20
-  agent_pool: mi325_2
-  depends_on: amd-build
-  mirror_hardwares: [amdproduction]
-  grade: Blocking
-  commands:
-    - export GPU_ARCHS=gfx942
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
+# - label: "Diffusion Sequence Parallelism Test"
+#   timeout_in_minutes: 20
+#   agent_pool: mi325_2
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - export MIOPEN_DEBUG_CONV_DIRECT=0
+#     - export MIOPEN_DEBUG_CONV_GEMM=0
+#     - export VLLM_ROCM_USE_AITER=1
+#     - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
 
-- label: "Diffusion Tensor Parallelism Test"
-  timeout_in_minutes: 20
-  agent_pool: mi325_2
-  depends_on: amd-build
-  commands:
-    - export GPU_ARCHS=gfx942
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
+# - label: "Diffusion Tensor Parallelism Test"
+#   timeout_in_minutes: 20
+#   agent_pool: mi325_2
+#   depends_on: amd-build
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - export MIOPEN_DEBUG_CONV_DIRECT=0
+#     - export MIOPEN_DEBUG_CONV_GEMM=0
+#     - export VLLM_ROCM_USE_AITER=1
+#     - pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
 
-- label: "Diffusion GPU Worker Test"
-  timeout_in_minutes: 20
-  agent_pool: mi325_2
-  depends_on: amd-build
-  mirror_hardwares: [amdproduction]
-  grade: Blocking
-  commands:
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - pytest -s -v tests/diffusion/test_gpu_diffusion_worker.py
+# - label: "Diffusion GPU Worker Test"
+#   timeout_in_minutes: 20
+#   agent_pool: mi325_2
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export MIOPEN_DEBUG_CONV_DIRECT=0
+#     - export MIOPEN_DEBUG_CONV_GEMM=0
+#     - pytest -s -v tests/diffusion/test_gpu_diffusion_worker.py
 
-- label: "Omni Model Test Qwen2-5-Omni"
-  timeout_in_minutes: 15
-  agent_pool: mi325_2
-  depends_on: amd-build
-  mirror_hardwares: [amdproduction]
-  grade: Blocking
-  commands:
-    - export GPU_ARCHS=gfx942
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
+# - label: "Omni Model Test Qwen2-5-Omni"
+#   timeout_in_minutes: 15
+#   agent_pool: mi325_2
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - export MIOPEN_DEBUG_CONV_DIRECT=0
+#     - export MIOPEN_DEBUG_CONV_GEMM=0
+#     - export VLLM_ROCM_USE_AITER=1
+#     - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
 
 - label: "Omni Model Test Qwen3-Omni"
   timeout_in_minutes: 15
@@ -137,17 +137,17 @@ steps:
     - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
     - pytest -s -v tests/e2e/online_serving/test_async_omni.py
 
-- label: "Diffusion Image Edit Test"
-  timeout_in_minutes: 15
-  agent_pool: mi325_1
-  depends_on: amd-build
-  mirror_hardwares: [amdproduction]
-  grade: Blocking
-  commands:
-    - export GPU_ARCHS=gfx942
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
-    - export VLLM_ROCM_USE_AITER=1
-    - pytest -s -v tests/e2e/online_serving/test_i2i_multi_image_input.py
+# - label: "Diffusion Image Edit Test"
+#   timeout_in_minutes: 15
+#   agent_pool: mi325_1
+#   depends_on: amd-build
+#   mirror_hardwares: [amdproduction]
+#   grade: Blocking
+#   commands:
+#     - export GPU_ARCHS=gfx942
+#     - export VLLM_LOGGING_LEVEL=DEBUG
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - export MIOPEN_DEBUG_CONV_DIRECT=0
+#     - export MIOPEN_DEBUG_CONV_GEMM=0
+#     - export VLLM_ROCM_USE_AITER=1
+#     - pytest -s -v tests/e2e/online_serving/test_i2i_multi_image_input.py

From bdb2342b82fd59c1daa6ffe5f761dc725521f11c Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 28 Jan 2026 16:35:57 +0000
Subject: [PATCH 4/4] enable all tests and make aiter enabled default

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .../scripts/hardware_ci/run-amd-test.sh       |   6 +
 .buildkite/test-amd.yaml                      | 209 +++++++-----------
 2 files changed, 92 insertions(+), 123 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 5f7440a805c..f86b4b5d958 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -116,6 +116,9 @@ if [[ $commands == *"--shard-id="* ]]; then
         --shm-size=16gb \
         --group-add "$render_gid" \
         --rm \
+        -e MIOPEN_DEBUG_CONV_DIRECT=0 \
+        -e MIOPEN_DEBUG_CONV_GEMM=0 \
+        -e VLLM_ROCM_USE_AITER=1 \
         -e HIP_VISIBLE_DEVICES="${GPU}" \
         -e HF_TOKEN \
         -e AWS_ACCESS_KEY_ID \
@@ -148,6 +151,9 @@ else
           --shm-size=16gb \
           --group-add "$render_gid" \
           --rm \
+          -e MIOPEN_DEBUG_CONV_DIRECT=0 \
+          -e MIOPEN_DEBUG_CONV_GEMM=0 \
+          -e VLLM_ROCM_USE_AITER=1 \
           -e HF_TOKEN \
           -e AWS_ACCESS_KEY_ID \
           -e AWS_SECRET_ACCESS_KEY \
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 7462662166a..7a91c829806 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1,125 +1,93 @@
 steps:
 
-# - label: "Diffusion Model Test"
-#   timeout_in_minutes: 20
-#   agent_pool: mi325_2
-#   depends_on: amd-build
-#   mirror_hardwares: [amdproduction]
-#   grade: Blocking
-#   commands:
-#     - export GPU_ARCHS=gfx942
-#     - export MIOPEN_DEBUG_CONV_DIRECT=0
-#     - export MIOPEN_DEBUG_CONV_GEMM=0
-#     - export VLLM_ROCM_USE_AITER=1
-#     - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
+- label: "Diffusion Model Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
 
-# - label: "Diffusion Images API LoRA E2E"
-#   timeout_in_minutes: 20
-#   agent_pool: mi325_1
-#   depends_on: amd-build
-#   mirror_hardwares: [amdproduction]
-#   grade: Blocking
-#   commands:
-#     - export GPU_ARCHS=gfx942
-#     - export VLLM_LOGGING_LEVEL=DEBUG
-#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#     - export MIOPEN_DEBUG_CONV_DIRECT=0
-#     - export MIOPEN_DEBUG_CONV_GEMM=0
-#     - export VLLM_ROCM_USE_AITER=1
-#     - pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
+- label: "Diffusion Images API LoRA E2E"
+  timeout_in_minutes: 20
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/online_serving/test_images_generations_lora.py
 
-# - label: "Diffusion Model CPU offloading Test"
-#   timeout_in_minutes: 20
-#   agent_pool: mi325_1
-#   depends_on: amd-build
-#   mirror_hardwares: [amdproduction]
-#   grade: Blocking
-#   commands:
-#     - export GPU_ARCHS=gfx942
-#     - export VLLM_LOGGING_LEVEL=DEBUG
-#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#     - export MIOPEN_DEBUG_CONV_DIRECT=0
-#     - export MIOPEN_DEBUG_CONV_GEMM=0
-#     - export VLLM_ROCM_USE_AITER=1
-#     - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
-  
-- label: "Audio Generation Model Test"
+- label: "Diffusion Model CPU offloading Test"
   timeout_in_minutes: 20
   agent_pool: mi325_1
   depends_on: amd-build
   mirror_hardwares: [amdproduction]
   grade: Blocking
   commands:
-    - pytest -s -v tests/e2e/offline_inference/test_stable_audio_model.py
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
 
-# - label: "Diffusion Cache Backend Test"
-#   timeout_in_minutes: 15
-#   agent_pool: mi325_1
-#   depends_on: amd-build
-#   mirror_hardwares: [amdproduction]
-#   grade: Blocking
-#   commands:
-#     - export GPU_ARCHS=gfx942
-#     - export VLLM_LOGGING_LEVEL=DEBUG
-#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#     - export MIOPEN_DEBUG_CONV_DIRECT=0
-#     - export MIOPEN_DEBUG_CONV_GEMM=0
-#     - export VLLM_ROCM_USE_AITER=1
-#     - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
+- label: "Diffusion Cache Backend Test"
+  timeout_in_minutes: 15
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
 
-# - label: "Diffusion Sequence Parallelism Test"
-#   timeout_in_minutes: 20
-#   agent_pool: mi325_2
-#   depends_on: amd-build
-#   mirror_hardwares: [amdproduction]
-#   grade: Blocking
-#   commands:
-#     - export GPU_ARCHS=gfx942
-#     - export VLLM_LOGGING_LEVEL=DEBUG
-#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#     - export MIOPEN_DEBUG_CONV_DIRECT=0
-#     - export MIOPEN_DEBUG_CONV_GEMM=0
-#     - export VLLM_ROCM_USE_AITER=1
-#     - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
+- label: "Diffusion Sequence Parallelism Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
 
-# - label: "Diffusion Tensor Parallelism Test"
-#   timeout_in_minutes: 20
-#   agent_pool: mi325_2
-#   depends_on: amd-build
-#   commands:
-#     - export GPU_ARCHS=gfx942
-#     - export VLLM_LOGGING_LEVEL=DEBUG
-#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#     - export MIOPEN_DEBUG_CONV_DIRECT=0
-#     - export MIOPEN_DEBUG_CONV_GEMM=0
-#     - export VLLM_ROCM_USE_AITER=1
-#     - pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
+- label: "Diffusion Tensor Parallelism Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_2
+  depends_on: amd-build
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_zimage_tensor_parallel.py
 
-# - label: "Diffusion GPU Worker Test"
-#   timeout_in_minutes: 20
-#   agent_pool: mi325_2
-#   depends_on: amd-build
-#   mirror_hardwares: [amdproduction]
-#   grade: Blocking
-#   commands:
-#     - export MIOPEN_DEBUG_CONV_DIRECT=0
-#     - export MIOPEN_DEBUG_CONV_GEMM=0
-#     - pytest -s -v tests/diffusion/test_gpu_diffusion_worker.py
+- label: "Diffusion GPU Worker Test"
+  timeout_in_minutes: 20
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - pytest -s -v tests/diffusion/test_gpu_diffusion_worker.py
 
-# - label: "Omni Model Test Qwen2-5-Omni"
-#   timeout_in_minutes: 15
-#   agent_pool: mi325_2
-#   depends_on: amd-build
-#   mirror_hardwares: [amdproduction]
-#   grade: Blocking
-#   commands:
-#     - export GPU_ARCHS=gfx942
-#     - export VLLM_LOGGING_LEVEL=DEBUG
-#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#     - export MIOPEN_DEBUG_CONV_DIRECT=0
-#     - export MIOPEN_DEBUG_CONV_GEMM=0
-#     - export VLLM_ROCM_USE_AITER=1
-#     - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
+- label: "Omni Model Test Qwen2-5-Omni"
+  timeout_in_minutes: 15
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
 
 - label: "Omni Model Test Qwen3-Omni"
   timeout_in_minutes: 15
@@ -130,24 +98,19 @@ steps:
   commands:
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_DEBUG_CONV_DIRECT=0
-    - export MIOPEN_DEBUG_CONV_GEMM=0
     - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
     - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
     - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py
     - pytest -s -v tests/e2e/online_serving/test_async_omni.py
 
-# - label: "Diffusion Image Edit Test"
-#   timeout_in_minutes: 15
-#   agent_pool: mi325_1
-#   depends_on: amd-build
-#   mirror_hardwares: [amdproduction]
-#   grade: Blocking
-#   commands:
-#     - export GPU_ARCHS=gfx942
-#     - export VLLM_LOGGING_LEVEL=DEBUG
-#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-#     - export MIOPEN_DEBUG_CONV_DIRECT=0
-#     - export MIOPEN_DEBUG_CONV_GEMM=0
-#     - export VLLM_ROCM_USE_AITER=1
-#     - pytest -s -v tests/e2e/online_serving/test_i2i_multi_image_input.py
+- label: "Diffusion Image Edit Test"
+  timeout_in_minutes: 15
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/online_serving/test_i2i_multi_image_input.py