diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index d9a2315953..00823951dc 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -44,11 +44,19 @@ steps:
     agents:
       queue: "cpu_queue_premerge"
 
-  # L4 Test — main+NIGHTLY=1 (scheduled), or PR with label nightly-test (e.g. add label then Rebuild)
+  # L4 Test — main+NIGHTLY=1 (scheduled), or PR with specific label (e.g. add label then Rebuild)
   - label: "Upload Nightly Pipeline"
     depends_on: image-build
     key: upload-nightly-pipeline
-    if: '(build.branch == "main" && build.env("NIGHTLY") == "1") || (build.branch != "main" && build.pull_request.labels includes "nightly-test")'
+    if: >-
+      (build.branch == "main" && build.env("NIGHTLY") == "1") ||
+      (build.branch != "main" && (
+        build.pull_request.labels includes "nightly-test" ||
+        build.pull_request.labels includes "omni-test" ||
+        build.pull_request.labels includes "tts-test" ||
+        build.pull_request.labels includes "diffusion-x2iat-test" ||
+        build.pull_request.labels includes "diffusion-x2v-test"
+      ))
     commands:
       - buildkite-agent pipeline upload .buildkite/test-nightly.yml
     agents:
diff --git a/.buildkite/test-nightly-diffusion.yml b/.buildkite/test-nightly-diffusion.yml
deleted file mode 100644
index b5ba8a117c..0000000000
--- a/.buildkite/test-nightly-diffusion.yml
+++ /dev/null
@@ -1,417 +0,0 @@
-# Nightly diffusion GPU tests — appended to the main nightly build via
-#   buildkite-agent pipeline upload .buildkite/test-nightly-diffusion.yml
-# from test-nightly.yml (step key: nightly-diffusion-model-test). Top-level groups are
-# foldable in the Buildkite UI (Other / Wan / Qwen-Image).
-env:
-  VLLM_WORKER_MULTIPROC_METHOD: spawn
-  HF_HUB_DOWNLOAD_TIMEOUT: 300
-  HF_HUB_ETAG_TIMEOUT: 60
-
-steps:
-  - group: ":card_index_dividers: Other Model Test"
-    key: nightly-other-model-test-group
-    steps:
-      - label: ":full_moon: Diffusion · Other · Function Test with H100"
-        timeout_in_minutes: 120
-        # Shared nightly vs PR label conditional; referenced below as *nightly_or_pr_label
-        if: &nightly_or_pr_label 'build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"'
-        commands:
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not test_qwen_image" -m "advanced_model and diffusion and H100" --run-level "advanced_model"
-        agents:
-          queue: "mithril-h100-pool"
-        plugins:
-          - kubernetes:
-              podSpec:
-                containers:
-                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-                    resources:
-                      limits:
-                        nvidia.com/gpu: 2
-                    volumeMounts:
-                      - name: devshm
-                        mountPath: /dev/shm
-                      - name: hf-cache
-                        mountPath: /root/.cache/huggingface
-                    env:
-                      - name: HF_HOME
-                        value: /root/.cache/huggingface
-                      - name: HF_TOKEN
-                        valueFrom:
-                          secretKeyRef:
-                            name: hf-token-secret
-                            key: token
-                nodeSelector:
-                  node.kubernetes.io/instance-type: gpu-h100-sxm
-                volumes:
-                  - name: devshm
-                    emptyDir:
-                      medium: Memory
-                  - name: hf-cache
-                    hostPath:
-                      path: /mnt/hf-cache
-                      type: DirectoryOrCreate
-
-      - label: ":full_moon: Diffusion · Other · Function Test with L4"
-        timeout_in_minutes: 60
-        if: *nightly_or_pr_label
-        commands:
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and L4" --run-level "advanced_model"
-        agents:
-          queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
-        plugins:
-          - docker#v5.2.0:
-              image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-              always-pull: true
-              propagate-environment: true
-              shm-size: "8gb"
-              environment:
-                - "HF_HOME=/fsx/hf_cache"
-                - "HF_TOKEN"
-              volumes:
-                - "/fsx/hf_cache:/fsx/hf_cache"
-
-      - label: ":full_moon: Diffusion · Other · Doc Test"
-        timeout_in_minutes: 60
-        if: *nightly_or_pr_label
-        commands:
-          - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
-          - pytest -s -v tests/examples/online_serving/test_text_to_image.py tests/examples/offline_inference/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model"
-        agents:
-          queue: "mithril-h100-pool"
-        plugins:
-          - kubernetes:
-              podSpec:
-                containers:
-                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-                    resources:
-                      limits:
-                        nvidia.com/gpu: 2
-                    volumeMounts:
-                      - name: devshm
-                        mountPath: /dev/shm
-                      - name: hf-cache
-                        mountPath: /root/.cache/huggingface
-                    env:
-                      - name: HF_HOME
-                        value: /root/.cache/huggingface
-                      - name: HF_TOKEN
-                        valueFrom:
-                          secretKeyRef:
-                            name: hf-token-secret
-                            key: token
-                nodeSelector:
-                  node.kubernetes.io/instance-type: gpu-h100-sxm
-                volumes:
-                  - name: devshm
-                    emptyDir:
-                      medium: Memory
-                  - name: hf-cache
-                    hostPath:
-                      path: /mnt/hf-cache
-                      type: DirectoryOrCreate
-
-  - group: ":card_index_dividers: Wan Series Model Test"
-    key: nightly-wan-model-test-group
-    steps:
-      - label: ":full_moon: Diffusion · Wan · Function Test"
-        timeout_in_minutes: 90
-        if: *nightly_or_pr_label
-        commands:
-          - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py -m "advanced_model" --run-level "advanced_model"
-        agents:
-          queue: "mithril-h100-pool"
-        plugins:
-          - kubernetes:
-              podSpec:
-                containers:
-                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-                    resources:
-                      limits:
-                        nvidia.com/gpu: 2
-                    volumeMounts:
-                      - name: devshm
-                        mountPath: /dev/shm
-                      - name: hf-cache
-                        mountPath: /root/.cache/huggingface
-                    env:
-                      - name: HF_HOME
-                        value: /root/.cache/huggingface
-                      - name: HF_TOKEN
-                        valueFrom:
-                          secretKeyRef:
-                            name: hf-token-secret
-                            key: token
-                nodeSelector:
-                  node.kubernetes.io/instance-type: gpu-h100-sxm
-                volumes:
-                  - name: devshm
-                    emptyDir:
-                      medium: Memory
-                  - name: hf-cache
-                    hostPath:
-                      path: /mnt/hf-cache
-                      type: DirectoryOrCreate
-
-      - label: ":full_moon: Diffusion · Wan · Accuracy Test"
-        key: nightly-wan22-i2v-accuracy
-        timeout_in_minutes: 180
-        if: *nightly_or_pr_label
-        commands:
-          - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py --run-level advanced_model
-        agents:
-          queue: "mithril-h100-pool"
-        plugins:
-          - kubernetes:
-              podSpec:
-                containers:
-                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-                    resources:
-                      limits:
-                        nvidia.com/gpu: 2
-                    volumeMounts:
-                      - name: devshm
-                        mountPath: /dev/shm
-                      - name: hf-cache
-                        mountPath: /root/.cache/huggingface
-                    env:
-                      - name: HF_HOME
-                        value: /root/.cache/huggingface
-                      - name: HF_TOKEN
-                        valueFrom:
-                          secretKeyRef:
-                            name: hf-token-secret
-                            key: token
-                nodeSelector:
-                  node.kubernetes.io/instance-type: gpu-h100-sxm
-                volumes:
-                  - name: devshm
-                    emptyDir:
-                      medium: Memory
-                  - name: hf-cache
-                    hostPath:
-                      path: /mnt/hf-cache
-                      type: DirectoryOrCreate
-
-  - group: ":card_index_dividers: Qwen-Image Series Model Test"
-    key: nightly-qwen-image-edit-group
-    steps:
-      - label: ":full_moon: Diffusion · Qwen-Image · Function Test with H100"
-        timeout_in_minutes: 120
-        if: *nightly_or_pr_label
-        commands:
-          - pytest -s -v tests/e2e/online_serving/test_qwen_image*_expansion.py -m "advanced_model and diffusion and H100" --run-level "advanced_model"
-        agents:
-          queue: "mithril-h100-pool"
-        plugins:
-          - kubernetes:
-              podSpec:
-                containers:
-                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-                    resources:
-                      limits:
-                        nvidia.com/gpu: 2
-                    volumeMounts:
-                      - name: devshm
-                        mountPath: /dev/shm
-                      - name: hf-cache
-                        mountPath: /root/.cache/huggingface
-                    env:
-                      - name: HF_HOME
-                        value: /root/.cache/huggingface
-                      - name: HF_TOKEN
-                        valueFrom:
-                          secretKeyRef:
-                            name: hf-token-secret
-                            key: token
-                nodeSelector:
-                  node.kubernetes.io/instance-type: gpu-h100-sxm
-                volumes:
-                  - name: devshm
-                    emptyDir:
-                      medium: Memory
-                  - name: hf-cache
-                    hostPath:
-                      path: /mnt/hf-cache
-                      type: DirectoryOrCreate
-
-      - label: ":full_moon: Diffusion · Qwen-Image · GEBench Accuracy Test"
-        key: nightly-gebench-accuracy
-        timeout_in_minutes: 60
-        if: *nightly_or_pr_label
-        commands:
-          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1
-          - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json"
-        agents:
-          queue: "mithril-h100-pool"
-        plugins:
-          - kubernetes:
-              podSpec:
-                containers:
-                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-                    resources:
-                      limits:
-                        nvidia.com/gpu: 1
-                    volumeMounts:
-                      - name: devshm
-                        mountPath: /dev/shm
-                      - name: hf-cache
-                        mountPath: /root/.cache/huggingface
-                    env:
-                      - name: HF_HOME
-                        value: /root/.cache/huggingface
-                      - name: HF_TOKEN
-                        valueFrom:
-                          secretKeyRef:
-                            name: hf-token-secret
-                            key: token
-                nodeSelector:
-                  node.kubernetes.io/instance-type: gpu-h100-sxm
-                volumes:
-                  - name: devshm
-                    emptyDir:
-                      medium: Memory
-                  - name: hf-cache
-                    hostPath:
-                      path: /mnt/hf-cache
-                      type: DirectoryOrCreate
-
-      - label: ":full_moon: Diffusion · Qwen-Image · GEdit-Bench Accuracy Test"
-        key: nightly-gedit-bench-accuracy
-        timeout_in_minutes: 60
-        if: *nightly_or_pr_label
-        commands:
-          - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level advanced_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1
-          - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv"
-          - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_summary_*.json"
-        agents:
-          queue: "mithril-h100-pool"
-        plugins:
-          - kubernetes:
-              podSpec:
-                containers:
-                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-                    resources:
-                      limits:
-                        nvidia.com/gpu: 1
-                    volumeMounts:
-                      - name: devshm
-                        mountPath: /dev/shm
-                      - name: hf-cache
-                        mountPath: /root/.cache/huggingface
-                    env:
-                      - name: HF_HOME
-                        value: /root/.cache/huggingface
-                      - name: VLLM_HTTP_TIMEOUT_KEEP_ALIVE
-                        value: "120"
-                      - name: HF_TOKEN
-                        valueFrom:
-                          secretKeyRef:
-                            name: hf-token-secret
-                            key: token
-                nodeSelector:
-                  node.kubernetes.io/instance-type: gpu-h100-sxm
-                volumes:
-                  - name: devshm
-                    emptyDir:
-                      medium: Memory
-                  - name: hf-cache
-                    hostPath:
-                      path: /mnt/hf-cache
-                      type: DirectoryOrCreate
-
-      - label: ":full_moon: Diffusion · Qwen-Image · Perf Test"
-        key: nightly-qwen-image-performance
-        timeout_in_minutes: 180
-        if: *nightly_or_pr_label
-        commands:
-          - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results
-          - export DIFFUSION_ATTENTION_BACKEND=FLASH_ATTN
-          - export CACHE_DIT_VERSION=1.3.0
-          # [HACK]: run upload in the same command block as pytest.
-          # Because `exit` aborts the entire commands list.
-          - |
-            set +e
-            pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json
-            EXIT1=$$?
-            pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json
-            EXIT2=$$?
-            pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json
-            EXIT3=$$?
-            if [ $$EXIT1 -eq 0 ] || [ $$EXIT2 -eq 0 ] || [ $$EXIT3 -eq 0 ]; then
-              buildkite-agent artifact upload "tests/dfx/perf/results/diffusion_result_*.json"
-              buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log"
-            fi
-            exit $$((EXIT1 | EXIT2 | EXIT3))
-        agents:
-          queue: "mithril-h100-pool"
-        plugins:
-          - kubernetes:
-              podSpec:
-                containers:
-                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-                    resources:
-                      limits:
-                        nvidia.com/gpu: 4
-                    volumeMounts:
-                      - name: devshm
-                        mountPath: /dev/shm
-                      - name: hf-cache
-                        mountPath: /root/.cache/huggingface
-                    env:
-                      - name: HF_HOME
-                        value: /root/.cache/huggingface
-                      - name: HF_TOKEN
-                        valueFrom:
-                          secretKeyRef:
-                            name: hf-token-secret
-                            key: token
-                nodeSelector:
-                  node.kubernetes.io/instance-type: gpu-h100-sxm
-                volumes:
-                  - name: devshm
-                    emptyDir:
-                      medium: Memory
-                  - name: hf-cache
-                    hostPath:
-                      path: /mnt/hf-cache
-                      type: DirectoryOrCreate
-
-      - label: ":full_moon: Diffusion · Qwen-Image · Accuracy Test"
-        key: nightly-qwen-image-accuracy
-        timeout_in_minutes: 180
-        if: *nightly_or_pr_label
-        commands:
-          - pytest -s -v tests/e2e/accuracy/test_qwen_image*.py --run-level advanced_model
-        agents:
-          queue: "mithril-h100-pool"
-        plugins:
-          - kubernetes:
-              podSpec:
-                containers:
-                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
-                    resources:
-                      limits:
-                        nvidia.com/gpu: 1
-                    volumeMounts:
-                      - name: devshm
-                        mountPath: /dev/shm
-                      - name: hf-cache
-                        mountPath: /root/.cache/huggingface
-                    env:
-                      - name: HF_HOME
-                        value: /root/.cache/huggingface
-                      - name: HF_TOKEN
-                        valueFrom:
-                          secretKeyRef:
-                            name: hf-token-secret
-                            key: token
-                nodeSelector:
-                  node.kubernetes.io/instance-type: gpu-h100-sxm
-                volumes:
-                  - name: devshm
-                    emptyDir:
-                      medium: Memory
-                  - name: hf-cache
-                    hostPath:
-                      path: /mnt/hf-cache
-                      type: DirectoryOrCreate
diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
index 31b3e17976..58e1e55af7 100644
--- a/.buildkite/test-nightly.yml
+++ b/.buildkite/test-nightly.yml
@@ -7,12 +7,11 @@ steps:
   # Group: collapses under one heading in the Buildkite UI; child steps still run in parallel.
   - group: ":card_index_dividers: Omni Model Test"
     key: nightly-omni-test-group
+    depends_on: upload-nightly-pipeline
+    if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" || build.pull_request.labels includes "omni-test"
     steps:
-      - label: ":full_moon: Omni · Function Test with H100"
+      - label: ":full_moon: Omni · Function Test"
         timeout_in_minutes: 90
-        depends_on: upload-nightly-pipeline
-        # Shared nightly vs PR label conditional; referenced below as *nightly_or_pr_label
-        if: &nightly_or_pr_label 'build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"'
         commands:
           - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and H100 and omni" --run-level "advanced_model"
         agents:
@@ -49,13 +48,11 @@ steps:
                       path: /mnt/hf-cache
                       type: DirectoryOrCreate
 
-      - label: ":full_moon: Omni · Function Test with L4"
+      - label: ":full_moon: Omni · Doc Test with L4"
         timeout_in_minutes: 90
-        depends_on: upload-nightly-pipeline
-        if: *nightly_or_pr_label
         commands:
           - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
-          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model"
+          - pytest -s -v tests/examples/ -m "advanced_model and omni and L4" --run-level "advanced_model"
         agents:
           queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:
@@ -70,13 +67,203 @@ steps:
               volumes:
                 - "/fsx/hf_cache:/fsx/hf_cache"
 
-      - label: ":full_moon: Omni · Doc Test with L4"
+      - label: ":full_moon: Omni · Doc Test with H100"
+        timeout_in_minutes: 90
+        commands:
+          - pytest -s -v tests/examples/ -m "advanced_model and omni and H100" --run-level "advanced_model"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 2
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
+
+      - label: ":full_moon: Omni · Perf Test"
+        key: nightly-omni-performance
+        timeout_in_minutes: 180
+        commands:
+          - export BENCHMARK_DIR=tests/dfx/perf/results
+          - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_omni.json
+          - buildkite-agent artifact upload "tests/dfx/perf/results/*.json"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 2
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
+
+
+  - group: ":card_index_dividers: TTS Model Test"
+    key: nightly-tts-test-group
+    depends_on: upload-nightly-pipeline
+    if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test" || build.pull_request.labels includes "tts-test"
+    steps:
+      - label: ":full_moon: TTS · Function Test"
         timeout_in_minutes: 90
-        depends_on: upload-nightly-pipeline
-        if: *nightly_or_pr_label
         commands:
           - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
-          - pytest -s -v tests/examples/ -m "advanced_model and omni and L4" --run-level "advanced_model"
+          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model"
+        agents:
+          queue: "gpu_1_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
+        plugins:
+          - docker#v5.2.0:
+              image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+              always-pull: true
+              propagate-environment: true
+              shm-size: "8gb"
+              environment:
+                - "HF_HOME=/fsx/hf_cache"
+                - "HF_TOKEN"
+              volumes:
+                - "/fsx/hf_cache:/fsx/hf_cache"
+
+      - label: ":full_moon: TTS · Perf Test"
+        key: nightly-tts-performance
+        timeout_in_minutes: 180
+        commands:
+          - export BENCHMARK_DIR=tests/dfx/perf/results
+          - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
+          - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py --test-config-file tests/dfx/perf/tests/test_tts.json
+          - buildkite-agent artifact upload "tests/dfx/perf/results/*.json"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 1
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
+
+  # Diffusion X2I suite: x2i / x2a / x2t and related non-video paths; x2v is only in "Diffusion X2V Model Test" below.
+  - group: ":card_index_dividers: Diffusion X2I(&A&T) Model Test"
+    key: nightly-diffusion-x2iat-group
+    depends_on: upload-nightly-pipeline
+    if: >-
+      build.env("NIGHTLY") == "1" ||
+      build.pull_request.labels includes "nightly-test" ||
+      build.pull_request.labels includes "diffusion-x2iat-test"
+    steps:
+      - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with H100"
+        timeout_in_minutes: 120
+        commands:
+          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not hunyuan" -m "advanced_model and diffusion and H100" --run-level "advanced_model"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 2
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
+
+      - label: ":full_moon: Diffusion X2I(&A&T) · Function Test with L4"
+        timeout_in_minutes: 60
+        commands:
+          - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -k "not test_wan22_expansion and not test_wan_2_1_vace_expansion and not hunyuan" -m "advanced_model and diffusion and L4" --run-level "advanced_model"
         agents:
           queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
         plugins:
@@ -91,12 +278,11 @@ steps:
               volumes:
                 - "/fsx/hf_cache:/fsx/hf_cache"
 
-      - label: ":full_moon: Omni · Doc Test with H100"
-        timeout_in_minutes: 90
-        depends_on: upload-nightly-pipeline
-        if: *nightly_or_pr_label
+      - label: ":full_moon: Diffusion X2I(&A&T) · Doc Test"
+        timeout_in_minutes: 60
         commands:
-          - pytest -s -v tests/examples/ -m "advanced_model and omni and H100" --run-level "advanced_model"
+          - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
+          - pytest -s -v tests/examples/*/test_text_to_image.py -m "advanced_model and example and H100" --run-level "advanced_model"
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -131,16 +317,109 @@ steps:
                       path: /mnt/hf-cache
                       type: DirectoryOrCreate
 
-      - label: ":full_moon: Omni · Perf Test"
-        key: nightly-omni-performance
+      - label: ":full_moon: Diffusion X2I(&A&T) · GEBench Accuracy Test"
+        timeout_in_minutes: 60
+        commands:
+          - pytest -s -v tests/e2e/accuracy/test_gebench_h100_smoke.py --run-level advanced_model --gebench-model Qwen/Qwen-Image-2512 --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gebench-port 8093 --accuracy-workers 1
+          - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gebench_qwen-image-2512/summary*.json"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 1
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
+
+      - label: ":full_moon: Diffusion X2I(&A&T) · GEdit-Bench Accuracy Test"
+        timeout_in_minutes: 60
+        commands:
+          - pytest -s -v tests/e2e/accuracy/test_gedit_bench_h100_smoke.py --run-level advanced_model --gedit-model Qwen/Qwen-Image-Edit --accuracy-judge-model QuantTrio/Qwen3-VL-30B-A3B-Instruct-AWQ --accuracy-gpu 0 --gedit-port 8093 --gedit-samples-per-group 20 --accuracy-workers 1
+          - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_vie_score_*.csv"
+          - buildkite-agent artifact upload "tests/e2e/accuracy/artifacts/gedit_scores_qwen-image-edit/qwen-image-edit_all_all_summary_*.json"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 1
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: VLLM_HTTP_TIMEOUT_KEEP_ALIVE
+                        value: "120"
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
+
+      - label: ":full_moon: Diffusion X2I(&A&T) · Perf Test"
+        key: nightly-diffusion-x2iat-performance
         timeout_in_minutes: 180
-        depends_on: upload-nightly-pipeline
-        if: *nightly_or_pr_label
         commands:
-          - export BENCHMARK_DIR=tests/dfx/perf/results
-          - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
-          - pytest -s -v tests/dfx/perf/scripts/run_benchmark.py
-          - buildkite-agent artifact upload "tests/dfx/perf/results/*.json"
+          - export DIFFUSION_BENCHMARK_DIR=tests/dfx/perf/results
+          - export DIFFUSION_ATTENTION_BACKEND=FLASH_ATTN
+          - export CACHE_DIT_VERSION=1.3.0
+          # [HACK]: run upload in the same command block as pytest.
+          # Because `exit` aborts the entire commands list.
+          - |
+            set +e
+            pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json
+            EXIT1=$$?
+            pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_edit_vllm_omni.json
+            EXIT2=$$?
+            pytest -s -v tests/dfx/perf/scripts/run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_edit_2509_vllm_omni.json
+            EXIT3=$$?
+            if [ $$EXIT1 -eq 0 ] || [ $$EXIT2 -eq 0 ] || [ $$EXIT3 -eq 0 ]; then
+              buildkite-agent artifact upload "tests/dfx/perf/results/diffusion_result_*.json"
+              buildkite-agent artifact upload "tests/dfx/perf/results/logs/*.log"
+            fi
+            exit $$((EXIT1 | EXIT2 | EXIT3))
         agents:
           queue: "mithril-h100-pool"
         plugins:
@@ -150,7 +429,7 @@ steps:
                   - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
                     resources:
                       limits:
-                        nvidia.com/gpu: 2
+                        nvidia.com/gpu: 4
                     volumeMounts:
                       - name: devshm
                         mountPath: /dev/shm
@@ -175,23 +454,96 @@ steps:
                       path: /mnt/hf-cache
                       type: DirectoryOrCreate
 
-  # Dynamically appends steps from test-nightly-diffusion.yml into this build (same mechanism as
-  # pipeline.yml → test-ready.yml / test-merge.yml / test-nightly.yml). Foldable groups stay in the
-  # uploaded YAML (Other / Wan / Qwen-Image).
-  - label: ":card_index_dividers: Diffusion Model Test"
-    key: nightly-diffusion-model-test
+  # Diffusion x2v only (Wan, HunyuanVideo, …). x2i/x2a/x2t live in the X2I group above, not here.
+  - group: ":card_index_dividers: Diffusion X2V Model Test"
+    key: nightly-diffusion-x2v-group
     depends_on: upload-nightly-pipeline
-    if: *nightly_or_pr_label
-    commands:
-      - buildkite-agent pipeline upload .buildkite/test-nightly-diffusion.yml
-    agents:
-      queue: "cpu_queue_premerge"
+    if: >-
+      build.env("NIGHTLY") == "1" ||
+      build.pull_request.labels includes "nightly-test" ||
+      build.pull_request.labels includes "diffusion-x2v-test"
+    steps:
+      - label: ":full_moon: Diffusion X2V · Function Test"
+        timeout_in_minutes: 90
+        commands:
+          - pytest -s -v tests/e2e/online_serving/test_wan22_expansion.py tests/e2e/online_serving/test_wan_2_1_vace_expansion.py tests/e2e/online_serving/test_hunyuan_video_15_expansion.py -m "advanced_model" --run-level "advanced_model"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 2
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
+
+      - label: ":full_moon: Diffusion X2V · Accuracy Test"
+        timeout_in_minutes: 180
+        commands:
+          - pytest -s -v tests/e2e/accuracy/wan22_i2v/test_wan22_i2v_video_similarity.py --run-level advanced_model
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 2
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
 
   - label: ":bar_chart: Testcase Statistics"
     key: nightly-testcase-statistics
     timeout_in_minutes: 120
     depends_on: upload-nightly-pipeline
-    if: *nightly_or_pr_label
+    if: build.env("NIGHTLY") == "1" || build.pull_request.labels includes "nightly-test"
     commands:
       - python tools/nightly/buildkite_testcase_statistics.py -o tests/dfx/perf/results/buildkite_testcase_statistics.html
       - buildkite-agent artifact upload "tests/dfx/perf/results/*.html"
@@ -234,15 +586,17 @@ steps:
     key: nightly-perf-distribution
     depends_on:
       - nightly-omni-performance
-      - nightly-qwen-image-performance
+      - nightly-tts-performance
+      - nightly-diffusion-x2iat-performance
       - nightly-testcase-statistics
     if: build.env("NIGHTLY") == "1"
     commands:
       - pip install openpyxl
       - export DEFAULT_INPUT_DIR=tests/dfx/perf/results
       - export DEFAULT_OUTPUT_DIR=tests/dfx/perf/results
+      - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-tts-performance
       - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-omni-performance
-      - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-qwen-image-performance
+      - buildkite-agent artifact download "tests/dfx/perf/results/*.json" . --step nightly-diffusion-x2iat-performance
       - buildkite-agent artifact download "tests/dfx/perf/results/*.html" . --step nightly-testcase-statistics
       - python tools/nightly/generate_nightly_perf_excel.py
       - python tools/nightly/generate_nightly_perf_html.py
diff --git a/docs/contributing/ci/CI_5levels.md b/docs/contributing/ci/CI_5levels.md
index 9306035738..b0428ddd7d 100644
--- a/docs/contributing/ci/CI_5levels.md
+++ b/docs/contributing/ci/CI_5levels.md
@@ -86,7 +86,8 @@ Through five levels (L1-L5) and common (Common) specifications, the system clari
         /tests/e2e/online_serving/test_{model_name}_expansion.py<br>
         /tests/e2e/offline_inference/test_{model_name}_expansion.py<br>
         <strong>Performance:</strong><br>
-        /tests/dfx/perf/tests/test.json<br>
+        /tests/dfx/perf/tests/test_qwen_omni.json (Omni), test_tts.json (TTS),<br>
+        and /tests/dfx/perf/tests/test_{diffusion_model}_vllm_omni.json (Diffusion)<br>
         <strong>Doc Test:</strong><br>
         tests/example/online_serving/test_{model_name}.py<br>
         tests/example/offline_inference/test_{model_name}.py
@@ -530,13 +531,13 @@ L4 level testing is a comprehensive quality audit before a version release. It e
 ### 3.2 Testing Content and Scope
 
 -   ***Full Functionality Testing***: Executes all test cases defined in `test_{model_name}_expansion.py`, covering all implemented features, positive flows, boundary conditions, and exception handling.
--   ***Performance Testing***: Uses the `tests/dfx/perf/tests/test.json` configuration file to drive performance testing tools for stress, load, and endurance tests, collecting metrics like throughput, response time, and resource utilization.
+-   ***Performance Testing***: Uses `tests/dfx/perf/tests/test_qwen_omni.json`, `tests/dfx/perf/tests/test_tts.json`, and diffusion configs in the form `tests/dfx/perf/tests/test_*_vllm_omni.json` (passed to `run_benchmark.py` via `--test-config-file`) to drive performance testing tools for stress, load, and endurance tests, collecting metrics like throughput, response time, and resource utilization.
 -   ***Documentation Testing***: Verifies whether the example code provided to users is runnable and its results match the description.
 
 ### 3.3 Test Directory and Execution Files
 
 -   ***Functional Testing***: Same directories as L3.
--   ***Performance Test Configuration***: `tests/dfx/perf/tests/test.json`
+-   ***Performance Test Configuration***: `tests/dfx/perf/tests/test_qwen_omni.json`, `tests/dfx/perf/tests/test_tts.json`, and diffusion configs `tests/dfx/perf/tests/test_*_vllm_omni.json` (e.g. `test_qwen_image_vllm_omni.json`)
 -   ***Documentation Example Tests***:
 -   -   `tests/example/online_serving/test_{model_name}.py`
     -   `tests/example/offline_inference/test_{model_name}.py`
diff --git a/docs/contributing/ci/test_examples/l4_performance_tests.inc.md b/docs/contributing/ci/test_examples/l4_performance_tests.inc.md
index 8093e1459f..f1f3073dc5 100644
--- a/docs/contributing/ci/test_examples/l4_performance_tests.inc.md
+++ b/docs/contributing/ci/test_examples/l4_performance_tests.inc.md
@@ -1,4 +1,4 @@
-When you want to add L4-level ***performance test*** cases, you can refer to the following format for case addition in tests/dfx/perf/tests/test.json:
+When you want to add L4-level ***performance test*** cases, you can refer to the following format for case addition in `tests/dfx/perf/tests/test_qwen_omni.json`, `tests/dfx/perf/tests/test_tts.json`, or diffusion configs such as `tests/dfx/perf/tests/test_*_vllm_omni.json` (selected via `pytest ... run_benchmark.py --test-config-file <path>`):
 
 ```JSON
 {
diff --git a/docs/contributing/ci/test_guide.md b/docs/contributing/ci/test_guide.md
index 425f24332c..08b2e3b4ea 100644
--- a/docs/contributing/ci/test_guide.md
+++ b/docs/contributing/ci/test_guide.md
@@ -45,7 +45,6 @@ Our test scripts use the pytest framework. First, please use `git clone https://
 === "L3 level & L4 level"
 
     ```bash
-    cd tests
     pytest -s -v -m "advanced_model" --run-level=advanced_model
     ```
     If you only want to run L3 test case, you can use:
@@ -60,9 +59,9 @@ Our test scripts use the pytest framework. First, please use `git clone https://
     ```bash
     pytest -s -v -m "core_model and distributed_cuda and L4"  --run-level=core_model
     ```
-    Note: To run performance tests, use:
+    Note: To run performance tests (defaults to ``test_qwen_omni.json``; use ``--test-config-file tests/dfx/perf/tests/test_tts.json`` for TTS):
     ```bash
-    pytest -s -v perf/scripts/run_benchmark.py
+    pytest -s -v tests/dfx/perf/scripts/run_benchmark.py
     ```
 
     The latest L3 test commands for various test suites can be found in the [pipeline](https://github.com/vllm-project/vllm-omni/blob/main/.buildkite/test-merge.yml).
diff --git a/tests/dfx/conftest.py b/tests/dfx/conftest.py
index e54141b344..997f25e6e5 100644
--- a/tests/dfx/conftest.py
+++ b/tests/dfx/conftest.py
@@ -2,6 +2,8 @@
 from pathlib import Path
 from typing import Any
 
+import pytest
+
 from tests.conftest import modify_stage_config
 
 
@@ -95,3 +97,13 @@ def create_benchmark_indices(
                 indices.append((test_name, idx))
 
     return indices
+
+
+def pytest_addoption(parser: pytest.Parser) -> None:
+    """Register shared CLI options for DFX benchmark suites."""
+    parser.addoption(
+        "--test-config-file",
+        action="store",
+        default=None,
+        help=("Path to benchmark config JSON. Example: --test-config-file tests/dfx/perf/tests/test_tts.json"),
+    )
diff --git a/tests/dfx/perf/scripts/run_benchmark.py b/tests/dfx/perf/scripts/run_benchmark.py
index 67dedcd048..d5ef1b49e7 100644
--- a/tests/dfx/perf/scripts/run_benchmark.py
+++ b/tests/dfx/perf/scripts/run_benchmark.py
@@ -21,10 +21,30 @@
 os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"
 
 
-CONFIG_FILE_PATH = str(Path(__file__).parent.parent / "tests" / "test.json")
-BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH)
-STAGE_INIT_TIMEOUT = 600
+def _get_config_file_from_argv() -> str | None:
+    """Read ``--test-config-file`` from ``sys.argv`` at import time so parametrization can use it."""
+    import sys
+
+    for i, arg in enumerate(sys.argv):
+        if arg == "--test-config-file" and i + 1 < len(sys.argv):
+            return sys.argv[i + 1]
+        if arg.startswith("--test-config-file="):
+            return arg.split("=", 1)[1]
+    return None
+
+
+_PERF_TESTS_DIR = Path(__file__).resolve().parent.parent / "tests"
+_DEFAULT_CONFIG_FILE = str(_PERF_TESTS_DIR / "test_qwen_omni.json")
+
+CONFIG_FILE_PATH = _get_config_file_from_argv()
+if CONFIG_FILE_PATH is None:
+    print(
+        "No --test-config-file in argv, using default: tests/dfx/perf/tests/test_qwen_omni.json "
+        "(override with e.g. --test-config-file tests/dfx/perf/tests/test_tts.json)"
+    )
+    CONFIG_FILE_PATH = _DEFAULT_CONFIG_FILE
 
+BENCHMARK_CONFIGS = load_configs(CONFIG_FILE_PATH)
 
 STAGE_CONFIGS_DIR = Path(__file__).parent.parent / "stage_configs"
 test_params = create_unique_server_params(BENCHMARK_CONFIGS, STAGE_CONFIGS_DIR)
@@ -44,7 +64,7 @@ def omni_server(request):
 
         print(f"Starting OmniServer with test: {test_name}, model: {model}")
 
-        server_args = ["--stage-init-timeout", str(STAGE_INIT_TIMEOUT), "--init-timeout", "900"]
+        server_args = ["--stage-init-timeout", "300", "--init-timeout", "900"]
         if stage_config_path:
             server_args = ["--stage-configs-path", stage_config_path] + server_args
         with OmniServer(model, server_args) as server:
@@ -97,8 +117,6 @@ def run_benchmark(
         ["vllm", "bench", "serve", "--omni"]
         + args
         + [
-            "--num-warmups",
-            "2",
             "--save-result",
             "--result-dir",
             os.environ.get("BENCHMARK_DIR", "tests"),
@@ -141,7 +159,6 @@ def run_benchmark(
         result["random_output_len"] = random_output_len
     with open(result_path, "w", encoding="utf-8") as f:
         json.dump(result, f, ensure_ascii=False, indent=2)
-
     return result
 
 
@@ -207,10 +224,6 @@ def _resolve_baseline_value(
             f"or request_rate={request_rate!r}; keys={list(baseline_raw.keys())!r}"
         )
     if isinstance(baseline_raw, (list, tuple)):
-        if sweep_index is None:
-            raise ValueError("list baseline requires sweep_index")
-        if not (0 <= sweep_index < len(baseline_raw)):
-            raise IndexError(f"baseline list len={len(baseline_raw)} has no index {sweep_index}")
         return baseline_raw[sweep_index]
     return baseline_raw
 
@@ -245,14 +258,14 @@ def assert_result(
 ) -> None:
     assert result["completed"] == num_prompt, "Request failures exist"
     baseline_data = params.get("baseline", {})
-    thresholds = _baseline_thresholds_for_step(
-        baseline_data,
-        sweep_index=sweep_index,
-        max_concurrency=max_concurrency,
-        request_rate=request_rate,
-    )
-    for metric_name, baseline_value in thresholds.items():
+    for metric_name, baseline_raw in baseline_data.items():
         current_value = result[metric_name]
+        baseline_value = _resolve_baseline_value(
+            baseline_raw,
+            sweep_index=sweep_index,
+            max_concurrency=max_concurrency,
+            request_rate=request_rate,
+        )
         if "throughput" in metric_name:
             if current_value <= baseline_value:
                 print(
diff --git a/tests/dfx/perf/scripts/run_diffusion_benchmark.py b/tests/dfx/perf/scripts/run_diffusion_benchmark.py
index 123f21405e..8eeeec8df2 100644
--- a/tests/dfx/perf/scripts/run_diffusion_benchmark.py
+++ b/tests/dfx/perf/scripts/run_diffusion_benchmark.py
@@ -5,8 +5,8 @@
   - vllm-omni (default): starts DiffusionServer via vllm_omni.entrypoints.cli.main,
     benchmarks with diffusion_benchmark_serving.py --backend vllm-omni
 
-A config JSON file is REQUIRED via --config-file:
-  pytest run_diffusion_benchmark.py --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json
+A config JSON file is REQUIRED via --test-config-file:
+  pytest run_diffusion_benchmark.py --test-config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json
 
 JSON config entries use a "server_type" field, and this runner executes
 the vllm-omni path.
@@ -55,16 +55,16 @@
 
 
 def _get_config_file_from_argv() -> str | None:
-    """Read --config-file from sys.argv at import time so pytest parametrize can use it.
+    """Read --test-config-file from sys.argv at import time so pytest parametrize can use it.
 
     pytest_addoption (below) registers the same flag so pytest does not reject it.
-    Supports both ``--config-file path`` and ``--config-file=path`` forms.
+    Supports both ``--test-config-file path`` and ``--test-config-file=path`` forms.
     Returns None if the flag is not present; callers must handle the missing case.
     """
     for i, arg in enumerate(sys.argv):
-        if arg == "--config-file" and i + 1 < len(sys.argv):
+        if arg == "--test-config-file" and i + 1 < len(sys.argv):
             return sys.argv[i + 1]
-        if arg.startswith("--config-file="):
+        if arg.startswith("--test-config-file="):
             return arg.split("=", 1)[1]
     return None
 
@@ -133,19 +133,6 @@ def _append_to_aggregated_file(record: dict[str, Any]) -> None:
             json.dump(records, f, indent=2, ensure_ascii=False)
 
 
-# Register --config-file with pytest so it does not reject the argument.
-def pytest_addoption(parser: pytest.Parser) -> None:
-    parser.addoption(
-        "--config-file",
-        action="store",
-        default=None,
-        help=(
-            "Path to the benchmark config JSON file (required). "
-            "Example: --config-file tests/dfx/perf/tests/test_qwen_image_vllm_omni.json"
-        ),
-    )
-
-
 _server_lock = threading.Lock()
 
 # ---------------------------------------------------------------------------
diff --git a/tests/dfx/perf/tests/test.json b/tests/dfx/perf/tests/test_qwen_omni.json
similarity index 92%
rename from tests/dfx/perf/tests/test.json
rename to tests/dfx/perf/tests/test_qwen_omni.json
index 159e27a064..4662f8c0c7 100644
--- a/tests/dfx/perf/tests/test.json
+++ b/tests/dfx/perf/tests/test_qwen_omni.json
@@ -329,37 +329,5 @@
                 }
             }
         ]
-    },
-    {
-        "test_name": "test_qwen3_tts",
-        "server_params": {
-            "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
-        },
-        "benchmark_params": [
-            {
-                "dataset_name": "random",
-                "backend": "openai-audio-speech",
-                "endpoint": "/v1/audio/speech",
-                "num_prompts": [
-                    10,
-                    40
-                ],
-                "max_concurrency": [
-                    1,
-                    4
-                ],
-                "random_input_len": 100,
-                "random_output_len": 100,
-                "extra_body": {
-                    "voice": "Vivian",
-                    "language": "English"
-                },
-                "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration",
-                "baseline": {
-                    "mean_audio_ttfp_ms": [6000, 6000],
-                    "mean_audio_rtf": [0.3, 0.3]
-                }
-            }
-        ]
     }
 ]
diff --git a/tests/dfx/perf/tests/test_tts.json b/tests/dfx/perf/tests/test_tts.json
new file mode 100644
index 0000000000..3583b45b4f
--- /dev/null
+++ b/tests/dfx/perf/tests/test_tts.json
@@ -0,0 +1,34 @@
+[
+    {
+        "test_name": "test_qwen3_tts",
+        "server_params": {
+            "model": "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
+        },
+        "benchmark_params": [
+            {
+                "dataset_name": "random",
+                "backend": "openai-audio-speech",
+                "endpoint": "/v1/audio/speech",
+                "num_prompts": [
+                    10,
+                    40
+                ],
+                "max_concurrency": [
+                    1,
+                    4
+                ],
+                "random_input_len": 100,
+                "random_output_len": 100,
+                "extra_body": {
+                    "voice": "Vivian",
+                    "language": "English"
+                },
+                "percentile-metrics": "ttft,e2el,audio_rtf,audio_ttfp,audio_duration",
+                "baseline": {
+                    "mean_audio_ttfp_ms": [6000, 6000],
+                    "mean_audio_rtf": [0.3, 0.3]
+                }
+            }
+        ]
+    }
+]