yinpeiqi · yinpeiqi · Apr 22, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -64,12 +64,13 @@ while true; do
 done
 
 echo "--- Pulling container"
-## Temporary change to use AMD Docker Hub to store the vllm-ci image
+## Temporary change to use AMD Docker Hub to store the vllm-omni image
 # to bypass the rate limit issue with ECR Public Gallery.
+# Images are now stored in a separate repository for vllm-omni, instead of vllm-ci.
 # TODO: @tjtanaa point back to ECR Public Gallery
 # once the amd agents are configured to use ECR Public Gallery.
 # image_name="public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${BUILDKITE_COMMIT}-rocm-omni"
-image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}-rocm-omni"
+image_name="rocm/vllm-omni:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 
 # TODO: @tjtanaa uncomment this once the amd agents are configured to use ECR Public Gallery.

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
@@ -117,3 +117,17 @@ steps:
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -s -v tests/e2e/online_serving/test_image_gen_edit.py
+
+
+- label: "Omni Sleep Mode Test"
+  timeout_in_minutes: 40
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdproduction]
+  grade: Blocking
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
+    - pytest -s -v tests/e2e/offline_inference/test_omni_sleep_mode.py -m "advanced_model and omni and MI325" --run-level "advanced_model"
diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml
@@ -360,6 +360,46 @@ steps:
                   path: /mnt/hf-cache
                   type: DirectoryOrCreate
 
+  - label: "Omni Sleep Mode Test with H100"
+    timeout_in_minutes: 30
+    depends_on: upload-merge-pipeline
+    commands:
+      - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
+      - pytest -s -v tests/e2e/offline_inference/test_omni_sleep_mode.py -m "advanced_model and H100 and omni" --run-level "advanced_model"
+    agents:
+      queue: "mithril-h100-pool"
+    plugins:
+      - kubernetes:
+          podSpec:
+            containers:
+              - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                resources:
+                  limits:
+                    nvidia.com/gpu: 2
+                volumeMounts:
+                  - name: devshm
+                    mountPath: /dev/shm
+                  - name: hf-cache
+                    mountPath: /root/.cache/huggingface
+                env:
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+              - name: devshm
+                emptyDir:
+                  medium: Memory
+              - name: hf-cache
+                hostPath:
+                  path: /mnt/hf-cache
+                  type: DirectoryOrCreate
+
   - label: "Voxtral-TTS E2E Test"
     timeout_in_minutes: 20
     depends_on: upload-merge-pipeline

diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml
@@ -151,6 +151,44 @@ steps:
                       type: DirectoryOrCreate
 
 
+      - label: ":full_moon: Omni Multi-Replica Startup Test with 4x H100"
+        timeout_in_minutes: 45
+        commands:
+          - pytest -s -v tests/e2e/online_serving/test_qwen3_omni_multi_replicas.py -m "core_model" --run-level "core_model"
+        agents:
+          queue: "mithril-h100-pool"
+        plugins:
+          - kubernetes:
+              podSpec:
+                containers:
+                  - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                    resources:
+                      limits:
+                        nvidia.com/gpu: 4
+                    volumeMounts:
+                      - name: devshm
+                        mountPath: /dev/shm
+                      - name: hf-cache
+                        mountPath: /root/.cache/huggingface
+                    env:
+                      - name: HF_HOME
+                        value: /root/.cache/huggingface
+                      - name: HF_TOKEN
+                        valueFrom:
+                          secretKeyRef:
+                            name: hf-token-secret
+                            key: token
+                nodeSelector:
+                  node.kubernetes.io/instance-type: gpu-h100-sxm
+                volumes:
+                  - name: devshm
+                    emptyDir:
+                      medium: Memory
+                  - name: hf-cache
+                    hostPath:
+                      path: /mnt/hf-cache
+                      type: DirectoryOrCreate
+
   - group: ":card_index_dividers: TTS Model Test"
     key: nightly-tts-test-group
     depends_on: upload-nightly-pipeline

diff --git a/.buildkite/test-template-amd-omni.j2 b/.buildkite/test-template-amd-omni.j2
@@ -3,7 +3,7 @@
    Last synced: 2025-12-15
    Modifications: Removed unused CUDA/NVIDIA logic, keeping only AMD tests
 #}
-{% set docker_image_amd = "rocm/vllm-ci:$BUILDKITE_COMMIT-rocm-omni" %}
+{% set docker_image_amd = "rocm/vllm-omni:$BUILDKITE_COMMIT" %}
 {% set default_working_dir = "/app/vllm-omni" %}
 
   - group: "AMD Tests"

diff --git a/.gitignore b/.gitignore
@@ -174,6 +174,7 @@ CLAUDE.md
 
 # Codex
 AGENTS.md
+.codex
 .codex/
 
 # cursor

diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
@@ -138,9 +138,27 @@ Single-stage diffusion serving with torch profiler:
 vllm serve Wan-AI/Wan2.2-I2V-A14B-Diffusers \
   --omni \
   --port 8091 \
-  --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}'
+  --profiler-config '{
+    "profiler": "torch",
+    "torch_profiler_dir": "/tmp/vllm_profile_wan22_i2v",
+    "torch_profiler_with_stack": true,
+    "torch_profiler_with_flops": false,
+    "torch_profiler_use_gzip": true,
+    "torch_profiler_dump_cuda_time_total": true,
+    "torch_profiler_record_shapes": true,
+    "torch_profiler_with_memory": true,
+    "delay_iterations": 0,
+    "max_iterations": 0
+  }'
 ```
 
+Useful optional fields:
+
+- `torch_profiler_with_stack`: export `by_stack` operator views and stack text files.
+- `torch_profiler_record_shapes`: export `by_shape` operator views.
+- `torch_profiler_with_memory`: dump `memory_snapshot_rank*.pickle` when the backend supports memory history.
+- `torch_profiler_use_gzip`: write the trace as `trace_rank*.json.gz`.
+
 Single-stage diffusion serving with Nsight Systems:
 
 ```bash
@@ -177,8 +195,11 @@ For mixed-stage pipelines, use explicit `stages` and pass the same stage list to
 
 Torch profiler output:
 
-- Chrome/Perfetto traces under `torch_profiler_dir`
-- Optional aggregated CUDA-time tables under the same directory
+- Chrome/Perfetto trace: `trace_rank*.json` or `trace_rank*.json.gz`
+- Excel workbook: `ops_rank*.xlsx` with `summary`, and optional `by_shape` / `by_stack` sheets
+- Stack exports: `stacks_cpu_rank*.txt` and `stacks_cuda_rank*.txt` when stack capture is enabled
+- Memory snapshot: `memory_snapshot_rank*.pickle` when memory capture is enabled and supported by the backend
+- Optional aggregated CUDA-time tables under the same session directory
 
 CUDA profiler / Nsight Systems output:
-Original file line number
+Diff line change
@@ Expand Up / @@ -174,6 +174,7 @@ CLAUDE.md @@
     # Codex
     AGENTS.md
+    .codex
     .codex/
     # cursor
@@ Expand Down @@