From 4e4a0c68d8b898a1bcc64648fc39c2d02dd532ae Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 11 Dec 2025 09:36:18 +0000
Subject: [PATCH 01/32] add Dockerfile.rocm

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 docker/Dockerfile.rocm | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 docker/Dockerfile.rocm

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
new file mode 100644
index 00000000000..6e774135e26
--- /dev/null
+++ b/docker/Dockerfile.rocm
@@ -0,0 +1,41 @@
+ARG ROCM_BASE_IMAGE=rocm/vllm-dev
+ARG ROCM_BASE_TAG=nightly_main_20251005
+FROM ${ROCM_BASE_IMAGE}:${ROCM_BASE_TAG}
+
+ARG APP_DIR=/workspace/vllm-omni
+ARG VLLM_VERSION=v0.11.0
+ARG PYTORCH_ROCM_ARCH="gfx942;gfx950"
+
+WORKDIR ${APP_DIR}
+
+# Step 1: Setup - Install system dependencies
+RUN apt-get update && \
+    apt-get install -y ffmpeg && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Step 2: Reinstall vllm from source
+RUN cd ../ && python3 -m pip uninstall -y vllm && \
+    git clone https://github.com/vllm-project/vllm.git && \
+    cd vllm && \
+    git checkout ${VLLM_VERSION} && \
+    python3 -c "import setuptools_scm; print(setuptools_scm.get_version())" && \
+    PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} python3 setup.py develop && \
+    cd / && \
+    rm -rf vllm/.git
+
+# Step 3: Copy vllm-omni code and install without uv
+COPY . ${APP_DIR}
+RUN python3 -m pip install --no-cache-dir ".[dev]"
+
+# Create python symlink
+RUN ln -sf /usr/bin/python3 /usr/bin/python
+
+# Step 4: Set environment variables for ROCm optimization
+ENV MIOPEN_FIND_MODE=FAST
+ENV VLLM_ROCM_USE_AITER=1
+ENV VLLM_ROCM_USE_AITER_MHA=1
+ENV VLLM_ROCM_USE_AITER_LINEAR=0
+ENV VLLM_ROCM_USE_AITER_RMSNORM=0
+
+ENTRYPOINT []
\ No newline at end of file

From 2b7ff410af666cbdf001df5f71a3e58ea1e92f1f Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 11 Dec 2025 10:05:08 +0000
Subject: [PATCH 02/32] add dockerfile build instruction

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 docs/getting_started/installation/gpu.md      |  9 +++--
 .../installation/gpu/rocm.inc.md              | 38 +++++++++++++++++--
 2 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
index 5956ed102de..03758fd9354 100644
--- a/docs/getting_started/installation/gpu.md
+++ b/docs/getting_started/installation/gpu.md
@@ -49,11 +49,14 @@ vLLM-Omni is a Python library that supports the following GPU variants. The libr
 
 ## Set up using Docker
 
-### Build wheel from source
 
-=== "NVIDIA CUDA"
+### Build your own docker image
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-docker"
 
-    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-wheel-from-source-in-docker"
+### Build wheel from source
 
 === "AMD ROCm"
 
diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
index 3b970267a28..c925dece380 100644
--- a/docs/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -69,9 +69,6 @@ python -c "import setuptools_scm; print(setuptools_scm.get_version())"
 PYTORCH_ROCM_ARCH=gfx942 python3 setup.py develop
 ```
 
-!!! note
-    vLLM release wheels based on the branch with prefix `releases/`, not from the tag as vLLM may cherry pick bugfixes after cutting a branch.
-
 
 #### Installation of vLLM-Omni
 
@@ -110,6 +107,41 @@ export VLLM_ROCM_USE_AITER_RMSNORM=0
 
 # --8<-- [end:build-wheel-from-source-in-docker]
 
+# --8<-- [start:build-docker]
+
+#### Build docker image
+
+```bash
+DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-omni-rocm .
+```
+
+If you want to specify which GPU Arch to build for to cutdown build time:
+
+```bash
+DOCKER_BUILDKIT=1 docker build \
+  -f docker/Dockerfile.rocm \
+  --build-arg PYTORCH_ROCM_ARCH="gfx942;gfx950" \
+  -t vllm-omni-rocm .
+```
+
+#### Launch the docker image
+
+```
+docker run -it \
+--network=host \
+--group-add=video \
+--ipc=host \
+--cap-add=SYS_PTRACE \
+--security-opt seccomp=unconfined \
+--device /dev/kfd \
+--device /dev/dri \
+-v <path/to/model>:/app/model \
+vllm-omni-rocm \
+bash
+```
+
+# --8<-- [end:build-docker]
+
 # --8<-- [start:pre-built-images]
 
 # --8<-- [end:pre-built-images]

From b03d282a85e0cf6f1a1428333654c2edae349ca9 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 11 Dec 2025 14:50:00 +0000
Subject: [PATCH 03/32] add preliminary CI files

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/bootstrap-omni-amd.sh              | 677 ++++++++++++++++++
 .../scripts/hardware_ci/run-amd-test.sh       | 113 +++
 .buildkite/test-amd.yaml                      |  34 +
 3 files changed, 824 insertions(+)
 create mode 100755 .buildkite/bootstrap-omni-amd.sh
 create mode 100755 .buildkite/scripts/hardware_ci/run-amd-test.sh
 create mode 100644 .buildkite/test-amd.yaml

diff --git a/.buildkite/bootstrap-omni-amd.sh b/.buildkite/bootstrap-omni-amd.sh
new file mode 100755
index 00000000000..724050cd477
--- /dev/null
+++ b/.buildkite/bootstrap-omni-amd.sh
@@ -0,0 +1,677 @@
+#!/bin/bash
+# vLLM-Omni AMD CI Bootstrap
+# Intelligent CI orchestration following vLLM's ci-infra approach
+#
+# Features:
+# - Smart change detection (docs-only skip, critical files)
+# - Pure bash YAML parsing
+# - Test filtering by source_file_dependencies and mirror_hardwares
+# - GitHub PR label support (ready-run-all-tests, ci-no-fail-fast)
+# - Dynamic Buildkite pipeline generation
+
+set -euo pipefail
+
+#==============================================================================
+# SECTION 1: INITIALIZATION & ENVIRONMENT DETECTION
+#==============================================================================
+
+# Enable debugging if requested
+DEBUG="${VLLM_CI_DEBUG:-0}"
+[[ "$DEBUG" == "1" ]] && set -x
+
+echo "=== vLLM-Omni AMD CI Bootstrap ==="
+echo "Timestamp: $(date -u +"%Y-%m-%d %H:%M:%S UTC")"
+echo "Branch: ${BUILDKITE_BRANCH:-unknown}"
+echo "Commit: ${BUILDKITE_COMMIT:-unknown}"
+echo "Pull Request: ${BUILDKITE_PULL_REQUEST:-none}"
+echo ""
+
+# Validate environment
+if [ ! -d ".buildkite" ]; then
+    echo "Error: .buildkite directory not found"
+    echo "Please run this script from the repository root"
+    exit 1
+fi
+
+if [ ! -f ".buildkite/test-amd.yaml" ]; then
+    echo "Error: .buildkite/test-amd.yaml not found"
+    exit 1
+fi
+
+# Validate git repository
+if ! git rev-parse --git-dir > /dev/null 2>&1; then
+    echo "Error: Not a git repository"
+    exit 1
+fi
+
+# Determine base branch for comparison
+if [[ "${BUILDKITE_PULL_REQUEST:-false}" != "false" ]]; then
+    BASE_BRANCH="${BUILDKITE_PULL_REQUEST_BASE_BRANCH:-main}"
+else
+    BASE_BRANCH="main"
+fi
+
+echo "Base branch for comparison: ${BASE_BRANCH}"
+echo ""
+
+#==============================================================================
+# SECTION 2: GITHUB LABEL CHECKING
+#==============================================================================
+
+# Function: Check GitHub PR labels
+check_github_labels() {
+    local pr_number="$1"
+
+    # Extract owner/repo from git URL
+    local repo_full_name=$(git remote get-url origin 2>/dev/null | sed -E 's/.*github\.com[:/]([^/]+\/[^/]+)(\.git)?$/\1/' || echo "")
+
+    if [[ -z "$repo_full_name" ]]; then
+        echo "Warning: Could not determine GitHub repository"
+        return 1
+    fi
+
+    echo "--- Checking GitHub PR labels"
+    echo "Repository: ${repo_full_name}"
+    echo "PR Number: ${pr_number}"
+
+    # Try to fetch labels via GitHub API (no auth needed for public repos)
+    if command -v curl >/dev/null 2>&1; then
+        local api_url="https://api.github.com/repos/${repo_full_name}/pulls/${pr_number}"
+        local response=$(curl -s -f "${api_url}" 2>/dev/null || echo "")
+
+        if [[ -n "$response" ]]; then
+            # Extract label names
+            local labels=$(echo "$response" | grep -o '"name":"[^"]*"' | cut -d'"' -f4 | tr '\n' ',' || echo "")
+
+            # Check for specific labels
+            if [[ "$labels" == *"ready-run-all-tests"* ]]; then
+                RUN_ALL_TESTS=1
+                echo "✓ Found label: ready-run-all-tests"
+            fi
+
+            if [[ "$labels" == *"ci-no-fail-fast"* ]]; then
+                NO_FAIL_FAST=1
+                echo "✓ Found label: ci-no-fail-fast"
+            fi
+
+            [[ "$RUN_ALL_TESTS" == "0" ]] && echo "  No run-all-tests label"
+            [[ "$NO_FAIL_FAST" == "0" ]] && echo "  No fail-fast override label"
+            return 0
+        fi
+    fi
+
+    echo "Warning: Could not fetch GitHub labels (API unavailable or request failed)"
+    return 1
+}
+
+# Initialize flags
+RUN_ALL_TESTS=0
+NO_FAIL_FAST=0
+
+# Check labels if this is a PR
+if [[ "${BUILDKITE_PULL_REQUEST:-false}" != "false" ]] && [[ "${BUILDKITE_PULL_REQUEST}" != "" ]]; then
+    check_github_labels "${BUILDKITE_PULL_REQUEST}" || echo "Continuing without label information"
+    echo ""
+fi
+
+#==============================================================================
+# SECTION 3: CHANGE DETECTION & ANALYSIS
+#==============================================================================
+
+# Function: Detect if only docs changed
+is_docs_only_change() {
+    local changed_files="$1"
+
+    # Docs-related patterns
+    local docs_patterns=(
+        "^docs/"
+        "^README"
+        "\\.md$"
+        "\\.rst$"
+        "^LICENSE"
+        "^CONTRIBUTING"
+    )
+
+    local has_non_docs=0
+
+    while IFS= read -r file; do
+        [[ -z "$file" ]] && continue
+
+        local is_docs=0
+        for pattern in "${docs_patterns[@]}"; do
+            if [[ "$file" =~ $pattern ]]; then
+                is_docs=1
+                break
+            fi
+        done
+
+        if [[ "$is_docs" == "0" ]]; then
+            # Found non-docs file
+            has_non_docs=1
+            break
+        fi
+    done <<< "$changed_files"
+
+    # Return 0 (success) if all files are docs, 1 otherwise
+    [[ "$has_non_docs" == "0" ]]
+}
+
+# Function: Detect critical file changes
+has_critical_file_changes() {
+    local changed_files="$1"
+
+    # Critical patterns that trigger full test run
+    local critical_patterns=(
+        "^docker/Dockerfile"
+        "^requirements.*\\.txt$"
+        "^setup\\.py$"
+        "^pyproject\\.toml$"
+        "^\\.buildkite/bootstrap-omni-amd\\.sh$"
+        "^\\.buildkite/test-amd\\.yaml$"
+        "^\\.buildkite/scripts/hardware_ci/run-amd-test\\.sh$"
+    )
+
+    while IFS= read -r file; do
+        [[ -z "$file" ]] && continue
+
+        for pattern in "${critical_patterns[@]}"; do
+            if [[ "$file" =~ $pattern ]]; then
+                echo "  Critical file changed: $file"
+                return 0
+            fi
+        done
+    done <<< "$changed_files"
+
+    return 1
+}
+
+echo "--- Analyzing changed files"
+
+# Get list of changed files
+CHANGED_FILES=""
+if git rev-parse "origin/${BASE_BRANCH}" >/dev/null 2>&1; then
+    # Fetch latest base branch
+    echo "Fetching origin/${BASE_BRANCH}..."
+    git fetch origin "${BASE_BRANCH}" >/dev/null 2>&1 || true
+
+    # Get changed files between base and current commit
+    CHANGED_FILES=$(git diff --name-only "origin/${BASE_BRANCH}...${BUILDKITE_COMMIT}" 2>/dev/null || \
+                    git diff --name-only "origin/${BASE_BRANCH}" "${BUILDKITE_COMMIT}" 2>/dev/null || \
+                    echo "")
+else
+    echo "Warning: Could not find base branch ${BASE_BRANCH}"
+    echo "Will run all tests as a safety measure"
+    RUN_ALL_TESTS=1
+fi
+
+# Count changed files
+CHANGED_FILE_COUNT=$(echo "$CHANGED_FILES" | grep -c . || echo "0")
+echo "Changed files: ${CHANGED_FILE_COUNT}"
+
+# Debug: Show changed files if in debug mode
+if [[ "$DEBUG" == "1" ]] && [[ -n "$CHANGED_FILES" ]]; then
+    echo "Changed files list:"
+    echo "$CHANGED_FILES" | head -20
+    [[ "$CHANGED_FILE_COUNT" -gt 20 ]] && echo "... (${CHANGED_FILE_COUNT} total files)"
+    echo ""
+fi
+
+# Check for docs-only changes (early exit optimization)
+if [[ "$CHANGED_FILE_COUNT" -gt 0 ]] && is_docs_only_change "$CHANGED_FILES"; then
+    echo ""
+    echo "=== Documentation-only changes detected ==="
+    echo "Skipping CI tests to save resources"
+    echo ""
+
+    # Generate minimal pipeline
+    cat > .buildkite/pipeline.yaml << 'EOF'
+steps:
+  - label: ":memo: Docs-only change"
+    command: echo "Only documentation changed, skipping tests"
+    agents:
+      queue: cpu_queue_premerge
+EOF
+
+    echo "--- Generated minimal pipeline:"
+    cat .buildkite/pipeline.yaml
+    echo ""
+
+    echo "--- Uploading pipeline to Buildkite"
+    buildkite-agent pipeline upload .buildkite/pipeline.yaml
+
+    echo "=== Bootstrap Complete (docs-only) ==="
+    exit 0
+fi
+
+# Check for critical file changes
+if has_critical_file_changes "$CHANGED_FILES"; then
+    echo "  → Critical files detected: Will run ALL tests"
+    RUN_ALL_TESTS=1
+fi
+
+echo ""
+
+#==============================================================================
+# SECTION 4: YAML PARSING (Pure Bash)
+#==============================================================================
+
+echo "--- Parsing test-amd.yaml"
+
+# Parse test-amd.yaml into structured variables
+# Variables will be named: STEP_<N>_<FIELD>
+# Arrays/lists use || as delimiter
+
+TOTAL_STEPS=0
+declare -A STEP_DATA
+
+parse_test_yaml() {
+    local yaml_file="$1"
+
+    local step_num=0
+    local in_step=0
+    local current_section=""
+    local list_indent=0
+
+    while IFS= read -r raw_line; do
+        # Handle line without removing all whitespace (need to detect indentation)
+        local line="$raw_line"
+
+        # Skip comments and empty lines
+        [[ "$line" =~ ^[[:space:]]*# ]] && continue
+        [[ -z "${line// /}" ]] && continue
+
+        # Detect indentation level
+        local indent=$(echo "$line" | sed -E 's/^([[:space:]]*).*/\1/' | wc -c)
+        indent=$((indent - 1))  # wc -c counts \n
+
+        # Detect new step (starts with "- label:")
+        if [[ "$line" =~ ^[[:space:]]*-[[:space:]]+label:[[:space:]]*(.+)$ ]]; then
+            step_num=$((step_num + 1))
+            in_step=1
+            local label="${BASH_REMATCH[1]}"
+            label=$(echo "$label" | sed 's/^["'"'"']//' | sed 's/["'"'"']$//')
+
+            eval "STEP_${step_num}_LABEL=\"\$label\""
+            eval "STEP_${step_num}_KEY=\"step-${step_num}\""
+
+            echo "  Step ${step_num}: ${label}"
+            current_section=""
+            continue
+        fi
+
+        [[ "$in_step" == "0" ]] && continue
+
+        # Parse key-value pairs at step level
+        if [[ "$line" =~ ^[[:space:]]+([a-z_]+):[[:space:]]*(.*)$ ]]; then
+            local key="${BASH_REMATCH[1]}"
+            local value="${BASH_REMATCH[2]}"
+
+            # Clean value (remove quotes, brackets)
+            value=$(echo "$value" | sed 's/^["'"'"'\[]//' | sed 's/["'"'"'\]]*$//')
+
+            case "$key" in
+                key)
+                    eval "STEP_${step_num}_KEY=\"\$value\""
+                    ;;
+                mirror_hardwares)
+                    # Parse array: [amdexperimental, amdproduction]
+                    value=$(echo "$value" | tr ',' ' ' | xargs)
+                    eval "STEP_${step_num}_MIRROR_HARDWARES=\"\$value\""
+                    ;;
+                agent_pool)
+                    eval "STEP_${step_num}_AGENT_POOL=\"\$value\""
+                    ;;
+                timeout_in_minutes)
+                    eval "STEP_${step_num}_TIMEOUT=\"\$value\""
+                    ;;
+                fast_check)
+                    eval "STEP_${step_num}_FAST_CHECK=\"\$value\""
+                    ;;
+                working_dir)
+                    eval "STEP_${step_num}_WORKING_DIR=\"\$value\""
+                    ;;
+                queue)
+                    # Part of agents section
+                    eval "STEP_${step_num}_QUEUE=\"\$value\""
+                    ;;
+                commands)
+                    current_section="commands"
+                    eval "STEP_${step_num}_COMMANDS=\"\""
+                    list_indent=$indent
+                    ;;
+                source_file_dependencies)
+                    current_section="dependencies"
+                    eval "STEP_${step_num}_DEPENDENCIES=\"\""
+                    list_indent=$indent
+                    ;;
+                agents)
+                    current_section="agents"
+                    ;;
+                *)
+                    # Check if we left a list section
+                    if [[ "$indent" -le "$list_indent" ]] && [[ -n "$current_section" ]]; then
+                        current_section=""
+                    fi
+                    ;;
+            esac
+        # Parse list items (- item)
+        elif [[ "$line" =~ ^[[:space:]]*-[[:space:]]+(.+)$ ]]; then
+            local item="${BASH_REMATCH[1]}"
+            item=$(echo "$item" | sed 's/^["'"'"']//' | sed 's/["'"'"']$//')
+
+            if [[ "$current_section" == "commands" ]]; then
+                local current_cmds
+                eval "current_cmds=\"\$STEP_${step_num}_COMMANDS\""
+                if [[ -n "$current_cmds" ]]; then
+                    eval "STEP_${step_num}_COMMANDS=\"\${current_cmds}||\$item\""
+                else
+                    eval "STEP_${step_num}_COMMANDS=\"\$item\""
+                fi
+            elif [[ "$current_section" == "dependencies" ]]; then
+                local current_deps
+                eval "current_deps=\"\$STEP_${step_num}_DEPENDENCIES\""
+                if [[ -n "$current_deps" ]]; then
+                    eval "STEP_${step_num}_DEPENDENCIES=\"\${current_deps}||\$item\""
+                else
+                    eval "STEP_${step_num}_DEPENDENCIES=\"\$item\""
+                fi
+            fi
+        fi
+    done < "$yaml_file"
+
+    TOTAL_STEPS=$step_num
+}
+
+parse_test_yaml ".buildkite/test-amd.yaml"
+
+echo "Parsed ${TOTAL_STEPS} steps from test-amd.yaml"
+
+# Validate parsing
+if [[ "$TOTAL_STEPS" == "0" ]]; then
+    echo "Error: No steps found in test-amd.yaml"
+    exit 1
+fi
+
+echo ""
+
+#==============================================================================
+# SECTION 5: TEST FILTERING & SELECTION
+#==============================================================================
+
+# Function: Check if step should run based on file dependencies
+should_run_step() {
+    local step_num="$1"
+
+    # If RUN_ALL_TESTS is set, always run
+    if [[ "$RUN_ALL_TESTS" == "1" ]]; then
+        return 0
+    fi
+
+    # Get step dependencies
+    local deps_var="STEP_${step_num}_DEPENDENCIES"
+    local dependencies="${!deps_var:-}"
+
+    # If no dependencies specified, always run (catch-all test)
+    if [[ -z "$dependencies" ]]; then
+        return 0
+    fi
+
+    # If no files changed, don't run
+    if [[ -z "$CHANGED_FILES" ]] || [[ "$CHANGED_FILE_COUNT" == "0" ]]; then
+        return 1
+    fi
+
+    # Check if any changed file matches dependencies
+    # Dependencies use prefix matching (e.g., "vllm_omni/" matches "vllm_omni/diffusion/model.py")
+    local IFS='||'
+    local deps_array=($dependencies)
+
+    for dep_pattern in "${deps_array[@]}"; do
+        # Remove trailing slashes for consistency
+        dep_pattern="${dep_pattern%/}"
+
+        while IFS= read -r changed_file; do
+            [[ -z "$changed_file" ]] && continue
+
+            # Check if changed file matches dependency pattern
+            # Support both prefix matching and exact file matching
+            if [[ "$changed_file" == "${dep_pattern}"* ]] || [[ "$changed_file" == "$dep_pattern" ]]; then
+                if [[ "$DEBUG" == "1" ]]; then
+                    echo "    Match: ${changed_file} ↔ ${dep_pattern}"
+                fi
+                return 0
+            fi
+        done <<< "$CHANGED_FILES"
+    done
+
+    return 1
+}
+
+# Function: Check if step matches hardware filter
+matches_hardware() {
+    local step_num="$1"
+    local target_hardware="${2:-amdexperimental}"
+
+    # Get mirror_hardwares for this step
+    local hw_var="STEP_${step_num}_MIRROR_HARDWARES"
+    local hardwares="${!hw_var:-}"
+
+    # If no hardware specified, it's a non-AMD step (CPU/build steps)
+    # These should always be included
+    if [[ -z "$hardwares" ]]; then
+        return 0
+    fi
+
+    # Check if target hardware is in the list
+    if [[ "$hardwares" == *"$target_hardware"* ]]; then
+        return 0
+    fi
+
+    return 1
+}
+
+echo "--- Filtering tests"
+echo "Filter criteria:"
+echo "  Target hardware: amdexperimental"
+echo "  RUN_ALL_TESTS: ${RUN_ALL_TESTS}"
+echo "  Changed files: ${CHANGED_FILE_COUNT}"
+echo ""
+
+# Build list of steps to include
+INCLUDED_STEPS=()
+
+for ((i=1; i<=TOTAL_STEPS; i++)); do
+    local label_var="STEP_${i}_LABEL"
+    local label="${!label_var:-Step $i}"
+
+    # Check hardware match
+    if ! matches_hardware "$i" "amdexperimental"; then
+        echo "  Step ${i} (${label}): ✗ Wrong hardware - SKIP"
+        continue
+    fi
+
+    # Check file dependencies
+    if should_run_step "$i"; then
+        echo "  Step ${i} (${label}): ✓ INCLUDE"
+        INCLUDED_STEPS+=("$i")
+    else
+        echo "  Step ${i} (${label}): ✗ No matching changes - SKIP"
+    fi
+done
+
+echo ""
+echo "Selected ${#INCLUDED_STEPS[@]} of ${TOTAL_STEPS} steps"
+
+# Ensure at least build step (step 1) is included
+if [[ "${#INCLUDED_STEPS[@]}" == "0" ]]; then
+    echo "Warning: No tests selected, including build step as fallback"
+    INCLUDED_STEPS=(1)
+fi
+
+echo ""
+
+#==============================================================================
+# SECTION 6: PIPELINE GENERATION
+#==============================================================================
+
+echo "--- Generating Buildkite pipeline"
+
+# Start pipeline file
+cat > .buildkite/pipeline.yaml << 'PIPELINE_HEADER'
+# Auto-generated by bootstrap-omni-amd.sh
+# DO NOT EDIT MANUALLY - Edit .buildkite/test-amd.yaml instead
+#
+# Generated:
+PIPELINE_HEADER
+
+echo "# $(date -u)" >> .buildkite/pipeline.yaml
+echo "" >> .buildkite/pipeline.yaml
+echo "steps:" >> .buildkite/pipeline.yaml
+
+# Generate steps
+for step_num in "${INCLUDED_STEPS[@]}"; do
+    # Get step data
+    local label_var="STEP_${step_num}_LABEL"
+    local label="${!label_var}"
+
+    local key_var="STEP_${step_num}_KEY"
+    local key="${!key_var:-step-${step_num}}"
+
+    local queue_var="STEP_${step_num}_QUEUE"
+    local queue="${!queue_var:-}"
+
+    local agent_pool_var="STEP_${step_num}_AGENT_POOL"
+    local agent_pool="${!agent_pool_var:-}"
+
+    local timeout_var="STEP_${step_num}_TIMEOUT"
+    local timeout="${!timeout_var:-10}"
+
+    local commands_var="STEP_${step_num}_COMMANDS"
+    local commands="${!commands_var:-}"
+
+    local working_dir_var="STEP_${step_num}_WORKING_DIR"
+    local working_dir="${!working_dir_var:-}"
+
+    # Determine queue
+    local final_queue=""
+    if [[ -n "$queue" ]]; then
+        final_queue="$queue"
+    elif [[ -n "$agent_pool" ]]; then
+        # Map agent_pool to queue name
+        case "$agent_pool" in
+            mi325_1) final_queue="amd_mi325_1" ;;
+            mi325_2) final_queue="amd_mi325_2" ;;
+            mi325_4) final_queue="amd_mi325_4" ;;
+            mi325_8) final_queue="amd_mi325_8" ;;
+            *) final_queue="amd_${agent_pool}" ;;
+        esac
+    else
+        # Default queue for steps without specification
+        final_queue="cpu_queue_premerge"
+    fi
+
+    # Generate step YAML
+    cat >> .buildkite/pipeline.yaml << STEP_START
+
+  - label: "${label}"
+    key: "${key}"
+    agents:
+      queue: "${final_queue}"
+      cluster: "CI"
+STEP_START
+
+    # Add commands
+    if [[ -n "$commands" ]]; then
+        # Check if this is an AMD GPU test (needs run-amd-test.sh wrapper)
+        if [[ "$final_queue" == amd_* ]]; then
+            # Build command string for AMD GPU execution
+            local cmd_string=""
+            local IFS='||'
+            local cmd_array=($commands)
+
+            # Add ROCm check prefix
+            cmd_string="(command rocm-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1"
+
+            # Add working directory if specified
+            if [[ -n "$working_dir" ]]; then
+                cmd_string="${cmd_string} && cd ${working_dir}"
+            fi
+
+            # Join commands with &&
+            for cmd in "${cmd_array[@]}"; do
+                cmd_string="${cmd_string} && ${cmd}"
+            done
+
+            # Wrap in run-amd-test.sh
+            echo "    command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh \"${cmd_string}\"" >> .buildkite/pipeline.yaml
+        else
+            # CPU or build step - direct commands
+            echo "    commands:" >> .buildkite/pipeline.yaml
+            local IFS='||'
+            local cmd_array=($commands)
+            for cmd in "${cmd_array[@]}"; do
+                echo "      - \"${cmd}\"" >> .buildkite/pipeline.yaml
+            done
+        fi
+    else
+        # No commands specified
+        echo "    command: echo \"No commands specified for this step\"" >> .buildkite/pipeline.yaml
+    fi
+
+    # Add timeout
+    echo "    timeout_in_minutes: ${timeout}" >> .buildkite/pipeline.yaml
+
+    # Add retry for AMD GPU tests
+    if [[ "$final_queue" == amd_* ]]; then
+        cat >> .buildkite/pipeline.yaml << 'RETRY_BLOCK'
+    retry:
+      automatic:
+        - exit_status: "*"
+          limit: 1
+RETRY_BLOCK
+    fi
+
+    # Add environment variables for AMD GPU tests
+    if [[ "$final_queue" == amd_* ]]; then
+        cat >> .buildkite/pipeline.yaml << 'ENV_BLOCK'
+    env:
+      HF_HOME: "/root/.cache/huggingface"
+ENV_BLOCK
+    fi
+
+    # Add depends_on for non-first steps (depend on build step)
+    if [[ "$step_num" != "${INCLUDED_STEPS[0]}" ]]; then
+        local first_key_var="STEP_${INCLUDED_STEPS[0]}_KEY"
+        local first_key="${!first_key_var:-step-${INCLUDED_STEPS[0]}}"
+        echo "    depends_on: \"${first_key}\"" >> .buildkite/pipeline.yaml
+    fi
+done
+
+# Validate generated pipeline
+if ! grep -q "^steps:" .buildkite/pipeline.yaml; then
+    echo "Error: Generated pipeline is invalid (missing 'steps:' section)"
+    exit 1
+fi
+
+echo "Pipeline generated successfully"
+echo ""
+
+# Display generated pipeline
+echo "--- Generated Pipeline:"
+cat .buildkite/pipeline.yaml
+echo ""
+
+# Upload pipeline to Buildkite
+echo "--- Uploading pipeline to Buildkite"
+buildkite-agent pipeline upload .buildkite/pipeline.yaml
+
+echo ""
+echo "=== Bootstrap Complete ==="
+echo "Configuration:"
+echo "  Total steps defined: ${TOTAL_STEPS}"
+echo "  Steps selected: ${#INCLUDED_STEPS[@]}"
+echo "  RUN_ALL_TESTS: ${RUN_ALL_TESTS}"
+echo "  NO_FAIL_FAST: ${NO_FAIL_FAST}"
+echo "  Changed files: ${CHANGED_FILE_COUNT}"
+echo ""
+echo "Pipeline uploaded successfully!"
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
new file mode 100755
index 00000000000..faf51234619
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# This script runs tests inside the ROCm docker container for vLLM-Omni.
+# Adapted from vLLM's run-amd-test.sh for vllm-omni's simpler use case.
+
+set -o pipefail
+
+# Export Python path
+export PYTHONPATH=".."
+
+# Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+    sleep 3
+    if grep -q clean /opt/amdgpu/etc/gpu_state; then
+        echo "GPUs state is \"clean\""
+        break
+    fi
+done
+
+echo "--- ROCm info"
+rocminfo
+
+# Cleanup older docker images
+cleanup_docker() {
+    docker_root=$(docker info -f '{{.DockerRootDir}}')
+    if [ -z "$docker_root" ]; then
+        echo "Failed to determine Docker root directory."
+        exit 1
+    fi
+    echo "Docker root directory: $docker_root"
+
+    disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+    threshold=70
+
+    if [ "$disk_usage" -gt "$threshold" ]; then
+        echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+        docker image prune -f
+        docker volume prune -f && docker system prune --force --filter "until=72h" --all
+        echo "Docker images and volumes cleanup completed."
+    else
+        echo "Disk usage is below $threshold%. No cleanup needed."
+    fi
+}
+
+cleanup_docker
+
+echo "--- Resetting GPUs"
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+    sleep 3
+    if grep -q clean /opt/amdgpu/etc/gpu_state; then
+        echo "GPUs state is \"clean\""
+        break
+    fi
+done
+
+echo "--- Pulling/Building container"
+image_name="vllm/vllm-omni-rocm-ci:${BUILDKITE_COMMIT}"
+container_name="rocm_vllm_omni_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+
+# Try to pull image first, if it doesn't exist, build it
+if ! docker pull "${image_name}" 2>/dev/null; then
+    echo "Image not found, building from Dockerfile.rocm..."
+    cd "$(dirname "$0")/../../.."  # Go to repo root
+    docker build \
+        -f docker/Dockerfile.rocm \
+        -t "${image_name}" \
+        --build-arg BUILDKITE_COMMIT="${BUILDKITE_COMMIT}" \
+        .
+fi
+
+remove_docker_container() {
+    docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+}
+trap remove_docker_container EXIT
+
+echo "--- Running container"
+
+# HuggingFace cache setup
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+
+# Get commands from arguments
+commands=$@
+echo "Commands: $commands"
+
+# Get render group for GPU access
+render_gid=$(getent group render | cut -d: -f3)
+if [[ -z "$render_gid" ]]; then
+    echo "Error: 'render' group not found. This is required for GPU access." >&2
+    exit 1
+fi
+
+# Run tests in container
+echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+docker run \
+    --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+    --network=host \
+    --shm-size=16gb \
+    --group-add "$render_gid" \
+    --rm \
+    -e HF_TOKEN \
+    -e AWS_ACCESS_KEY_ID \
+    -e AWS_SECRET_ACCESS_KEY \
+    -v "${HF_CACHE}:${HF_MOUNT}" \
+    -e "HF_HOME=${HF_MOUNT}" \
+    -e "PYTHONPATH=.." \
+    --name "${container_name}" \
+    "${image_name}" \
+    /bin/bash -c "${commands}"
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
new file mode 100644
index 00000000000..bbe196d8a35
--- /dev/null
+++ b/.buildkite/test-amd.yaml
@@ -0,0 +1,34 @@
+# AMD Test Pipeline for vLLM-Omni
+#
+# This file follows vLLM's test-amd.yaml structure and will be processed
+# by Jinja templates in the buildkite-ci repository (if configured).
+#
+# Documentation:
+# - mirror_hardwares: list of AMD hardware to run tests on [amdexperimental, amdproduction, amdtentative]
+# - agent_pool: GPU pool to use (mi325_1 = single MI325X GPU)
+# - fast_check: run on every commit
+# - timeout_in_minutes: test timeout
+# - source_file_dependencies: trigger test only when these files change
+# - commands: list of commands to execute
+# - working_dir: directory where commands execute (default: /vllm-omni-workspace/tests)
+
+steps:
+##### fast check tests  #####
+
+- label: "Simple Unit Test"
+  commands:
+    - ".buildkite/scripts/simple_test.sh"
+  agents:
+    queue: "cpu_queue_premerge"
+
+# - label: ":rocm: Z-Image Diffusion Model Test"
+#   mirror_hardwares: [amdexperimental]
+#   agent_pool: mi325_1
+#   timeout_in_minutes: 20
+#   fast_check: true
+#   source_file_dependencies:
+#     - vllm_omni/
+#     - tests/test_diffusion_model.py
+#   commands:
+#     - pytest -v -s test_diffusion_model.py
+#   working_dir: "/vllm-omni-workspace/tests"

From d5c75c348dce849362572a68796e17644f0117a9 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 11 Dec 2025 15:01:08 +0000
Subject: [PATCH 04/32] fix local error

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/bootstrap-omni-amd.sh | 48 ++++++++++++++++----------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/.buildkite/bootstrap-omni-amd.sh b/.buildkite/bootstrap-omni-amd.sh
index 724050cd477..a6211f0b595 100755
--- a/.buildkite/bootstrap-omni-amd.sh
+++ b/.buildkite/bootstrap-omni-amd.sh
@@ -481,8 +481,8 @@ echo ""
 INCLUDED_STEPS=()
 
 for ((i=1; i<=TOTAL_STEPS; i++)); do
-    local label_var="STEP_${i}_LABEL"
-    local label="${!label_var:-Step $i}"
+    label_var="STEP_${i}_LABEL"
+    label="${!label_var:-Step $i}"
 
     # Check hardware match
     if ! matches_hardware "$i" "amdexperimental"; then
@@ -531,29 +531,29 @@ echo "steps:" >> .buildkite/pipeline.yaml
 # Generate steps
 for step_num in "${INCLUDED_STEPS[@]}"; do
     # Get step data
-    local label_var="STEP_${step_num}_LABEL"
-    local label="${!label_var}"
+    label_var="STEP_${step_num}_LABEL"
+    label="${!label_var}"
 
-    local key_var="STEP_${step_num}_KEY"
-    local key="${!key_var:-step-${step_num}}"
+    key_var="STEP_${step_num}_KEY"
+    key="${!key_var:-step-${step_num}}"
 
-    local queue_var="STEP_${step_num}_QUEUE"
-    local queue="${!queue_var:-}"
+    queue_var="STEP_${step_num}_QUEUE"
+    queue="${!queue_var:-}"
 
-    local agent_pool_var="STEP_${step_num}_AGENT_POOL"
-    local agent_pool="${!agent_pool_var:-}"
+    agent_pool_var="STEP_${step_num}_AGENT_POOL"
+    agent_pool="${!agent_pool_var:-}"
 
-    local timeout_var="STEP_${step_num}_TIMEOUT"
-    local timeout="${!timeout_var:-10}"
+    timeout_var="STEP_${step_num}_TIMEOUT"
+    timeout="${!timeout_var:-10}"
 
-    local commands_var="STEP_${step_num}_COMMANDS"
-    local commands="${!commands_var:-}"
+    commands_var="STEP_${step_num}_COMMANDS"
+    commands="${!commands_var:-}"
 
-    local working_dir_var="STEP_${step_num}_WORKING_DIR"
-    local working_dir="${!working_dir_var:-}"
+    working_dir_var="STEP_${step_num}_WORKING_DIR"
+    working_dir="${!working_dir_var:-}"
 
     # Determine queue
-    local final_queue=""
+    final_queue=""
     if [[ -n "$queue" ]]; then
         final_queue="$queue"
     elif [[ -n "$agent_pool" ]]; then
@@ -585,9 +585,9 @@ STEP_START
         # Check if this is an AMD GPU test (needs run-amd-test.sh wrapper)
         if [[ "$final_queue" == amd_* ]]; then
             # Build command string for AMD GPU execution
-            local cmd_string=""
-            local IFS='||'
-            local cmd_array=($commands)
+            cmd_string=""
+            IFS='||'
+            cmd_array=($commands)
 
             # Add ROCm check prefix
             cmd_string="(command rocm-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1"
@@ -607,8 +607,8 @@ STEP_START
         else
             # CPU or build step - direct commands
             echo "    commands:" >> .buildkite/pipeline.yaml
-            local IFS='||'
-            local cmd_array=($commands)
+            IFS='||'
+            cmd_array=($commands)
             for cmd in "${cmd_array[@]}"; do
                 echo "      - \"${cmd}\"" >> .buildkite/pipeline.yaml
             done
@@ -641,8 +641,8 @@ ENV_BLOCK
 
     # Add depends_on for non-first steps (depend on build step)
     if [[ "$step_num" != "${INCLUDED_STEPS[0]}" ]]; then
-        local first_key_var="STEP_${INCLUDED_STEPS[0]}_KEY"
-        local first_key="${!first_key_var:-step-${INCLUDED_STEPS[0]}}"
+        first_key_var="STEP_${INCLUDED_STEPS[0]}_KEY"
+        first_key="${!first_key_var:-step-${INCLUDED_STEPS[0]}}"
         echo "    depends_on: \"${first_key}\"" >> .buildkite/pipeline.yaml
     fi
 done

From e2c24b77495661d836fc40ae3d99a34e7a3bbfec Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 11 Dec 2025 15:16:33 +0000
Subject: [PATCH 05/32] simplify amd test to just test build docker

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/bootstrap-omni-amd.sh              | 677 ------------------
 .../scripts/hardware_ci/run-amd-test.sh       | 113 ---
 .buildkite/test-amd.yaml                      |  43 +-
 3 files changed, 11 insertions(+), 822 deletions(-)
 delete mode 100755 .buildkite/bootstrap-omni-amd.sh
 delete mode 100755 .buildkite/scripts/hardware_ci/run-amd-test.sh

diff --git a/.buildkite/bootstrap-omni-amd.sh b/.buildkite/bootstrap-omni-amd.sh
deleted file mode 100755
index a6211f0b595..00000000000
--- a/.buildkite/bootstrap-omni-amd.sh
+++ /dev/null
@@ -1,677 +0,0 @@
-#!/bin/bash
-# vLLM-Omni AMD CI Bootstrap
-# Intelligent CI orchestration following vLLM's ci-infra approach
-#
-# Features:
-# - Smart change detection (docs-only skip, critical files)
-# - Pure bash YAML parsing
-# - Test filtering by source_file_dependencies and mirror_hardwares
-# - GitHub PR label support (ready-run-all-tests, ci-no-fail-fast)
-# - Dynamic Buildkite pipeline generation
-
-set -euo pipefail
-
-#==============================================================================
-# SECTION 1: INITIALIZATION & ENVIRONMENT DETECTION
-#==============================================================================
-
-# Enable debugging if requested
-DEBUG="${VLLM_CI_DEBUG:-0}"
-[[ "$DEBUG" == "1" ]] && set -x
-
-echo "=== vLLM-Omni AMD CI Bootstrap ==="
-echo "Timestamp: $(date -u +"%Y-%m-%d %H:%M:%S UTC")"
-echo "Branch: ${BUILDKITE_BRANCH:-unknown}"
-echo "Commit: ${BUILDKITE_COMMIT:-unknown}"
-echo "Pull Request: ${BUILDKITE_PULL_REQUEST:-none}"
-echo ""
-
-# Validate environment
-if [ ! -d ".buildkite" ]; then
-    echo "Error: .buildkite directory not found"
-    echo "Please run this script from the repository root"
-    exit 1
-fi
-
-if [ ! -f ".buildkite/test-amd.yaml" ]; then
-    echo "Error: .buildkite/test-amd.yaml not found"
-    exit 1
-fi
-
-# Validate git repository
-if ! git rev-parse --git-dir > /dev/null 2>&1; then
-    echo "Error: Not a git repository"
-    exit 1
-fi
-
-# Determine base branch for comparison
-if [[ "${BUILDKITE_PULL_REQUEST:-false}" != "false" ]]; then
-    BASE_BRANCH="${BUILDKITE_PULL_REQUEST_BASE_BRANCH:-main}"
-else
-    BASE_BRANCH="main"
-fi
-
-echo "Base branch for comparison: ${BASE_BRANCH}"
-echo ""
-
-#==============================================================================
-# SECTION 2: GITHUB LABEL CHECKING
-#==============================================================================
-
-# Function: Check GitHub PR labels
-check_github_labels() {
-    local pr_number="$1"
-
-    # Extract owner/repo from git URL
-    local repo_full_name=$(git remote get-url origin 2>/dev/null | sed -E 's/.*github\.com[:/]([^/]+\/[^/]+)(\.git)?$/\1/' || echo "")
-
-    if [[ -z "$repo_full_name" ]]; then
-        echo "Warning: Could not determine GitHub repository"
-        return 1
-    fi
-
-    echo "--- Checking GitHub PR labels"
-    echo "Repository: ${repo_full_name}"
-    echo "PR Number: ${pr_number}"
-
-    # Try to fetch labels via GitHub API (no auth needed for public repos)
-    if command -v curl >/dev/null 2>&1; then
-        local api_url="https://api.github.com/repos/${repo_full_name}/pulls/${pr_number}"
-        local response=$(curl -s -f "${api_url}" 2>/dev/null || echo "")
-
-        if [[ -n "$response" ]]; then
-            # Extract label names
-            local labels=$(echo "$response" | grep -o '"name":"[^"]*"' | cut -d'"' -f4 | tr '\n' ',' || echo "")
-
-            # Check for specific labels
-            if [[ "$labels" == *"ready-run-all-tests"* ]]; then
-                RUN_ALL_TESTS=1
-                echo "✓ Found label: ready-run-all-tests"
-            fi
-
-            if [[ "$labels" == *"ci-no-fail-fast"* ]]; then
-                NO_FAIL_FAST=1
-                echo "✓ Found label: ci-no-fail-fast"
-            fi
-
-            [[ "$RUN_ALL_TESTS" == "0" ]] && echo "  No run-all-tests label"
-            [[ "$NO_FAIL_FAST" == "0" ]] && echo "  No fail-fast override label"
-            return 0
-        fi
-    fi
-
-    echo "Warning: Could not fetch GitHub labels (API unavailable or request failed)"
-    return 1
-}
-
-# Initialize flags
-RUN_ALL_TESTS=0
-NO_FAIL_FAST=0
-
-# Check labels if this is a PR
-if [[ "${BUILDKITE_PULL_REQUEST:-false}" != "false" ]] && [[ "${BUILDKITE_PULL_REQUEST}" != "" ]]; then
-    check_github_labels "${BUILDKITE_PULL_REQUEST}" || echo "Continuing without label information"
-    echo ""
-fi
-
-#==============================================================================
-# SECTION 3: CHANGE DETECTION & ANALYSIS
-#==============================================================================
-
-# Function: Detect if only docs changed
-is_docs_only_change() {
-    local changed_files="$1"
-
-    # Docs-related patterns
-    local docs_patterns=(
-        "^docs/"
-        "^README"
-        "\\.md$"
-        "\\.rst$"
-        "^LICENSE"
-        "^CONTRIBUTING"
-    )
-
-    local has_non_docs=0
-
-    while IFS= read -r file; do
-        [[ -z "$file" ]] && continue
-
-        local is_docs=0
-        for pattern in "${docs_patterns[@]}"; do
-            if [[ "$file" =~ $pattern ]]; then
-                is_docs=1
-                break
-            fi
-        done
-
-        if [[ "$is_docs" == "0" ]]; then
-            # Found non-docs file
-            has_non_docs=1
-            break
-        fi
-    done <<< "$changed_files"
-
-    # Return 0 (success) if all files are docs, 1 otherwise
-    [[ "$has_non_docs" == "0" ]]
-}
-
-# Function: Detect critical file changes
-has_critical_file_changes() {
-    local changed_files="$1"
-
-    # Critical patterns that trigger full test run
-    local critical_patterns=(
-        "^docker/Dockerfile"
-        "^requirements.*\\.txt$"
-        "^setup\\.py$"
-        "^pyproject\\.toml$"
-        "^\\.buildkite/bootstrap-omni-amd\\.sh$"
-        "^\\.buildkite/test-amd\\.yaml$"
-        "^\\.buildkite/scripts/hardware_ci/run-amd-test\\.sh$"
-    )
-
-    while IFS= read -r file; do
-        [[ -z "$file" ]] && continue
-
-        for pattern in "${critical_patterns[@]}"; do
-            if [[ "$file" =~ $pattern ]]; then
-                echo "  Critical file changed: $file"
-                return 0
-            fi
-        done
-    done <<< "$changed_files"
-
-    return 1
-}
-
-echo "--- Analyzing changed files"
-
-# Get list of changed files
-CHANGED_FILES=""
-if git rev-parse "origin/${BASE_BRANCH}" >/dev/null 2>&1; then
-    # Fetch latest base branch
-    echo "Fetching origin/${BASE_BRANCH}..."
-    git fetch origin "${BASE_BRANCH}" >/dev/null 2>&1 || true
-
-    # Get changed files between base and current commit
-    CHANGED_FILES=$(git diff --name-only "origin/${BASE_BRANCH}...${BUILDKITE_COMMIT}" 2>/dev/null || \
-                    git diff --name-only "origin/${BASE_BRANCH}" "${BUILDKITE_COMMIT}" 2>/dev/null || \
-                    echo "")
-else
-    echo "Warning: Could not find base branch ${BASE_BRANCH}"
-    echo "Will run all tests as a safety measure"
-    RUN_ALL_TESTS=1
-fi
-
-# Count changed files
-CHANGED_FILE_COUNT=$(echo "$CHANGED_FILES" | grep -c . || echo "0")
-echo "Changed files: ${CHANGED_FILE_COUNT}"
-
-# Debug: Show changed files if in debug mode
-if [[ "$DEBUG" == "1" ]] && [[ -n "$CHANGED_FILES" ]]; then
-    echo "Changed files list:"
-    echo "$CHANGED_FILES" | head -20
-    [[ "$CHANGED_FILE_COUNT" -gt 20 ]] && echo "... (${CHANGED_FILE_COUNT} total files)"
-    echo ""
-fi
-
-# Check for docs-only changes (early exit optimization)
-if [[ "$CHANGED_FILE_COUNT" -gt 0 ]] && is_docs_only_change "$CHANGED_FILES"; then
-    echo ""
-    echo "=== Documentation-only changes detected ==="
-    echo "Skipping CI tests to save resources"
-    echo ""
-
-    # Generate minimal pipeline
-    cat > .buildkite/pipeline.yaml << 'EOF'
-steps:
-  - label: ":memo: Docs-only change"
-    command: echo "Only documentation changed, skipping tests"
-    agents:
-      queue: cpu_queue_premerge
-EOF
-
-    echo "--- Generated minimal pipeline:"
-    cat .buildkite/pipeline.yaml
-    echo ""
-
-    echo "--- Uploading pipeline to Buildkite"
-    buildkite-agent pipeline upload .buildkite/pipeline.yaml
-
-    echo "=== Bootstrap Complete (docs-only) ==="
-    exit 0
-fi
-
-# Check for critical file changes
-if has_critical_file_changes "$CHANGED_FILES"; then
-    echo "  → Critical files detected: Will run ALL tests"
-    RUN_ALL_TESTS=1
-fi
-
-echo ""
-
-#==============================================================================
-# SECTION 4: YAML PARSING (Pure Bash)
-#==============================================================================
-
-echo "--- Parsing test-amd.yaml"
-
-# Parse test-amd.yaml into structured variables
-# Variables will be named: STEP_<N>_<FIELD>
-# Arrays/lists use || as delimiter
-
-TOTAL_STEPS=0
-declare -A STEP_DATA
-
-parse_test_yaml() {
-    local yaml_file="$1"
-
-    local step_num=0
-    local in_step=0
-    local current_section=""
-    local list_indent=0
-
-    while IFS= read -r raw_line; do
-        # Handle line without removing all whitespace (need to detect indentation)
-        local line="$raw_line"
-
-        # Skip comments and empty lines
-        [[ "$line" =~ ^[[:space:]]*# ]] && continue
-        [[ -z "${line// /}" ]] && continue
-
-        # Detect indentation level
-        local indent=$(echo "$line" | sed -E 's/^([[:space:]]*).*/\1/' | wc -c)
-        indent=$((indent - 1))  # wc -c counts \n
-
-        # Detect new step (starts with "- label:")
-        if [[ "$line" =~ ^[[:space:]]*-[[:space:]]+label:[[:space:]]*(.+)$ ]]; then
-            step_num=$((step_num + 1))
-            in_step=1
-            local label="${BASH_REMATCH[1]}"
-            label=$(echo "$label" | sed 's/^["'"'"']//' | sed 's/["'"'"']$//')
-
-            eval "STEP_${step_num}_LABEL=\"\$label\""
-            eval "STEP_${step_num}_KEY=\"step-${step_num}\""
-
-            echo "  Step ${step_num}: ${label}"
-            current_section=""
-            continue
-        fi
-
-        [[ "$in_step" == "0" ]] && continue
-
-        # Parse key-value pairs at step level
-        if [[ "$line" =~ ^[[:space:]]+([a-z_]+):[[:space:]]*(.*)$ ]]; then
-            local key="${BASH_REMATCH[1]}"
-            local value="${BASH_REMATCH[2]}"
-
-            # Clean value (remove quotes, brackets)
-            value=$(echo "$value" | sed 's/^["'"'"'\[]//' | sed 's/["'"'"'\]]*$//')
-
-            case "$key" in
-                key)
-                    eval "STEP_${step_num}_KEY=\"\$value\""
-                    ;;
-                mirror_hardwares)
-                    # Parse array: [amdexperimental, amdproduction]
-                    value=$(echo "$value" | tr ',' ' ' | xargs)
-                    eval "STEP_${step_num}_MIRROR_HARDWARES=\"\$value\""
-                    ;;
-                agent_pool)
-                    eval "STEP_${step_num}_AGENT_POOL=\"\$value\""
-                    ;;
-                timeout_in_minutes)
-                    eval "STEP_${step_num}_TIMEOUT=\"\$value\""
-                    ;;
-                fast_check)
-                    eval "STEP_${step_num}_FAST_CHECK=\"\$value\""
-                    ;;
-                working_dir)
-                    eval "STEP_${step_num}_WORKING_DIR=\"\$value\""
-                    ;;
-                queue)
-                    # Part of agents section
-                    eval "STEP_${step_num}_QUEUE=\"\$value\""
-                    ;;
-                commands)
-                    current_section="commands"
-                    eval "STEP_${step_num}_COMMANDS=\"\""
-                    list_indent=$indent
-                    ;;
-                source_file_dependencies)
-                    current_section="dependencies"
-                    eval "STEP_${step_num}_DEPENDENCIES=\"\""
-                    list_indent=$indent
-                    ;;
-                agents)
-                    current_section="agents"
-                    ;;
-                *)
-                    # Check if we left a list section
-                    if [[ "$indent" -le "$list_indent" ]] && [[ -n "$current_section" ]]; then
-                        current_section=""
-                    fi
-                    ;;
-            esac
-        # Parse list items (- item)
-        elif [[ "$line" =~ ^[[:space:]]*-[[:space:]]+(.+)$ ]]; then
-            local item="${BASH_REMATCH[1]}"
-            item=$(echo "$item" | sed 's/^["'"'"']//' | sed 's/["'"'"']$//')
-
-            if [[ "$current_section" == "commands" ]]; then
-                local current_cmds
-                eval "current_cmds=\"\$STEP_${step_num}_COMMANDS\""
-                if [[ -n "$current_cmds" ]]; then
-                    eval "STEP_${step_num}_COMMANDS=\"\${current_cmds}||\$item\""
-                else
-                    eval "STEP_${step_num}_COMMANDS=\"\$item\""
-                fi
-            elif [[ "$current_section" == "dependencies" ]]; then
-                local current_deps
-                eval "current_deps=\"\$STEP_${step_num}_DEPENDENCIES\""
-                if [[ -n "$current_deps" ]]; then
-                    eval "STEP_${step_num}_DEPENDENCIES=\"\${current_deps}||\$item\""
-                else
-                    eval "STEP_${step_num}_DEPENDENCIES=\"\$item\""
-                fi
-            fi
-        fi
-    done < "$yaml_file"
-
-    TOTAL_STEPS=$step_num
-}
-
-parse_test_yaml ".buildkite/test-amd.yaml"
-
-echo "Parsed ${TOTAL_STEPS} steps from test-amd.yaml"
-
-# Validate parsing
-if [[ "$TOTAL_STEPS" == "0" ]]; then
-    echo "Error: No steps found in test-amd.yaml"
-    exit 1
-fi
-
-echo ""
-
-#==============================================================================
-# SECTION 5: TEST FILTERING & SELECTION
-#==============================================================================
-
-# Function: Check if step should run based on file dependencies
-should_run_step() {
-    local step_num="$1"
-
-    # If RUN_ALL_TESTS is set, always run
-    if [[ "$RUN_ALL_TESTS" == "1" ]]; then
-        return 0
-    fi
-
-    # Get step dependencies
-    local deps_var="STEP_${step_num}_DEPENDENCIES"
-    local dependencies="${!deps_var:-}"
-
-    # If no dependencies specified, always run (catch-all test)
-    if [[ -z "$dependencies" ]]; then
-        return 0
-    fi
-
-    # If no files changed, don't run
-    if [[ -z "$CHANGED_FILES" ]] || [[ "$CHANGED_FILE_COUNT" == "0" ]]; then
-        return 1
-    fi
-
-    # Check if any changed file matches dependencies
-    # Dependencies use prefix matching (e.g., "vllm_omni/" matches "vllm_omni/diffusion/model.py")
-    local IFS='||'
-    local deps_array=($dependencies)
-
-    for dep_pattern in "${deps_array[@]}"; do
-        # Remove trailing slashes for consistency
-        dep_pattern="${dep_pattern%/}"
-
-        while IFS= read -r changed_file; do
-            [[ -z "$changed_file" ]] && continue
-
-            # Check if changed file matches dependency pattern
-            # Support both prefix matching and exact file matching
-            if [[ "$changed_file" == "${dep_pattern}"* ]] || [[ "$changed_file" == "$dep_pattern" ]]; then
-                if [[ "$DEBUG" == "1" ]]; then
-                    echo "    Match: ${changed_file} ↔ ${dep_pattern}"
-                fi
-                return 0
-            fi
-        done <<< "$CHANGED_FILES"
-    done
-
-    return 1
-}
-
-# Function: Check if step matches hardware filter
-matches_hardware() {
-    local step_num="$1"
-    local target_hardware="${2:-amdexperimental}"
-
-    # Get mirror_hardwares for this step
-    local hw_var="STEP_${step_num}_MIRROR_HARDWARES"
-    local hardwares="${!hw_var:-}"
-
-    # If no hardware specified, it's a non-AMD step (CPU/build steps)
-    # These should always be included
-    if [[ -z "$hardwares" ]]; then
-        return 0
-    fi
-
-    # Check if target hardware is in the list
-    if [[ "$hardwares" == *"$target_hardware"* ]]; then
-        return 0
-    fi
-
-    return 1
-}
-
-echo "--- Filtering tests"
-echo "Filter criteria:"
-echo "  Target hardware: amdexperimental"
-echo "  RUN_ALL_TESTS: ${RUN_ALL_TESTS}"
-echo "  Changed files: ${CHANGED_FILE_COUNT}"
-echo ""
-
-# Build list of steps to include
-INCLUDED_STEPS=()
-
-for ((i=1; i<=TOTAL_STEPS; i++)); do
-    label_var="STEP_${i}_LABEL"
-    label="${!label_var:-Step $i}"
-
-    # Check hardware match
-    if ! matches_hardware "$i" "amdexperimental"; then
-        echo "  Step ${i} (${label}): ✗ Wrong hardware - SKIP"
-        continue
-    fi
-
-    # Check file dependencies
-    if should_run_step "$i"; then
-        echo "  Step ${i} (${label}): ✓ INCLUDE"
-        INCLUDED_STEPS+=("$i")
-    else
-        echo "  Step ${i} (${label}): ✗ No matching changes - SKIP"
-    fi
-done
-
-echo ""
-echo "Selected ${#INCLUDED_STEPS[@]} of ${TOTAL_STEPS} steps"
-
-# Ensure at least build step (step 1) is included
-if [[ "${#INCLUDED_STEPS[@]}" == "0" ]]; then
-    echo "Warning: No tests selected, including build step as fallback"
-    INCLUDED_STEPS=(1)
-fi
-
-echo ""
-
-#==============================================================================
-# SECTION 6: PIPELINE GENERATION
-#==============================================================================
-
-echo "--- Generating Buildkite pipeline"
-
-# Start pipeline file
-cat > .buildkite/pipeline.yaml << 'PIPELINE_HEADER'
-# Auto-generated by bootstrap-omni-amd.sh
-# DO NOT EDIT MANUALLY - Edit .buildkite/test-amd.yaml instead
-#
-# Generated:
-PIPELINE_HEADER
-
-echo "# $(date -u)" >> .buildkite/pipeline.yaml
-echo "" >> .buildkite/pipeline.yaml
-echo "steps:" >> .buildkite/pipeline.yaml
-
-# Generate steps
-for step_num in "${INCLUDED_STEPS[@]}"; do
-    # Get step data
-    label_var="STEP_${step_num}_LABEL"
-    label="${!label_var}"
-
-    key_var="STEP_${step_num}_KEY"
-    key="${!key_var:-step-${step_num}}"
-
-    queue_var="STEP_${step_num}_QUEUE"
-    queue="${!queue_var:-}"
-
-    agent_pool_var="STEP_${step_num}_AGENT_POOL"
-    agent_pool="${!agent_pool_var:-}"
-
-    timeout_var="STEP_${step_num}_TIMEOUT"
-    timeout="${!timeout_var:-10}"
-
-    commands_var="STEP_${step_num}_COMMANDS"
-    commands="${!commands_var:-}"
-
-    working_dir_var="STEP_${step_num}_WORKING_DIR"
-    working_dir="${!working_dir_var:-}"
-
-    # Determine queue
-    final_queue=""
-    if [[ -n "$queue" ]]; then
-        final_queue="$queue"
-    elif [[ -n "$agent_pool" ]]; then
-        # Map agent_pool to queue name
-        case "$agent_pool" in
-            mi325_1) final_queue="amd_mi325_1" ;;
-            mi325_2) final_queue="amd_mi325_2" ;;
-            mi325_4) final_queue="amd_mi325_4" ;;
-            mi325_8) final_queue="amd_mi325_8" ;;
-            *) final_queue="amd_${agent_pool}" ;;
-        esac
-    else
-        # Default queue for steps without specification
-        final_queue="cpu_queue_premerge"
-    fi
-
-    # Generate step YAML
-    cat >> .buildkite/pipeline.yaml << STEP_START
-
-  - label: "${label}"
-    key: "${key}"
-    agents:
-      queue: "${final_queue}"
-      cluster: "CI"
-STEP_START
-
-    # Add commands
-    if [[ -n "$commands" ]]; then
-        # Check if this is an AMD GPU test (needs run-amd-test.sh wrapper)
-        if [[ "$final_queue" == amd_* ]]; then
-            # Build command string for AMD GPU execution
-            cmd_string=""
-            IFS='||'
-            cmd_array=($commands)
-
-            # Add ROCm check prefix
-            cmd_string="(command rocm-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1"
-
-            # Add working directory if specified
-            if [[ -n "$working_dir" ]]; then
-                cmd_string="${cmd_string} && cd ${working_dir}"
-            fi
-
-            # Join commands with &&
-            for cmd in "${cmd_array[@]}"; do
-                cmd_string="${cmd_string} && ${cmd}"
-            done
-
-            # Wrap in run-amd-test.sh
-            echo "    command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh \"${cmd_string}\"" >> .buildkite/pipeline.yaml
-        else
-            # CPU or build step - direct commands
-            echo "    commands:" >> .buildkite/pipeline.yaml
-            IFS='||'
-            cmd_array=($commands)
-            for cmd in "${cmd_array[@]}"; do
-                echo "      - \"${cmd}\"" >> .buildkite/pipeline.yaml
-            done
-        fi
-    else
-        # No commands specified
-        echo "    command: echo \"No commands specified for this step\"" >> .buildkite/pipeline.yaml
-    fi
-
-    # Add timeout
-    echo "    timeout_in_minutes: ${timeout}" >> .buildkite/pipeline.yaml
-
-    # Add retry for AMD GPU tests
-    if [[ "$final_queue" == amd_* ]]; then
-        cat >> .buildkite/pipeline.yaml << 'RETRY_BLOCK'
-    retry:
-      automatic:
-        - exit_status: "*"
-          limit: 1
-RETRY_BLOCK
-    fi
-
-    # Add environment variables for AMD GPU tests
-    if [[ "$final_queue" == amd_* ]]; then
-        cat >> .buildkite/pipeline.yaml << 'ENV_BLOCK'
-    env:
-      HF_HOME: "/root/.cache/huggingface"
-ENV_BLOCK
-    fi
-
-    # Add depends_on for non-first steps (depend on build step)
-    if [[ "$step_num" != "${INCLUDED_STEPS[0]}" ]]; then
-        first_key_var="STEP_${INCLUDED_STEPS[0]}_KEY"
-        first_key="${!first_key_var:-step-${INCLUDED_STEPS[0]}}"
-        echo "    depends_on: \"${first_key}\"" >> .buildkite/pipeline.yaml
-    fi
-done
-
-# Validate generated pipeline
-if ! grep -q "^steps:" .buildkite/pipeline.yaml; then
-    echo "Error: Generated pipeline is invalid (missing 'steps:' section)"
-    exit 1
-fi
-
-echo "Pipeline generated successfully"
-echo ""
-
-# Display generated pipeline
-echo "--- Generated Pipeline:"
-cat .buildkite/pipeline.yaml
-echo ""
-
-# Upload pipeline to Buildkite
-echo "--- Uploading pipeline to Buildkite"
-buildkite-agent pipeline upload .buildkite/pipeline.yaml
-
-echo ""
-echo "=== Bootstrap Complete ==="
-echo "Configuration:"
-echo "  Total steps defined: ${TOTAL_STEPS}"
-echo "  Steps selected: ${#INCLUDED_STEPS[@]}"
-echo "  RUN_ALL_TESTS: ${RUN_ALL_TESTS}"
-echo "  NO_FAIL_FAST: ${NO_FAIL_FAST}"
-echo "  Changed files: ${CHANGED_FILE_COUNT}"
-echo ""
-echo "Pipeline uploaded successfully!"
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
deleted file mode 100755
index faf51234619..00000000000
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/bin/bash
-
-# This script runs tests inside the ROCm docker container for vLLM-Omni.
-# Adapted from vLLM's run-amd-test.sh for vllm-omni's simpler use case.
-
-set -o pipefail
-
-# Export Python path
-export PYTHONPATH=".."
-
-# Print ROCm version
-echo "--- Confirming Clean Initial State"
-while true; do
-    sleep 3
-    if grep -q clean /opt/amdgpu/etc/gpu_state; then
-        echo "GPUs state is \"clean\""
-        break
-    fi
-done
-
-echo "--- ROCm info"
-rocminfo
-
-# Cleanup older docker images
-cleanup_docker() {
-    docker_root=$(docker info -f '{{.DockerRootDir}}')
-    if [ -z "$docker_root" ]; then
-        echo "Failed to determine Docker root directory."
-        exit 1
-    fi
-    echo "Docker root directory: $docker_root"
-
-    disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
-    threshold=70
-
-    if [ "$disk_usage" -gt "$threshold" ]; then
-        echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
-        docker image prune -f
-        docker volume prune -f && docker system prune --force --filter "until=72h" --all
-        echo "Docker images and volumes cleanup completed."
-    else
-        echo "Disk usage is below $threshold%. No cleanup needed."
-    fi
-}
-
-cleanup_docker
-
-echo "--- Resetting GPUs"
-echo "reset" > /opt/amdgpu/etc/gpu_state
-
-while true; do
-    sleep 3
-    if grep -q clean /opt/amdgpu/etc/gpu_state; then
-        echo "GPUs state is \"clean\""
-        break
-    fi
-done
-
-echo "--- Pulling/Building container"
-image_name="vllm/vllm-omni-rocm-ci:${BUILDKITE_COMMIT}"
-container_name="rocm_vllm_omni_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-
-# Try to pull image first, if it doesn't exist, build it
-if ! docker pull "${image_name}" 2>/dev/null; then
-    echo "Image not found, building from Dockerfile.rocm..."
-    cd "$(dirname "$0")/../../.."  # Go to repo root
-    docker build \
-        -f docker/Dockerfile.rocm \
-        -t "${image_name}" \
-        --build-arg BUILDKITE_COMMIT="${BUILDKITE_COMMIT}" \
-        .
-fi
-
-remove_docker_container() {
-    docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
-}
-trap remove_docker_container EXIT
-
-echo "--- Running container"
-
-# HuggingFace cache setup
-HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p "${HF_CACHE}"
-HF_MOUNT="/root/.cache/huggingface"
-
-# Get commands from arguments
-commands=$@
-echo "Commands: $commands"
-
-# Get render group for GPU access
-render_gid=$(getent group render | cut -d: -f3)
-if [[ -z "$render_gid" ]]; then
-    echo "Error: 'render' group not found. This is required for GPU access." >&2
-    exit 1
-fi
-
-# Run tests in container
-echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
-docker run \
-    --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
-    --network=host \
-    --shm-size=16gb \
-    --group-add "$render_gid" \
-    --rm \
-    -e HF_TOKEN \
-    -e AWS_ACCESS_KEY_ID \
-    -e AWS_SECRET_ACCESS_KEY \
-    -v "${HF_CACHE}:${HF_MOUNT}" \
-    -e "HF_HOME=${HF_MOUNT}" \
-    -e "PYTHONPATH=.." \
-    --name "${container_name}" \
-    "${image_name}" \
-    /bin/bash -c "${commands}"
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index bbe196d8a35..625d59748a8 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1,34 +1,13 @@
-# AMD Test Pipeline for vLLM-Omni
-#
-# This file follows vLLM's test-amd.yaml structure and will be processed
-# by Jinja templates in the buildkite-ci repository (if configured).
-#
-# Documentation:
-# - mirror_hardwares: list of AMD hardware to run tests on [amdexperimental, amdproduction, amdtentative]
-# - agent_pool: GPU pool to use (mi325_1 = single MI325X GPU)
-# - fast_check: run on every commit
-# - timeout_in_minutes: test timeout
-# - source_file_dependencies: trigger test only when these files change
-# - commands: list of commands to execute
-# - working_dir: directory where commands execute (default: /vllm-omni-workspace/tests)
-
 steps:
-##### fast check tests  #####
-
-- label: "Simple Unit Test"
-  commands:
-    - ".buildkite/scripts/simple_test.sh"
-  agents:
-    queue: "cpu_queue_premerge"
+  - label: ":docker: Build image"
+    key: image-build
+    commands:
+      - "docker build -f docker/Dockerfile.rocm -t vllm-omni-rocm ."
+    agents:
+      queue: "cpu_queue_premerge_us_east_1"
 
-# - label: ":rocm: Z-Image Diffusion Model Test"
-#   mirror_hardwares: [amdexperimental]
-#   agent_pool: mi325_1
-#   timeout_in_minutes: 20
-#   fast_check: true
-#   source_file_dependencies:
-#     - vllm_omni/
-#     - tests/test_diffusion_model.py
-#   commands:
-#     - pytest -v -s test_diffusion_model.py
-#   working_dir: "/vllm-omni-workspace/tests"
+  - label: "Simple Unit Test"
+    commands:
+      - ".buildkite/scripts/simple_test.sh"
+    agents:
+      queue: "cpu_queue_premerge"

From 46da88092adb73305b6f0c4f34da4528f19bf32c Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 11 Dec 2025 15:33:02 +0000
Subject: [PATCH 06/32] use amd-cpu to build image like in vLLM

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 625d59748a8..cfa4c4cd012 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -4,7 +4,7 @@ steps:
     commands:
       - "docker build -f docker/Dockerfile.rocm -t vllm-omni-rocm ."
     agents:
-      queue: "cpu_queue_premerge_us_east_1"
+      queue: "amd-cpu"
 
   - label: "Simple Unit Test"
     commands:

From bb03847a19bcca0b2a3059b2cc266dc3a864e68d Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Fri, 12 Dec 2025 11:54:47 +0000
Subject: [PATCH 07/32] apply review feedback

Co-authored-by: Hongxia Yang <hongxia.yang@amd.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 docker/Dockerfile.rocm | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 6e774135e26..0c8fb1ab419 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -1,12 +1,11 @@
-ARG ROCM_BASE_IMAGE=rocm/vllm-dev
-ARG ROCM_BASE_TAG=nightly_main_20251005
-FROM ${ROCM_BASE_IMAGE}:${ROCM_BASE_TAG}
+ARG BASE_IMAGE=rocm/vllm-dev:nightly_main_20251005
+FROM ${BASE_IMAGE}
 
-ARG APP_DIR=/workspace/vllm-omni
+ARG COMMON_WORKDIR=/app
 ARG VLLM_VERSION=v0.11.0
 ARG PYTORCH_ROCM_ARCH="gfx942;gfx950"
 
-WORKDIR ${APP_DIR}
+WORKDIR ${COMMON_WORKDIR}
 
 # Step 1: Setup - Install system dependencies
 RUN apt-get update && \
@@ -15,27 +14,23 @@ RUN apt-get update && \
     rm -rf /var/lib/apt/lists/*
 
 # Step 2: Reinstall vllm from source
-RUN cd ../ && python3 -m pip uninstall -y vllm && \
+RUN python3 -m pip uninstall -y vllm && rm -rf vllm &&\
     git clone https://github.com/vllm-project/vllm.git && \
     cd vllm && \
     git checkout ${VLLM_VERSION} && \
-    python3 -c "import setuptools_scm; print(setuptools_scm.get_version())" && \
+    python3 -m pip install -r requirements/rocm.txt && \
+    python3 setup.py clean --all && \
     PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} python3 setup.py develop && \
-    cd / && \
+    cd ../ && \
     rm -rf vllm/.git
 
+RUN mkdir -p ${COMMON_WORKDIR}/vllm-omni
+
 # Step 3: Copy vllm-omni code and install without uv
-COPY . ${APP_DIR}
-RUN python3 -m pip install --no-cache-dir ".[dev]"
+COPY . ${COMMON_WORKDIR}/vllm-omni
+RUN cd ${COMMON_WORKDIR}/vllm-omni && python3 -m pip install --no-cache-dir ".[dev]"
 
 # Create python symlink
 RUN ln -sf /usr/bin/python3 /usr/bin/python
 
-# Step 4: Set environment variables for ROCm optimization
-ENV MIOPEN_FIND_MODE=FAST
-ENV VLLM_ROCM_USE_AITER=1
-ENV VLLM_ROCM_USE_AITER_MHA=1
-ENV VLLM_ROCM_USE_AITER_LINEAR=0
-ENV VLLM_ROCM_USE_AITER_RMSNORM=0
-
 ENTRYPOINT []
\ No newline at end of file

From 055090d7bfc13c4f56fea591431854d5c20a84a1 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Fri, 12 Dec 2025 11:56:01 +0000
Subject: [PATCH 08/32] fix precommit

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 docker/Dockerfile.rocm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 0c8fb1ab419..872fb0c049a 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -33,4 +33,4 @@ RUN cd ${COMMON_WORKDIR}/vllm-omni && python3 -m pip install --no-cache-dir ".[d
 # Create python symlink
 RUN ln -sf /usr/bin/python3 /usr/bin/python
 
-ENTRYPOINT []
\ No newline at end of file
+ENTRYPOINT []

From e8374b5ee998289e96455456c15e6a6fd3623412 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Fri, 12 Dec 2025 12:14:34 +0000
Subject: [PATCH 09/32] test pushing CI docker

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index cfa4c4cd012..cd527621557 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -2,7 +2,10 @@ steps:
   - label: ":docker: Build image"
     key: image-build
     commands:
-      - "docker build -f docker/Dockerfile.rocm -t vllm-omni-rocm ."
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker build -f docker/Dockerfile.rocm -t vllm-omni-rocm-ci ."
+      - "docker tag vllm-omni-rocm-ci public.ecr.aws/q9t5s3a7/vllm-rocm-ci-test-repo:$BUILDKITE_COMMIT"
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-rocm-ci-test-repo:$BUILDKITE_COMMIT"
     agents:
       queue: "amd-cpu"
 

From 02e68e22d4009379b9128c401a578bccf63d6e00 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Fri, 12 Dec 2025 12:21:56 +0000
Subject: [PATCH 10/32] try using cpu_queue_premerge_us_east_1 to build image

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index cd527621557..67741c5ece1 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -7,7 +7,7 @@ steps:
       - "docker tag vllm-omni-rocm-ci public.ecr.aws/q9t5s3a7/vllm-rocm-ci-test-repo:$BUILDKITE_COMMIT"
       - "docker push public.ecr.aws/q9t5s3a7/vllm-rocm-ci-test-repo:$BUILDKITE_COMMIT"
     agents:
-      queue: "amd-cpu"
+      queue: "cpu_queue_premerge_us_east_1"
 
   - label: "Simple Unit Test"
     commands:

From 57661bd3d98453d43fb72737c882b1d642768d19 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Mon, 15 Dec 2025 08:57:01 +0000
Subject: [PATCH 11/32] add preliminary script to run amd ci

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/bootstrap-amd-omni.sh              | 238 +++++++++++++
 .../scripts/hardware_ci/run-amd-test.sh       | 240 +++++++++++++
 .buildkite/test-amd.yaml                      |  24 +-
 .buildkite/test-template-amd-omni.j2          | 335 ++++++++++++++++++
 4 files changed, 823 insertions(+), 14 deletions(-)
 create mode 100755 .buildkite/bootstrap-amd-omni.sh
 create mode 100755 .buildkite/scripts/hardware_ci/run-amd-test.sh
 create mode 100644 .buildkite/test-template-amd-omni.j2

diff --git a/.buildkite/bootstrap-amd-omni.sh b/.buildkite/bootstrap-amd-omni.sh
new file mode 100755
index 00000000000..a38b7622011
--- /dev/null
+++ b/.buildkite/bootstrap-amd-omni.sh
@@ -0,0 +1,238 @@
+#!/bin/bash
+# vllm-omni customized version
+# Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/bootstrap-amd.sh
+# Last synced: 2025-12-15
+# Modifications: Use local template file instead of downloading from ci-infra
+
+set -euo pipefail
+
+if [[ -z "${RUN_ALL:-}" ]]; then
+    RUN_ALL=0
+fi
+
+if [[ -z "${NIGHTLY:-}" ]]; then
+    NIGHTLY=0
+fi
+
+if [[ -z "${VLLM_CI_BRANCH:-}" ]]; then
+    VLLM_CI_BRANCH="main"
+fi
+
+if [[ -z "${AMD_MIRROR_HW:-}" ]]; then
+    AMD_MIRROR_HW="amdproduction"
+fi
+
+if [[ -z "${DOCS_ONLY_DISABLE:-}" ]]; then
+    DOCS_ONLY_DISABLE=0
+fi
+
+fail_fast() {
+    DISABLE_LABEL="ci-no-fail-fast"
+    # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
+    if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
+        PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+        if [[ $PR_LABELS == *"$DISABLE_LABEL"* ]]; then
+            echo false
+        else
+            echo true
+        fi
+    else
+        echo false  # not a PR or BUILDKITE_PULL_REQUEST not set
+    fi
+}
+
+check_run_all_label() {
+    RUN_ALL_LABEL="ready-run-all-tests"
+    # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
+    if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
+        PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm-omni/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+        if [[ $PR_LABELS == *"$RUN_ALL_LABEL"* ]]; then
+            echo true
+        else
+            echo false
+        fi
+    else
+        echo false  # not a PR or BUILDKITE_PULL_REQUEST not set
+    fi
+}
+
+if [[ -z "${COV_ENABLED:-}" ]]; then
+    COV_ENABLED=0
+fi
+
+upload_pipeline() {
+    echo "Uploading pipeline..."
+    # Install minijinja
+    ls .buildkite || buildkite-agent annotate --style error 'Please merge upstream main branch for buildkite CI'
+    curl -sSfL https://github.com/mitsuhiko/minijinja/releases/download/2.3.1/minijinja-cli-installer.sh | sh
+    source /var/lib/buildkite-agent/.cargo/env
+
+    if [[ $BUILDKITE_PIPELINE_SLUG == "fastcheck" ]]; then
+        AMD_MIRROR_HW="amdtentative"
+    fi
+
+    # Use local template file for vllm-omni
+    cp .buildkite/test-template-amd-omni.j2 .buildkite/test-template.j2
+
+
+    # (WIP) Use pipeline generator instead of jinja template
+    if [ -e ".buildkite/pipeline_generator/pipeline_generator.py" ]; then
+        python -m pip install click pydantic
+        python .buildkite/pipeline_generator/pipeline_generator.py --run_all=$RUN_ALL --list_file_diff="$LIST_FILE_DIFF" --nightly="$NIGHTLY" --mirror_hw="$AMD_MIRROR_HW"
+        buildkite-agent pipeline upload .buildkite/pipeline.yaml
+        exit 0
+    fi
+    echo "List file diff: $LIST_FILE_DIFF"
+    echo "Run all: $RUN_ALL"
+    echo "Nightly: $NIGHTLY"
+    echo "AMD Mirror HW: $AMD_MIRROR_HW"
+
+    FAIL_FAST=$(fail_fast)
+
+    cd .buildkite
+    (
+        set -x
+        # Output pipeline.yaml with all blank lines removed
+        minijinja-cli test-template.j2 test-amd.yaml \
+            -D branch="$BUILDKITE_BRANCH" \
+            -D list_file_diff="$LIST_FILE_DIFF" \
+            -D run_all="$RUN_ALL" \
+            -D nightly="$NIGHTLY" \
+            -D mirror_hw="$AMD_MIRROR_HW" \
+            -D fail_fast="$FAIL_FAST" \
+            -D vllm_use_precompiled="$VLLM_USE_PRECOMPILED" \
+            -D vllm_merge_base_commit="$(git merge-base origin/main HEAD)" \
+            -D cov_enabled="$COV_ENABLED" \
+            -D vllm_ci_branch="$VLLM_CI_BRANCH" \
+            | sed '/^[[:space:]]*$/d' \
+            > pipeline.yaml
+    )
+    cat pipeline.yaml
+    buildkite-agent artifact upload pipeline.yaml
+    buildkite-agent pipeline upload pipeline.yaml
+    exit 0
+}
+
+get_diff() {
+    $(git add .)
+    echo $(git diff --name-only --diff-filter=ACMDR $(git merge-base origin/main HEAD))
+}
+
+get_diff_main() {
+    $(git add .)
+    echo $(git diff --name-only --diff-filter=ACMDR HEAD~1)
+}
+
+file_diff=$(get_diff)
+if [[ $BUILDKITE_BRANCH == "main" ]]; then
+    file_diff=$(get_diff_main)
+fi
+
+# ----------------------------------------------------------------------
+# Early exit start: skip pipeline if conditions are met
+# ----------------------------------------------------------------------
+
+# skip pipeline if all changed files are under docs/
+if [[ "${DOCS_ONLY_DISABLE}" != "1" ]]; then
+  if [[ -n "${file_diff:-}" ]]; then
+    docs_only=1
+    # Robust iteration over newline-separated file_diff
+    while IFS= read -r f; do
+      [[ -z "$f" ]] && continue
+      # **Policy:** only skip if *every* path starts with docs/
+      if [[ "$f" != docs/* ]]; then
+        docs_only=0
+        break
+      fi
+    done < <(printf '%s\n' "$file_diff" | tr ' ' '\n' | tr -d '\r')
+
+    if [[ "$docs_only" -eq 1 ]]; then
+      buildkite-agent annotate ":memo: CI skipped — docs/** only changes detected
+
+\`\`\`
+${file_diff}
+\`\`\`" --style "info" || true
+      echo "[docs-only] All changes are under docs/. Exiting before pipeline upload."
+      exit 0
+    fi
+  fi
+fi
+
+# ----------------------------------------------------------------------
+# Early exit end
+# ----------------------------------------------------------------------
+
+patterns=(
+    "docker/Dockerfile"
+    "CMakeLists.txt"
+    "requirements/common.txt"
+    "requirements/cuda.txt"
+    "requirements/build.txt"
+    "requirements/test.txt"
+    "setup.py"
+    "csrc/"
+    "cmake/"
+)
+
+ignore_patterns=(
+    "docker/Dockerfile."
+    "csrc/cpu"
+    "csrc/rocm"
+    "cmake/hipify.py"
+    "cmake/cpu_extension.cmake"
+)
+
+for file in $file_diff; do
+    # First check if file matches any pattern
+    matches_pattern=0
+    for pattern in "${patterns[@]}"; do
+        if [[ $file == $pattern* ]] || [[ $file == $pattern ]]; then
+            matches_pattern=1
+            break
+        fi
+    done
+
+    # If file matches pattern, check it's not in ignore patterns
+    if [[ $matches_pattern -eq 1 ]]; then
+        matches_ignore=0
+        for ignore in "${ignore_patterns[@]}"; do
+            if [[ $file == $ignore* ]] || [[ $file == $ignore ]]; then
+                matches_ignore=1
+                break
+            fi
+        done
+
+        if [[ $matches_ignore -eq 0 ]]; then
+            RUN_ALL=1
+            echo "Found changes: $file. Run all tests"
+            break
+        fi
+    fi
+done
+
+# Check for ready-run-all-tests label
+LABEL_RUN_ALL=$(check_run_all_label)
+if [[ $LABEL_RUN_ALL == true ]]; then
+    RUN_ALL=1
+    NIGHTLY=1
+    echo "Found 'ready-run-all-tests' label. Running all tests including optional tests."
+fi
+
+# Decide whether to use precompiled wheels
+# Relies on existing patterns array as a basis.
+if [[ -n "${VLLM_USE_PRECOMPILED:-}" ]]; then
+    echo "VLLM_USE_PRECOMPILED is already set to: $VLLM_USE_PRECOMPILED"
+elif [[ $RUN_ALL -eq 1 ]]; then
+    export VLLM_USE_PRECOMPILED=0
+    echo "Detected critical changes, building wheels from source"
+else
+    export VLLM_USE_PRECOMPILED=1
+    echo "No critical changes, using precompiled wheels"
+fi
+
+
+LIST_FILE_DIFF=$(get_diff | tr ' ' '|')
+if [[ $BUILDKITE_BRANCH == "main" ]]; then
+    LIST_FILE_DIFF=$(get_diff_main | tr ' ' '|')
+fi
+upload_pipeline
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
new file mode 100755
index 00000000000..64e97011768
--- /dev/null
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -0,0 +1,240 @@
+#!/bin/bash
+# vllm-omni customized version
+# Based on: vllm/.buildkite/scripts/hardware_ci/run-amd-test.sh
+# Last synced: 2025-12-15
+# Modifications: docker image name for vllm-omni
+
+# This script runs test inside the corresponding ROCm docker container.
+set -o pipefail
+
+# Export Python path
+export PYTHONPATH=".."
+
+# Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
+echo "--- ROCm info"
+rocminfo
+
+# cleanup older docker images
+cleanup_docker() {
+  # Get Docker's root directory
+  docker_root=$(docker info -f '{{.DockerRootDir}}')
+  if [ -z "$docker_root" ]; then
+    echo "Failed to determine Docker root directory."
+    exit 1
+  fi
+  echo "Docker root directory: $docker_root"
+  # Check disk usage of the filesystem where Docker's root directory is located
+  disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+  # Define the threshold
+  threshold=70
+  if [ "$disk_usage" -gt "$threshold" ]; then
+    echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+    # Remove dangling images (those that are not tagged and not used by any container)
+    docker image prune -f
+    # Remove unused volumes / force the system prune for old images as well.
+    docker volume prune -f && docker system prune --force --filter "until=72h" --all
+    echo "Docker images and volumes cleanup completed."
+  else
+    echo "Disk usage is below $threshold%. No cleanup needed."
+  fi
+}
+
+# Call the cleanup docker function
+cleanup_docker
+
+echo "--- Resetting GPUs"
+
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
+echo "--- Pulling container"
+image_name="public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${BUILDKITE_COMMIT}-rocm-omni"
+container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+docker pull "${image_name}"
+
+remove_docker_container() {
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
+}
+trap remove_docker_container EXIT
+
+echo "--- Running container"
+
+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p "${HF_CACHE}"
+HF_MOUNT="/root/.cache/huggingface"
+
+commands=$@
+echo "Commands:$commands"
+
+commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
+
+if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
+  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
+fi
+
+commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
+
+if [[ $commands == *"pytest -v -s lora"* ]]; then
+  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
+fi
+
+#ignore certain kernels tests
+if [[ $commands == *" kernels/core"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/core/test_fused_quant_layernorm.py \
+  --ignore=kernels/core/test_permute_cols.py"
+fi
+
+if [[ $commands == *" kernels/attention"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/attention/test_attention_selector.py \
+  --ignore=kernels/attention/test_encoder_decoder_attn.py \
+  --ignore=kernels/attention/test_flash_attn.py \
+  --ignore=kernels/attention/test_flashinfer.py \
+  --ignore=kernels/attention/test_prefix_prefill.py \
+  --ignore=kernels/attention/test_cascade_flash_attn.py \
+  --ignore=kernels/attention/test_mha_attn.py \
+  --ignore=kernels/attention/test_lightning_attn.py \
+  --ignore=kernels/attention/test_attention.py"
+fi
+
+if [[ $commands == *" kernels/quantization"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/quantization/test_int8_quant.py \
+  --ignore=kernels/quantization/test_machete_mm.py \
+  --ignore=kernels/quantization/test_block_fp8.py \
+  --ignore=kernels/quantization/test_block_int8.py \
+  --ignore=kernels/quantization/test_marlin_gemm.py \
+  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
+  --ignore=kernels/quantization/test_int8_kernel.py"
+fi
+
+if [[ $commands == *" kernels/mamba"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/mamba/test_mamba_mixer2.py \
+  --ignore=kernels/mamba/test_causal_conv1d.py \
+  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
+fi
+
+if [[ $commands == *" kernels/moe"* ]]; then
+  commands="${commands} \
+  --ignore=kernels/moe/test_moe.py \
+  --ignore=kernels/moe/test_cutlass_moe.py \
+  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
+fi
+
+#ignore certain Entrypoints/openai tests
+if [[ $commands == *" entrypoints/openai "* ]]; then
+  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
+  --ignore=entrypoints/openai/test_audio.py \
+  --ignore=entrypoints/openai/test_shutdown.py \
+  --ignore=entrypoints/openai/test_completion.py \
+  --ignore=entrypoints/openai/test_sleep.py \
+  --ignore=entrypoints/openai/test_models.py \
+  --ignore=entrypoints/openai/test_lora_adapters.py \
+  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
+  --ignore=entrypoints/openai/test_root_path.py \
+  --ignore=entrypoints/openai/test_tokenization.py \
+  --ignore=entrypoints/openai/test_prompt_validation.py "}
+fi
+
+#ignore certain Entrypoints/llm tests
+if [[ $commands == *" entrypoints/llm "* ]]; then
+  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
+  --ignore=entrypoints/llm/test_chat.py \
+  --ignore=entrypoints/llm/test_accuracy.py \
+  --ignore=entrypoints/llm/test_init.py \
+  --ignore=entrypoints/llm/test_prompt_validation.py "}
+fi
+
+# --ignore=entrypoints/openai/test_encoder_decoder.py \
+# --ignore=entrypoints/openai/test_embedding.py \
+# --ignore=entrypoints/openai/test_oot_registration.py
+# --ignore=entrypoints/openai/test_accuracy.py \
+# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
+
+
+PARALLEL_JOB_COUNT=8
+MYPYTHONPATH=".."
+
+# Test that we're launching on the machine that has
+# proper access to GPUs
+render_gid=$(getent group render | cut -d: -f3)
+if [[ -z "$render_gid" ]]; then
+  echo "Error: 'render' group not found. This is required for GPU access." >&2
+  exit 1
+fi
+
+# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
+if [[ $commands == *"--shard-id="* ]]; then
+  # assign job count as the number of shards used
+  commands=$(echo "$commands" | sed -E "s/--num-shards[[:blank:]]*=[[:blank:]]*[0-9]*/--num-shards=${PARALLEL_JOB_COUNT} /g" | sed 's/ \\ / /g')
+  for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
+    # assign shard-id for each shard
+    commands_gpu=$(echo "$commands" | sed -E "s/--shard-id[[:blank:]]*=[[:blank:]]*[0-9]*/--shard-id=${GPU} /g" | sed 's/ \\ / /g')
+    echo "Shard ${GPU} commands:$commands_gpu"
+    echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+    docker run \
+        --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+        --network=host \
+        --shm-size=16gb \
+        --group-add "$render_gid" \
+        --rm \
+        -e HIP_VISIBLE_DEVICES="${GPU}" \
+        -e HF_TOKEN \
+        -e AWS_ACCESS_KEY_ID \
+        -e AWS_SECRET_ACCESS_KEY \
+        -v "${HF_CACHE}:${HF_MOUNT}" \
+        -e "HF_HOME=${HF_MOUNT}" \
+        -e "PYTHONPATH=${MYPYTHONPATH}" \
+        --name "${container_name}_${GPU}" \
+        "${image_name}" \
+        /bin/bash -c "${commands_gpu}" \
+        |& while read -r line; do echo ">>Shard $GPU: $line"; done &
+    PIDS+=($!)
+  done
+  #wait for all processes to finish and collect exit codes
+  for pid in "${PIDS[@]}"; do
+    wait "${pid}"
+    STATUS+=($?)
+  done
+  for st in "${STATUS[@]}"; do
+    if [[ ${st} -ne 0 ]]; then
+      echo "One of the processes failed with $st"
+      exit "${st}"
+    fi
+  done
+else
+  echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
+  docker run \
+          --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
+          --network=host \
+          --shm-size=16gb \
+          --group-add "$render_gid" \
+          --rm \
+          -e HF_TOKEN \
+          -e AWS_ACCESS_KEY_ID \
+          -e AWS_SECRET_ACCESS_KEY \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
+          -e "HF_HOME=${HF_MOUNT}" \
+          -e "PYTHONPATH=${MYPYTHONPATH}" \
+          --name "${container_name}" \
+          "${image_name}" \
+          /bin/bash -c "${commands}"
+fi
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 67741c5ece1..d781c2b4bd6 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -1,16 +1,12 @@
 steps:
-  - label: ":docker: Build image"
-    key: image-build
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker build -f docker/Dockerfile.rocm -t vllm-omni-rocm-ci ."
-      - "docker tag vllm-omni-rocm-ci public.ecr.aws/q9t5s3a7/vllm-rocm-ci-test-repo:$BUILDKITE_COMMIT"
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-rocm-ci-test-repo:$BUILDKITE_COMMIT"
-    agents:
-      queue: "cpu_queue_premerge_us_east_1"
 
-  - label: "Simple Unit Test"
-    commands:
-      - ".buildkite/scripts/simple_test.sh"
-    agents:
-      queue: "cpu_queue_premerge"
+- label: "Diffusion Model Test"
+  timeout_in_minutes: 15
+  agent_pool: mi325_2
+  depends_on: image-build
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  grade: Blocking
+  source_file_dependencies:
+  - ./
+  commands:
+    - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
\ No newline at end of file
diff --git a/.buildkite/test-template-amd-omni.j2 b/.buildkite/test-template-amd-omni.j2
new file mode 100644
index 00000000000..8c44b8bc1e6
--- /dev/null
+++ b/.buildkite/test-template-amd-omni.j2
@@ -0,0 +1,335 @@
+{# vllm-omni customized version
+   Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/test-template-amd.j2
+   Last synced: 2025-12-15
+   Modifications: docker image name, default working dir, build queue, build commands for vllm-omni
+#}
+{% set cov_enabled = (cov_enabled == "1") %}
+{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
+{% set docker_image_torch_nightly = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-torch-nightly" %}
+{% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-cu118" %}
+{% set docker_image_cpu = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-cpu" %}
+{% if branch == "main" %}
+{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT" %}
+{% set docker_image_latest = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:latest" %}
+{% set docker_image_torch_nightly = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-torch-nightly" %}
+{% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu118" %}
+{% set docker_image_cpu = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cpu" %}
+{% endif %}
+{% set docker_image_amd = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-rocm-omni" %}
+{% set default_working_dir = "/app/vllm-omni/tests" %}
+{% set hf_home = "/root/.cache/huggingface" %}
+{% set hf_home_efs = "/mnt/efs/hf_cache" %}
+{% set hf_home_fsx = "/fsx/hf_cache" %}
+{% set list_file_diff = list_file_diff | split("|") %}
+
+{# Intelligent test targeting: Detect when only test files changed and collect them #}
+{%- set tests_acc = namespace(only_tests=true, any=false, changed=[]) %}
+{%- for file in list_file_diff %}
+{%- if file[:6] == 'tests/' and '/test_' in file and file[-3:] == '.py' %}
+{%- set tests_acc.any = true %}
+{%- set tests_acc.changed = tests_acc.changed + [file[6:]] %}
+{%- else %}
+{%- set tests_acc.only_tests = false %}
+{%- endif %}
+{%- endfor %}
+{%- set tests_only = (tests_acc.only_tests and tests_acc.any) %}
+{%- set changed_tests = tests_acc.changed %}
+
+{% macro add_pytest_coverage(cmd, coverage_file) %}
+{% if "pytest " in cmd %}
+COVERAGE_FILE={{ coverage_file }} {{ cmd | replace("pytest ", "pytest --cov=vllm --cov-report=xml --cov-append --durations=0 ") }} || true
+{% else %}
+{{ cmd }}
+{% endif %}
+{% endmacro %}
+
+{% macro add_docker_pytest_coverage(step, cov_enabled) %}
+{# Compute coverage file id #}
+{% set step_length = step.label | length %}
+{% set step_first = step.label | first | default("x") %}
+{% set coverage_file = ".coverage." + step_length ~ "_" ~ step_first %}
+
+{# Intelligent test targeting: Build matched test targets for this step when only tests changed #}
+{%- set match_ns = namespace(targets=[]) %}
+{%- if tests_only and step.source_file_dependencies %}
+{%- for dep in step.source_file_dependencies %}
+{%- if dep[:6] == 'tests/' %}
+{%- set dep_rel = dep[6:] %}
+{# Handle deps that already end with '/' (e.g., tests/benchmarks/) #}
+{%- if dep_rel[-1:] == '/' %}
+{%- set dep_dir_prefix = dep_rel %}
+{%- set dep_file_name = dep_rel[:-1] ~ '.py' %}
+{%- else %}
+{%- set dep_dir_prefix = dep_rel ~ '/' %}
+{%- set dep_file_name = dep_rel ~ '.py' %}
+{%- endif %}
+{%- for t in changed_tests %}
+{# Check if t starts with dep_dir_prefix (for directories) or equals dep_file_name (for files) #}
+{%- set prefix_len = dep_dir_prefix | length %}
+{%- set t_prefix = t[:prefix_len] %}
+{%- set cond1 = (t | length >= prefix_len and t_prefix == dep_dir_prefix) %}
+{%- set cond2 = (t == dep_file_name) %}
+{%- if cond1 or cond2 %}
+{%- set match_ns.targets = match_ns.targets + [t] %}
+{%- endif %}
+{%- endfor %}
+{%- endif %}
+{%- endfor %}
+{%- endif %}
+{%- set matched_targets = match_ns.targets %}
+
+{# If we have matched targets, run only those specific tests #}
+{% if matched_targets | length > 0 %}
+pytest -v -s {{ matched_targets | join(' ') }}
+{% else %}
+{# Default behavior: preserve original commands with optional coverage injection #}
+{% if cov_enabled %}
+{% set ns = namespace(has_pytest=false) %}
+{% if step.command %}
+{% if "pytest " in step.command %}{% set ns.has_pytest = true %}{% endif %}
+{{ add_pytest_coverage(step.command, coverage_file) }}
+{% else %}
+{% for cmd in step.commands %}
+{% if "pytest " in cmd %}{% set ns.has_pytest = true %}{% endif %}
+{{ add_pytest_coverage(cmd, coverage_file) }}{{ " && " if not loop.last else "" }}{% endfor %}
+{% endif %}{% if ns.has_pytest %} && curl -sSL https://raw.githubusercontent.com/vllm-project/ci-infra/{{ vllm_ci_branch | default('main') }}/buildkite/scripts/upload_codecov.sh | bash -s -- \"{{ step.label }}\"{% endif %}
+{% else %}
+{{ step.command or (step.commands | join(' && ')) | safe }}
+{% endif %}
+{% endif %}
+{% endmacro %}
+
+{% macro render_cuda_config(step, image, default_working_dir, hf_home_fsx, hf_home, branch) %}
+agents:
+  {% if step.label == "Documentation Build" %}
+  queue: small_cpu_queue_premerge
+  {% elif step.no_gpu %}
+  queue: cpu_queue_premerge_us_east_1
+  {% elif step.gpu == "a100" %}
+  queue: a100_queue
+  {% elif step.gpu == "h100" %}
+  queue: mithril-h100-pool
+  {% elif step.gpu == "h200" %}
+  queue: skylab-h200
+  {% elif step.gpu == "b200" %}
+  queue: B200
+  {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
+  queue: gpu_4_queue
+  {% else %}
+  queue: gpu_1_queue
+  {% endif %}
+
+{% if step.num_nodes >= 2 %}
+commands:
+  - ./.buildkite/scripts/run-multi-node-test.sh {{ (step.working_dir or default_working_dir) | safe }} {{ step.num_nodes }} {{ step.num_gpus }} {{ image }} {% for command in step.commands %}"{{ (command | join(' && ')) | safe }}" {% endfor %}
+{% endif %}
+
+{% if step.parallelism %}
+parallelism: {{ step.parallelism }}
+{% endif %}
+
+retry:
+  automatic:
+    - exit_status: -1
+      limit: 1
+    - exit_status: -10
+      limit: 1
+
+{% if step.num_nodes < 2 %}
+plugins:
+  {% if step.gpu != "a100" and step.gpu != "h100" and step.gpu != "h200" and step.gpu != "b200" %}
+  - docker#v5.2.0:
+      image: {{ image }}
+      always-pull: true
+      propagate-environment: true
+      {% if not step.no_gpu %}
+      gpus: all
+      {% endif %}
+      {% if step.label == "Benchmarks" or step.mount_buildkite_agent or cov_enabled %}
+      mount-buildkite-agent: true
+      {% endif %}
+      command: ["bash", "{% if fail_fast == "true" %}-xce{% else %}-xc{% endif %}", "(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ add_docker_pytest_coverage(step, cov_enabled) }}"]
+      environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME={{ hf_home_fsx }}
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        {% if fail_fast == "true" %}
+        - PYTEST_ADDOPTS=-x
+        {% endif %}
+        {% if branch == "main" %}
+        - BUILDKITE_ANALYTICS_TOKEN
+        {% endif %}
+        {% if step.label == "Speculative decoding tests" %}
+        - VLLM_ATTENTION_BACKEND=XFORMERS
+        {% endif %}
+      volumes:
+        - /dev/shm:/dev/shm
+        - {{ hf_home_fsx }}:{{ hf_home_fsx }}
+  {% elif step.gpu == "h200" %}
+   - docker#v5.2.0:
+      image: {{ image }}
+      always-pull: true
+      propagate-environment: true
+      gpus: all
+      command: ["bash", "{% if fail_fast == "true" %}-xce{% else %}-xc{% endif %}", "(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ add_docker_pytest_coverage(step, cov_enabled) }}"]
+      environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/benchmark-hf-cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        {% if fail_fast == "true" %}
+        - PYTEST_ADDOPTS=-x
+        {% endif %}
+        {% if branch == "main" %}
+        - BUILDKITE_ANALYTICS_TOKEN
+        {% endif %}
+      volumes:
+        - /dev/shm:/dev/shm
+        - /data/benchmark-hf-cache:/benchmark-hf-cache
+        - /data/benchmark-vllm-cache:/root/.cache/vllm
+  {% elif step.gpu == "b200" %}
+   - docker#v5.2.0:
+      image: {{ image }}
+      always-pull: true
+      propagate-environment: true
+      # gpus will be configured by BUILDKITE_PLUGIN_DOCKER_GPUS in per host environment variable.
+      # gpus: all
+      command: ["bash", "{% if fail_fast == "true" %}-xce{% else %}-xc{% endif %}", "(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ add_docker_pytest_coverage(step, cov_enabled) }}"]
+      environment:
+        - VLLM_USAGE_SOURCE=ci-test
+        - NCCL_CUMEM_HOST_ENABLE=0
+        - HF_HOME=/benchmark-hf-cache
+        - HF_TOKEN
+        - CODECOV_TOKEN
+        {% if fail_fast == "true" %}
+        - PYTEST_ADDOPTS=-x
+        {% endif %}
+        {% if branch == "main" %}
+        - BUILDKITE_ANALYTICS_TOKEN
+        {% endif %}
+      volumes:
+        - /dev/shm:/dev/shm
+        - /data/benchmark-hf-cache:/benchmark-hf-cache
+        - /data/benchmark-vllm-cache:/root/.cache/vllm
+  {% elif step.gpu == "h100" %}
+  - kubernetes:
+      podSpec:
+        containers:
+          - image: {{ image }}
+            command:
+              - bash -c "{{ '(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd ' ~ ((step.working_dir or default_working_dir) | safe) ~ ' && ' ~ (step.command or (step.commands | join(" && ")) | safe) }}"
+            resources:
+              limits:
+                nvidia.com/gpu: {{ step.num_gpus or 1 }}
+            volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              - name: hf-cache
+                mountPath: {{ hf_home }}
+            env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: NCCL_CUMEM_HOST_ENABLE
+                value: "0"
+              - name: HF_HOME
+                value: {{ hf_home }}
+        nodeSelector:
+          nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3
+        volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /mnt/hf-cache
+              type: Directory
+  {% else %}
+  - kubernetes:
+      podSpec:
+        priorityClassName: ci
+        containers:
+          - image: {{ image }}
+            command:
+              - bash -c "{{ '(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd ' ~ ((step.working_dir or default_working_dir) | safe) ~ ' && ' ~ (step.command or (step.commands | join(" && ")) | safe) }}"
+            resources:
+              limits:
+                nvidia.com/gpu: {{ step.num_gpus or 1 }}
+            volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              - name: hf-cache
+                mountPath: {{ hf_home }}
+            env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: NCCL_CUMEM_HOST_ENABLE
+                value: "0"
+              - name: HF_HOME
+                value: {{ hf_home }}
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
+        nodeSelector:
+          nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+        volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: {{ hf_home }}
+              type: Directory
+  {% endif %}
+{% endif %}
+{% endmacro %}
+
+  - group: "AMD Tests"
+    depends_on: ~
+    steps:
+      - label: "AMD: :docker: build image"
+        depends_on: ~
+        soft_fail: false
+        commands:
+          - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+          - "docker build -f docker/Dockerfile.rocm -t {{ docker_image_amd }} --progress plain ."
+          - "docker push {{ docker_image_amd }}"
+        key: "amd-build"
+        env:
+          DOCKER_BUILDKIT: "1"
+        retry:
+          automatic:
+            - exit_status: -1  # Agent was lost
+              limit: 1
+            - exit_status: -10  # Agent was lost
+              limit: 1
+            - exit_status: 1  # Machine occasionally fail
+              limit: 1
+        agents:
+          queue: cpu_queue_premerge_us_east_1
+
+    {% for step in steps %}
+    {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %}
+      - label: "{{ step.agent_pool }}: {{ step.label }}"
+        depends_on: amd-build
+        agents:
+          {% if step.agent_pool %}
+          queue: amd_{{ step.agent_pool }}
+          {% else %}
+          queue: amd_mi325_1
+          {% endif %}
+        command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" && ")) | safe }}"
+        env:
+          DOCKER_BUILDKIT: "1"
+        priority: 100
+        {% if step.grade and step.grade == "Blocking" %}
+        soft_fail: false
+        {% else %}
+        soft_fail: true
+        {% endif%}
+    {% endif %}
+    {% endfor %}

From 89987f3ab5735c03123fecbc18505f7508054c8e Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 16 Dec 2025 13:29:44 +0000
Subject: [PATCH 12/32] add change the working directory of vllm omni docker
 image in CI; add is rocm to handle unit tests that are failing

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml                      | 4 ++--
 .buildkite/test-template-amd-omni.j2          | 2 +-
 tests/e2e/__init__.py                         | 0
 tests/e2e/offline_inference/test_t2i_model.py | 6 ++++++
 vllm_omni/utils/platform_utils.py             | 6 ++++++
 5 files changed, 15 insertions(+), 3 deletions(-)
 create mode 100644 tests/e2e/__init__.py

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index d781c2b4bd6..9d891d1ff9e 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -4,9 +4,9 @@ steps:
   timeout_in_minutes: 15
   agent_pool: mi325_2
   depends_on: image-build
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  mirror_hardwares: [amdexperimental, amdtentative]
   grade: Blocking
   source_file_dependencies:
   - ./
   commands:
-    - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
\ No newline at end of file
+    - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
diff --git a/.buildkite/test-template-amd-omni.j2 b/.buildkite/test-template-amd-omni.j2
index 8c44b8bc1e6..47a207580a4 100644
--- a/.buildkite/test-template-amd-omni.j2
+++ b/.buildkite/test-template-amd-omni.j2
@@ -16,7 +16,7 @@
 {% set docker_image_cpu = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cpu" %}
 {% endif %}
 {% set docker_image_amd = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-rocm-omni" %}
-{% set default_working_dir = "/app/vllm-omni/tests" %}
+{% set default_working_dir = "/app/vllm-omni" %}
 {% set hf_home = "/root/.cache/huggingface" %}
 {% set hf_home_efs = "/mnt/efs/hf_cache" %}
 {% set hf_home_fsx = "/fsx/hf_cache" %}
diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/e2e/offline_inference/test_t2i_model.py b/tests/e2e/offline_inference/test_t2i_model.py
index 77097836622..1cac2dfddf9 100644
--- a/tests/e2e/offline_inference/test_t2i_model.py
+++ b/tests/e2e/offline_inference/test_t2i_model.py
@@ -11,11 +11,17 @@
     sys.path.insert(0, str(REPO_ROOT))
 
 from vllm_omni import Omni
+from vllm_omni.utils.platform_utils import is_rocm
 
 os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "1"
 
 models = ["Tongyi-MAI/Z-Image-Turbo", "riverclouds/qwen_image_random"]
 
+if is_rocm():
+    # vLLM V0.11.0 has issue running
+    # riverclouds/qwen_image_random on ROCm
+    models = ["Tongyi-MAI/Z-Image-Turbo"]
+
 
 @pytest.mark.parametrize("model_name", models)
 def test_diffusion_model(model_name: str):
diff --git a/vllm_omni/utils/platform_utils.py b/vllm_omni/utils/platform_utils.py
index 385b1a8f36c..679471415f0 100644
--- a/vllm_omni/utils/platform_utils.py
+++ b/vllm_omni/utils/platform_utils.py
@@ -6,6 +6,8 @@
 
 def detect_device_type() -> str:
     device_type = getattr(current_platform, "device_type", None)
+    if current_platform.is_rocm():
+        return "rocm"
     if isinstance(device_type, str) and device_type:
         return device_type.lower()
     if torch.cuda.is_available():
@@ -19,6 +21,10 @@ def is_npu() -> bool:
     return detect_device_type() == "npu"
 
 
+def is_rocm() -> bool:
+    return detect_device_type() == "rocm"
+
+
 def get_device_control_env_var() -> str:
     """Return the environment variable name for device visibility control."""
     if hasattr(current_platform, "device_control_env_var"):

From 19c3056404eda321244d1ee04730126b14141359 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 16 Dec 2025 14:41:21 +0000
Subject: [PATCH 13/32] fix test path; add qwen25 omni

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml                      |  16 ++-
 .../stage_configs/rocm/qwen2_5_omni_ci.yaml   | 105 ++++++++++++++++++
 .../offline_inference/test_qwen2_5_omni.py    |   9 +-
 tests/e2e/offline_inference/utils.py          |   1 +
 4 files changed, 127 insertions(+), 4 deletions(-)
 create mode 100644 tests/e2e/offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 9d891d1ff9e..c0a4bd16e60 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -3,10 +3,22 @@ steps:
 - label: "Diffusion Model Test"
   timeout_in_minutes: 15
   agent_pool: mi325_2
-  depends_on: image-build
-  mirror_hardwares: [amdexperimental, amdtentative]
+  depends_on: amd-build
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
   grade: Blocking
   source_file_dependencies:
   - ./
   commands:
     - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
+
+- label: "Omni Model Test"
+  timeout_in_minutes: 15
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  source_file_dependencies:
+  - ./
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
diff --git a/tests/e2e/offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml
new file mode 100644
index 00000000000..96e9d7fa725
--- /dev/null
+++ b/tests/e2e/offline_inference/stage_configs/rocm/qwen2_5_omni_ci.yaml
@@ -0,0 +1,105 @@
+# stage config for running qwen2.5-omni with architecture of OmniLLM.
+
+# The following config has been verified on 2x 24GB GPU (L4/RTX3090/RTX4090).
+# This config is optimized for CI e2e tests.
+stage_args:
+  - stage_id: 0
+    runtime:
+      process: true            # Run this stage in a separate process
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 896
+      max_num_batched_tokens: 896
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      skip_mm_profiling: true
+      enforce_eager: true  # Now we only support eager mode
+      trust_remote_code: true
+      engine_output_type: latent
+      enable_prefix_caching: false
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+  - stage_id: 1
+    runtime:
+      process: true
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      max_model_len: 896
+      max_num_batched_tokens: 896
+      max_num_seqs: 1
+      gpu_memory_utilization: 0.8
+      skip_mm_profiling: true
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: latent
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
+    default_sampling_params:
+      temperature: 0.9
+      top_p: 0.8
+      top_k: 40
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+      stop_token_ids: [8294]
+  - stage_id: 2
+    runtime:
+      process: true
+      devices: "0"            # Example: use a different GPU than the previous stage; use "0" if single GPU
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      gpu_memory_utilization: 0.15
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio
+    engine_input_source: [1]
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 128
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+
+# Top-level runtime config (concise): default windows and stage edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1             # Simplified: trigger downstream only after full upstream completion
+    max_inflight: 1             # Simplified: process serially within each stage
+  edges:
+    - from: 0                   # thinker → talker: trigger only after receiving full input (-1)
+      to: 1
+      window_size: -1
+    - from: 1                   # talker → code2wav: trigger only after receiving full input (-1)
+      to: 2
+      window_size: -1
diff --git a/tests/e2e/offline_inference/test_qwen2_5_omni.py b/tests/e2e/offline_inference/test_qwen2_5_omni.py
index 72478cfd40f..45e8449284f 100644
--- a/tests/e2e/offline_inference/test_qwen2_5_omni.py
+++ b/tests/e2e/offline_inference/test_qwen2_5_omni.py
@@ -12,24 +12,29 @@
 from vllm.assets.video import VideoAsset
 from vllm.multimodal.image import convert_image_mode
 
+from vllm_omni.utils.platform_utils import detect_device_type
+
 from .conftest import OmniRunner
-from .utils import create_new_process_for_each_test
 
 models = ["Qwen/Qwen2.5-Omni-3B"]
 
 # CI stage config optimized for 24GB GPU (L4/RTX3090)
 stage_configs = [str(Path(__file__).parent / "stage_configs" / "qwen2_5_omni_ci.yaml")]
 
+if detect_device_type() != "cuda":
+    # ROCm stage config optimized for MI325 GPU
+    stage_configs = [str(Path(__file__).parent / "stage_configs" / detect_device_type() / "qwen2_5_omni_ci.yaml")]
+
 # Create parameter combinations for model and stage config
 test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
 
 
 @pytest.mark.core_model
 @pytest.mark.parametrize("test_config", test_params)
-@create_new_process_for_each_test()
 def test_mixed_modalities_to_audio(omni_runner: type[OmniRunner], test_config: tuple[str, str]) -> None:
     """Test processing audio, image, and video together, generating audio output."""
     model, stage_config_path = test_config
+    print(f"Running test for model: {model} and stage config: {stage_config_path}")
     with omni_runner(model, seed=42, stage_configs_path=stage_config_path) as runner:
         # Prepare multimodal inputs
         question = "What is recited in the audio? What is in this image? Describe the video briefly."
diff --git a/tests/e2e/offline_inference/utils.py b/tests/e2e/offline_inference/utils.py
index 931e7b506cb..82c46ff55dd 100644
--- a/tests/e2e/offline_inference/utils.py
+++ b/tests/e2e/offline_inference/utils.py
@@ -20,6 +20,7 @@
 VLLM_PATH = Path(__file__).parent.parent.parent
 """Path to root of the vLLM repository."""
 
+print(f"VLLM_PATH: {VLLM_PATH}")
 
 _P = ParamSpec("_P")
 

From 8ae3569f597ca1ca7e159e4c6800e01af531e684 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 16 Dec 2025 15:36:59 +0000
Subject: [PATCH 14/32] add necessary env flag for mi325 vllm 0.11.0

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index c0a4bd16e60..59a6e1f3a27 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -9,6 +9,11 @@ steps:
   source_file_dependencies:
   - ./
   commands:
+    - export MIOPEN_FIND_MODE=FAST
+    - export VLLM_ROCM_USE_AITER=1
+    - export VLLM_ROCM_USE_AITER_MHA=1
+    - export VLLM_ROCM_USE_AITER_LINEAR=0
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
 
 - label: "Omni Model Test"
@@ -21,4 +26,9 @@ steps:
   commands:
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export MIOPEN_FIND_MODE=FAST
+    - export VLLM_ROCM_USE_AITER=1
+    - export VLLM_ROCM_USE_AITER_MHA=1
+    - export VLLM_ROCM_USE_AITER_LINEAR=0
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py

From 442dc4483aa4d950e4811f9797c806a53e2cb257 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 16 Dec 2025 16:23:06 +0000
Subject: [PATCH 15/32] fix get device; add qwen3-omni unit tests

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml                      |  19 +++-
 docker/Dockerfile.rocm                        |   1 +
 .../stage_configs/rocm/qwen3_omni_ci.yaml     |  98 +++++++++++++++++
 .../offline_inference/test_qwen2_5_omni.py    |   6 +-
 .../e2e/offline_inference/test_qwen3_omni.py  |   6 ++
 vllm_omni/entrypoints/utils.py                |   6 +-
 .../stage_configs/rocm/qwen2_5_omni.yaml      | 102 ++++++++++++++++++
 .../stage_configs/rocm/qwen3_omni_moe.yaml    |  97 +++++++++++++++++
 vllm_omni/utils/__init__.py                   |   2 +
 vllm_omni/utils/platform_utils.py             |   4 +-
 10 files changed, 332 insertions(+), 9 deletions(-)
 create mode 100644 tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
 create mode 100644 vllm_omni/model_executor/stage_configs/rocm/qwen2_5_omni.yaml
 create mode 100644 vllm_omni/model_executor/stage_configs/rocm/qwen3_omni_moe.yaml

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 59a6e1f3a27..ab66b4d61a5 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -16,7 +16,7 @@ steps:
     - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
 
-- label: "Omni Model Test"
+- label: "Omni Model Test Qwen2-5-Omni"
   timeout_in_minutes: 15
   agent_pool: mi325_2
   depends_on: amd-build
@@ -32,3 +32,20 @@ steps:
     - export VLLM_ROCM_USE_AITER_LINEAR=0
     - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
+
+- label: "Omni Model Test Qwen3-Omni"
+  timeout_in_minutes: 15
+  agent_pool: mi325_2
+  depends_on: amd-build
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  source_file_dependencies:
+  - ./
+  commands:
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export MIOPEN_FIND_MODE=FAST
+    - export VLLM_ROCM_USE_AITER=1
+    - export VLLM_ROCM_USE_AITER_MHA=1
+    - export VLLM_ROCM_USE_AITER_LINEAR=0
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
+    - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py tests/e2e/online_serving/test_qwen3_omni.py
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 872fb0c049a..849539e3a90 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -31,6 +31,7 @@ COPY . ${COMMON_WORKDIR}/vllm-omni
 RUN cd ${COMMON_WORKDIR}/vllm-omni && python3 -m pip install --no-cache-dir ".[dev]"
 
 # Create python symlink
+RUN export GPU_ARCHS="gfx942;gfx950"
 RUN ln -sf /usr/bin/python3 /usr/bin/python
 
 ENTRYPOINT []
diff --git a/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
new file mode 100644
index 00000000000..e9f87be387d
--- /dev/null
+++ b/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
@@ -0,0 +1,98 @@
+# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
+# Stage 0: Thinker (multimodal understanding + text generation)
+# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes)
+# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
+
+# The following config has been verified on 2x H100-80G GPUs.
+stage_args:
+  - stage_id: 0
+    runtime:
+      devices: "0,1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.6
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: latent  # Output hidden states for talker
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      hf_config_name: thinker_config
+      tensor_parallel_size: 2
+      load_format: dummy
+    final_output: true
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      top_k: 1
+      max_tokens: 100
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+
+  - stage_id: 1
+    runtime:
+       devices: "1"
+       max_batch_size: 1
+    engine_args:
+       model_stage: talker
+       model_arch: Qwen3OmniMoeForConditionalGeneration
+       worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+       gpu_memory_utilization: 0.5
+       enforce_eager: true
+       trust_remote_code: true
+       engine_output_type: latent  # Output codec codes for code2wav
+      #  tensor_parallel_size: 2
+       enable_prefix_caching: false
+       distributed_executor_backend: "mp"
+       hf_config_name: talker_config
+       load_format: dummy
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
+    # final_output: true
+    # final_output_type: text
+    default_sampling_params:
+      temperature: 0.9
+      top_k: 50
+      max_tokens: 100
+      seed: 42
+      detokenize: False
+      repetition_penalty: 1.05
+      stop_token_ids: [2150]
+
+  - stage_id: 2
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio  # Final output: audio waveform
+      gpu_memory_utilization: 0.1
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 1000000
+      hf_config_name: thinker_config
+      load_format: dummy
+    engine_input_source: [1]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 200
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
diff --git a/tests/e2e/offline_inference/test_qwen2_5_omni.py b/tests/e2e/offline_inference/test_qwen2_5_omni.py
index 45e8449284f..f0ac1723160 100644
--- a/tests/e2e/offline_inference/test_qwen2_5_omni.py
+++ b/tests/e2e/offline_inference/test_qwen2_5_omni.py
@@ -12,7 +12,7 @@
 from vllm.assets.video import VideoAsset
 from vllm.multimodal.image import convert_image_mode
 
-from vllm_omni.utils.platform_utils import detect_device_type
+from vllm_omni.utils.platform_utils import is_rocm
 
 from .conftest import OmniRunner
 
@@ -21,9 +21,9 @@
 # CI stage config optimized for 24GB GPU (L4/RTX3090)
 stage_configs = [str(Path(__file__).parent / "stage_configs" / "qwen2_5_omni_ci.yaml")]
 
-if detect_device_type() != "cuda":
+if is_rocm():
     # ROCm stage config optimized for MI325 GPU
-    stage_configs = [str(Path(__file__).parent / "stage_configs" / detect_device_type() / "qwen2_5_omni_ci.yaml")]
+    stage_configs = [str(Path(__file__).parent / "stage_configs" / "rocm" / "qwen2_5_omni_ci.yaml")]
 
 # Create parameter combinations for model and stage config
 test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
diff --git a/tests/e2e/offline_inference/test_qwen3_omni.py b/tests/e2e/offline_inference/test_qwen3_omni.py
index b43fa836174..ec1fc4f15ac 100644
--- a/tests/e2e/offline_inference/test_qwen3_omni.py
+++ b/tests/e2e/offline_inference/test_qwen3_omni.py
@@ -10,6 +10,8 @@
 import pytest
 from vllm.assets.video import VideoAsset
 
+from vllm_omni.utils.platform_utils import is_rocm
+
 from .conftest import OmniRunner
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
@@ -19,6 +21,10 @@
 # CI stage config for 2xH100-80G GPUs
 stage_configs = [str(Path(__file__).parent / "stage_configs" / "qwen3_omni_ci.yaml")]
 
+if is_rocm():
+    # ROCm stage config optimized for MI325 GPU
+    stage_configs = [str(Path(__file__).parent / "stage_configs" / "rocm" / "qwen2_5_omni_ci.yaml")]
+
 # Create parameter combinations for model and stage config
 test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
 
diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py
index e092325f8f1..5256abc3a3e 100644
--- a/vllm_omni/entrypoints/utils.py
+++ b/vllm_omni/entrypoints/utils.py
@@ -7,7 +7,7 @@
 from omegaconf import OmegaConf
 from vllm.transformers_utils.config import get_config
 
-from vllm_omni.utils import detect_device_type
+from vllm_omni.utils import detect_device_type, is_rocm
 
 # Get the project root directory (2 levels up from this file)
 PROJECT_ROOT = Path(__file__).parent.parent.parent
@@ -83,8 +83,10 @@ def resolve_model_config_path(model: str) -> str:
     device_type = detect_device_type()
 
     # Try device-specific config first
-    if device_type != "cuda":
+    if device_type != "cuda" or is_rocm():
         device_config_file = f"vllm_omni/model_executor/stage_configs/{device_type}/{model_type}.yaml"
+        if is_rocm():
+            device_config_file = f"vllm_omni/model_executor/stage_configs/rocm/{model_type}.yaml"
         device_config_path = PROJECT_ROOT / device_config_file
         if os.path.exists(device_config_path):
             return str(device_config_path)
diff --git a/vllm_omni/model_executor/stage_configs/rocm/qwen2_5_omni.yaml b/vllm_omni/model_executor/stage_configs/rocm/qwen2_5_omni.yaml
new file mode 100644
index 00000000000..c646aa76a9d
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/rocm/qwen2_5_omni.yaml
@@ -0,0 +1,102 @@
+# stage config for running qwen2.5-omni with architecture of OmniLLM.
+
+# The following config has been verified on 2x H100-80G GPU.
+stage_args:
+  - stage_id: 0
+    runtime:
+      process: true            # Run this stage in a separate process
+      devices: "0"            # Visible devices for this stage (CUDA_VISIBLE_DEVICES/torch.cuda.set_device)
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.8
+      enforce_eager: true  # Now we only support eager mode
+      trust_remote_code: true
+      engine_output_type: latent
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+    is_comprehension: true
+    final_output: true
+    final_output_type: text
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+
+  - stage_id: 1
+    runtime:
+      process: true
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.8
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      engine_output_type: latent
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen2_5_omni.thinker2talker
+    default_sampling_params:
+      temperature: 0.9
+      top_p: 0.8
+      top_k: 40
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+      stop_token_ids: [8294]
+
+  - stage_id: 2
+    runtime:
+      process: true
+      devices: "2"            # Example: use a different GPU than the previous stage; use "0" if single GPU
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen2_5OmniForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      gpu_memory_utilization: 0.15
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      engine_output_type: audio
+    engine_input_source: [1]
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
+
+# Top-level runtime config (concise): default windows and stage edges
+runtime:
+  enabled: true
+  defaults:
+    window_size: -1             # Simplified: trigger downstream only after full upstream completion
+    max_inflight: 1             # Simplified: process serially within each stage
+
+  edges:
+    - from: 0                   # thinker → talker: trigger only after receiving full input (-1)
+      to: 1
+      window_size: -1
+    - from: 1                   # talker → code2wav: trigger only after receiving full input (-1)
+      to: 2
+      window_size: -1
diff --git a/vllm_omni/model_executor/stage_configs/rocm/qwen3_omni_moe.yaml b/vllm_omni/model_executor/stage_configs/rocm/qwen3_omni_moe.yaml
new file mode 100644
index 00000000000..73f65ecb557
--- /dev/null
+++ b/vllm_omni/model_executor/stage_configs/rocm/qwen3_omni_moe.yaml
@@ -0,0 +1,97 @@
+# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
+# Stage 0: Thinker (multimodal understanding + text generation)
+# Stage 1: Talker (text embeddings → 8-layer RVQ codec codes)
+# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
+
+# The following config has been verified on 2x H100-80G GPUs.
+stage_args:
+  - stage_id: 0
+    runtime:
+      devices: "0,1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: thinker
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.6
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: latent  # Output hidden states for talker
+      distributed_executor_backend: "mp"
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      hf_config_name: thinker_config
+      tensor_parallel_size: 2
+    final_output: true
+    final_output_type: text
+    is_comprehension: true
+    default_sampling_params:
+      temperature: 0.4
+      top_p: 0.9
+      top_k: 1
+      max_tokens: 2048
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.05
+
+  - stage_id: 1
+    runtime:
+      devices: "1"
+      max_batch_size: 1
+    engine_args:
+      model_stage: talker
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+      gpu_memory_utilization: 0.3
+      enforce_eager: true
+      trust_remote_code: true
+      engine_output_type: latent  # Output codec codes for code2wav
+      # tensor_parallel_size: 2
+      enable_prefix_caching: false
+      max_num_batched_tokens: 32768
+      distributed_executor_backend: "mp"
+      hf_config_name: talker_config
+    engine_input_source: [0]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
+    # final_output: true
+    # final_output_type: text
+    default_sampling_params:
+      temperature: 0.9
+      top_k: 50
+      max_tokens: 4096
+      seed: 42
+      detokenize: False
+      repetition_penalty: 1.05
+      stop_token_ids: [2150]
+
+  - stage_id: 2
+    runtime:
+      devices: "0"
+      max_batch_size: 1
+    engine_args:
+      model_stage: code2wav
+      model_arch: Qwen3OmniMoeForConditionalGeneration
+      worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
+      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+      enforce_eager: true
+      trust_remote_code: true
+      enable_prefix_caching: false
+      engine_output_type: audio  # Final output: audio waveform
+      gpu_memory_utilization: 0.1
+      distributed_executor_backend: "mp"
+      max_num_batched_tokens: 1000000
+      hf_config_name: thinker_config
+    engine_input_source: [1]
+    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
+    final_output: true
+    final_output_type: audio
+    default_sampling_params:
+      temperature: 0.0
+      top_p: 1.0
+      top_k: -1
+      max_tokens: 65536
+      seed: 42
+      detokenize: True
+      repetition_penalty: 1.1
diff --git a/vllm_omni/utils/__init__.py b/vllm_omni/utils/__init__.py
index 50dbb478d90..34b2545db59 100644
--- a/vllm_omni/utils/__init__.py
+++ b/vllm_omni/utils/__init__.py
@@ -2,10 +2,12 @@
     detect_device_type,
     get_device_control_env_var,
     is_npu,
+    is_rocm,
 )
 
 __all__ = [
     "detect_device_type",
     "get_device_control_env_var",
     "is_npu",
+    "is_rocm",
 ]
diff --git a/vllm_omni/utils/platform_utils.py b/vllm_omni/utils/platform_utils.py
index 679471415f0..5f8259ab83d 100644
--- a/vllm_omni/utils/platform_utils.py
+++ b/vllm_omni/utils/platform_utils.py
@@ -6,8 +6,6 @@
 
 def detect_device_type() -> str:
     device_type = getattr(current_platform, "device_type", None)
-    if current_platform.is_rocm():
-        return "rocm"
     if isinstance(device_type, str) and device_type:
         return device_type.lower()
     if torch.cuda.is_available():
@@ -22,7 +20,7 @@ def is_npu() -> bool:
 
 
 def is_rocm() -> bool:
-    return detect_device_type() == "rocm"
+    return current_platform.is_rocm()
 
 
 def get_device_control_env_var() -> str:

From 3532ec7da180206be99925b347937664592662df Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 16 Dec 2025 16:31:22 +0000
Subject: [PATCH 16/32] fix the file pointed by qwen3 omni test

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/e2e/offline_inference/test_qwen3_omni.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/e2e/offline_inference/test_qwen3_omni.py b/tests/e2e/offline_inference/test_qwen3_omni.py
index ec1fc4f15ac..9fa07839f0e 100644
--- a/tests/e2e/offline_inference/test_qwen3_omni.py
+++ b/tests/e2e/offline_inference/test_qwen3_omni.py
@@ -23,7 +23,7 @@
 
 if is_rocm():
     # ROCm stage config optimized for MI325 GPU
-    stage_configs = [str(Path(__file__).parent / "stage_configs" / "rocm" / "qwen2_5_omni_ci.yaml")]
+    stage_configs = [str(Path(__file__).parent / "stage_configs" / "rocm" / "qwen3_omni_ci.yaml")]
 
 # Create parameter combinations for model and stage config
 test_params = [(model, stage_config) for model in models for stage_config in stage_configs]

From 056fe9afb1ae1fe00220edd4ce3ba0ea20dcfd0a Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 17 Dec 2025 04:17:27 +0000
Subject: [PATCH 17/32] trying to fix aiter mi325x arch auto detection issue

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 21 +++++++++++++++++++++
 docker/Dockerfile.rocm   |  2 +-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index ab66b4d61a5..fc45004f91e 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -9,6 +9,7 @@ steps:
   source_file_dependencies:
   - ./
   commands:
+    - export GPU_ARCHS=gfx942
     - export MIOPEN_FIND_MODE=FAST
     - export VLLM_ROCM_USE_AITER=1
     - export VLLM_ROCM_USE_AITER_MHA=1
@@ -16,6 +17,24 @@ steps:
     - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_t2i_model.py
 
+- label: "Diffusion Cache Backend Test"
+  timeout_in_minutes: 15
+  agent_pool: mi325_1
+  depends_on: amd-build
+  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
+  source_file_dependencies:
+  - ./
+  commands:
+    - export GPU_ARCHS=gfx942
+    - export VLLM_LOGGING_LEVEL=DEBUG
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - export MIOPEN_FIND_MODE=FAST
+    - export VLLM_ROCM_USE_AITER=1
+    - export VLLM_ROCM_USE_AITER_MHA=1
+    - export VLLM_ROCM_USE_AITER_LINEAR=0
+    - export VLLM_ROCM_USE_AITER_RMSNORM=0
+    - pytest -s -v tests/e2e/offline_inference/test_cache_dit.py tests/e2e/offline_inference/test_teacache.py
+
 - label: "Omni Model Test Qwen2-5-Omni"
   timeout_in_minutes: 15
   agent_pool: mi325_2
@@ -24,6 +43,7 @@ steps:
   source_file_dependencies:
   - ./
   commands:
+    - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - export MIOPEN_FIND_MODE=FAST
@@ -41,6 +61,7 @@ steps:
   source_file_dependencies:
   - ./
   commands:
+    - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - export MIOPEN_FIND_MODE=FAST
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 849539e3a90..c8093330901 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -31,7 +31,7 @@ COPY . ${COMMON_WORKDIR}/vllm-omni
 RUN cd ${COMMON_WORKDIR}/vllm-omni && python3 -m pip install --no-cache-dir ".[dev]"
 
 # Create python symlink
-RUN export GPU_ARCHS="gfx942;gfx950"
+ENV GPU_ARCHS="gfx942;gfx950"
 RUN ln -sf /usr/bin/python3 /usr/bin/python
 
 ENTRYPOINT []

From 7c689e3668143d3e4d7d077b0345fd2788298a07 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 17 Dec 2025 04:46:59 +0000
Subject: [PATCH 18/32] fix the rocm qwen3 omni unit test

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .../offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
index e9f87be387d..fb955f72478 100644
--- a/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
+++ b/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
@@ -14,7 +14,7 @@ stage_args:
       model_arch: Qwen3OmniMoeForConditionalGeneration
       worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.6
+      gpu_memory_utilization: 0.4
       enforce_eager: true
       trust_remote_code: true
       engine_output_type: latent  # Output hidden states for talker
@@ -44,7 +44,7 @@ stage_args:
        model_arch: Qwen3OmniMoeForConditionalGeneration
        worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
        scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-       gpu_memory_utilization: 0.5
+       gpu_memory_utilization: 0.4
        enforce_eager: true
        trust_remote_code: true
        engine_output_type: latent  # Output codec codes for code2wav

From ce45b1fac1fe8ff4244ad970633a1345a915a82b Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 17 Dec 2025 14:17:44 +0000
Subject: [PATCH 19/32] remove qwen3 unit tests first; reuse AITER_ROCM_ARCH
 from base image

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml | 18 ------------------
 docker/Dockerfile.rocm   |  7 ++++++-
 2 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index fc45004f91e..fb84b57c74e 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -52,21 +52,3 @@ steps:
     - export VLLM_ROCM_USE_AITER_LINEAR=0
     - export VLLM_ROCM_USE_AITER_RMSNORM=0
     - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
-
-- label: "Omni Model Test Qwen3-Omni"
-  timeout_in_minutes: 15
-  agent_pool: mi325_2
-  depends_on: amd-build
-  mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  source_file_dependencies:
-  - ./
-  commands:
-    - export GPU_ARCHS=gfx942
-    - export VLLM_LOGGING_LEVEL=DEBUG
-    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_FIND_MODE=FAST
-    - export VLLM_ROCM_USE_AITER=1
-    - export VLLM_ROCM_USE_AITER_MHA=1
-    - export VLLM_ROCM_USE_AITER_LINEAR=0
-    - export VLLM_ROCM_USE_AITER_RMSNORM=0
-    - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py tests/e2e/online_serving/test_qwen3_omni.py
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index c8093330901..1885306d3fd 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -31,7 +31,12 @@ COPY . ${COMMON_WORKDIR}/vllm-omni
 RUN cd ${COMMON_WORKDIR}/vllm-omni && python3 -m pip install --no-cache-dir ".[dev]"
 
 # Create python symlink
-ENV GPU_ARCHS="gfx942;gfx950"
+# `GPU_ARCHS` is an environment variable that is used to set the GPU archs for the AITER.
+# This is needed to prevent the AITER automatic GPU arch detection from failing on MI325X.
+# The AITER version used in this dockerfile has issues with handling
+# the GPU archs of MI325X (CI machine) correctly. So we manually set the GPU archs here.
+# We reuse AITER_ROCM_ARCH from the base image to avoid duplication.
+ENV GPU_ARCHS=${AITER_ROCM_ARCH}
 RUN ln -sf /usr/bin/python3 /usr/bin/python
 
 ENTRYPOINT []

From 5e9c4d3718e728dcac46025dfe4a61a808e3252b Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 17 Dec 2025 14:24:08 +0000
Subject: [PATCH 20/32] remove print

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/e2e/offline_inference/test_qwen2_5_omni.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/e2e/offline_inference/test_qwen2_5_omni.py b/tests/e2e/offline_inference/test_qwen2_5_omni.py
index 2b7dd4403ab..f5f9894b386 100644
--- a/tests/e2e/offline_inference/test_qwen2_5_omni.py
+++ b/tests/e2e/offline_inference/test_qwen2_5_omni.py
@@ -37,7 +37,6 @@
 def test_mixed_modalities_to_audio(omni_runner: type[OmniRunner], test_config: tuple[str, str]) -> None:
     """Test processing audio, image, and video together, generating audio output."""
     model, stage_config_path = test_config
-    print(f"Running test for model: {model} and stage config: {stage_config_path}")
     with omni_runner(model, seed=42, stage_configs_path=stage_config_path) as runner:
         # Prepare multimodal inputs
         question = "What is recited in the audio? What is in this image? Describe the video briefly."

From d865d1800cbd95e66b9c16e32be27dcdf414be67 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 17 Dec 2025 14:26:19 +0000
Subject: [PATCH 21/32] simplify more

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .../scripts/hardware_ci/run-amd-test.sh       | 88 -------------------
 1 file changed, 88 deletions(-)

diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 64e97011768..a291f1b8c47 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -82,94 +82,6 @@ HF_MOUNT="/root/.cache/huggingface"
 commands=$@
 echo "Commands:$commands"
 
-commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"pytest -v -s basic_correctness/test_basic_correctness.py"}
-
-if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
-  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
-fi
-
-commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"pytest -v -s compile/test_basic_correctness.py"}
-
-if [[ $commands == *"pytest -v -s lora"* ]]; then
-  commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
-fi
-
-#ignore certain kernels tests
-if [[ $commands == *" kernels/core"* ]]; then
-  commands="${commands} \
-  --ignore=kernels/core/test_fused_quant_layernorm.py \
-  --ignore=kernels/core/test_permute_cols.py"
-fi
-
-if [[ $commands == *" kernels/attention"* ]]; then
-  commands="${commands} \
-  --ignore=kernels/attention/test_attention_selector.py \
-  --ignore=kernels/attention/test_encoder_decoder_attn.py \
-  --ignore=kernels/attention/test_flash_attn.py \
-  --ignore=kernels/attention/test_flashinfer.py \
-  --ignore=kernels/attention/test_prefix_prefill.py \
-  --ignore=kernels/attention/test_cascade_flash_attn.py \
-  --ignore=kernels/attention/test_mha_attn.py \
-  --ignore=kernels/attention/test_lightning_attn.py \
-  --ignore=kernels/attention/test_attention.py"
-fi
-
-if [[ $commands == *" kernels/quantization"* ]]; then
-  commands="${commands} \
-  --ignore=kernels/quantization/test_int8_quant.py \
-  --ignore=kernels/quantization/test_machete_mm.py \
-  --ignore=kernels/quantization/test_block_fp8.py \
-  --ignore=kernels/quantization/test_block_int8.py \
-  --ignore=kernels/quantization/test_marlin_gemm.py \
-  --ignore=kernels/quantization/test_cutlass_scaled_mm.py \
-  --ignore=kernels/quantization/test_int8_kernel.py"
-fi
-
-if [[ $commands == *" kernels/mamba"* ]]; then
-  commands="${commands} \
-  --ignore=kernels/mamba/test_mamba_mixer2.py \
-  --ignore=kernels/mamba/test_causal_conv1d.py \
-  --ignore=kernels/mamba/test_mamba_ssm_ssd.py"
-fi
-
-if [[ $commands == *" kernels/moe"* ]]; then
-  commands="${commands} \
-  --ignore=kernels/moe/test_moe.py \
-  --ignore=kernels/moe/test_cutlass_moe.py \
-  --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
-fi
-
-#ignore certain Entrypoints/openai tests
-if [[ $commands == *" entrypoints/openai "* ]]; then
-  commands=${commands//" entrypoints/openai "/" entrypoints/openai \
-  --ignore=entrypoints/openai/test_audio.py \
-  --ignore=entrypoints/openai/test_shutdown.py \
-  --ignore=entrypoints/openai/test_completion.py \
-  --ignore=entrypoints/openai/test_sleep.py \
-  --ignore=entrypoints/openai/test_models.py \
-  --ignore=entrypoints/openai/test_lora_adapters.py \
-  --ignore=entrypoints/openai/test_return_tokens_as_ids.py \
-  --ignore=entrypoints/openai/test_root_path.py \
-  --ignore=entrypoints/openai/test_tokenization.py \
-  --ignore=entrypoints/openai/test_prompt_validation.py "}
-fi
-
-#ignore certain Entrypoints/llm tests
-if [[ $commands == *" entrypoints/llm "* ]]; then
-  commands=${commands//" entrypoints/llm "/" entrypoints/llm \
-  --ignore=entrypoints/llm/test_chat.py \
-  --ignore=entrypoints/llm/test_accuracy.py \
-  --ignore=entrypoints/llm/test_init.py \
-  --ignore=entrypoints/llm/test_prompt_validation.py "}
-fi
-
-# --ignore=entrypoints/openai/test_encoder_decoder.py \
-# --ignore=entrypoints/openai/test_embedding.py \
-# --ignore=entrypoints/openai/test_oot_registration.py
-# --ignore=entrypoints/openai/test_accuracy.py \
-# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
-
-
 PARALLEL_JOB_COUNT=8
 MYPYTHONPATH=".."
 

From 5053c1bd08cb92d6fdd09565f94d682ca48bc72c Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Wed, 17 Dec 2025 16:15:34 +0000
Subject: [PATCH 22/32] keep the template small

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml             |   8 +-
 .buildkite/test-template-amd-omni.j2 | 286 +--------------------------
 2 files changed, 4 insertions(+), 290 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index fb84b57c74e..9a47f5e9b04 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -6,8 +6,6 @@ steps:
   depends_on: amd-build
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
   grade: Blocking
-  source_file_dependencies:
-  - ./
   commands:
     - export GPU_ARCHS=gfx942
     - export MIOPEN_FIND_MODE=FAST
@@ -22,8 +20,7 @@ steps:
   agent_pool: mi325_1
   depends_on: amd-build
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  source_file_dependencies:
-  - ./
+  grade: Blocking
   commands:
     - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
@@ -40,8 +37,7 @@ steps:
   agent_pool: mi325_2
   depends_on: amd-build
   mirror_hardwares: [amdexperimental, amdproduction, amdtentative]
-  source_file_dependencies:
-  - ./
+  grade: Blocking
   commands:
     - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
diff --git a/.buildkite/test-template-amd-omni.j2 b/.buildkite/test-template-amd-omni.j2
index 47a207580a4..0b6eb8f54b2 100644
--- a/.buildkite/test-template-amd-omni.j2
+++ b/.buildkite/test-template-amd-omni.j2
@@ -1,292 +1,10 @@
 {# vllm-omni customized version
    Based on: https://github.com/vllm-project/ci-infra/blob/main/buildkite/test-template-amd.j2
    Last synced: 2025-12-15
-   Modifications: docker image name, default working dir, build queue, build commands for vllm-omni
+   Modifications: Removed unused CUDA/NVIDIA logic, keeping only AMD tests
 #}
-{% set cov_enabled = (cov_enabled == "1") %}
-{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
-{% set docker_image_torch_nightly = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-torch-nightly" %}
-{% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-cu118" %}
-{% set docker_image_cpu = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-cpu" %}
-{% if branch == "main" %}
-{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT" %}
-{% set docker_image_latest = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:latest" %}
-{% set docker_image_torch_nightly = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-torch-nightly" %}
-{% set docker_image_cu118 = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cu118" %}
-{% set docker_image_cpu = "public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT-cpu" %}
-{% endif %}
 {% set docker_image_amd = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT-rocm-omni" %}
 {% set default_working_dir = "/app/vllm-omni" %}
-{% set hf_home = "/root/.cache/huggingface" %}
-{% set hf_home_efs = "/mnt/efs/hf_cache" %}
-{% set hf_home_fsx = "/fsx/hf_cache" %}
-{% set list_file_diff = list_file_diff | split("|") %}
-
-{# Intelligent test targeting: Detect when only test files changed and collect them #}
-{%- set tests_acc = namespace(only_tests=true, any=false, changed=[]) %}
-{%- for file in list_file_diff %}
-{%- if file[:6] == 'tests/' and '/test_' in file and file[-3:] == '.py' %}
-{%- set tests_acc.any = true %}
-{%- set tests_acc.changed = tests_acc.changed + [file[6:]] %}
-{%- else %}
-{%- set tests_acc.only_tests = false %}
-{%- endif %}
-{%- endfor %}
-{%- set tests_only = (tests_acc.only_tests and tests_acc.any) %}
-{%- set changed_tests = tests_acc.changed %}
-
-{% macro add_pytest_coverage(cmd, coverage_file) %}
-{% if "pytest " in cmd %}
-COVERAGE_FILE={{ coverage_file }} {{ cmd | replace("pytest ", "pytest --cov=vllm --cov-report=xml --cov-append --durations=0 ") }} || true
-{% else %}
-{{ cmd }}
-{% endif %}
-{% endmacro %}
-
-{% macro add_docker_pytest_coverage(step, cov_enabled) %}
-{# Compute coverage file id #}
-{% set step_length = step.label | length %}
-{% set step_first = step.label | first | default("x") %}
-{% set coverage_file = ".coverage." + step_length ~ "_" ~ step_first %}
-
-{# Intelligent test targeting: Build matched test targets for this step when only tests changed #}
-{%- set match_ns = namespace(targets=[]) %}
-{%- if tests_only and step.source_file_dependencies %}
-{%- for dep in step.source_file_dependencies %}
-{%- if dep[:6] == 'tests/' %}
-{%- set dep_rel = dep[6:] %}
-{# Handle deps that already end with '/' (e.g., tests/benchmarks/) #}
-{%- if dep_rel[-1:] == '/' %}
-{%- set dep_dir_prefix = dep_rel %}
-{%- set dep_file_name = dep_rel[:-1] ~ '.py' %}
-{%- else %}
-{%- set dep_dir_prefix = dep_rel ~ '/' %}
-{%- set dep_file_name = dep_rel ~ '.py' %}
-{%- endif %}
-{%- for t in changed_tests %}
-{# Check if t starts with dep_dir_prefix (for directories) or equals dep_file_name (for files) #}
-{%- set prefix_len = dep_dir_prefix | length %}
-{%- set t_prefix = t[:prefix_len] %}
-{%- set cond1 = (t | length >= prefix_len and t_prefix == dep_dir_prefix) %}
-{%- set cond2 = (t == dep_file_name) %}
-{%- if cond1 or cond2 %}
-{%- set match_ns.targets = match_ns.targets + [t] %}
-{%- endif %}
-{%- endfor %}
-{%- endif %}
-{%- endfor %}
-{%- endif %}
-{%- set matched_targets = match_ns.targets %}
-
-{# If we have matched targets, run only those specific tests #}
-{% if matched_targets | length > 0 %}
-pytest -v -s {{ matched_targets | join(' ') }}
-{% else %}
-{# Default behavior: preserve original commands with optional coverage injection #}
-{% if cov_enabled %}
-{% set ns = namespace(has_pytest=false) %}
-{% if step.command %}
-{% if "pytest " in step.command %}{% set ns.has_pytest = true %}{% endif %}
-{{ add_pytest_coverage(step.command, coverage_file) }}
-{% else %}
-{% for cmd in step.commands %}
-{% if "pytest " in cmd %}{% set ns.has_pytest = true %}{% endif %}
-{{ add_pytest_coverage(cmd, coverage_file) }}{{ " && " if not loop.last else "" }}{% endfor %}
-{% endif %}{% if ns.has_pytest %} && curl -sSL https://raw.githubusercontent.com/vllm-project/ci-infra/{{ vllm_ci_branch | default('main') }}/buildkite/scripts/upload_codecov.sh | bash -s -- \"{{ step.label }}\"{% endif %}
-{% else %}
-{{ step.command or (step.commands | join(' && ')) | safe }}
-{% endif %}
-{% endif %}
-{% endmacro %}
-
-{% macro render_cuda_config(step, image, default_working_dir, hf_home_fsx, hf_home, branch) %}
-agents:
-  {% if step.label == "Documentation Build" %}
-  queue: small_cpu_queue_premerge
-  {% elif step.no_gpu %}
-  queue: cpu_queue_premerge_us_east_1
-  {% elif step.gpu == "a100" %}
-  queue: a100_queue
-  {% elif step.gpu == "h100" %}
-  queue: mithril-h100-pool
-  {% elif step.gpu == "h200" %}
-  queue: skylab-h200
-  {% elif step.gpu == "b200" %}
-  queue: B200
-  {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
-  queue: gpu_4_queue
-  {% else %}
-  queue: gpu_1_queue
-  {% endif %}
-
-{% if step.num_nodes >= 2 %}
-commands:
-  - ./.buildkite/scripts/run-multi-node-test.sh {{ (step.working_dir or default_working_dir) | safe }} {{ step.num_nodes }} {{ step.num_gpus }} {{ image }} {% for command in step.commands %}"{{ (command | join(' && ')) | safe }}" {% endfor %}
-{% endif %}
-
-{% if step.parallelism %}
-parallelism: {{ step.parallelism }}
-{% endif %}
-
-retry:
-  automatic:
-    - exit_status: -1
-      limit: 1
-    - exit_status: -10
-      limit: 1
-
-{% if step.num_nodes < 2 %}
-plugins:
-  {% if step.gpu != "a100" and step.gpu != "h100" and step.gpu != "h200" and step.gpu != "b200" %}
-  - docker#v5.2.0:
-      image: {{ image }}
-      always-pull: true
-      propagate-environment: true
-      {% if not step.no_gpu %}
-      gpus: all
-      {% endif %}
-      {% if step.label == "Benchmarks" or step.mount_buildkite_agent or cov_enabled %}
-      mount-buildkite-agent: true
-      {% endif %}
-      command: ["bash", "{% if fail_fast == "true" %}-xce{% else %}-xc{% endif %}", "(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ add_docker_pytest_coverage(step, cov_enabled) }}"]
-      environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME={{ hf_home_fsx }}
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        {% if fail_fast == "true" %}
-        - PYTEST_ADDOPTS=-x
-        {% endif %}
-        {% if branch == "main" %}
-        - BUILDKITE_ANALYTICS_TOKEN
-        {% endif %}
-        {% if step.label == "Speculative decoding tests" %}
-        - VLLM_ATTENTION_BACKEND=XFORMERS
-        {% endif %}
-      volumes:
-        - /dev/shm:/dev/shm
-        - {{ hf_home_fsx }}:{{ hf_home_fsx }}
-  {% elif step.gpu == "h200" %}
-   - docker#v5.2.0:
-      image: {{ image }}
-      always-pull: true
-      propagate-environment: true
-      gpus: all
-      command: ["bash", "{% if fail_fast == "true" %}-xce{% else %}-xc{% endif %}", "(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ add_docker_pytest_coverage(step, cov_enabled) }}"]
-      environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/benchmark-hf-cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        {% if fail_fast == "true" %}
-        - PYTEST_ADDOPTS=-x
-        {% endif %}
-        {% if branch == "main" %}
-        - BUILDKITE_ANALYTICS_TOKEN
-        {% endif %}
-      volumes:
-        - /dev/shm:/dev/shm
-        - /data/benchmark-hf-cache:/benchmark-hf-cache
-        - /data/benchmark-vllm-cache:/root/.cache/vllm
-  {% elif step.gpu == "b200" %}
-   - docker#v5.2.0:
-      image: {{ image }}
-      always-pull: true
-      propagate-environment: true
-      # gpus will be configured by BUILDKITE_PLUGIN_DOCKER_GPUS in per host environment variable.
-      # gpus: all
-      command: ["bash", "{% if fail_fast == "true" %}-xce{% else %}-xc{% endif %}", "(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe }} && {{ add_docker_pytest_coverage(step, cov_enabled) }}"]
-      environment:
-        - VLLM_USAGE_SOURCE=ci-test
-        - NCCL_CUMEM_HOST_ENABLE=0
-        - HF_HOME=/benchmark-hf-cache
-        - HF_TOKEN
-        - CODECOV_TOKEN
-        {% if fail_fast == "true" %}
-        - PYTEST_ADDOPTS=-x
-        {% endif %}
-        {% if branch == "main" %}
-        - BUILDKITE_ANALYTICS_TOKEN
-        {% endif %}
-      volumes:
-        - /dev/shm:/dev/shm
-        - /data/benchmark-hf-cache:/benchmark-hf-cache
-        - /data/benchmark-vllm-cache:/root/.cache/vllm
-  {% elif step.gpu == "h100" %}
-  - kubernetes:
-      podSpec:
-        containers:
-          - image: {{ image }}
-            command:
-              - bash -c "{{ '(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd ' ~ ((step.working_dir or default_working_dir) | safe) ~ ' && ' ~ (step.command or (step.commands | join(" && ")) | safe) }}"
-            resources:
-              limits:
-                nvidia.com/gpu: {{ step.num_gpus or 1 }}
-            volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              - name: hf-cache
-                mountPath: {{ hf_home }}
-            env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: NCCL_CUMEM_HOST_ENABLE
-                value: "0"
-              - name: HF_HOME
-                value: {{ hf_home }}
-        nodeSelector:
-          nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3
-        volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /mnt/hf-cache
-              type: Directory
-  {% else %}
-  - kubernetes:
-      podSpec:
-        priorityClassName: ci
-        containers:
-          - image: {{ image }}
-            command:
-              - bash -c "{{ '(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd ' ~ ((step.working_dir or default_working_dir) | safe) ~ ' && ' ~ (step.command or (step.commands | join(" && ")) | safe) }}"
-            resources:
-              limits:
-                nvidia.com/gpu: {{ step.num_gpus or 1 }}
-            volumeMounts:
-              - name: devshm
-                mountPath: /dev/shm
-              - name: hf-cache
-                mountPath: {{ hf_home }}
-            env:
-              - name: VLLM_USAGE_SOURCE
-                value: ci-test
-              - name: NCCL_CUMEM_HOST_ENABLE
-                value: "0"
-              - name: HF_HOME
-                value: {{ hf_home }}
-              - name: HF_TOKEN
-                valueFrom:
-                  secretKeyRef:
-                    name: hf-token-secret
-                    key: token
-        nodeSelector:
-          nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-        volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: {{ hf_home }}
-              type: Directory
-  {% endif %}
-{% endif %}
-{% endmacro %}
 
   - group: "AMD Tests"
     depends_on: ~
@@ -322,7 +40,7 @@ plugins:
           {% else %}
           queue: amd_mi325_1
           {% endif %}
-        command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" && ")) | safe }}"
+        command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" && ")) | safe }}"
         env:
           DOCKER_BUILDKIT: "1"
         priority: 100

From c94d67ed487c9ae1f5af5e6143bd4d458e5c4d00 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 18 Dec 2025 02:03:10 +0000
Subject: [PATCH 23/32] remove unwanted print

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/e2e/offline_inference/utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/e2e/offline_inference/utils.py b/tests/e2e/offline_inference/utils.py
index 82c46ff55dd..931e7b506cb 100644
--- a/tests/e2e/offline_inference/utils.py
+++ b/tests/e2e/offline_inference/utils.py
@@ -20,7 +20,6 @@
 VLLM_PATH = Path(__file__).parent.parent.parent
 """Path to root of the vLLM repository."""
 
-print(f"VLLM_PATH: {VLLM_PATH}")
 
 _P = ParamSpec("_P")
 

From 5c104bb4172a1c1221db7fe25a3b48e8529941b8 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Thu, 18 Dec 2025 02:10:37 +0000
Subject: [PATCH 24/32] remove qwen3 omni test relate file for now

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .../stage_configs/rocm/qwen3_omni_ci.yaml     | 98 -------------------
 .../e2e/offline_inference/test_qwen3_omni.py  |  6 --
 2 files changed, 104 deletions(-)
 delete mode 100644 tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml

diff --git a/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml b/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
deleted file mode 100644
index fb955f72478..00000000000
--- a/tests/e2e/offline_inference/stage_configs/rocm/qwen3_omni_ci.yaml
+++ /dev/null
@@ -1,98 +0,0 @@
-# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
-# Stage 0: Thinker (multimodal understanding + text generation)
-# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes)
-# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
-
-# The following config has been verified on 2x H100-80G GPUs.
-stage_args:
-  - stage_id: 0
-    runtime:
-      devices: "0,1"
-      max_batch_size: 1
-    engine_args:
-      model_stage: thinker
-      model_arch: Qwen3OmniMoeForConditionalGeneration
-      worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
-      scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-      gpu_memory_utilization: 0.4
-      enforce_eager: true
-      trust_remote_code: true
-      engine_output_type: latent  # Output hidden states for talker
-      distributed_executor_backend: "mp"
-      enable_prefix_caching: false
-      hf_config_name: thinker_config
-      tensor_parallel_size: 2
-      load_format: dummy
-    final_output: true
-    final_output_type: text
-    is_comprehension: true
-    default_sampling_params:
-      temperature: 0.4
-      top_p: 0.9
-      top_k: 1
-      max_tokens: 100
-      seed: 42
-      detokenize: True
-      repetition_penalty: 1.05
-
-  - stage_id: 1
-    runtime:
-       devices: "1"
-       max_batch_size: 1
-    engine_args:
-       model_stage: talker
-       model_arch: Qwen3OmniMoeForConditionalGeneration
-       worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
-       scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
-       gpu_memory_utilization: 0.4
-       enforce_eager: true
-       trust_remote_code: true
-       engine_output_type: latent  # Output codec codes for code2wav
-      #  tensor_parallel_size: 2
-       enable_prefix_caching: false
-       distributed_executor_backend: "mp"
-       hf_config_name: talker_config
-       load_format: dummy
-    engine_input_source: [0]
-    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
-    # final_output: true
-    # final_output_type: text
-    default_sampling_params:
-      temperature: 0.9
-      top_k: 50
-      max_tokens: 100
-      seed: 42
-      detokenize: False
-      repetition_penalty: 1.05
-      stop_token_ids: [2150]
-
-  - stage_id: 2
-    runtime:
-      devices: "0"
-      max_batch_size: 1
-    engine_args:
-      model_stage: code2wav
-      model_arch: Qwen3OmniMoeForConditionalGeneration
-      worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
-      scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
-      enforce_eager: true
-      trust_remote_code: true
-      enable_prefix_caching: false
-      engine_output_type: audio  # Final output: audio waveform
-      gpu_memory_utilization: 0.1
-      distributed_executor_backend: "mp"
-      max_num_batched_tokens: 1000000
-      hf_config_name: thinker_config
-      load_format: dummy
-    engine_input_source: [1]
-    custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
-    final_output: true
-    final_output_type: audio
-    default_sampling_params:
-      temperature: 0.0
-      top_p: 1.0
-      top_k: -1
-      max_tokens: 200
-      seed: 42
-      detokenize: True
-      repetition_penalty: 1.1
diff --git a/tests/e2e/offline_inference/test_qwen3_omni.py b/tests/e2e/offline_inference/test_qwen3_omni.py
index 9fa07839f0e..b43fa836174 100644
--- a/tests/e2e/offline_inference/test_qwen3_omni.py
+++ b/tests/e2e/offline_inference/test_qwen3_omni.py
@@ -10,8 +10,6 @@
 import pytest
 from vllm.assets.video import VideoAsset
 
-from vllm_omni.utils.platform_utils import is_rocm
-
 from .conftest import OmniRunner
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
@@ -21,10 +19,6 @@
 # CI stage config for 2xH100-80G GPUs
 stage_configs = [str(Path(__file__).parent / "stage_configs" / "qwen3_omni_ci.yaml")]
 
-if is_rocm():
-    # ROCm stage config optimized for MI325 GPU
-    stage_configs = [str(Path(__file__).parent / "stage_configs" / "rocm" / "qwen3_omni_ci.yaml")]
-
 # Create parameter combinations for model and stage config
 test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
 

From e6d5b320e7d6c71baf2dfd62eeae71f6e9c4b72d Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Fri, 19 Dec 2025 14:42:56 +0000
Subject: [PATCH 25/32] upgrade vllm version to 0.12.0 following main

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 docker/Dockerfile.rocm | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 1885306d3fd..7fabb9c3c68 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -1,8 +1,8 @@
-ARG BASE_IMAGE=rocm/vllm-dev:nightly_main_20251005
+ARG BASE_IMAGE=rocm/vllm-dev:nightly_main_20251205
 FROM ${BASE_IMAGE}
 
 ARG COMMON_WORKDIR=/app
-ARG VLLM_VERSION=v0.11.0
+ARG VLLM_VERSION=v0.12.0
 ARG PYTORCH_ROCM_ARCH="gfx942;gfx950"
 
 WORKDIR ${COMMON_WORKDIR}

From ef7a50d3f0a3eca1d0eb4fde58860d978f050c88 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Sat, 20 Dec 2025 03:01:33 +0000
Subject: [PATCH 26/32] fix import error ModuleNotFoundError: No module named
 'vllm.vllm_flash_attn.layers'

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 vllm_omni/diffusion/layers/rope.py | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/vllm_omni/diffusion/layers/rope.py b/vllm_omni/diffusion/layers/rope.py
index acc0158fc63..8cb1d10d110 100644
--- a/vllm_omni/diffusion/layers/rope.py
+++ b/vllm_omni/diffusion/layers/rope.py
@@ -1,7 +1,13 @@
+from importlib.util import find_spec
+
 import torch
 from einops import rearrange, repeat
+from vllm.logger import init_logger
 
 from vllm_omni.diffusion.layers.custom_op import CustomOp
+from vllm_omni.utils.platform_utils import is_rocm
+
+logger = init_logger(__name__)
 
 
 def rotate_half(x, interleaved=False):
@@ -45,6 +51,20 @@ def __init__(
         super().__init__()
         self.is_neox_style = is_neox_style
         self.interleaved = not is_neox_style
+        self.triton_rotary_emb = None
+        if is_rocm():
+            if find_spec("flash_attn") is not None:
+                from flash_attn.ops.triton.rotary import apply_rotary
+
+                self.triton_rotary_emb = apply_rotary
+            else:
+                logger.warning(
+                    "flash_attn is not installed. Falling back to PyTorch implementation for rotary embeddings."
+                )
+        else:
+            from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+
+            self.triton_rotary_emb = apply_rotary_emb
 
     def forward_cuda(
         self,
@@ -52,14 +72,15 @@ def forward_cuda(
         cos: torch.Tensor,
         sin: torch.Tensor,
     ) -> torch.Tensor:
-        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+        if self.triton_rotary_emb is None:
+            return self.forward_native(x, cos, sin)
 
         if cos.dim() == 3:
             # (B, S, D/2) -> (S, D/2)
             cos = cos[0]
             sin = sin[0]
 
-        return apply_rotary_emb(
+        return self.triton_rotary_emb(
             x,
             cos,
             sin,

From cc225dd32b1572c5a1effae57cce24e6dab7659d Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Sat, 20 Dec 2025 19:46:40 +0000
Subject: [PATCH 27/32] add forward_hip instead of sharing the same path with
 cuda

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml           |  3 ++-
 vllm_omni/diffusion/layers/rope.py | 42 ++++++++++++++++++------------
 2 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 9a47f5e9b04..fa108e1534e 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -42,7 +42,8 @@ steps:
     - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_FIND_MODE=FAST
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - export VLLM_ROCM_USE_AITER=1
     - export VLLM_ROCM_USE_AITER_MHA=1
     - export VLLM_ROCM_USE_AITER_LINEAR=0
diff --git a/vllm_omni/diffusion/layers/rope.py b/vllm_omni/diffusion/layers/rope.py
index 8cb1d10d110..7db87416193 100644
--- a/vllm_omni/diffusion/layers/rope.py
+++ b/vllm_omni/diffusion/layers/rope.py
@@ -5,7 +5,6 @@
 from vllm.logger import init_logger
 
 from vllm_omni.diffusion.layers.custom_op import CustomOp
-from vllm_omni.utils.platform_utils import is_rocm
 
 logger = init_logger(__name__)
 
@@ -51,20 +50,11 @@ def __init__(
         super().__init__()
         self.is_neox_style = is_neox_style
         self.interleaved = not is_neox_style
-        self.triton_rotary_emb = None
-        if is_rocm():
-            if find_spec("flash_attn") is not None:
-                from flash_attn.ops.triton.rotary import apply_rotary
+        self.apply_rotary_emb_flash_attn = None
+        if find_spec("flash_attn") is not None:
+            from flash_attn.ops.triton.rotary import apply_rotary
 
-                self.triton_rotary_emb = apply_rotary
-            else:
-                logger.warning(
-                    "flash_attn is not installed. Falling back to PyTorch implementation for rotary embeddings."
-                )
-        else:
-            from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
-
-            self.triton_rotary_emb = apply_rotary_emb
+            self.apply_rotary_emb_flash_attn = apply_rotary
 
     def forward_cuda(
         self,
@@ -72,9 +62,6 @@ def forward_cuda(
         cos: torch.Tensor,
         sin: torch.Tensor,
     ) -> torch.Tensor:
-        if self.triton_rotary_emb is None:
-            return self.forward_native(x, cos, sin)
-
         if cos.dim() == 3:
             # (B, S, D/2) -> (S, D/2)
             cos = cos[0]
@@ -87,6 +74,27 @@ def forward_cuda(
             interleaved=self.interleaved,
         )
 
+    def forward_hip(
+        self,
+        x: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> torch.Tensor:
+        if self.apply_rotary_emb_flash_attn is None:
+            return self.forward_cuda(x, cos, sin)
+
+        if cos.dim() == 3:
+            # (B, S, D/2) -> (S, D/2)
+            cos = cos[0]
+            sin = sin[0]
+
+        return self.apply_rotary_emb_flash_attn(
+            x,
+            cos,
+            sin,
+            interleaved=self.interleaved,
+        )
+
     def forward_native(
         self,
         x: torch.Tensor,

From 858f74ec7d9b33d308ce1c40d75a9db0ed6a356e Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Sat, 20 Dec 2025 19:59:49 +0000
Subject: [PATCH 28/32] revert forward_cuda

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 vllm_omni/diffusion/layers/rope.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm_omni/diffusion/layers/rope.py b/vllm_omni/diffusion/layers/rope.py
index 7db87416193..528f2425efb 100644
--- a/vllm_omni/diffusion/layers/rope.py
+++ b/vllm_omni/diffusion/layers/rope.py
@@ -62,12 +62,14 @@ def forward_cuda(
         cos: torch.Tensor,
         sin: torch.Tensor,
     ) -> torch.Tensor:
+        from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+
         if cos.dim() == 3:
             # (B, S, D/2) -> (S, D/2)
             cos = cos[0]
             sin = sin[0]
 
-        return self.triton_rotary_emb(
+        return apply_rotary_emb(
             x,
             cos,
             sin,

From 5695e2f31acadd26a6b9019ec1521408b9872ab3 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Sun, 21 Dec 2025 01:27:05 +0000
Subject: [PATCH 29/32] add forward hip dispatching logic

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 .buildkite/test-amd.yaml                |  6 ++++--
 vllm_omni/diffusion/layers/custom_op.py | 10 ++++++++--
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index fa108e1534e..57008e59f53 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -8,7 +8,8 @@ steps:
   grade: Blocking
   commands:
     - export GPU_ARCHS=gfx942
-    - export MIOPEN_FIND_MODE=FAST
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - export VLLM_ROCM_USE_AITER=1
     - export VLLM_ROCM_USE_AITER_MHA=1
     - export VLLM_ROCM_USE_AITER_LINEAR=0
@@ -25,7 +26,8 @@ steps:
     - export GPU_ARCHS=gfx942
     - export VLLM_LOGGING_LEVEL=DEBUG
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-    - export MIOPEN_FIND_MODE=FAST
+    - export MIOPEN_DEBUG_CONV_DIRECT=0
+    - export MIOPEN_DEBUG_CONV_GEMM=0
     - export VLLM_ROCM_USE_AITER=1
     - export VLLM_ROCM_USE_AITER_MHA=1
     - export VLLM_ROCM_USE_AITER_LINEAR=0
diff --git a/vllm_omni/diffusion/layers/custom_op.py b/vllm_omni/diffusion/layers/custom_op.py
index 461da0d361e..0bf5c4f60ee 100644
--- a/vllm_omni/diffusion/layers/custom_op.py
+++ b/vllm_omni/diffusion/layers/custom_op.py
@@ -3,7 +3,7 @@
 
 import torch.nn as nn
 
-from vllm_omni.utils.platform_utils import detect_device_type
+from vllm_omni.utils.platform_utils import detect_device_type, is_rocm
 
 
 class CustomOp(nn.Module):
@@ -18,7 +18,9 @@ def __init__(self) -> None:
         self._forward_method = self.dispatch_forward()
 
     def dispatch_forward(self) -> Callable:
-        if self.is_cuda:
+        if is_rocm():
+            return self.forward_hip
+        elif self.is_cuda:
             return self.forward_cuda
         else:
             return self.forward_native
@@ -36,3 +38,7 @@ def forward_native(self, *args, **kwargs):
 
     def forward_cuda(self, *args, **kwargs):
         raise NotImplementedError
+
+    def forward_hip(self, *args, **kwargs):
+        # By default, we assume that HIP ops are compatible with CUDA ops.
+        return self.forward_cuda(*args, **kwargs)

From 32233aeec5a92c2a85b34a93b72f2ecd093afb5f Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Sun, 21 Dec 2025 02:03:47 +0000
Subject: [PATCH 30/32] try to do torch sync when destructing omni runner in
 tests

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/e2e/offline_inference/conftest.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/e2e/offline_inference/conftest.py b/tests/e2e/offline_inference/conftest.py
index a24c63bff7c..b9a72dd9aeb 100644
--- a/tests/e2e/offline_inference/conftest.py
+++ b/tests/e2e/offline_inference/conftest.py
@@ -7,6 +7,7 @@
 from typing import Any
 
 import pytest
+import torch
 from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
 from vllm.sampling_params import SamplingParams
 
@@ -334,6 +335,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
         del self.omni
         cleanup_dist_env_and_memory()
+        torch.cuda.synchronize()
 
     def close(self):
         """Close and cleanup the Omni instance."""

From dd3e6dbfe7f14a9829068f045706613b29f2fddd Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Sun, 21 Dec 2025 02:36:03 +0000
Subject: [PATCH 31/32] revert the create_new_process_for_each_test for
 test_qwen25omni

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/e2e/offline_inference/conftest.py          | 2 --
 tests/e2e/offline_inference/test_qwen2_5_omni.py | 3 +++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/e2e/offline_inference/conftest.py b/tests/e2e/offline_inference/conftest.py
index b9a72dd9aeb..a24c63bff7c 100644
--- a/tests/e2e/offline_inference/conftest.py
+++ b/tests/e2e/offline_inference/conftest.py
@@ -7,7 +7,6 @@
 from typing import Any
 
 import pytest
-import torch
 from vllm.distributed.parallel_state import cleanup_dist_env_and_memory
 from vllm.sampling_params import SamplingParams
 
@@ -335,7 +334,6 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         self.close()
         del self.omni
         cleanup_dist_env_and_memory()
-        torch.cuda.synchronize()
 
     def close(self):
         """Close and cleanup the Omni instance."""
diff --git a/tests/e2e/offline_inference/test_qwen2_5_omni.py b/tests/e2e/offline_inference/test_qwen2_5_omni.py
index f5f9894b386..63eea1ba26d 100644
--- a/tests/e2e/offline_inference/test_qwen2_5_omni.py
+++ b/tests/e2e/offline_inference/test_qwen2_5_omni.py
@@ -16,6 +16,7 @@
 from vllm_omni.utils import is_npu, is_rocm
 
 from .conftest import OmniRunner
+from .utils import create_new_process_for_each_test
 
 models = ["Qwen/Qwen2.5-Omni-3B"]
 
@@ -34,6 +35,7 @@
 
 @pytest.mark.core_model
 @pytest.mark.parametrize("test_config", test_params)
+@create_new_process_for_each_test()
 def test_mixed_modalities_to_audio(omni_runner: type[OmniRunner], test_config: tuple[str, str]) -> None:
     """Test processing audio, image, and video together, generating audio output."""
     model, stage_config_path = test_config
@@ -90,6 +92,7 @@ def test_mixed_modalities_to_audio(omni_runner: type[OmniRunner], test_config: t
 
 @pytest.mark.core_model
 @pytest.mark.parametrize("test_config", test_params)
+@create_new_process_for_each_test()
 def test_mixed_modalities_to_text_only(omni_runner: type[OmniRunner], test_config: tuple[str, str]) -> None:
     """Test processing audio, image, and video together, generating audio output."""
     model, stage_config_path = test_config

From fb1b1d87bd0d90b0e5f5b8807c5c1d466d41a26f Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Sun, 21 Dec 2025 03:10:43 +0000
Subject: [PATCH 32/32] fix create_new_process_for_each_test

Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
---
 tests/e2e/offline_inference/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/e2e/offline_inference/utils.py b/tests/e2e/offline_inference/utils.py
index 931e7b506cb..c491c10b91e 100644
--- a/tests/e2e/offline_inference/utils.py
+++ b/tests/e2e/offline_inference/utils.py
@@ -195,7 +195,11 @@ def create_new_process_for_each_test(
         A decorator to run test functions in separate processes.
     """
     if method is None:
-        use_spawn = current_platform.is_rocm() or current_platform.is_xpu()
+        # TODO: Find out why spawn is not working correctly on ROCm
+        # The test content will not run and tests passed immediately.
+        # For now, using `fork` for ROCm as it can run with `fork`
+        # and tests are running correctly.
+        use_spawn = current_platform.is_xpu()
         method = "spawn" if use_spawn else "fork"
 
     assert method in ["spawn", "fork"], "Method must be either 'spawn' or 'fork'"