Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
4fa5062
reafator pipeline stage/step api
asukaqaq-s Feb 13, 2026
fcef0bb
refactor(diffusion): pass scheduler explicitly in stepwise pipeline a…
asukaqaq-s Feb 28, 2026
a7ece4a
[Refactor] Refactor Diffusion Scheduler/Executor Boundaries and Reque…
yJader Mar 3, 2026
92b2c48
[Bugfix] Handle output error in DiffusionEngine's dummy run
yJader Mar 3, 2026
727a653
Merge branch 'tmp/pipeline-rebase' into dev-exp
asukaqaq-s Mar 4, 2026
1f09e7c
Merge branch 'tmp/scheduler-rebase' into dev-exp
asukaqaq-s Mar 4, 2026
9675230
feat(diffusion): implement new scheduler architecture with RequestSch…
yJader Mar 8, 2026
d8f93b5
feat(diffusion/scheduler): add _max_batch_size (enforce 1)
yJader Mar 8, 2026
6408230
refactor runner to support stepwise
asukaqaq-s Mar 9, 2026
8c254af
support step/request execute level on executor/worker
asukaqaq-s Mar 9, 2026
140cea1
feat(scheduler): update output handling to use RunnerOutput and impro…
yJader Mar 9, 2026
3cd352a
support step execution and abort
asukaqaq-s Mar 9, 2026
7ffd42d
modify engine layer to support abort(draft)
asukaqaq-s Mar 10, 2026
2f6663c
upgrade test and fix error
asukaqaq-s Mar 10, 2026
07073b0
refactor: align RunnerOutput
yJader Mar 10, 2026
bf089fc
refactor(diffusion): unify scheduler request id naming and generation
yJader Mar 10, 2026
9f19fe4
fix: avoid update finished request in scheduler
yJader Mar 10, 2026
faaeee2
feat(diffusion): add async abort support in AsyncOmniDiffusion
asukaqaq-s Mar 10, 2026
0ba4378
refactor(diffusion): extract base scheduler and fix update_from_outpu…
yJader Mar 10, 2026
68ad5db
fix: support abort batched request
yJader Mar 10, 2026
8a14df9
fix: tests/diffusion/test_multiproc_engine_concurrency.py engine setu…
yJader Mar 10, 2026
f7d9a87
feat(diffusion): implement better abort handling in DiffusionEngine a…
yJader Mar 10, 2026
15d4979
fix(qwen-image): isolate stepwise schedule state and refactor
asukaqaq-s Mar 10, 2026
95a9933
refactor(diffusion): split scheduler output into new and cached requests
yJader Mar 11, 2026
b9d3de9
[Feat] Request cancel and engine abort for image generation
Semmer2 Mar 12, 2026
f32777b
refactor(diffusion): move request ID mapping into scheduler and simpl…
yJader Mar 13, 2026
6bdb55a
[Draft][Feat] DiffusionEngine support async infer
Semmer2 Mar 18, 2026
98a2ce4
feat(diffusion): add experimental cachepool restore for stepwise mode
asukaqaq-s Mar 18, 2026
7de0117
squash merge pr/scheduler(#1625)
yJader Mar 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
38 changes: 38 additions & 0 deletions .buildkite/release-pipeline.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
steps:
- label: ":docker: Build image"
key: image-build
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "docker build --progress=plain --file docker/Dockerfile.ci -t vllm-omni-ci ."
- "docker tag vllm-omni-ci public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT"
- "docker push public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT"
agents:
queue: "cpu_queue_premerge"

# L2 Test
- label: "Upload Ready Pipeline"
depends_on: image-build
key: upload-ready-pipeline
commands:
- buildkite-agent pipeline upload .buildkite/test-ready.yml
agents:
queue: "cpu_queue_premerge"

# L3 Test
- label: "Upload Merge Pipeline"
depends_on: image-build
key: upload-merge-pipeline
commands:
- buildkite-agent pipeline upload .buildkite/test-merge.yml
agents:
queue: "cpu_queue_premerge"

# L4 Test
- label: "Upload Nightly Pipeline"
depends_on: image-build
key: upload-nightly-pipeline
if: build.env("NIGHTLY") == "1"
commands:
- buildkite-agent pipeline upload .buildkite/test-nightly.yml
agents:
queue: "cpu_queue_premerge"
56 changes: 56 additions & 0 deletions .buildkite/scripts/hardware_ci/run-xpu-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash

# This script build the XPU docker image and run the offline inference inside the container.
set -ex

omni_source_dir=$(git rev-parse --show-toplevel)

base_image_name="xpu/vllm-omni-ci-base:${VLLM_VERSION:?VLLM_VERSION must be set}"
image_name="xpu/vllm-omni-ci:${BUILDKITE_COMMIT:?BUILDKITE_COMMIT must be set}"
container_name="xpu_${BUILDKITE_COMMIT}_$(
tr -dc A-Za-z0-9 </dev/urandom | head -c 10
echo
)"

cd "${omni_source_dir}"
if [ -z "$(docker images -q "${base_image_name}")" ]; then
docker build --target vllm-base -t "${base_image_name}" --build-arg "VLLM_VERSION=${VLLM_VERSION}" -f docker/Dockerfile.xpu .
fi

# Try building the docker image
docker build --build-arg "VLLM_BASE=${base_image_name}" --build-arg "VLLM_VERSION=${VLLM_VERSION}" -t "${image_name}" -f docker/Dockerfile.xpu .

# Setup cleanup
remove_docker_container() {
docker rm -f "${container_name}" || true
docker image rm -f "${image_name}" || true
docker system prune -f || true
}
trap remove_docker_container EXIT

HF_CACHE="${HF_CACHE:-$(realpath ~)/.cache/huggingface}"
mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface"

time timeout -k 30 30m docker run \
--device /dev/dri:/dev/dri \
--net=host \
--ipc=host \
-v /dev/dri/by-path:/dev/dri/by-path \
-v "${HF_CACHE}:${HF_MOUNT}" \
--security-opt seccomp=unconfined \
--entrypoint="" \
-e VLLM_LOGGING_LEVEL \
-e VLLM_OMNI_LOGGING_LEVEL \
-e HF_TOKEN \
-e ZE_AFFINITY_MASK \
--name "${container_name}" \
"${image_name}" \
bash -c '
set -e
echo $ZE_AFFINITY_MASK
pip install tblib==3.1.0
cd /workspace/vllm-omni
pytest -v -s -m "core_model and xpu and B60"
pytest -v -s -m "advanced_model and xpu and B60"
'
69 changes: 65 additions & 4 deletions .buildkite/test-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,72 @@ steps:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
shm-size: "8gb"
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"

- label: ":full_moon: Diffusion Model Test with H100"
timeout_in_minutes: 60
depends_on: upload-nightly-pipeline
# if: build.env("NIGHTLY") == "1"
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and diffusion and H100" --run-level "advanced_model"
agents:
queue: "mithril-h100-pool"
plugins:
- kubernetes:
podSpec:
containers:
- image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
resources:
limits:
nvidia.com/gpu: 2
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: HF_HOME
value: /root/.cache/huggingface
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate

- label: ":full_moon: Qwen3-TTS Non-Async-Chunk E2E Test"
timeout_in_minutes: 30
depends_on: upload-nightly-pipeline
if: build.env("NIGHTLY") == "1"
commands:
- |
huggingface-cli download Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice
export VLLM_WORKER_MULTIPROC_METHOD=spawn
pytest -s -v tests/e2e/online_serving/test_qwen3_tts.py::TestQwen3TTSNoAsyncChunk
agents:
queue: "gpu_4_queue"
plugins:
- docker#v5.2.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
always-pull: true
propagate-environment: true
shm-size: "8gb"
environment:
- "HF_HOME=/fsx/hf_cache"
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"

- label: ":full_moon: Omni Model Perf Test with H100"
- label: ":full_moon: Omni Model Perf Test & Test Case Statistics"
key: nightly-performance
timeout_in_minutes: 180
depends_on: upload-nightly-pipeline
Expand All @@ -72,6 +129,8 @@ steps:
- export BENCHMARK_DIR=tests
- pytest -s -v tests/perf/scripts/run_benchmark.py
- buildkite-agent artifact upload "tests/*.json"
- python tools/nightly/buildkite_testcase_statistics.py -o tests/buildkite_testcase_statistics.html
- buildkite-agent artifact upload "tests/*.html"
agents:
queue: "mithril-h100-pool"
plugins:
Expand Down Expand Up @@ -101,17 +160,19 @@ steps:
path: /mnt/hf-cache
type: DirectoryOrCreate

- label: ":email: Nightly Perf Collection & Email"
- label: ":email: Nightly Collection & Email"
key: nightly-perf-distribution
depends_on: nightly-performance
depends_on:
- nightly-performance
if: build.env("NIGHTLY") == "1"
commands:
- pip install openpyxl
- export DEFAULT_INPUT_DIR=tests
- export DEFAULT_OUTPUT_DIR=tests
- buildkite-agent artifact download "tests/*.json" . --step nightly-performance
- buildkite-agent artifact download "tests/*.html" . --step nightly-performance
- python tools/nightly/generate_nightly_perf_excel.py
- python tools/nightly/send_nightly_perf_email.py
- python tools/nightly/send_nightly_email.py --report-file "tests/*.xlsx, tests/*.html"
- buildkite-agent artifact upload "tests/*.xlsx"
agents:
queue: "cpu_queue_premerge"
163 changes: 117 additions & 46 deletions .buildkite/test-ready.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,7 @@ steps:
- label: "Diffusion Model CPU offloading Test"
depends_on: upload-ready-pipeline
commands:
- |
timeout 20m bash -c '
set +e
pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
EXIT1=$$?
pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
EXIT2=$$?
exit $$((EXIT1 | EXIT2))
'
- timeout 10m pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
plugins:
Expand Down Expand Up @@ -214,9 +206,10 @@ steps:
commands:
- |
timeout 20m bash -c '
huggingface-cli download Qwen/Qwen3-TTS-12Hz-0.6B-CustomVoice
export VLLM_LOGGING_LEVEL=DEBUG
export VLLM_WORKER_MULTIPROC_METHOD=spawn
pytest -s -v tests/e2e/online_serving/test_qwen3_tts.py
pytest -s -v tests/e2e/online_serving/test_qwen3_tts.py -k "not NoAsyncChunk"
'
agents:
queue: "gpu_4_queue"
Expand Down Expand Up @@ -268,39 +261,117 @@ steps:
# path: /mnt/hf-cache
# type: DirectoryOrCreate

# - label: "Bagel Text2Img Model Test with H100"
# depends_on: upload-ready-pipeline
# commands:
# - |
# timeout 30m bash -c '
# export VLLM_WORKER_MULTIPROC_METHOD=spawn
# pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py
# '
# agents:
# queue: "mithril-h100-pool"
# plugins:
# - kubernetes:
# podSpec:
# containers:
# - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
# resources:
# limits:
# nvidia.com/gpu: 1
# volumeMounts:
# - name: devshm
# mountPath: /dev/shm
# - name: hf-cache
# mountPath: /root/.cache/huggingface
# env:
# - name: HF_HOME
# value: /root/.cache/huggingface
# nodeSelector:
# node.kubernetes.io/instance-type: gpu-h100-sxm
# volumes:
# - name: devshm
# emptyDir:
# medium: Memory
# - name: hf-cache
# hostPath:
# path: /mnt/hf-cache
# type: DirectoryOrCreate
- label: "Bagel Text2Img Model Test with H100"
depends_on: upload-ready-pipeline
commands:
- |
timeout 30m bash -c '
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_TEST_CLEAN_GPU_MEMORY=1
pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py
'
agents:
queue: "mithril-h100-pool"
plugins:
- kubernetes:
podSpec:
containers:
- image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
resources:
limits:
nvidia.com/gpu: 1
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: HF_HOME
value: /root/.cache/huggingface
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate

- label: "Bagel Img2Img Model Test with H100"
depends_on: upload-ready-pipeline
commands:
- |
timeout 30m bash -c '
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_TEST_CLEAN_GPU_MEMORY=1
pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py
'
agents:
queue: "mithril-h100-pool"
plugins:
- kubernetes:
podSpec:
containers:
- image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
resources:
limits:
nvidia.com/gpu: 1
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: HF_HOME
value: /root/.cache/huggingface
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate

- label: "Bagel Online Serving Test with H100"
depends_on: upload-ready-pipeline
commands:
- |
timeout 40m bash -c '
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_TEST_CLEAN_GPU_MEMORY=1
export VLLM_IMAGE_FETCH_TIMEOUT=60
pytest -s -v tests/e2e/online_serving/test_bagel_online.py
'
agents:
queue: "mithril-h100-pool"
plugins:
- kubernetes:
podSpec:
containers:
- image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
resources:
limits:
nvidia.com/gpu: 1
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: HF_HOME
value: /root/.cache/huggingface
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate
Loading